
    h!                        S r SSKrSSKrSSKJr  SSKJr  SSKJ	r	  SSK
JrJrJr   \r SSKJr   SSKJr   " S	 S
\5      r SSKJr   " S S\5      r\" 5       rS rSS jr  SS jr  SS jrSS jr SS jr!S r"\" 5       r#g! \ a    \\4r Nif = f! \ a	    SSKJr   Ntf = f! \ a	    SSKJr   Nf = f! \ a     Ngf = f)z?
An interface to html5lib that mimics the lxml.html interface.
    N)
HTMLParser)TreeBuilder)etree)ElementXHTML_NAMESPACE_contains_block_level_tag)urlopen)urlparsec                   "    \ rS rSrSrSS jrSrg)r      z*An html5lib HTML parser with lxml as tree.c                 @    [         R                  " U 4U[        S.UD6  g N)stricttree)_HTMLParser__init__r   selfr   kwargss      G/var/www/html/env/lib/python3.13/site-packages/lxml/html/html5parser.pyr   HTMLParser.__init__   s    TM&{MfM     NF__name__
__module____qualname____firstlineno____doc__r   __static_attributes__r   r   r   r   r      s    4Nr   r   )XHTMLParserc                   "    \ rS rSrSrSS jrSrg)r"   '   z+An html5lib XHTML Parser with lxml as tree.c                 @    [         R                  " U 4U[        S.UD6  g r   )_XHTMLParserr   r   r   s      r   r   XHTMLParser.__init__*   s    !!$RvKR6Rr   r   Nr   r   r   r   r   r"   r"   '   s    9	Sr   r"   c                 h    U R                  U5      nUb  U$ U R                  S[        < SU< 35      $ )N{})findr   )r   tagelems      r   	_find_tagr.   0   s.    99S>D99#677r   c                     [        U [        5      (       d  [        S5      eUc  [        n0 nUc  [        U [        5      (       a  SnUb  XS'   UR
                  " U 40 UD6R                  5       $ )z
Parse a whole document into a string.

If `guess_charset` is true, or if the input is not Unicode but a
byte string, the `chardet` library will perform charset guessing
on the string.
string requiredT
useChardet)
isinstance_strings	TypeErrorhtml_parserbytesparsegetroot)htmlguess_charsetparseroptionss       r   document_fromstringr=   7   sp     dH%%)**~GD%!8!8   -<<((0022r   c                    [        U [        5      (       d  [        S5      eUc  [        n0 nUc  [        U [        5      (       a  SnUb  X$S'   UR
                  " U S40 UD6nU(       aV  [        US   [        5      (       a>  U(       a7  US   R                  5       (       a  [        R                  " SUS   -  5      eUS	 U$ )aH  Parses several HTML elements, returning a list of elements.

The first item in the list may be a string.  If no_leading_text is true,
then it will be an error if there is leading text, and it will always be
a list of only elements.

If `guess_charset` is true, the `chardet` library will perform charset
guessing on the string.
r0   Fr1   divr   zThere is leading text: %r)	r2   r3   r4   r5   r6   parseFragmentstripr   ParserError)r9   no_leading_textr:   r;   r<   childrens         r   fragments_fromstringrE   O   s     dH%%)**~GD%!8!8   -##D%;7;HJx{H55{  ""''(C(0)4 5 5Or   c                    [        U [        5      (       d  [        S5      e[        U5      n[	        XUU(       + S9nU(       aa  [        U[        5      (       d  Sn[        U5      nU(       a6  [        US   [        5      (       a  US   Ul        US	 UR                  U5        U$ U(       d  [        R                  " S5      e[        U5      S:  a  [        R                  " S5      eUS   nUR                  (       aB  UR                  R                  5       (       a#  [        R                  " SUR                  -  5      eS	Ul        U$ )
a  Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
element.

If 'create_parent' is true (or is a tag name) then a parent node
will be created to encapsulate the HTML in a single element.  In
this case, leading or trailing text is allowed.

If `guess_charset` is true, the `chardet` library will perform charset
guessing on the string.
r0   )r:   r;   rC   r?   r   zNo elements found   zMultiple elements foundzElement followed by text: %rN)r2   r3   r4   boolrE   r   textextendr   rB   lentailrA   )r9   create_parentr:   r;   accept_leading_textelementsnew_rootresults           r   fragment_fromstringrR   q   s    dH%%)**}-#&//1H -22!M=)(1+x00 (QKOOH% 344
8}q 9::a[F{{v{{((** > LMMFKMr   c                    [        U [        5      (       d  [        S5      e[        XUS9nU SS n[        U[        5      (       a  UR                  SS5      nUR                  5       R                  5       nUR                  S5      (       d  UR                  S5      (       a  U$ [        US	5      n[        U5      (       a  U$ [        US
5      n[        U5      S:X  ak  UR                  (       a  UR                  R                  5       (       d;  US   R                  (       a"  US   R                  R                  5       (       d  US   $ [        U5      (       a	  SUl        U$ SUl        U$ )a  Parse the html, returning a single element/document.

This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.

'base_url' will set the document's base_url attribute (and the tree's
docinfo.URL)

If `guess_charset` is true, or if the input is not Unicode but a
byte string, the `chardet` library will perform charset guessing
on the string.
r0   )r;   r:   N2   asciireplacez<htmlz	<!doctypeheadbodyrG   r   r?   span)r2   r3   r4   r=   r6   decodelstriplower
startswithr.   rK   rI   rA   rL   r   r,   )r9   r:   r;   docstartrW   rX   s          r   
fromstringra      s0    dH%%)**
d,9;C "IE% Wi0LLN  "E  E$4$4[$A$A
S&!D 4yy
S&!D 	D	Q		1B1Bbd2hmm&9&9&;&;Aw
 !&& K Kr   c                     Uc  [         n[        U [        5      (       d  U nUc  SnO2[        U 5      (       a  [	        U 5      nUc  SnO[        U S5      nUc  Sn0 nU(       a  XS'   UR                  " U40 UD6$ )a
  Parse a filename, URL, or file-like object into an HTML document
tree.  Note: this returns a tree, not an element.  Use
``parse(...).getroot()`` to get the document root.

If ``guess_charset`` is true, the ``useChardet`` option is passed into
html5lib to enable character detection.  This option is on by default
when parsing from URLs, off by default when parsing from file(-like)
objects (which tend to return Unicode more often than not), and on by
default when parsing from a file path (which is read in binary mode).
FTrbr1   )r5   r2   r3   _looks_like_urlr	   openr7   )filename_url_or_filer:   r;   fpr<   s        r   r7   r7      s     ~*H55! !M	-	.	.)*  M&-  MG  -<<&g&&r   c                     [        U 5      S   nU(       d  g[        R                  S:X  a$  U[        R                  ;   a  [        U5      S:X  a  gg)Nr   Fwin32rG   T)r
   sysplatformstringascii_lettersrK   )strschemes     r   rd   rd      sB    c]1F
,,'
!f***Fq r   )NN)FNN)$r    rj   rl   html5libr   r    html5lib.treebuilders.etree_lxmlr   lxmlr   	lxml.htmlr   r   r   
basestringr3   	NameErrorr6   rn   urllib2r	   ImportErrorurllib.requestr
   urllib.parser"   r&   xhtml_parserr.   r=   rE   rR   ra   r7   rd   r5   r   r   r   <module>r{      s      . 8  I IH'&!
N N!4Sl S =L830 0548D -237)X3l!'H
 lk  s|H  '&'  &%&  		sE   B B B) B; 
BBB&%B&)B87B8;CC