
    hDN                       S SK Jr  S SKJr  S SKJr  SSKJrJrJ	r	  SSK
JrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJr   " S S5      r " S	 S
\5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r%\" SS9      S!S j5       r&\" SS9 S"       S#S jj5       r'g )$    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                  J    \ rS rSrSrS	S jrS
S jrSS jr\SS j5       r	Sr
g)MessDetectorPlugin    zm
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
c                    [         e)z0
Determine if given character should be fed in.
NotImplementedErrorself	characters     G/var/www/html/env/lib/python3.13/site-packages/charset_normalizer/md.pyeligibleMessDetectorPlugin.eligible&   
     "!    c                    [         e)zq
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
r   r!   s     r$   feedMessDetectorPlugin.feed,   s
    
 "!r(   c                    [         e)z2
Permit to reset the plugin to the initial state.
r   r"   s    r$   resetMessDetectorPlugin.reset3   r'   r(   c                    [         e)zm
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
r   r-   s    r$   ratioMessDetectorPlugin.ratio9   s
     "!r(    Nr#   strreturnboolr#   r5   r6   Noner6   r9   r6   float)__name__
__module____qualname____firstlineno____doc__r%   r*   r.   propertyr1   __static_attributes__r3   r(   r$   r   r       s*    
""" " "r(   r   c                  P    \ rS rSrS	S jrS
S jrSS jrS	S jr\SS j5       r	Sr
g) TooManySymbolOrPunctuationPluginB   c                J    SU l         SU l        SU l        S U l        SU l        g )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr-   s    r$   __init__)TooManySymbolOrPunctuationPlugin.__init__C   s*    '("#%&04!,1#r(   c                "    UR                  5       $ Nisprintabler!   s     r$   r%   )TooManySymbolOrPunctuationPlugin.eligibleK       $$&&r(   c                D   U =R                   S-  sl         XR                  :w  av  U[        ;  al  [        U5      (       a  U =R                  S-  sl        OFUR                  5       SL a3  [        U5      (       a#  [        U5      SL a  U =R                  S-  sl        Xl        g )Nr   F   )	rJ   rK   r   r   rH   isdigitr   r   rI   r!   s     r$   r*   %TooManySymbolOrPunctuationPlugin.feedN   s    " 222!==i((''1,'!!#u,i((	*e3""a'"$-!r(   c                .    SU l         SU l        SU l        g Nr   )rH   rJ   rI   r-   s    r$   r.   &TooManySymbolOrPunctuationPlugin.reset`   s    "# !r(   c                    U R                   S:X  a  gU R                  U R                  -   U R                   -  nUS:  a  U$ S$ )Nr           333333?)rJ   rH   rI   )r"   ratio_of_punctuations     r$   r1   &TooManySymbolOrPunctuationPlugin.ratioe   sO      A% ##d&8&88!!'" (<s'B#KKr(   )rJ   rL   rK   rH   rI   Nr:   r4   r8   r;   r=   r>   r?   r@   rM   r%   r*   r.   rB   r1   rC   r3   r(   r$   rE   rE   B   s,    2'.$
 L Lr(   rE   c                  P    \ rS rSrS	S jrS
S jrSS jrS	S jr\SS j5       r	Sr
g)TooManyAccentuatedPluginq   c                     SU l         SU l        g rZ   rJ   _accentuated_countr-   s    r$   rM   !TooManyAccentuatedPlugin.__init__r   s    %&'(r(   c                "    UR                  5       $ rP   )isalphar!   s     r$   r%   !TooManyAccentuatedPlugin.eligiblev   s      ""r(   c                z    U =R                   S-  sl         [        U5      (       a  U =R                  S-  sl        g g Nr   )rJ   r
   rg   r!   s     r$   r*   TooManyAccentuatedPlugin.feedy   s4    ")$$##q(# %r(   c                     SU l         SU l        g rZ   rf   r-   s    r$   r.   TooManyAccentuatedPlugin.reset   s     !"#r(   c                j    U R                   S:  a  gU R                  U R                   -  nUS:  a  U$ S$ )N   r]   gffffff?rf   )r"   ratio_of_accentuations     r$   r1   TooManyAccentuatedPlugin.ratio   s=      1$'+'>'>AVAV'V(=(E$N3Nr(   )rg   rJ   Nr:   r4   r8   r;   ra   r3   r(   r$   rc   rc   q   s,    )#)$ O Or(   rc   c                  P    \ rS rSrS	S jrS
S jrSS jrS	S jr\SS j5       r	Sr
g)UnprintablePlugin   c                     SU l         SU l        g rZ   )_unprintable_countrJ   r-   s    r$   rM   UnprintablePlugin.__init__   s    '(%&r(   c                    gNTr3   r!   s     r$   r%   UnprintablePlugin.eligible       r(   c                x    [        U5      (       a  U =R                  S-  sl        U =R                  S-  sl        g rm   )r   ry   rJ   r!   s     r$   r*   UnprintablePlugin.feed   s/    )$$##q(#"r(   c                    SU l         g rZ   )ry   r-   s    r$   r.   UnprintablePlugin.reset   s
    "#r(   c                \    U R                   S:X  a  gU R                  S-  U R                   -  $ )Nr   r]   rr   rJ   ry   r-   s    r$   r1   UnprintablePlugin.ratio   s/      A%''!+t/D/DDDr(   r   Nr:   r4   r8   r;   ra   r3   r(   r$   rv   rv      s,    '#
$ E Er(   rv   c                  P    \ rS rSrS	S jrS
S jrSS jrS	S jr\SS j5       r	Sr
g)SuspiciousDuplicateAccentPlugin   c                .    SU l         SU l        S U l        g rZ   _successive_countrJ   _last_latin_characterr-   s    r$   rM   (SuspiciousDuplicateAccentPlugin.__init__   s    &'%&15"r(   c                F    UR                  5       =(       a    [        U5      $ rP   )rj   r   r!   s     r$   r%   (SuspiciousDuplicateAccentPlugin.eligible   s      ":x	'::r(   c                   U =R                   S-  sl         U R                  b  [        U5      (       a  [        U R                  5      (       a  UR                  5       (       a4  U R                  R                  5       (       a  U =R                  S-  sl        [        U5      [        U R                  5      :X  a  U =R                  S-  sl        Xl        g rm   )rJ   r   r
   isupperr   r   r!   s     r$   r*   $SuspiciousDuplicateAccentPlugin.feed   s    "&&2y))t99::  ""t'A'A'I'I'K'K&&!+&Y'=9S9S+TT&&!+&%."r(   c                .    SU l         SU l        S U l        g rZ   r   r-   s    r$   r.   %SuspiciousDuplicateAccentPlugin.reset   s    !" !%)"r(   c                \    U R                   S:X  a  gU R                  S-  U R                   -  $ )Nr   r]   rV   )rJ   r   r-   s    r$   r1   %SuspiciousDuplicateAccentPlugin.ratio   s/      A%&&*d.C.CCCr(   )rJ   r   r   Nr:   r4   r8   r;   ra   r3   r(   r$   r   r      s,    6;/*
 D Dr(   r   c                  P    \ rS rSrS	S jrS
S jrSS jrS	S jr\SS j5       r	Sr
g)SuspiciousRange   c                .    SU l         SU l        S U l        g rZ   )"_suspicious_successive_range_countrJ   _last_printable_seenr-   s    r$   rM   SuspiciousRange.__init__   s    78/%&04!r(   c                "    UR                  5       $ rP   rQ   r!   s     r$   r%   SuspiciousRange.eligible   rT   r(   c                Z   U =R                   S-  sl         UR                  5       (       d  [        U5      (       d
  U[        ;   a  S U l        g U R                  c  Xl        g [        U R                  5      n[        U5      n[        X#5      (       a  U =R                  S-  sl        Xl        g rm   )rJ   isspacer   r   r   r    is_suspiciously_successive_ranger   )r"   r#   unicode_range_aunicode_range_bs       r$   r*   SuspiciousRange.feed   s    " i((88(,D%$$,(1%&3D4M4M&N&3I&>+OMM33q83$-!r(   c                .    SU l         SU l        S U l        g rZ   )rJ   r   r   r-   s    r$   r.   SuspiciousRange.reset   s     !23/$(!r(   c                `    U R                   S::  a  gU R                  S-  U R                   -  nU$ )N   r]   rV   )rJ   r   )r"   ratio_of_suspicious_range_usages     r$   r1   SuspiciousRange.ratio   s<      B& 33a7!!2"' /.r(   )rJ   r   r   Nr:   r4   r8   r;   ra   r3   r(   r$   r   r      s*    5
'..)
 / /r(   r   c                  P    \ rS rSrS	S jrS
S jrSS jrS	S jr\SS j5       r	Sr
g)SuperWeirdWordPlugin   c                    SU l         SU l        SU l        SU l        SU l        SU l        SU l        SU l        SU l        SU l	        g )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrJ   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr-   s    r$   rM   SuperWeirdWordPlugin.__init__   sQ     !$%() */!). %&)*!)*!() r(   c                    gr|   r3   r!   s     r$   r%   SuperWeirdWordPlugin.eligible
  r~   r(   c                   UR                  5       (       Ga  U =R                  U-  sl        [        U5      (       a  U =R                  S-  sl        U R                  SL ak  [        U5      SL d  [        U5      (       aM  [        U5      SL a?  [        U5      SL a1  [        U5      SL a#  [        U5      SL a  [        U5      SL a  SU l        [        U5      (       d@  [        U5      (       d0  [        U5      (       d   [        U5      (       d  [        U5      (       a  U =R                  S-  sl        g U R                  (       d  g UR                  5       (       d!  [        U5      (       d  [        U5      (       Ga-  U R                  (       Ga  U =R                  S-  sl        [!        U R                  5      nU =R"                  U-  sl        US:  a  U R                  U-  S:  a  SU l        O[        U R                  S   5      (       a^  U R                  S   R'                  5       (       a<  [)        S U R                   5       5      SL a  U =R*                  S-  sl        SU l        O,U R                  S:X  a  SU l        U =R*                  S-  sl        US:  a  U R                  (       a  [-        U R                  [/        S	U5      5       VVs/ s H  u  p4UR'                  5       (       d  M  UPM      nnnSnU(       a  [!        U5      U-  S
::  a  SnU(       d  U =R*                  S-  sl        SU l        U R$                  (       aD  U =R0                  S-  sl        U =R2                  [!        U R                  5      -  sl        SU l        SU l        SU l        S	U l        S	U l        g US;  aB  UR5                  5       SL a.  [7        U5      (       a  SU l        U =R                  U-  sl        g g g g s  snnf )Nr   FT   g      ?c              3  @   #    U  H  oR                  5       v   M     g 7frP   )r   ).0_s     r$   	<genexpr>,SuperWeirdWordPlugin.feed.<locals>.<genexpr>7  s     >AIIKKs      r   r^   r   >   -<=>r   |~)rj   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   lenrJ   r   r   allr   zipranger   r   rW   r   )r"   r#   buffer_lengthcicamel_case_dstprobable_camel_caseds          r$   r*   SuperWeirdWordPlugin.feed  s:   LLI%Li(())Q.)((E1i(E1^I5N5N9%.i(E1	*e3	*e3I&%/+/(y!!Y''y))y))9%%((A-(||>)#<#<Y@W@Wlll!!$T\\!2M!!]2!!,,}<C04D- #4<<#344R(0022>>>%G,,1,04D---204D-,,1,"t'?'? !$DLL%=2I J" Jyy{  J  "
 .3$!s>':]'Jc'Q+/(+,,1,04D-(($$)$))S->>),1)',D$DL()D%'(D$@@!!#u,)$$(,D%LLI%L % - A1"s   O/+O/c                t    SU l         SU l        SU l        SU l        SU l        SU l        SU l        SU l        g )Nr   Fr   )r   r   r   r   r   rJ   r   r   r-   s    r$   r.   SuperWeirdWordPlugin.reset^  sA    $)!#(   !$%!#$ r(   c                v    U R                   S::  a  U R                  S:X  a  gU R                  U R                  -  $ )N
   r   r]   )r   r   r   rJ   r-   s    r$   r1   SuperWeirdWordPlugin.ratioh  s7    r!d&>&>!&C((4+@+@@@r(   )
r   r   r   r   r   rJ   r   r   r   r   Nr:   r4   r8   r;   ra   r3   r(   r$   r   r      s.    *O&b% A Ar(   r   c                  T    \ rS rSrSrS
S jrSS jrSS jrS
S jr\	SS j5       r
Srg	)CjkInvalidStopPluginip  u   
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '丅' and '丄'.
c                     SU l         SU l        g rZ   _wrong_stop_count_cjk_character_countr-   s    r$   rM   CjkInvalidStopPlugin.__init__v  s    &')*!r(   c                    gr|   r3   r!   s     r$   r%   CjkInvalidStopPlugin.eligiblez  r~   r(   c                    US;   a  U =R                   S-  sl         g [        U5      (       a  U =R                  S-  sl        g g )N>      丄   丅r   )r   r   r   r!   s     r$   r*   CjkInvalidStopPlugin.feed}  s?    &""a'")%%*% r(   c                     SU l         SU l        g rZ   r   r-   s    r$   r.   CjkInvalidStopPlugin.reset  s    !"$%!r(   c                V    U R                   S:  a  gU R                  U R                   -  $ )N   r]   r   r   r-   s    r$   r1   CjkInvalidStopPlugin.ratio  s*    $$r)%%(A(AAAr(   r   Nr:   r4   r8   r;   )r=   r>   r?   r@   rA   rM   r%   r*   r.   rB   r1   rC   r3   r(   r$   r   r   p  s1    
++& B Br(   r   c                  P    \ rS rSrS	S jrS
S jrSS jrS	S jr\SS j5       r	Sr
g)ArchaicUpperLowerPlugini  c                f    SU l         SU l        SU l        SU l        SU l        S U l        SU l        g )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrJ   _last_alpha_seen_current_ascii_onlyr-   s    r$   rM    ArchaicUpperLowerPlugin.__init__  s9    	45,23*890%&,0)- r(   c                    gr|   r3   r!   s     r$   r%    ArchaicUpperLowerPlugin.eligible  r~   r(   c                   UR                  5       =(       a    [        U5      nUSL nU(       a  U R                  S:  a  U R                  S::  aA  UR                  5       SL a.  U R                  SL a  U =R
                  U R                  -  sl        SU l        SU l        S U l        SU l        U =R                  S-  sl	        SU l        g U R                  SL a  UR                  5       SL a  SU l        U R                  b  UR                  5       (       a  U R                  R                  5       (       d4  UR                  5       (       aS  U R                  R                  5       (       a4  U R                  SL a  U =R                  S-  sl        SU l        OSU l        OSU l        U =R                  S-  sl	        U =R                  S-  sl        Xl        g )NFr   @   r   TrV   )rj   r   r   rW   r   r   r   r   r   rJ   isasciir   islower)r"   r#   is_concerned	chunk_seps       r$   r*   ArchaicUpperLowerPlugin.feed  s    ((*J/?	/J E)	==A44:%%'50,,588668 23D.34D0$(D!DI!!Q&!'+D$##t+	0A0A0Cu0L',D$  ,!!##(=(=(E(E(G(G!!##(=(=(E(E(G(G99$66!;6 %DI $DI!	",,1, )r(   c                f    SU l         SU l        SU l        SU l        S U l        SU l        SU l        g )Nr   FT)rJ   r   r   r   r   r   r   r-   s    r$   r.   ArchaicUpperLowerPlugin.reset  s9     !/0,-.*340 $	#' r(   c                V    U R                   S:X  a  gU R                  U R                   -  $ )Nr   r]   )rJ   r   r-   s    r$   r1   ArchaicUpperLowerPlugin.ratio  s*      A%77$:O:OOOr(   )r   rJ   r   r   r   r   r   Nr:   r4   r8   r;   ra   r3   r(   r$   r   r     s-    .(*T( P Pr(   r   c                  P    \ rS rSrS	S jrS	S jrS
S jrSS jr\SS j5       r	Sr
g)ArabicIsolatedFormPlugini  c                     SU l         SU l        g rZ   rJ   _isolated_form_countr-   s    r$   rM   !ArabicIsolatedFormPlugin.__init__  s    %&)*!r(   c                     SU l         SU l        g rZ   r  r-   s    r$   r.   ArabicIsolatedFormPlugin.reset  s     !$%!r(   c                    [        U5      $ rP   )r   r!   s     r$   r%   !ArabicIsolatedFormPlugin.eligible  s    ##r(   c                z    U =R                   S-  sl         [        U5      (       a  U =R                  S-  sl        g g rm   )rJ   r   r	  r!   s     r$   r*   ArabicIsolatedFormPlugin.feed  s4    ""9--%%*% .r(   c                Z    U R                   S:  a  gU R                  U R                   -  nU$ )Nrr   r]   r  )r"   isolated_form_usages     r$   r1   ArabicIsolatedFormPlugin.ratio  s0      1$%)%>%>AVAV%V""r(   r  Nr:   r4   r8   r;   )r=   r>   r?   r@   rM   r.   r%   r*   rB   r1   rC   r3   r(   r$   r  r    s*    +&$+ # #r(   r     )maxsizec                .   U b  Uc  gX:X  a  gSU ;   a  SU;   a  gSU ;   d  SU;   a  gSU ;   d  SU;   a  SU ;   d  SU;   a  gU R                  S5      UR                  S5      p2U H  nU[        ;   a  M  XC;   d  M    g   U S;   US;   peU(       d  U(       a  SU ;   d  SU;   a  gU(       a  U(       a  gS	U ;   d  S	U;   a  SU ;   d  SU;   a  gU S
:X  d  US
:X  a  gSU ;   d  SU;   d  U S;   a-  US;   a'  SU ;   d  SU;   a  gSU ;   d  SU;   a  gU S
:X  d  US
:X  a  gg)zY
Determine if two Unicode range seen next to each other can be considered as suspicious.
TFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r  r  PunctuationForms)splitr	   )r   r   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r$   r   r     sw    /"9)/!g&@o%)G 	?"g&@&+*H 	c"c" '
 00!	  	
	

 	33 ' 	, E_$<,?"h/&AO#u'?m+-/O 	 E_$<3377O+}/Oo%O)Cm+-/Or(   i   c           	     $   [         R                  5        Vs/ s H	  o3" 5       PM     nn[        U 5      S-   nSnUS:  a  SnOUS::  a  SnOSn[        U S-   [	        U5      5       Hh  u  pU H,  n
U
R                  U5      (       d  M  U
R                  U5        M.     U	S	:  a  X-  S	:X  d
  XS-
  :X  d  MO  [        S
 U 5       5      nXa:  d  Mh    O   U(       a  [        S5      nUR                  [        SU SU SU 35        [        U 5      S:  a8  UR                  [        SU SS  35        UR                  [        SU SS  35        U H2  nUR                  [        UR                   SUR                   35        M4     [        US5      $ s  snf )zo
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
r   r]   i   r   r  r      
r   c              3  8   #    U  H  oR                   v   M     g 7frP   )r1   )r   dts     r$   r   mess_ratio.<locals>.<genexpr>`  s     !?Yr((Ys   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r%   r*   sumr   logr   	__class__r1   round)decoded_sequencemaximum_thresholddebugmd_class	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr#   indexdetectorloggerr+  s                r$   
mess_ratior?  A  s    $6#D#D#F+#Fx
#F  + &'!+F O|13)	4,.),/) 04 7vG	!H  ++i( "
 AI%CqHqj !!?Y!??O3 H /0

11R0SSdetdu v!!2 35	
  2%JJu0@"0E/FGHJJu.>su.E-FGHBJJub
;<  !$$[+s   FN)r   
str | Noner   r@  r6   r7   )g?F)r4  r5   r5  r<   r6  r7   r6   r<   )(
__future__r   	functoolsr   loggingr   constantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rE   rc   rv   r   r   r   r   r   r  r   r?  r3   r(   r$   <module>rF     s;   "   
    *" "D,L'9 ,L^O1 O6E* E0"D&8 "DJ./( ./bsA- sAlB- B>IP0 IPX#1 #8 4FF2<F	F FR 4IN4%4%.34%BF4%
4% 4%r(   