mò
EäFc           @   s  d  Z  d Z d k Z d k Z d k Z d k Z d k Z d k Z y d k	 l
 Z
 Wn: e j
 o. y d k l
 Z
 Wq e j
 o q Xn Xd „  Z d f  d „  ƒ  YZ d f  d „  ƒ  YZ d	 e i i i f d
 „  ƒ  YZ d e i f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   st  
Each XmlEntry object represents a page, as read from an XML source

The MediaWikiXmlHandler can be used for the XML given by Special:Export
as well as for XML dumps.

The XmlDump class reads a pages_current XML dump (like the ones offered on
http://download.wikimedia.org/wikipedia/de/) and offers a generator over
XmlEntry objects which can be used by other bots.

For fastest processing, XmlDump uses the cElementTree library if available
(this comes included with Python 2.5, and can be downloaded from
http://www.effbot.org/ for earlier versions). If not found, it falls back
to the older method using regular expressions.
s8   $Id: xmlreader.py 3726 2007-06-20 13:28:05Z wikipedian $N(   s	   iterparsec         C   sš   |  p d Sn d } d } t i d |  ƒ } | o | i d ƒ } n t i d |  ƒ } | o | i d ƒ } n |  d j o d } d } n | | f S(   s¾   
    Parses the characters within a restrictions tag and returns
    strings representing user groups allowed to edit and to move
    a page, where None means there are no restrictions.
    s   edit=([^:]*)i   s   move=([^:]*)t   sysopN(   NN(	   t   restrictionst   Nonet   editRestrictiont   moveRestrictiont   ret   searcht   editLockMatcht   groupt   moveLockMatch(   R   R	   R   R   R   (    (    t=   /home/cat/public_html/arm/mrobe/wiki/pywikipedia/xmlreader.pyt   parseRestrictions!   s     
t   XmlEntryc           B   s   t  Z d  Z d „  Z RS(   s   
    Represents a page.
    c
   
      C   s[   | |  _  | |  _ | |  _ | i ƒ  |  _ | |  _ | |  _ | |  _ | |  _	 |	 |  _
 d  S(   N(   t   titlet   selft   idt   textt   usernamet   stript   ipeditt	   timestampR   R   t
   revisionid(
   R   R   R   R   R   R   R   R   R   R   (    (    R
   t   __init__:   s    							(   t   __name__t
   __module__t   __doc__R   (    (    (    R
   R   6   s    t   XmlHeaderEntryc           B   s   t  Z d  Z d „  Z RS(   s#   
    Represents a header entry
    c         C   s1   d |  _ d |  _ d |  _ d |  _ h  |  _ d  S(   Nu    (   R   t   sitenamet   baset	   generatort   caset
   namespaces(   R   (    (    R
   R   J   s
    				(   R   R   R   R   (    (    (    R
   R   F   s    t   MediaWikiXmlHandlerc           B   s>   t  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   Nc         C   sG   t  i i i i |  ƒ t |  _ t |  _ d  |  _
 d |  _ d |  _ d  S(   Nu    (   t   xmlt   saxt   handlert   ContentHandlerR   R   t   Falset   inRevisionTagt   inContributorTagR   t   headercallbackR   R   (   R   (    (    R
   R   R   s    				c         C   s   | |  _  d  S(   N(   t   callbackR   (   R   R)   (    (    R
   t   setCallback]   s    c         C   s   | |  _  d  S(   N(   R(   R   (   R   R(   (    (    R
   t   setHeaderCallback`   s    c         C   s;  d  |  _ | d j o d  |  _ d  |  _ d  |  _ n| d j o t |  _ né| d j o t |  _	 nÏ| d j o d |  _ d |  _
 n¬| d j oV |  i	 o d |  _ d |  _ q7|  i o d |  _ d |  _ q7d |  _ d |  _ nI| d	 j o d	 |  _ d |  _ t |  _ n| d
 j o d	 |  _ d |  _ t |  _ nñ | d j o d |  _ d |  _ nÎ | d j o d |  _ d |  _ n« | d j o d |  _ d |  _ nˆ |  i o} | d j o t ƒ  |  _ q7| d d d d g j o | |  _ q7| d j o) d |  _ d |  _ t | d ƒ |  _ q7n d  S(   Nt   paget   revisiont   contributorR   u    R   t   useridR   R   t   ipR   R   R   t   siteinfoR   R   R   R   t	   namespacet   key(   R   R   t   destinationt   nameR   R   R   t   TrueR&   R'   R   R/   R   R   R   R%   R   R   R   R(   R   t   headerR2   t   intt   attrst   namespaceid(   R   R5   R9   (    (    R
   t   startElementc   s^    				
	
									
		c      
   C   s¼  | d j o t |  _ nŸ| d j o t |  i ƒ \ |  _ |  _ ns| d j o|  i } x' | o | d d j o | d  } q_ Wd i	 | i
 d ƒ ƒ } |  i d d	 !|  i d
 d !|  i d d !|  i d d !|  i d d !|  i d d !} |  i i ƒ  |  _ t |  i |  i | |  i |  i | |  i |  i |  i ƒ	 } t |  _ |  i | ƒ n` |  i oU | d j o |  i |  i i |  i <q¸| d j o |  i |  i ƒ d  |  _ q¸n d  S(   NR.   R   R-   iÿÿÿÿs   
 u   
s   
i    i   i   i   i   i
   i   i   i   i   i   i   R2   R1   (   R5   R%   R   R'   R   R   R   R   R   t   joint   splitR   R   R   R   R   R   R   R   t   entryR&   R)   R(   R2   R7   R   R:   R   (   R   R5   R   R   R>   (    (    R
   t
   endElement•   s*    	 V9	
c         C   sÂ  |  i d j o |  i | 7_ nœ|  i d j o |  i | 7_ ny|  i d j o |  i | 7_ nV|  i d j o |  i | 7_ n3|  i d j o |  i | 7_ n|  i d j o |  i | 7_ ní |  i d j o |  i	 | 7_	 nÊ |  i
 o¿ |  i d j o |  i i | 7_ q¾|  i d	 j o |  i i | 7_ q¾|  i d
 j o |  i i | 7_ q¾|  i d j o |  i i | 7_ q¾|  i d j o |  i | 7_ q¾n d  S(   NR   R   R   R   R   R   R   R   R   R   R   R2   (   R   R4   R   t   dataR   R   R   R   R   R   R(   R7   R   R   R   R   R2   (   R   R@   (    (    R
   t
   characters¹   s2    
(   R   R   R   R*   R+   R;   R?   RA   (    (    (    R
   R    Q   s   				2	$t   XmlParserThreadc           B   s    t  Z d  Z d „  Z d „  Z RS(   sß   
    This XML parser will run as a single thread. This allows the XmlDump
    generator to yield pages before the parser has finished reading the
    entire dump.
    
    There surely are more elegant ways to do this.
    c         C   s&   t  i i |  ƒ | |  _ | |  _ d  S(   N(   t	   threadingt   ThreadR   R   t   filenameR#   (   R   RE   R#   (    (    R
   R   Þ   s    	c         C   s   t  i i |  i |  i ƒ d  S(   N(   R!   R"   t   parseR   RE   R#   (   R   (    (    R
   t   runã   s    (   R   R   R   R   RG   (    (    (    R
   RB   Ö   s    	t   XmlDumpc           B   s2   t  Z d  Z d „  Z d „  Z d „  Z d „  Z RS(   sp  
    Represents an XML dump file. Reads the local file at initialization,
    parses it, and offers access to the resulting XmlEntries via a generator.
    
    NOTE: This used to be done by a SAX parser, but the solution with regular
    expressions is about 10 to 20 times faster. The cElementTree version is
    again much, much faster than the regex solution.
    c         C   s   | |  _  d  S(   N(   RE   R   (   R   RE   (    (    R
   R   ð   s    c         C   s>   d GHd t  ƒ  j o t i d ƒ |  i ƒ  Sn |  i ƒ  Sd S(   s3   Return a generator that will yield XmlEntry objectss   Reading XML dump...t	   iterparseur   NOTE: cElementTree not found. Using slower fallback solution. Consider installing the python-celementtree package.N(   t   globalst	   wikipediat   outputR   t   regex_parset	   new_parse(   R   (    (    R
   RF   ó   s     c         c   sÜ  t  |  i d d ƒ} d } xº| D]²\ } } | d j o" | d d j o | d } q" n | d j o | d j o | } q" n | d j oC| i	 d | j o/| i
 d	 | ƒ } | i
 d
 | ƒ } | i
 d | ƒ } | i d | ƒ }
 |
 i
 d
 | ƒ } |
 i
 d | ƒ } |
 i d | ƒ } | i
 d | ƒ } | p | i
 d | ƒ } |
 i
 d | ƒ } t | ƒ \ } }	 t d | d | d | p d d | d t | ƒ d | d | d |	 d | ƒ 	V| i ƒ  q" q" Wd S(   s/   Generator using cElementTree iterparse functiont   eventst   startt   ends   start-nsi    t    i   s   {%s}pages	   {%s}titles   {%s}ids   {%s}restrictionss   {%s}revisions   {%s}timestamps   {%s}contributors   {%s}ips   {%s}usernames   {%s}textR   R   R   u    R   R   R   R   R   R   N(   s   starts   ends   start-ns(   RI   R   RE   t   contextR   t   roott   eventt   elemt   urit   tagt   findtextR   t   pageidR   t   findR-   R   R   R.   t   ipeditorR   R   R   R   R   R   t   boolt   clear(   R   R   RW   R\   R.   RU   RZ   R   R   R   R-   R   R   R   R   RV   RS   RT   (    (    R
   RN   ü   sB      
!	
c         c   s,  t  i d d d d d d d d d	 d
 d d d d d d t  i ƒ }
 t i |  i d d t i	 ƒ  i
 ƒ  d d ƒ} t } d } x›| p“| i ƒ  } | | 7} | d j o
 t } q | i d ƒ oR| i d d ƒ } | i d d ƒ } | i d d ƒ } | i d d ƒ } |
 i | ƒ } | p d  GH| GHd } q#d } | i d! ƒ p d } | i d" ƒ } t | ƒ \ } }	 | i d# ƒ o | i d# ƒ } t } n | i d$ ƒ } t } t d% | i d% ƒ d& | i d' ƒ d! | d# | d( | d) | i d) ƒ d* | d+ |	 d, | i d, ƒ ƒ 	Vq q Wd- S(.   s#  
        Generator which reads some lines from the XML dump file, and
        parses them to create XmlEntry objects. Stops when the end of file is
        reached.

        NOTE: This is very slow. It's only a fallback solution for users who
        haven't installed cElementTree.
        s	   <page>\s*s    <title>(?P<title>.+?)</title>\s*s   <id>(?P<pageid>\d+?)</id>\s*s8   (<restrictions>(?P<restrictions>.+?)</restrictions>)?\s*s   <revision>\s*s    <id>(?P<revisionid>\d+?)</id>\s*s,   <timestamp>(?P<timestamp>.+?)</timestamp>\s*s   <contributor>\s*s*   (<username>(?P<username>.+?)</username>\s*s2   <id>(?P<userid>\d+?)</id>|<ip>(?P<ip>.+?)</ip>)\s*s   </contributor>\s*s   (?P<minor>(<minor />))?\s*s+   (?:<comment>(?P<comment>.+?)</comment>\s*)?sR   (<text xml:space="preserve">(?P<text>.*?)</text>|<text xml:space="preserve" />)\s*s   </revision>\s*s   </page>t   rt   encodingt   errorst   replaceu    RR   u   </page>
s   &gt;t   >s   &lt;t   <s   &quot;t   "s   &amp;t   &s#   ERROR: could not parse these lines:R   R   R   R0   R   R   RZ   R   R   R   R   R   N(   R   t   compilet   DOTALLt   Rpaget   codecst   openR   RE   RK   t   getSiteR`   t   fR%   t   eoft   linest   readlinet   lineR6   t   endswithRb   R   t   mR   R   R   R   R   R   R   R   R   (   R   R   R   Rn   R   Rm   Ro   Rs   R   R   Ri   Rq   R   (    (    R
   RM   %  sR     E! 



(   R   R   R   R   RF   RN   RM   (    (    (    R
   RH   ç   s
    				)(   R   t   __version__RC   t   timet   xml.saxR!   Rj   R   RK   t   xml.etree.cElementTreeRI   t   ImportErrort   cElementTreeR   R   R   R"   R#   R$   R    RD   RB   t   objectRH   (   R!   R   RH   R   Ru   RK   R   RC   RI   R   Rj   RB   Rt   R    (    (    R
   t   ?   s$   		
	…
