Usuário(a):GoEThe/Bloqueio.py

#!/usr/local/bin/python
# -*- coding: utf-8 -*-


"""
Wikipedia:Discussão de bloqueio - Robô para lista novos pedidos de desbloqueio automaticamente.

Este robô irá apanhar todos os novos pedidos da categoria 
especificada e adicioná-los às páginas determinadas 
substituindo a região especificada automaticamente.

Também fará uma lista dos casos abertos da mesma maneira. 
Ajuste as variáveis para bater certo.
"""

import re, sys, string
sys.path.append('')
import wikipedia, catlib, config

#**************
#* Variáveis: *
#**************

# Description of 'terms' in the new and open dict()s:
# category - Category to draw list of new cases from.
# target - Target page to add list of new cases to.
# section -
#   Section of target page to replace, which is delimited
#   by <!-- BEGIN [section] --> <!-- END [section] -->
# titlemask -
#   The title mask removes some portion of the title from
#   the link list.
# exclude -
#   Page exclusion regex.  List the pages that you don't
#   want to have included in the output list.
#   Example:
#   Template\:Medcab2$|Wikipedia\:Mediation Cabal\/Complaints$
# action - Update action text.

# New cases
new = dict()
new['category'] = '!Pedidos_de_desbloqueio'
new['target'] = 'Wikipédia:Pedidos a administradores/Discussão de bloqueio'
new['section'] = 'NewCases'
new['titlemask'] = r"^Wikipédia\:Pedidos a administradores\/Discussão de bloqueio\/"
new['exclude'] = r"^Template\:.*|^User:.*"
new['action'] = "A actualizar pedidos novos..."

# Open cases
open = dict()
open['category'] = '!2'
open['target'] = 'Wikipédia:Pedidos a administradores/Discussão de bloqueio'
open['section'] = 'OpenCases'
open['titlemask'] = r"^Wikipédia\:Pedidos a administradores\/Discussão de bloqueio\/"
open['exclude'] = r"^Template\:.*|^User:.*"
open['action'] = "A actualizar pedidos abertos"

# Description of 'terms' in the status dict():
# tmpl - Name of the status template.

status = dict()
status['tmpl'] = r'Mediação'

# This should be run infrequently, using relatively short delays for the processing.
wikipedia.get_throttle.setDelay(5) # 5 seconds
wikipedia.put_throttle.setDelay(5) # 5 seconds

# *******************
# * MedCabBot Class *
# *******************
class MedCabBot:
  def __init__(self):
    pass

  def run(self):
    if new['target'] == open['target']:

      wikipedia.output(u'Processing Cases Lists')
      page_target = wikipedia.Page(wikipedia.getSite(), new['target'])
      page_data = page_target.get()
      new_page_data = self.process_category(new, page_data)
      open_page_data = self.process_category(open, new_page_data)

      # Check if the page has changed at all.
      if new_page_data != page_data or open_page_data != new_page_data:
        # If it has, update.
        action = u""
        if page_data != new_page_data:
          if new_page_data != open_page_data:
            action = new['action'] + ' & ' + open['action']
          else:
            action = new['action']
        else:
          action = open['action']
        wikipedia.output(u'Updating Cases Lists')
        wikipedia.setAction(action)
        page_target.put(open_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'Cases Lists are already up-to-date')

    else:

      wikipedia.output(u'Processing New Cases List')
      page_target = wikipedia.Page(wikipedia.getSite(), new['target'])
      page_data = page_target.get()
      new_page_data = self.process_category(new, page_data)
      if new_page_data != page_data:
        wikipedia.output(u'Updating New Cases List')
        wikipedia.setAction(new['action'])
        page_target.put(new_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'New Cases List is already up-to-date')

      wikipedia.output(u'Processing Open Cases List')
      page_target = wikipedia.Page(wikipedia.getSite(), open['target'])
      page_data = page_target.get()
      open_page_data = self.process_category(open, page_data)
      if open_page_data != page_data:
        wikipedia.output(u'Updating Open Cases List')
        wikipedia.setAction(open['action'])
        page_target.put(open_page_data)
      else:
        # Otherwise, tell the user and exit.
        wikipedia.output(u'Open Cases List is already up-to-date')

  def process_category(self, pgt, page_data):
    # Populate local variables
    category = pgt['category']
    section = pgt['section']
    titlemask = pgt['titlemask']
    exclude = pgt['exclude']

    # Setup Regular Expressions used later.
    exclude_regex = re.compile(exclude)
    titlemask_regex = re.compile(titlemask)

    # Create instance of catlib object and specify category.
    cat = catlib.Category(wikipedia.getSite(), 'Categoria:' + category)

    # Get array of pages in category.
    pages = cat.articles()
    #pages.reverse() # Change to descending date order

    # Initialize variables.
    total = 2
    count = 0

    # Check if there are any pages in the category.
    if total == 0:
      # If the number of pages is zero output status.
      wikipedia.output('Categoria:' + category + ' está vazia, não está a fazer nada.')
      return page_data
    else:
      # Initialize variables.
      pagelist = u"\n"

      # Otherwise, process the pages to produce a page.
      wikipedia.output(u'A processar ' + str(total) + ' páginas de pedidos de desbloqueio.')

      # Loop through all pages.
      for page in pages:
        title = page.title()
        count = count + 1
        # Check to see whether it's in the exclude list.
        if exclude_regex.match(title):
          wikipedia.output(str(count) + u' of ' + str(total) + ' ' + title + ' - Skipping')
        else:
          # If not in the exclude list, add to the pagelist.
          # Output status line.
          wikipedia.output(str(count) + u' of ' + str(total) + ' ' + title)
          # Add the page title to the page list.
          pagelist = pagelist + u'* [[' + title + '|' + titlemask_regex.sub('', title) + ']]'

          hist = page.getVersionHistory()
          print hist

          # Get dict of parameters from status template.
          params = self.get_tmpl_params(page, status['tmpl'])
          if params:
            if 'mediators' in params and params['mediators'] != '':
              pagelist = pagelist + u' — Mediator(s): ' + params['mediators'] + "\n"
            else:
              pagelist = pagelist + u"\n"
            if 'comment' in params:
              if params['comment'] != '':
                pagelist = pagelist + u'** Comment: ' + params['comment'] + "\n"
          else:
            pagelist = pagelist + u"\n"

      # Finish the formatting of the pagelist.
      pagelist = u'<!-- BEGIN ' + section + ' -->' + pagelist + '<!-- END ' + section + ' -->'

      # Setup regex to find replaced region.
      start = r'\<\!\-\- BEGIN ' + section + ' \-\-\>'
      end = r'\<\!\-\- END ' + section + ' \-\-\>'

      # Run replacement and place in new variable.
      return re.compile(start + r'.*?' + end, re.S).sub(pagelist, page_data)

  # WARNING: get_tmpl_params() is really scary.
  # If there's an efficient regex for parsing out templates, I'd love to have it.
  def get_tmpl_params(self, page, tmpl_name):
    # Compile regexes.
    tmpl_open = re.compile(r'\{\{', re.I | re.S)
    tmpl_close = re.compile(r'\}\}', re.I | re.S)

    # Get the case page data.
    page_data = page.get()
    # Strip Comments
    page_data = re.sub(r'\<\!\-\-.*?\-\-\>', '', page_data)

    # Find start of string.
    m = re.compile(r'\{\{' + tmpl_name + '\W*?\|', re.I | re.S).search(page_data)

    # Only do processing if the search was successful.
    if m:
      # Set the start point for the parameter list.
      param_start = m.end()
      # Set the end point for the parameter list, which will iterate up if
      # subtemplates are found within the template definition.
      param_end = tmpl_close.search(page_data, param_start).end()
      # Set the temporary search results variable for the next template
      # opening delimiter.
      m = tmpl_open.search(page_data, param_start)
      # Since this could fail, verify that this result can be compared.
      if m:
        # Set the param_open variable to the last found template
        # opening delimiter.
        param_open = m.end()
        # While the end point for the parameter range is greater than
        # the end point of the last search for a template opening
        # delimiter we know that there is a subtemplate to identify.
        # This assumes that the templates are properly nested.
        while param_open < param_end:
          # Set the temporary search results variable to the next
          # template opening delimiter.
          m = tmpl_open.search(page_data, param_end)
          # Logic to set the param_open variable.
          if m:
            # Search was successful, set to end() value.
            param_open = m.end()
          else:
            # Search was failure, exit loop.
            param_end = tmpl_close.search(page_data, param_end).end()
            break
          # Sets the new end point for the parameter range.
          param_end = tmpl_close.search(page_data, param_end).end()

      # Remove the closing template delimiter.
      param_end = param_end - 2

      # Declare parameters dict()
      params = dict()
      # Loop through each parameter.
      for param in re.split(r"\n[\|]*",page_data[param_start:param_end]):
        # Only try splitting and adding to the params if not blank.
        if param != '':
          # Split only on the first equal sign.
          temp = param.split('=', 1)
          # Add entry for this parameter.
          if len(temp) > 1:
            params[temp[0].strip()] = temp[1].strip()

      # Debugging output.
      print params

      # Return the dict()
      return params

    else:
      # Failed, return the results of the failed match()
      return m

if __name__ == "__main__":
  try:
    bot = MedCabBot()
    bot.run()
  finally:
    wikipedia.stopme()