python正则表达式

写了一段python跟踪网络文学更新的代码,球猫说用正则分析网页更好,尝试学习之

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os.path as Path
import threading
import time
import shutil
import urllib2
import urllib
import requests
import json
fn=''
def wx(ip):
  print ip
  adata = {
    "msgtype": "markdown",
    "markdown": {
        "content": ip
    }
  }
  aheaders = {'Content-Type': 'application/json'}
  url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=3fff2eb1-a575-7777-f9a2-9813c7857777"
  response = requests.post(url, headers=aheaders, data = json.dumps(adata))
  print response.text
def heart_beat():
  global fn
  url = "https://m.999xs.com/"
  ip = "0.0.0.0"
  try:
    req = urllib2.Request(url)
    res_data = urllib2.urlopen(req)
    ip = res_data.read()
  except:
    print "error"
  try:
    p1=ip.find('<p>更新')
    p2=ip.find('章</a></p>')
    ip=ip[p1:p2+24]
    p2=ip.find('</a></p>')
    ip=ip[0:p2+8]
    print('\n')
    print(ip)
    print('\n')
    if fn!='':
      if fn != ip:
        wx(ip)
        fn = ip
    else:
      fn = ip
  except:
    print "error12"
  threading.Timer(600, heart_beat).start()
wx('xiaoshuo start...')
threading.Timer(10, heart_beat).start()

首先通过文件读取获取网页内容

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
fo = open("xiaoshuo.txt", "r+")
str = fo.read()
fo.close()
print "读取的字符串是 : ", str

然后就是从字符串中分析出需要的信息了
flags : 可选,表示匹配模式,比如忽略大小写,多行模式等,具体参数为:
re.I 忽略大小写
re.L 表示特殊字符集 \w, \W, \b, \B, \s, \S 依赖于当前环境
re.M 多行模式
re.S 即为 . 并且包括换行符在内的任意字符(. 不包括换行符)
re.U 表示特殊字符集 \w, \W, \b, \B, \d, \D, \s, \S 依赖于 Unicode 字符属性数据库
re.X 为了增加可读性,忽略空格和 # 后面的注释

勉强可用

searchObj = re.search( r'.*<p>更新:(.*?)</p>.*', str, re.M|re.I)
if searchObj:
   print "更新 : ", searchObj.group(1)
else:
   print "Nothing found!!"
searchObj = re.search( r'.*<p>最新:.*html">(.*?)</a>.*', str, re.M|re.I)
if searchObj:
   print "章节 : ", searchObj.group(1)
else:
   print "Nothing found!!"

包装成方法

def fenxi(str):
  searchObj = re.search( r'.*<p>更新:(.*?)</p>.*', str, re.M|re.I)
  if searchObj:
    sj=searchObj.group(1)
  else:
    sj=None
  searchObj = re.search( r'.*<p>最新:.*html">(.*?)</a>.*', str, re.M|re.I)
  if searchObj:
    zj=searchObj.group(1)
  else:
    zj=None
  return sj,zj

用于追更

  try:
    sj,zj=fenxi(ip)
    if sj:
      if fn!='':
        if fn!=sj:
          ip = ">更新时间:<font color=\"comment\">" + sj + "</font> \n"
          if zj:
            ip = ip + ">最新章节:<font color=\"comment\">" + zj + "</font>\n"
          wx(ip)
          fn=sj
      else:
        fn=sj

  except:
    print "解析错误"

标签: none

添加新评论