日志标签 ‘Readability’

Readability的Python实现

2012年10月15日

http://yanghao.org/tools/readability   做了一个将javascript实现的readability,采用python进行了实现。具体代码见https://github.com/kingwkb/readability

在使用的过程中主要有一些问题:

1、版权信息容易被抽取到,很多中文网站的版权区的class name等信息不一定可以枚举到
2、评论信息等很难被权重干掉,容易抽错误
3、多个div的页面不能够抽取完成
4、table表示的正文内容抽取不完全
5、标题抽取未计算<title>标签内的标题与里面<h1>等的相似度,标题抽取的准确率不高。

针对这些问题,我做了一些修改。

代码如下所示:

# -*- coding: utf-8 -*from __future__ import division
import os
import sys
import urllib
import urllib2
import urlparse
import re
import HTMLParser
import math
import urlparse
import posixpath
import time
import HTMLParser
import chardet
from BeautifulSoup import BeautifulSoup
#from bs4 import BeautifulSoupclass Readability:regexps = {
‘unlikelyCandidates’: re.compile(“combx|comment|community|disqus|extra|foot|header|menu|”
“remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|”
“pagination|pager|popup|tweet|twitter|login|Submit”,re.I),
‘okMaybeItsACandidate’: re.compile(“and|article|body|column|main|shadow”, re.I),
‘positive’: re.compile(“article|body|content|entry|hentry|main|page|pagination|post|text|”
“blog|story”,re.I),
‘negative’: re.compile(“combx|comment|com|contact|foot|footer|footnote|masthead|media|”
“meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|Submit|submit”
“shopping|tags|tool|widget|time|source”, re.I),
‘extraneous’: re.compile(“print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|”
“sign|single”,re.I),
‘divToPElements’: re.compile(“<(blockquote|dl|div|ol|ul|iframe)”,re.I),
‘replaceBrs’: re.compile(“(<br[^>]*>[ \n\r\t]*){2,}”,re.I),
‘replaceFonts’: re.compile(“<(/?)font[^>]*>”,re.I),
‘trim’: re.compile(“^\s+|\s+$”,re.I),
‘normalize’: re.compile(“\s{2,}”,re.I),
‘killBreaks’: re.compile(“(<br\s*/?>(\s|&nbsp;?)*)+”,re.I),
‘videos’: re.compile(“http://(www\.)?(youtube|vimeo)\.com”,re.I),
‘skipFootnoteLink’: re.compile(“^\s*((\[|\【)?[a-z0-9]{1,2}(\]|\】)?|^|edit|citation needed)\s*$”,re.I),
‘nextLink’: re.compile(u”(下一页|下页|更多|【\d*】\s*|next|<|«)”,re.I),
#// Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
‘prevLink’: re.compile(u”(上一页|上页|^\d+\s*$|prev|earl|old|new|<|«)”,re.I),
‘comment’: re.compile(“<!–(.|\\s)*?–>”),
}
chinese_punctuation = [u’,’,u’。’,u’!’,u’、’,u’?’]
chinese_copyright = [u’版权与责任说明’,u’凡本网注明’,u’版权说明’,u’免责声明’,u’版权属于’,u’可能已被删除’,
u’声明:’,u’版 权 所 有’,u’声明:’,‘Copyright’,u”版权申明:”,
u”独家稿件声明”,u”依法追究责任”,u”保留追究其法律责任的权利”]
chinese_time_or_source = re.compile(u”(发表时间:|发表时间:|发布时间:|发步时间:|201[0-9]年\d{1,2}月\d{1,2}日\s*\d{2}:\d{2}|201[0-9][\-\./][0-9]{1,2}[\-\./][0-9]{1,2}\s*\d{2}:\d{2}(:\d{2})?|来源:|来源:|发布会员:|当前位置:)”,re.I)

chinese_others = [u”相关”,u”热点”,u”推荐”,u”专题”,u”更多”,u”相关新闻”,u”相关链接”,
u”相关报道”,u”相关阅读”,u”相关文章”,u”今日推荐”,u”上一页”,
u”上一篇”,u”下一页”,u”下一篇”,u”延伸阅读”,u”责编:”,
u”本文被浏览”,u”收藏”,u”复制网址”,u’复制成功’,
u”分享到”,u”loading…”,u”加载中”,u”开通微博”,u”发布会员:”,
‘——‘,u’第1页’,u’播主:’,u” 播主:”,u”网友评论”,
u”邮箱:”,u”传真:”,u”电话:”,u”字号”,u”&gt;”,
u”联系方式:”,u”加入收藏”,u”出错了”,u”未经授权”]

def __init__(self, input, url):
“””
       url = “http://yanghao.org/blog/”
       htmlcode = urllib2.urlopen(url).read().decode(‘utf-8’)

       readability = Readability(htmlcode, url)

       print readability.title
       print readability.content
       “””
self.candidates = {}

self.input = input
self.url = url
self.input = self.regexps[‘comment’].sub(“”,self.input)
self.input = self.regexps[‘replaceBrs’].sub(“</p><p>”,self.input)
self.input = self.regexps[‘replaceFonts’].sub(“<\g<1>span>”,self.input)

self.html = BeautifulSoup(self.input)
#        print self.html.originalEncoding
#        print self.html
self.removeDisplayNone()
self.removeScript()
self.removeStyle()
self.removeLink()

self.title = self.getArticleTitle()
self.content = self.grabArticle()

def removeDisplayNone(self):
for elem in self.html.findAll(attrs={‘style’:‘display:none’}):
elem.extract()

def removeScript(self):
for elem in self.html.findAll(“script”):
elem.extract()

def removeStyle(self):
for elem in self.html.findAll(“style”):
elem.extract()

def removeLink(self):
for elem in self.html.findAll(“link”):
elem.extract()

def grabArticle(self):

for elem in self.html.findAll(True):
#print elem.get(“id”)
unlikelyMatchString = elem.get(‘id’,)+elem.get(‘class’,)

if self.regexps[‘unlikelyCandidates’].search(unlikelyMatchString) and \
not self.regexps[‘okMaybeItsACandidate’].search(unlikelyMatchString) and \
elem.name != ‘body’:
elem.extract()
continue
#                pass
if elem.name == ‘div’ or elem.name == ‘textarea’ or elem.name == ‘td’ or elem.name == ‘P’:
#print elem.get(‘id’)
s = elem.renderContents(encoding=None)
if not self.regexps[‘divToPElements’].search(s):
elem.name = ‘p’

for node in self.html.findAll(‘p’):

parentNode = node.parent
grandParentNode = parentNode.parent
innerText = node.text

#            print ‘==================’
#            print node
#            print ‘——————‘
#            print parentNode

if not parentNode or len(innerText) < 20:
continue

parentHash = hash(str(parentNode))
grandParentHash = hash(str(grandParentNode))

if parentHash not in self.candidates:
self.candidates[parentHash] = self.initializeNode(parentNode)

if grandParentNode and grandParentHash not in self.candidates:
self.candidates[grandParentHash] = self.initializeNode(grandParentNode)

contentScore = 1
for punctuation in self.chinese_punctuation:
contentScore += innerText.count(punctuation)*5
if contentScore == 1:
contentScore -= min(math.floor(len(innerText) / 30), 3)

for c in self.chinese_copyright:
contentScore -= innerText.count(c)*1000

for c in self.chinese_others:
contentScore -= innerText.count(c)*100

if self.chinese_time_or_source.search(innerText):
contentScore -= len(innerText)*100
#if contentScore < -200:
#    node.extract()
contentScore +=  min(math.floor(len(innerText) / 100), 3)

self.candidates[parentHash][‘score’] += contentScore

#            print ‘=======================’
#            print self.candidates[parentHash][‘score’]
#            print self.candidates[parentHash][‘node’]
#            print ‘———————–‘
#            print node

if grandParentNode:
self.candidates[grandParentHash][‘score’] += contentScore / 2

topCandidate = None

for key in self.candidates:
#            print ‘=======================’
#            print self.candidates[key][‘score’]
#            print self.candidates[key][‘node’]

self.candidates[key][‘score’] = self.candidates[key][‘score’] * \
(1 self.getLinkDensity(self.candidates[key][‘node’]))

if not topCandidate or self.candidates[key][‘score’] > topCandidate[‘score’]:
topCandidate = self.candidates[key]

content =

if topCandidate:
content = topCandidate[‘node’]
#            print content
content = self.cleanArticle(content)

contentScore = 0
for punctuation in self.chinese_punctuation:
contentScore += content.count(punctuation)*5
parser = HTMLParser.HTMLParser()
content=parser.unescape(content)
if contentScore == 0:
return
if len(content) < 30:
return
return content

def cleanArticle(self, content):

self.cleanStyle(content)
self.clean(content, ‘h1’)
self.clean(content, ‘object’)
self.cleanConditionally(content, “form”)

if len(content.findAll(‘h2’)) == 1:
self.clean(content, ‘h2’)

self.clean(content, ‘iframe’)
self.cleanNextLink(content)
self.cleanConditionally(content, “table”)
self.cleanConditionally(content, “ul”)
self.cleanConditionally(content, “div”)

self.fixImagesPath(content)

content = content.text
#content = content.renderContents(encoding=None)

#content = self.regexps[‘killBreaks’].sub(“<br />”, content)
content = re.compile(“(&nbsp;)+”,re.I).sub(” “, content)
return content

def clean(self,e ,tag):

targetList = e.findAll(tag)
isEmbed = 0
if tag ==‘object’ or tag == ’embed’:
isEmbed = 1

for target in targetList:
attributeValues = “”
for attribute in target.attrs:
attributeValues += target[attribute[0]]

if isEmbed and self.regexps[‘videos’].search(attributeValues):
continue

if isEmbed and self.regexps[‘videos’].search(target.renderContents(encoding=None)):
continue
target.extract()

def cleanStyle(self, e):

for elem in e.findAll(True):
del elem[‘class’]
del elem[‘id’]
del elem[‘style’]

def cleanNextLink(self, e):
tagsList = e.findAll(‘a’)
for node in tagsList:
if self.regexps[‘nextLink’].search(node.text) or self.regexps[‘prevLink’].search(node.text):
node.extract()

def cleanTimeAndSource(self, e):
tagsList = e.findAll(‘a’)
for node in tagsList:
if self.regexps[‘nextLink’].search(node.text) or self.regexps[‘prevLink’].search(node.text):
node.extract()

def cleanConditionally(self, e, tag):
tagsList = e.findAll(tag)

for node in tagsList:
weight = self.getClassWeight(node)
hashNode = hash(str(node))
if hashNode in self.candidates:
contentScore = self.candidates[hashNode][‘score’]
else:
contentScore = 0

if weight + contentScore < 0:
node.extract()
else:
p = len(node.findAll(“p”))
img = len(node.findAll(“img”))
li = len(node.findAll(“li”))100
input = len(node.findAll(“input”))
embedCount = 0
embeds = node.findAll(“embed”)
for embed in embeds:
if not self.regexps[‘videos’].search(embed[‘src’]):
embedCount += 1
linkDensity = self.getLinkDensity(node)
contentLength = len(node.text)
toRemove = False

if img > p:
toRemove = True
elif li > p and tag != “ul” and tag != “ol”:
toRemove = True
elif input > math.floor(p/3):
toRemove = True
elif contentLength < 25 and (img==0 or img>2):
toRemove = True
elif weight < 25 and linkDensity > 0.2:
toRemove = True
elif weight >= 25 and linkDensity > 0.5:
toRemove = True
elif (embedCount == 1 and contentLength < 35) or embedCount > 1:
toRemove = True

if toRemove:
node.extract()

def getArticleTitle(self):
title =
try:
title = self.html.find(‘title’).text
except:
pass

return title

def initializeNode(self, node):
contentScore = 0

if node.name == ‘div’:
contentScore += 5;
elif node.name == ‘blockquote’:
contentScore += 3;
elif node.name == ‘form’:
contentScore -= 3;
elif node.name == ‘th’:
contentScore -= 5;

contentScore += self.getClassWeight(node)

return {‘score’:contentScore, ‘node’: node}

def getClassWeight(self, node):
weight = 0
if ‘class’ in node:
if self.regexps[‘negative’].search(node[‘class’]):
weight -= 25
if self.regexps[‘positive’].search(node[‘class’]):
weight += 25

if ‘id’ in node:
if self.regexps[‘negative’].search(node[‘id’]):
weight -= 25
if self.regexps[‘positive’].search(node[‘id’]):
weight += 25

return weight

def getLinkDensity(self, node):
links = node.findAll(‘a’)
textLength = len(node.text)

if textLength == 0:
return 0
linkLength = 0
for link in links:
linkLength += len(link.text)

return linkLength / textLength

def fixImagesPath(self, node):
imgs = node.findAll(‘img’)
for img in imgs:
src = img.get(‘src’,None)
if not src:
img.extract()
continue

if ‘http://’ != src[:7] and ‘https://’ != src[:8]:
newSrc = urlparse.urljoin(self.url, src)

newSrcArr = urlparse.urlparse(newSrc)
newPath = posixpath.normpath(newSrcArr[2])
newSrc = urlparse.urlunparse((newSrcArr.scheme, newSrcArr.netloc, newPath,
newSrcArr.params, newSrcArr.query, newSrcArr.fragment))
img[‘src’] = newSrc
def decode(s):
if type(s) == str:
code = chardet.detect(s)[“encoding”]
if code == “GB2312”:
s = s.decode(“gb18030”)
else:
s = s.decode(code)
return s
def main():
url= ‘http://futures.jrj.com.cn/2012/07/03074513663141.shtml’
response = urllib2.urlopen(url)
html_str = response.read()

encode_str = decode(html_str).encode(‘utf-8’)
#print encode_str
readability = Readability(encode_str,url)
print readability.content
if __name__ == ‘__main__’:
start = time.time()
main()
#get_baidu_html()
end = time.time()
print ‘Time Used: %r % (endstart)

readability源码分析

2012年10月12日

From: http://stblog.baidu-tech.com/?p=79

以readability为例进行分析(http://code.google.com/p/arc90labs-readability),readability是一个js库,通过自动化提取算法对网页的dom树进行改写,并在浏览器上展现页面抽取后的网页。工作流程:

1.清除JavaScript,css等html标签

2.根据定义的规则集合过滤结点,根据id和class字符串特征进行过滤

3.结点权重计算

a) 结点类型调权

b)结点Id和Class特征调权

c)逗号分隔调权

d)文本长度调权

e) 权重向父母转移

f)链接文本比的权重调整

4.定位文本统领文本结点,兄弟结点考虑

ContentScore最大的结点作为文本统领结点Max;

对Max的兄弟结点进行尝试,contentScore满足一定阈值且满足一些强制要求者,加入到dom树中。