# -*- coding: utf-8 -*from __future__ import division
import os
import sys
import urllib
import urllib2
import urlparse
import re
import HTMLParser
import math
import urlparse
import posixpath
import time
import HTMLParser
import chardet
from BeautifulSoup import BeautifulSoup
#from bs4 import BeautifulSoupclass Readability:regexps = {
‘unlikelyCandidates’: re.compile(
“combx|comment|community|disqus|extra|foot|header|menu|”
“remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|”
“pagination|pager|popup|tweet|twitter|login|Submit”,re.I),
‘okMaybeItsACandidate’: re.compile(
“and|article|body|column|main|shadow”, re.I),
‘positive’: re.compile(
“article|body|content|entry|hentry|main|page|pagination|post|text|”
“blog|story”,re.I),
‘negative’: re.compile(
“combx|comment|com|contact|foot|footer|footnote|masthead|media|”
“meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|Submit|submit”
“shopping|tags|tool|widget|time|source”, re.I),
‘extraneous’: re.compile(
“print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|”
“sign|single”,re.I),
‘divToPElements’: re.compile(
“<(blockquote|dl|div|ol|ul|iframe)”,re.I),
‘replaceBrs’: re.compile(
“(<br[^>]*>[ \n\r\t]*){2,}”,re.I),
‘replaceFonts’: re.compile(
“<(/?)font[^>]*>”,re.I),
‘trim’: re.compile(
“^\s+|\s+$”,re.I),
‘normalize’: re.compile(
“\s{2,}”,re.I),
‘killBreaks’: re.compile(
“(<br\s*/?>(\s| ?)*)+”,re.I),
‘videos’: re.compile(
“http://(www\.)?(youtube|vimeo)\.com”,re.I),
‘skipFootnoteLink’: re.compile(
“^\s*((\[|\【)?[a-z0-9]{1,2}(\]|\】)?|^|edit|citation needed)\s*$”,re.I),
‘nextLink’: re.compile(
u”(下一页|下页|更多|【\d*】\s*|next|<|«)”,re.I),
#// Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
‘prevLink’: re.compile(
u”(上一页|上页|^\d+\s*$|prev|earl|old|new|<|«)”,re.I),
‘comment’: re.compile(
“<!–(.|\\s)*?–>”),
}
chinese_punctuation = [u’,’,u’。’,u’!’,u’、’,u’?’]
chinese_copyright = [u’版权与责任说明’,u’凡本网注明’,u’版权说明’,u’免责声明’,u’版权属于’,u’可能已被删除’,
u’声明:’,u’版 权 所 有’,u’声明:’,‘Copyright’,u”版权申明:”,
u”独家稿件声明”,u”依法追究责任”,u”保留追究其法律责任的权利”]
chinese_time_or_source = re.compile(
u”(发表时间:|发表时间:|发布时间:|发步时间:|201[0-9]年\d{1,2}月\d{1,2}日\s*\d{2}:\d{2}|201[0-9][\-\./][0-9]{1,2}[\-\./][0-9]{1,2}\s*\d{2}:\d{2}(:\d{2})?|来源:|来源:|发布会员:|当前位置:)”,re.I)
chinese_others = [u”相关”,u”热点”,u”推荐”,u”专题”,u”更多”,u”相关新闻”,u”相关链接”,
u”相关报道”,u”相关阅读”,u”相关文章”,u”今日推荐”,u”上一页”,
u”上一篇”,u”下一页”,u”下一篇”,u”延伸阅读”,u”责编:”,
u”本文被浏览”,u”收藏”,u”复制网址”,u’复制成功’,
u”分享到”,u”loading…”,u”加载中”,u”开通微博”,u”发布会员:”,
‘——‘,u’第1页’,u’播主:’,u” 播主:”,u”网友评论”,
u”邮箱:”,u”传真:”,u”电话:”,u”字号”,u”>”,
u”联系方式:”,u”加入收藏”,u”出错了”,u”未经授权”]
def __init__(self, input, url):
“””
url = “http://yanghao.org/blog/”
htmlcode = urllib2.urlopen(url).read().decode(‘utf-8’)
readability = Readability(htmlcode, url)
print readability.title
print readability.content
“””
self.candidates = {}
self.input = input
self.url = url
self.input = self.regexps[‘comment’].sub(“”,self.input)
self.input = self.regexps[‘replaceBrs’].sub(“</p><p>”,self.input)
self.input = self.regexps[‘replaceFonts’].sub(“<\g<1>span>”,self.input)
self.html = BeautifulSoup(self.input)
# print self.html.originalEncoding
# print self.html
self.removeDisplayNone()
self.removeScript()
self.removeStyle()
self.removeLink()
self.title = self.getArticleTitle()
self.content = self.grabArticle()
def removeDisplayNone(self):
for elem in self.html.findAll(attrs={‘style’:‘display:none’}):
elem.extract()
def removeScript(self):
for elem in self.html.findAll(“script”):
elem.extract()
def removeStyle(self):
for elem in self.html.findAll(“style”):
elem.extract()
def removeLink(self):
for elem in self.html.findAll(“link”):
elem.extract()
def grabArticle(self):
for elem in self.html.findAll(True):
#print elem.get(“id”)
unlikelyMatchString = elem.get(‘id’,”)+elem.get(‘class’,”)
if self.regexps[‘unlikelyCandidates’].search(unlikelyMatchString) and \
not self.regexps[‘okMaybeItsACandidate’].search(unlikelyMatchString) and \
elem.name != ‘body’:
elem.extract()
continue
# pass
if elem.name == ‘div’ or elem.name == ‘textarea’ or elem.name == ‘td’ or elem.name == ‘P’:
#print elem.get(‘id’)
s = elem.renderContents(encoding=None)
if not self.regexps[‘divToPElements’].search(s):
elem.name = ‘p’
for node in self.html.findAll(‘p’):
parentNode = node.parent
grandParentNode = parentNode.parent
innerText = node.text
# print ‘==================’
# print node
# print ‘——————‘
# print parentNode
if not parentNode or len(innerText) < 20:
continue
parentHash = hash(str(parentNode))
grandParentHash = hash(str(grandParentNode))
if parentHash not in self.candidates:
self.candidates[parentHash] = self.initializeNode(parentNode)
if grandParentNode and grandParentHash not in self.candidates:
self.candidates[grandParentHash] = self.initializeNode(grandParentNode)
contentScore = 1
for punctuation in self.chinese_punctuation:
contentScore += innerText.count(punctuation)*5
if contentScore == 1:
contentScore -= min(math.floor(len(innerText) / 30), 3)
for c in self.chinese_copyright:
contentScore -= innerText.count(c)*1000
for c in self.chinese_others:
contentScore -= innerText.count(c)*100
if self.chinese_time_or_source.search(innerText):
contentScore -= len(innerText)*100
#if contentScore < -200:
# node.extract()
contentScore += min(math.floor(len(innerText) / 100), 3)
self.candidates[parentHash][‘score’] += contentScore
# print ‘=======================’
# print self.candidates[parentHash][‘score’]
# print self.candidates[parentHash][‘node’]
# print ‘———————–‘
# print node
if grandParentNode:
self.candidates[grandParentHash][‘score’] += contentScore / 2
topCandidate = None
for key in self.candidates:
# print ‘=======================’
# print self.candidates[key][‘score’]
# print self.candidates[key][‘node’]
self.candidates[key][‘score’] = self.candidates[key][‘score’] * \
(1 – self.getLinkDensity(self.candidates[key][‘node’]))
if not topCandidate or self.candidates[key][‘score’] > topCandidate[‘score’]:
topCandidate = self.candidates[key]
content = ”
if topCandidate:
content = topCandidate[‘node’]
# print content
content = self.cleanArticle(content)
contentScore = 0
for punctuation in self.chinese_punctuation:
contentScore += content.count(punctuation)*5
parser = HTMLParser.HTMLParser()
content=parser.unescape(content)
if contentScore == 0:
return ”
if len(content) < 30:
return ”
return content
def cleanArticle(self, content):
self.cleanStyle(content)
self.clean(content, ‘h1’)
self.clean(content, ‘object’)
self.cleanConditionally(content, “form”)
if len(content.findAll(‘h2’)) == 1:
self.clean(content, ‘h2’)
self.clean(content, ‘iframe’)
self.cleanNextLink(content)
self.cleanConditionally(content, “table”)
self.cleanConditionally(content, “ul”)
self.cleanConditionally(content, “div”)
self.fixImagesPath(content)
content = content.text
#content = content.renderContents(encoding=None)
#content = self.regexps[‘killBreaks’].sub(“<br />”, content)
content = re.compile(“( )+”,re.I).sub(” “, content)
return content
def clean(self,e ,tag):
targetList = e.findAll(tag)
isEmbed = 0
if tag ==‘object’ or tag == ’embed’:
isEmbed = 1
for target in targetList:
attributeValues = “”
for attribute in target.attrs:
attributeValues += target[attribute[0]]
if isEmbed and self.regexps[‘videos’].search(attributeValues):
continue
if isEmbed and self.regexps[‘videos’].search(target.renderContents(encoding=None)):
continue
target.extract()
def cleanStyle(self, e):
for elem in e.findAll(True):
del elem[‘class’]
del elem[‘id’]
del elem[‘style’]
def cleanNextLink(self, e):
tagsList = e.findAll(‘a’)
for node in tagsList:
if self.regexps[‘nextLink’].search(node.text) or self.regexps[‘prevLink’].search(node.text):
node.extract()
def cleanTimeAndSource(self, e):
tagsList = e.findAll(‘a’)
for node in tagsList:
if self.regexps[‘nextLink’].search(node.text) or self.regexps[‘prevLink’].search(node.text):
node.extract()
def cleanConditionally(self, e, tag):
tagsList = e.findAll(tag)
for node in tagsList:
weight = self.getClassWeight(node)
hashNode = hash(str(node))
if hashNode in self.candidates:
contentScore = self.candidates[hashNode][‘score’]
else:
contentScore = 0
if weight + contentScore < 0:
node.extract()
else:
p = len(node.findAll(“p”))
img = len(node.findAll(“img”))
li = len(node.findAll(“li”))–100
input = len(node.findAll(“input”))
embedCount = 0
embeds = node.findAll(“embed”)
for embed in embeds:
if not self.regexps[‘videos’].search(embed[‘src’]):
embedCount += 1
linkDensity = self.getLinkDensity(node)
contentLength = len(node.text)
toRemove = False
if img > p:
toRemove = True
elif li > p and tag != “ul” and tag != “ol”:
toRemove = True
elif input > math.floor(p/3):
toRemove = True
elif contentLength < 25 and (img==0 or img>2):
toRemove = True
elif weight < 25 and linkDensity > 0.2:
toRemove = True
elif weight >= 25 and linkDensity > 0.5:
toRemove = True
elif (embedCount == 1 and contentLength < 35) or embedCount > 1:
toRemove = True
if toRemove:
node.extract()
def getArticleTitle(self):
title = ”
try:
title = self.html.find(‘title’).text
except:
pass
return title
def initializeNode(self, node):
contentScore = 0
if node.name == ‘div’:
contentScore += 5;
elif node.name == ‘blockquote’:
contentScore += 3;
elif node.name == ‘form’:
contentScore -= 3;
elif node.name == ‘th’:
contentScore -= 5;
contentScore += self.getClassWeight(node)
return {‘score’:contentScore, ‘node’: node}
def getClassWeight(self, node):
weight = 0
if ‘class’ in node:
if self.regexps[‘negative’].search(node[‘class’]):
weight -= 25
if self.regexps[‘positive’].search(node[‘class’]):
weight += 25
if ‘id’ in node:
if self.regexps[‘negative’].search(node[‘id’]):
weight -= 25
if self.regexps[‘positive’].search(node[‘id’]):
weight += 25
return weight
def getLinkDensity(self, node):
links = node.findAll(‘a’)
textLength = len(node.text)
if textLength == 0:
return 0
linkLength = 0
for link in links:
linkLength += len(link.text)
return linkLength / textLength
def fixImagesPath(self, node):
imgs = node.findAll(‘img’)
for img in imgs:
src = img.get(‘src’,None)
if not src:
img.extract()
continue
if ‘http://’ != src[:7] and ‘https://’ != src[:8]:
newSrc = urlparse.urljoin(self.url, src)
newSrcArr = urlparse.urlparse(newSrc)
newPath = posixpath.normpath(newSrcArr[2])
newSrc = urlparse.urlunparse((newSrcArr.scheme, newSrcArr.netloc, newPath,
newSrcArr.params, newSrcArr.query, newSrcArr.fragment))
img[‘src’] = newSrc
def decode(s):
if type(s) == str:
code = chardet.detect(s)[“encoding”]
if code == “GB2312”:
s = s.decode(“gb18030”)
else:
s = s.decode(code)
return s
def main():
url= ‘http://futures.jrj.com.cn/2012/07/03074513663141.shtml’
response = urllib2.urlopen(url)
html_str = response.read()
encode_str = decode(html_str).encode(‘utf-8’)
#print encode_str
readability = Readability(encode_str,url)
print readability.content
if __name__ == ‘__main__’:
start = time.time()
main()
#get_baidu_html()
end = time.time()
print ‘Time Used: %r‘ % (end–start)