跳转至

Python feedparser 教程

feedparser 号称是一个 universal feed parser,使用它我们可轻松地实现从任何 RSS 或 Atom 订阅源得到标题、链接和文章的条目了。 使用:pip install feedparser 来安装模块

RSS是RDF Site Summary 的缩写(RDF是Resource Description Framework的缩写 ),是指将网站摘要用xml语言描述。 使用feedparser模块来进行解析过滤页面,返回需要的信息:

用 feedparser.parse 输出页面信息

test_feedparser.py
#encoding:utf-8
import feedparser

def test(url='http://blog.csdn.net/together_cz/article'):
    '''学习使用feedparser
    输入:url
    输出:页面信息
    '''
    one_page_dict = feedparser.parse(url)  #解析得到的是一个字典
    print(one_page_dict)
    '''
    输出字典中的键值有哪些,一共有10中如下:
    ['feed', 'status', 'version', 'encoding', 'bozo', 'headers', 'href', 'namespaces', 'entries', 'bozo_exception']
    '''
    print(one_page_dict.keys())
    print('-'*20)
    print('访问页面链接href为:%s' % one_page_dict['href'])
    print('页面返回headers信息为:%s' % ne_page_dict['headers'])
    print('页面version信息为:%s ' % one_page_dict['version'])
    print('页面状态码为:%s' % one_page_dict['status'])
    print('页面语言类型为:%s ' % one_page_dict['feed']['html']['lang'])
    print('页面meta信息为:meta-content=%s meta-name=%s' % (one_page_dict['feed']['meta']['content'],one_page_dict['feed']['meta']['name']))
if __name__ == '__main__':
    url_list=['http://www.baidu.com','http://www.vmall.com','http://www.taobao.com']
    for one_url in url_list:
        print('当前url为--->', one_url)
    try:
        test(one_url)
    except:
        print('*'*20)
        print('-'*20)

用 feedparser.parse 获取 Google Trend Hottrend RSS feed 列表

GTrend.py
import feedparser
import ssl

#National Variables
dict_nations = {'UNITED_STATES' : 'p1','ARGENTINA': 'p30', 'AUSTRALIA': 'p8', 'AUSTRIA': 'p44', 'BELGIUM': 'p41', 'BRAZIL': 'p18', 'CANADA': 'p13', 'CHILE': 'p38', 'COLOMBIA': 'p32', 'CZECHIA': 'p43', 'DENMARK': 'p49', 'EGYPT': 'p29', 'FINLAND': 'p50', 'FRANCE': 'p16', 'GERMANY': 'p15', 'GREECE': 'p48', 'HONG_KONG': 'p10', 'HUNGARY': 'p45', 'INDIA': 'p3', 'INDONESIA': 'p19', 'IRELAND': 'p54', 'ISRAEL': 'p6', 'ITALY': 'p27', 'JAPAN': 'p4', 'KENYA': 'p37', 'MALAYSIA': 'p34', 'MEXICO': 'p21', 'NETHERLANDS': 'p17', 'NEW_ZEALAND': 'p53', 'NIGERIA': 'p52', 'NORWAY': 'p51', 'PHILIPPINES': 'p25', 'POLAND': 'p31', 'PORTUGAL': 'p47', 'ROMANIA': 'p39', 'RUSSIA': 'p14', 'SAUDI_ARABIA': 'p36', 'SINGAPORE': 'p5', 'SOUTH_AFRICA': 'p40', 'SOUTH_KOREA': 'p23', 'SPAIN': 'p26', 'SWEDEN': 'p42', 'SWITZERLAND': 'p46', 'TAIWAN': 'p12', 'THAILAND': 'p33', 'TURKEY': 'p24', 'UKRAINE': 'p35', 'UNITED_KINGDOM': 'p9', 'VIETNAM': 'p28',
}

# EXAMPLE SEARCH SYNTAX
# United States:
# https://trends.google.com/trends/hottrends#pn=p1
# United States RSS:
# https://trends.google.com/trends/hottrends/atom/feed?pn=p1
# United Kingdom:
# https://trends.google.com/trends/hottrends#pn=p9
# United Kingdom RSS:
# https://trends.google.com/trends/hottrends/atom/feed?pn=p9

TRENDING_URL = 'http://www.google.com/trends/hottrends/atom/feed?pn={}'

def hot(feed_url):
    """Returns a list of hit terms via google trends
    """
    try:
        listing = feedparser.parse(feed_url)['entries']
        trends = [item['title'] for item in listing]
        return trends
    except Exception as e:
        print('ERR hot terms failed!', str(e))
        return None

if __name__ == "__main__":
    if hasattr(ssl, '_create_unverified_context'):
        ssl._create_default_https_context = ssl._create_unverified_context
    print(hot(TRENDING_URL.format(dict_nations.get("UNITED_STATES"))))

下文中一共展示了feedparser.parse方法的代码片段

twitterbot.py
# 需要導入模塊: import feedparser [as 別名]
# 或者: from feedparser import parse [as 別名]
def run(self):
        for feed in self.feeds:
            parsed_feed = feedparser.parse(feed)
            for entry in parsed_feed.entries:
                if entryMatches(entry):
                    self.n_seen += 1
                    # If no ID provided, use the link as ID
                    if "id" not in entry:
                        entry.id = entry.link
                    if entry.id not in self.posted:
                        self.sendTweet(entry)
                        # Bail out if we have reached max number of tweets
                        if self.throttle > 0 and self.n_tweeted >= self.throttle:
                            print(f"Max number of papers met ({self.throttle}), stopping now")
                            return

    # Print statistics of a given run