摘要:今天介紹一下基于和的異步爬蟲(chóng)的編寫(xiě),解析用的是。通過(guò)輸入問(wèn)題,該爬蟲(chóng)能爬取關(guān)于健康方面的數(shù)據(jù)。先讀取規(guī)則,再爬取數(shù)據(jù)。
今天介紹一下基于asyncio和aiohttp的異步爬蟲(chóng)的編寫(xiě),解析html用的是xpath。
該爬蟲(chóng)實(shí)現(xiàn)了以下功能:
1.讀取csv文件中的爬取規(guī)則,根據(jù)規(guī)則爬取數(shù)據(jù);代碼中添加了對(duì)3個(gè)網(wǎng)站的不同提取規(guī)則,如有需要,還可以繼續(xù)添加;
2.將爬取到的數(shù)據(jù)保存到mysql數(shù)據(jù)庫(kù)中。
通過(guò)輸入問(wèn)題,該爬蟲(chóng)能爬取關(guān)于健康方面的數(shù)據(jù)。
具體代碼如下:
# coding:utf-8 """ async-apiser xpath """ from lxml import etree import csv import re import os import asyncio import aiohttp import aiomysql from datetime import datetime from config import Config class HealthSpider(object): def __init__(self, user_id, keyword, url, hrule, drule, count, trule): self.user_id = user_id self.keyword = keyword self.url = url self.hrule = hrule self.drule = drule self.count = count self.trule = trule self.headers = "" self.urls_done = [] self.urls_will = [] self.spider_data = {} @staticmethod def handle_flag(str): """ 去除字符串中的style樣式標(biāo)簽 :param html: :return: """ pattern = re.compile(r" style=".*?;"", re.S) return pattern.sub("", str) async def get_html(self, url, session): """ 根據(jù)url,返回html :param url: :return: """ try: async with session.get(url, headers=self.headers, timeout=5) as resp: if resp.status in [200, 201]: data = await resp.text() return data except Exception as e: raise Exception("數(shù)據(jù)搜索錯(cuò)誤") def get_url(self, resp): """ 根據(jù)html獲取每條數(shù)據(jù)的url :param resp: :return: """ # 保存爬取的數(shù)據(jù) root = etree.HTML(str(resp)) items = root.xpath(self.hrule) # html結(jié)構(gòu)不同,組織url的方式也不同 if 5 == self.count: self.urls_will = ["https://dxy.com" + i for i in items[:5]] elif 3 == self.count: self.urls_will = [i for i in items[:3]] elif 2 == self.count: self.urls_will = [i for i in items[:2]] async def get_data(self, url, session, pool): """ 根據(jù)url獲取具體數(shù)據(jù) :return: """ # 根據(jù)url解析出htm html = await self.get_html(url, session) # 保存爬取的數(shù)據(jù) root = etree.HTML(str(html)) html_data = "" try: title = root.xpath(self.trule) title = "".join(title) except Exception as e: title = "" try: data = root.xpath(self.drule) if data: # html結(jié)構(gòu)不同,獲取數(shù)據(jù)的方式也不同 if 3 == self.count: html_data = "".join(map(etree.tounicode, data)) # 去除結(jié)果中的style標(biāo)簽 html_data = HealthSpider.handle_flag(html_data) else: html_data = etree.tounicode(data[0]) html_data = HealthSpider.handle_flag(html_data) except Exception as e: html_data = [] self.urls_done.append(url) # 數(shù)據(jù)入庫(kù),保存:用戶(hù)id, 關(guān)鍵詞, 日期, 主url, 子url, html數(shù)據(jù) if html_data: self.spider_data["data"].append({"title": title, "html_data": html_data}) spide_date = datetime.now() data = (self.user_id, self.keyword, spide_date, self.url, url, title, html_data) stmt = "INSERT INTO spider_data (user_id, keyword, spide_date, main_url, sub_url, title, html_data) " "VALUES (%s, %s, %s, %s, %s, %s, %s)" try: async with pool.acquire() as conn: async with conn.cursor() as cur: await cur.execute(stmt, data) except Exception as e: pass async def start_spider(self, pool): """ 開(kāi)始爬取數(shù)據(jù) :return: """ async with aiohttp.ClientSession() as session: self.spider_data["user_id"] = self.user_id self.spider_data["keyword"] = self.keyword self.spider_data["data"] = [] while True: # 待爬取url隊(duì)列為空或者已經(jīng)爬取3條數(shù)據(jù),則停止爬取 if (len(self.urls_will) == 0) or len(self.spider_data["data"]) == self.count: break # 獲取待爬url url = self.urls_will.pop() # 開(kāi)始爬取數(shù)據(jù) if url not in self.urls_done: await self.get_data(url, session, pool) return self.spider_data async def main(self, loop): # 請(qǐng)求頭 self.headers = {"Accept": "text/html, application/xhtml+xml, application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-Hans-CN, zh-Hans; q=0.5", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063" } # 連接mysql數(shù)據(jù)庫(kù) pool = await aiomysql.create_pool(host=Config.DB_HOST, port=Config.DB_PORT, user=Config.DB_USER, password=Config.DB_PASSWORD, db=Config.DB_NAME, loop=loop, charset="utf8", autocommit=True) async with aiohttp.ClientSession() as session: # 首次獲取html html = await self.get_html(self.url, session) # 獲取url self.get_url(html) data = await self.start_spider(pool) return data # asyncio.ensure_future(self.start_spider(pool)) def get_rules(keyword): """ 獲取csv中的xpath規(guī)則 :return: """ csv_dict = [] path = os.path.join(os.path.dirname(__file__), "rules.csv") with open(path, "rU") as f: reader = csv.DictReader(f) for line in reader: url = line["url"].format(keyword) hrule = line["hrule"] drule = line["drule"] count = int(line["count"]) title = line["trule"] csv_dict.append({"url": url, "hrule": hrule, "drule": drule, "count": count, "trule": title}) return csv_dict def start_spider(keyword): """ 爬取數(shù)據(jù) :param user_id: :param keyword: :return: """ try: data_list = get_rules(keyword) except Exception as e: raise Exception("搜索規(guī)則獲取失敗") spider_data = [] tasks = [] loop = asyncio.get_event_loop() for i in data_list: spider = HealthSpider(1, keyword, i["url"], i["hrule"], i["drule"], i["count"], i["trule"]) # 任務(wù)列表 tasks.append(asyncio.ensure_future(spider.main(loop))) # 添加到loop loop.run_until_complete(asyncio.wait(tasks)) try: for task in tasks: for i in range(len(task.result()["data"])): spider_data.append(task.result()["data"][i]) except Exception as e: pass # 延時(shí)以等待底層打開(kāi)的連接關(guān)閉 loop.run_until_complete(asyncio.sleep(0.250)) loop.close() return spider_data if __name__ == "__main__": # 爬取感冒了怎么辦相關(guān)內(nèi)容 start_spider("感冒了怎么辦")
下面講一下代碼中某些方法的作用:
1.handle_flag()方法用于去掉html字符串中的style樣式標(biāo)簽,保留html中的其他標(biāo)簽,便于前端的展示;
2.get_data()方法用于爬取具體數(shù)據(jù),并使用aiomysql將爬取道德數(shù)據(jù)保存到數(shù)據(jù)庫(kù);
數(shù)據(jù)庫(kù)的配置文件config.py:
# coding=utf-8 class Config(object): DB_ENGINE = "mysql" DB_HOST = "127.0.0.1" DB_PORT = 3306 DB_USER = "root" DB_PASSWORD = "wyzane" DB_NAME = "db_tornado" DB_OPTIONS = { "init_command": "SET sql_mode="STRICT_TRANS_TABLES"", "charset": "utf8mb4", }
3.get_rules()方法用于從rules.csv文件中讀取爬取的規(guī)則。因?yàn)檫@里同時(shí)爬取了3個(gè)不同的網(wǎng)站,由于每個(gè)網(wǎng)站解析html的xpath規(guī)則不同,并且每個(gè)網(wǎng)站提取的數(shù)據(jù)條數(shù)不同,所以把這些規(guī)則寫(xiě)到了rules.csv文件(就是一個(gè)excel文件)中。先讀取規(guī)則,再爬取數(shù)據(jù)。
以上就是基于asyncio的異步爬蟲(chóng)的代碼,如有錯(cuò)誤,歡迎交流指正!
文章版權(quán)歸作者所有,未經(jīng)允許請(qǐng)勿轉(zhuǎn)載,若此文章存在違規(guī)行為,您可以聯(lián)系管理員刪除。
轉(zhuǎn)載請(qǐng)注明本文地址:http://systransis.cn/yun/42252.html
摘要:這篇文章的題目有點(diǎn)大,但這并不是說(shuō)我自覺(jué)對(duì)爬蟲(chóng)這塊有多大見(jiàn)解,我只不過(guò)是想將自己的一些經(jīng)驗(yàn)付諸于筆,對(duì)于如何寫(xiě)一個(gè)爬蟲(chóng)框架,我想一步一步地結(jié)合具體代碼來(lái)講述如何從零開(kāi)始編寫(xiě)一個(gè)自己的爬蟲(chóng)框架年到如今,我花精力比較多的一個(gè)開(kāi)源項(xiàng)目算是了,這是 showImg(https://segmentfault.com/img/remote/1460000018513379); 這篇文章的題目有點(diǎn)大...
摘要:而的異步非阻塞特性能夠完美的解決這一問(wèn)題。爬蟲(chóng)機(jī)器人功能實(shí)現(xiàn)我使用編寫(xiě)的機(jī)器人是用來(lái)抓取來(lái)自游民星空的圖片。也是使用裝飾器進(jìn)行回調(diào)函數(shù)注冊(cè),使用進(jìn)行消息更新。當(dāng)沒(méi)有指令時(shí),會(huì)顯示一些能夠查看的圖片類(lèi)型。 原文鏈接 前言 aiotg 可以通過(guò)異步調(diào)用telegram api的方式來(lái)構(gòu)建bot,因?yàn)闆Q定開(kāi)發(fā)一個(gè)爬蟲(chóng)功能的bot,所以網(wǎng)絡(luò)請(qǐng)求阻塞是比較嚴(yán)重的性能障礙。而asyncio的異步非...
摘要:一般用進(jìn)程池維護(hù),的設(shè)為數(shù)量。多線(xiàn)程爬蟲(chóng)多線(xiàn)程版本可以在單進(jìn)程下進(jìn)行異步采集,但線(xiàn)程間的切換開(kāi)銷(xiāo)也會(huì)隨著線(xiàn)程數(shù)的增大而增大。異步協(xié)程爬蟲(chóng)引入了異步協(xié)程語(yǔ)法。 Welcome to the D-age 對(duì)于網(wǎng)絡(luò)上的公開(kāi)數(shù)據(jù),理論上只要由服務(wù)端發(fā)送到前端都可以由爬蟲(chóng)獲取到。但是Data-age時(shí)代的到來(lái),數(shù)據(jù)是新的黃金,毫不夸張的說(shuō),數(shù)據(jù)是未來(lái)的一切?;诮y(tǒng)計(jì)學(xué)數(shù)學(xué)模型的各種人工智能的出現(xiàn)...
摘要:開(kāi)始,加入了新的語(yǔ)法,和這兩個(gè)關(guān)鍵字,也成了標(biāo)準(zhǔn)庫(kù),這對(duì)于我們寫(xiě)異步的程序來(lái)說(shuō)就是如虎添翼,讓我們輕而易舉的實(shí)現(xiàn)一個(gè)定向抓取新聞的異步爬蟲(chóng)。網(wǎng)址池異步爬蟲(chóng)的所有流程不能單單用一個(gè)循環(huán)來(lái)完成,它是多個(gè)循環(huán)至少兩個(gè)相互作用共同完成的。 showImg(https://segmentfault.com/img/bVbsjjR?w=742&h=487); Python寫(xiě)爬蟲(chóng)是非常方便的,爬取的...
摘要:蜂鳥(niǎo)網(wǎng)圖片簡(jiǎn)介今天玩點(diǎn)新鮮的,使用一個(gè)新庫(kù),利用它提高咱爬蟲(chóng)的爬取速度。上下文不在提示,自行搜索相關(guān)資料即可創(chuàng)建一個(gè)對(duì)象,然后用該對(duì)象去打開(kāi)網(wǎng)頁(yè)??梢赃M(jìn)行多項(xiàng)操作,比如等代碼中等待網(wǎng)頁(yè)數(shù)據(jù)返回創(chuàng)建線(xiàn)程,方法負(fù)責(zé)安排執(zhí)行中的任務(wù)。 1. 蜂鳥(niǎo)網(wǎng)圖片-簡(jiǎn)介 今天玩點(diǎn)新鮮的,使用一個(gè)新庫(kù) aiohttp ,利用它提高咱爬蟲(chóng)的爬取速度。 安裝模塊常規(guī)套路 pip install aiohtt...
閱讀 2346·2021-11-23 09:51
閱讀 1152·2021-11-22 13:52
閱讀 3623·2021-11-10 11:35
閱讀 1203·2021-10-25 09:47
閱讀 3008·2021-09-07 09:58
閱讀 1073·2019-08-30 15:54
閱讀 2830·2019-08-29 14:21
閱讀 3041·2019-08-29 12:20