摘要:有什么不懂得麻煩去去轉(zhuǎn)盤(pán)網(wǎng)找我,因?yàn)檫@個(gè)也是我開(kāi)發(fā)的,上面會(huì)及時(shí)更新群號(hào),這里不留號(hào)啥的,以免被系統(tǒng)給了。
因?yàn)橐鲇^點(diǎn),觀點(diǎn)的屋子類(lèi)似于知乎的話(huà)題,所以得想辦法把他給爬下來(lái),搞了半天最終還是妥妥的搞定了,代碼是python寫(xiě)的,不懂得麻煩自學(xué)哈!懂得直接看代碼,絕對(duì)可用
#coding:utf-8 """ @author:haoning @create time:2015.8.5 """ from __future__ import division # 精確除法 from Queue import Queue from __builtin__ import False import json import os import re import platform import uuid import urllib import urllib2 import sys import time import MySQLdb as mdb from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding( "utf-8" ) headers = { "User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0", "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With":"XMLHttpRequest", "Referer":"https://www.zhihu.com/topics", "Cookie":"__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a" } DB_HOST = "127.0.0.1" DB_USER = "root" DB_PASS = "root" queue= Queue() #接收隊(duì)列 nodeSet=set() keywordSet=set() stop=0 offset=-20 level=0 maxLevel=7 counter=0 base="" conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "zhihu", charset="utf8") conn.autocommit(False) curr = conn.cursor() def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #在這里應(yīng)該加入代理 html = response.read() return html except: pass return None def getTopics(): url = "https://www.zhihu.com/topics" print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞? html = response.read().decode("utf-8") print html soup = BeautifulSoup(html) lis = soup.find_all("li", {"class" : "zm-topic-cat-item"}) for li in lis: data_id=li.get("data-id") name=li.text curr.execute("select id from classify_new where name=%s",(name)) y= curr.fetchone() if not y: curr.execute("INSERT INTO classify_new(data_id,name)VALUES(%s,%s)",(data_id,name)) conn.commit() except Exception as e: print "get topic error",e def get_extension(name): where=name.rfind(".") if where!=-1: return name[where:len(name)] return None def which_platform(): sys_str = platform.system() return sys_str def GetDateString(): when=time.strftime("%Y-%m-%d",time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + "http://" + GetDateString() + "http://" +str(classify) if which_platform()=="Linux": newFolderName=par + "/" + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"http://"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,"w+b") file_object.write(dataimg) file_object.close() return "/room/default/"+GetDateString()+"/"+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #如果沒(méi)有下載下來(lái)就利用原來(lái)網(wǎng)站的鏈接 def getChildren(node,name): global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch="父話(huà)題" node_name=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text topic_cla=soup.find("div", {"class" : "child-topic"}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all("a", {"class" : "zm-item-tag"}) #獲取所有子節(jié)點(diǎn) if u"子話(huà)題" in p_ch: for a in aList: token=a.get("data-token") a=str(a).replace(" ","").replace(" ","").replace(" ","") start=str(a).find(">") end=str(a).rfind("") new_node=str(str(a)[start+1:end]) curr.execute("select id from rooms where name=%s",(new_node)) #先保證名字絕不相同 y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",e def getContent(n,name,p,top_id): try: global counter curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同 y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text pic_path=soup.find("a",{"id":"zh-avartar-edit-form"}).find("img").get("src") description=soup.find("div",{"class":"zm-editable-content"}) if description is not None: description=description.text if (u"未歸類(lèi)" in title or u"根話(huà)題" in title): #允許入庫(kù),避免死循環(huán) description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #默認(rèn)為雜談 curr.execute("select id from rooms where name=%s",(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同 y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id ######################有資格入庫(kù)的內(nèi)容 counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #必須時(shí)時(shí)進(jìn)入數(shù)據(jù)庫(kù),不然找不到父節(jié)點(diǎn) if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e def work(): global queue curr.execute("select id,node,parent,name from classify where status=1") results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #首先放入隊(duì)列 while queue.qsize() >0: n,p=queue.get() #頂節(jié)點(diǎn)出隊(duì) getContent(n,p,top_id) getChildren(n,name) #出隊(duì)內(nèi)容的子節(jié)點(diǎn) conn.commit() except Exception as e: print "what"s wrong",e def new_work(): global queue curr.execute("select id,data_id,name from classify_new_copy where status=1") results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: pass def get_topis(data_id,name,top_id): global queue url = "https://www.zhihu.com/node/TopicsPlazzaListV2" isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {"method": "next", "params": "{"topic_id":"+data_id+","offset":"+str(offset)+","hash_id":""}"} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode("utf-8") json_str = json.loads(html) ms=json_str["msg"] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all("div", {"class" : "blk"}) for blk in blks: page=blk.find("a").get("href") if page is not None: node=page.replace("/topic/","") #將更多的種子入庫(kù) parent=name ne=blk.find("strong").text try: queue.put((node,ne,parent)) #首先放入隊(duì)列 while queue.qsize() >0: n,name,p=queue.get() #頂節(jié)點(diǎn)出隊(duì) size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #出隊(duì)內(nèi)容的子節(jié)點(diǎn) conn.commit() except Exception as e: print "what"s wrong",e except urllib2.URLError, e: print "error is",e pass if __name__ == "__main__": i=0 while i<400: new_work() i=i+1
說(shuō)下數(shù)據(jù)庫(kù)的問(wèn)題,我這里就不傳附件了,看字段自己建立,因?yàn)檫@確實(shí)太簡(jiǎn)單了,我是用的mysql,你看自己的需求自己建。
有什么不懂得麻煩去去轉(zhuǎn)盤(pán)網(wǎng)找我,因?yàn)檫@個(gè)也是我開(kāi)發(fā)的,上面會(huì)及時(shí)更新qq群號(hào),這里不留qq號(hào)啥的,以免被系統(tǒng)給K了。
文章版權(quán)歸作者所有,未經(jīng)允許請(qǐng)勿轉(zhuǎn)載,若此文章存在違規(guī)行為,您可以聯(lián)系管理員刪除。
轉(zhuǎn)載請(qǐng)注明本文地址:http://systransis.cn/yun/41199.html
摘要:話(huà)題精華即為知乎的高票回答。下面的項(xiàng)目中還包含了另外一個(gè)爬取的知乎的動(dòng)態(tài)。 作者:William本文為原創(chuàng)文章,轉(zhuǎn)載請(qǐng)注明作者及出處 Electron 可以讓你使用純 JavaScript 調(diào)用 Chrome 豐富的原生的接口來(lái)創(chuàng)造桌面應(yīng)用。你可以把它看作一個(gè)專(zhuān)注于桌面應(yīng)用的 Node.js 的變體,而不是 Web 服務(wù)器。其基于瀏覽器的應(yīng)用方式可以極方便的做各種響應(yīng)式的交互,接下來(lái)介...
摘要:我是一個(gè)知乎輕微重度用戶(hù),之前寫(xiě)了一只爬蟲(chóng)幫我爬取并分析它的數(shù)據(jù),我感覺(jué)這個(gè)過(guò)程還是挺有意思,因?yàn)檫@是一個(gè)不斷給自己創(chuàng)造問(wèn)題又去解決問(wèn)題的過(guò)程。所以這只爬蟲(chóng)還有登陸知乎搜索題目的功能。 我一直覺(jué)得,爬蟲(chóng)是許多web開(kāi)發(fā)人員難以回避的點(diǎn)。我們也應(yīng)該或多或少的去接觸這方面,因?yàn)榭梢詮呐老x(chóng)中學(xué)習(xí)到web開(kāi)發(fā)中應(yīng)當(dāng)掌握的一些基本知識(shí)。而且,它還很有趣。 我是一個(gè)知乎輕微重度用戶(hù),之前寫(xiě)了一只爬...
摘要:用將倒放這次讓我們一個(gè)用做一個(gè)小工具將動(dòng)態(tài)圖片倒序播放發(fā)現(xiàn)引力波的機(jī)構(gòu)使用的包美國(guó)科學(xué)家日宣布,他們?nèi)ツ暝率状翁綔y(cè)到引力波。宣布這一發(fā)現(xiàn)的,是激光干涉引力波天文臺(tái)的負(fù)責(zé)人。這個(gè)機(jī)構(gòu)誕生于上世紀(jì)年代,進(jìn)行引力波觀測(cè)已經(jīng)有近年。 那些年我們寫(xiě)過(guò)的爬蟲(chóng) 從寫(xiě) nodejs 的第一個(gè)爬蟲(chóng)開(kāi)始陸陸續(xù)續(xù)寫(xiě)了好幾個(gè)爬蟲(chóng),從爬拉勾網(wǎng)上的職位信息到爬豆瓣上的租房帖子,再到去爬知乎上的妹子照片什么的,爬蟲(chóng)...
摘要:網(wǎng)站信息采集在編寫(xiě)爬蟲(chóng)之前可能需要先了解和搜集網(wǎng)站信息協(xié)議也稱(chēng)為爬蟲(chóng)協(xié)議機(jī)器人協(xié)議等的全稱(chēng)是網(wǎng)絡(luò)爬蟲(chóng)排除標(biāo)準(zhǔn),網(wǎng)站通過(guò)協(xié)議告訴搜索引擎哪些頁(yè)面可以抓取,哪些頁(yè)面不能抓取。 網(wǎng)站信息采集 在編寫(xiě)爬蟲(chóng)之前可能需要先了解和搜集網(wǎng)站信息 robots.txt Robots協(xié)議(也稱(chēng)為爬蟲(chóng)協(xié)議、機(jī)器人協(xié)議等)的全稱(chēng)是網(wǎng)絡(luò)爬蟲(chóng)排除標(biāo)準(zhǔn)(Robots Exclusion Protocol),網(wǎng)站通過(guò)...
摘要:代碼運(yùn)行完成以后,微信被打開(kāi)了。能不能像前面打開(kāi)知乎一樣,使用這個(gè)屬性呢也行,也不行。滑動(dòng)屏幕使用的命令為,滑動(dòng)屏幕需要使用坐標(biāo)信息。單獨(dú)使用控制手機(jī)在 想開(kāi)發(fā)網(wǎng)頁(yè)爬蟲(chóng),發(fā)現(xiàn)被反爬了?想對(duì) App 抓包,發(fā)現(xiàn)數(shù)據(jù)被加密了?不要擔(dān)心,使用 Airtest 開(kāi)發(fā) App 爬蟲(chóng),只要人眼能看到,你就能抓到,最快只需要2分鐘,兼容 Unity3D、Cocos2dx-*、Android 原生 A...
閱讀 1216·2019-08-30 15:55
閱讀 964·2019-08-30 15:55
閱讀 2162·2019-08-30 15:44
閱讀 2895·2019-08-29 14:17
閱讀 1140·2019-08-29 12:45
閱讀 3316·2019-08-26 10:48
閱讀 3142·2019-08-23 18:18
閱讀 2613·2019-08-23 16:47