python：狂抓“某逼乎”精彩話題，撞破“某榴”新手賬號

Enlightenment 發(fā)布于2019-07-30 18:38 / 1642人閱讀

摘要：上一篇寫了個有意思的文章腳本撞庫國內(nèi)某榴賬號很多朋友反映，該榴賬號有驗證，即時撞破賬號也無卵用，其實新手號還是可以使用的，至于撞庫破解某榴賬號的問題請移到上篇帖子查看。華麗分割線這次再來研究下如何搞定某逼乎的話題問題。

上一篇寫了個有意思的文章：python 腳本撞庫國內(nèi)“某榴”賬號 https://www.52pojie.cn/thread...
很多朋友反映，該榴賬號有g(shù)oogle驗證，即時撞破賬號也無卵用，其實新手號還是可以使用的，至于撞庫破解“某榴”賬號的問題請移到上篇帖子查看。

華麗分割線

這次再來研究下如何搞定“某逼乎”的話題問題。
逼乎現(xiàn)在在整個社區(qū)類網(wǎng)站中可以說火的不要不要的，逼乎上的內(nèi)容質(zhì)量在所有社區(qū)中還是相對較高的，很多時候我們都需要爬取逼乎精彩的話題，當(dāng)然這不是為了裝，搞不好你的設(shè)計恰好
就需要這么一個需求。

程序猿之間上代碼，一起研究下：

"""
@author:haoning
[url=home.php?mod=space&uid=365491]@create[/url] time:2015.8.5
"""
from future import division # 精確除法
from Queue import Queue
from builtin import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
"User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With":"XMLHttpRequest",
"Referer":"https://www.zhihu.com/topics",
"Cookie":"__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a"
}

DB_HOST = "127.0.0.1"
DB_USER = "root"
DB_PASS = "root"

queue= Queue() #接收隊列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, "zhihu", charset="utf8")
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):

try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req,None,3) #在這里應(yīng)該加入代{過}{濾}理
    html = response.read()
    return html
except:
    pass
return None

def getTopics():

url = "https://www.zhihu.com/topics"
print url
try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
    html = response.read().decode("utf-8")
    print html
    soup = BeautifulSoup(html)
    lis = soup.find_all("li", {"class" : "zm-topic-cat-item"})
     
    for li in lis:
        data_id=li.get("data-id")
        name=li.text
        curr.execute("select id from classify_new where name=%s",(name))
        y= curr.fetchone()
        if not y:
            curr.execute("INSERT INTO classify_new(data_id,name)VALUES(%s,%s)",(data_id,name))
    conn.commit()
except Exception as e:
    print "get topic error",e

def get_extension(name):

where=name.rfind(".")
if where!=-1:
    return name[where:len(name)]
return None

def which_platform():

sys_str = platform.system()
return sys_str

def GetDateString():

when=time.strftime("%Y-%m-%d",time.localtime(time.time()))
foldername = str(when)
return foldername

def makeDateFolder(par,classify):

try:
    if os.path.isdir(par):
        newFolderName=par + "http://" + GetDateString() + "http://"  +str(classify)
        if which_platform()=="Linux":
            newFolderName=par + "/" + GetDateString() + "/" +str(classify)
        if not os.path.isdir( newFolderName ):
            os.makedirs( newFolderName )
        return newFolderName
    else:
        return None
except Exception,e:
    print "kk",e
return None

def download_img(url,classify):

try:
    extention=get_extension(url)
    if(extention is None):
        return None
    req = urllib2.Request(url)
    resp = urllib2.urlopen(req,None,3)
    dataimg=resp.read()
    name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
    top="E://topic_pic"
    folder=makeDateFolder(top, classify)
    filename=None
    if folder is not None:
        filename  =folder+"http://"+name
    try:
        if "e82bab09c_m" in str(url):
            return True
        if not os.path.exists(filename):
            file_object = open(filename,"w+b")
            file_object.write(dataimg)
            file_object.close()
            return "/room/default/"+GetDateString()+"/"+str(classify)+"/"+name
        else:
            print "file exist"
            return None
    except IOError,e1:
        print "e1=",e1
        pass
except Exception as e:
    print "eee",e
    pass
return None #如果沒有下載下來就利用原來網(wǎng)站的鏈接

def getChildren(node,name):

global queue,nodeSet
try:
    url="https://www.zhihu.com/topic/"+str(node)+"/hot"
    html=get_html(url)
    if html is None:
        return
    soup = BeautifulSoup(html)
    p_ch="父話題"
    node_name=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text
    topic_cla=soup.find("div", {"class" : "child-topic"})
    if topic_cla is not None:
        try:
            p_ch=str(topic_cla.text)
            aList = soup.find_all("a", {"class" : "zm-item-tag"}) #獲取所有子節(jié)點
            if u"子話題" in p_ch:
                for a in aList:
                    token=a.get("data-token")
                    a=str(a).replace("
","").replace("	","").replace("
","")
                    start=str(a).find(">")
                    end=str(a).rfind("")
                    new_node=str(str(a)[start+1:end])
                    curr.execute("select id from rooms where name=%s",(new_node)) #先保證名字絕不相同
                    y= curr.fetchone()
                    if not y:
                        print "y=",y,"new_node=",new_node,"token=",token
                        queue.put((token,new_node,node_name))
        except Exception as e:
            print "add queue error",e
except Exception as e:
    print "get html error",e

def getContent(n,name,p,top_id):

try:
    global counter
    curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同
    y= curr.fetchone()
    print "exist?? ",y,"n=",n
    if not y:
        url="https://www.zhihu.com/topic/"+str(n)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        title=soup.find("div", {"id" : "zh-topic-title"}).find("h1").text
        pic_path=soup.find("a",{"id":"zh-avartar-edit-form"}).find("img").get("src")
        description=soup.find("div",{"class":"zm-editable-content"})
        if description is not None:
            description=description.text
             
        if (u"未歸類" in title or u"根話題" in title): #允許入庫，避免死循環(huán)
            description=None
             
        tag_path=download_img(pic_path,top_id)
        print "tag_path=",tag_path
        if (tag_path is not None) or tag_path==True:
            if tag_path==True:
                tag_path=None
            father_id=2 #默認(rèn)為雜談
            curr.execute("select id from rooms where name=%s",(p))
            results = curr.fetchall()
            for r in results:
                father_id=r[0]
            name=title
            curr.execute("select id from rooms where name=%s",(name)) #先保證名字絕不相同
            y= curr.fetchone()
            print "store see..",y
            if not y:
                friends_num=0
                temp = time.time()
                x = time.localtime(float(temp))
                create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                create_time
                creater_id=None
                room_avatar=tag_path
                is_pass=1
                has_index=0
                reason_id=None 
                #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                ######################有資格入庫的內(nèi)容
                counter=counter+1
                curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                conn.commit() #必須時時進(jìn)入數(shù)據(jù)庫，不然找不到父節(jié)點
                if counter % 200==0:
                    print "current node",name,"num",counter
except Exception as e:
    print "get content error",e

def work():

global queue
curr.execute("select id,node,parent,name from classify where status=1")
results = curr.fetchall()
for r in results:
    top_id=r[0]
    node=r[1]
    parent=r[2]
    name=r[3]
    try:
        queue.put((node,name,parent)) #首先放入隊列
        while queue.qsize() >0:
            n,p=queue.get() #頂節(jié)點出隊
            getContent(n,p,top_id)
            getChildren(n,name) #出隊內(nèi)容的子節(jié)點
        conn.commit()
    except Exception as e:
        print "what"s wrong",e

def new_work():

global queue
curr.execute("select id,data_id,name from classify_new_copy where status=1")
results = curr.fetchall()
for r in results:
    top_id=r[0]
    data_id=r[1]
    name=r[2]
    try:
        get_topis(data_id,name,top_id)
    except:
        pass

def get_topis(data_id,name,top_id):

global queue
url = "https://www.zhihu.com/node/TopicsPlazzaListV2"
isGet = True;
offset = -20;
data_id=str(data_id)
while isGet:
    offset = offset + 20
    values = {"method": "next", "params": "{"topic_id":"+data_id+","offset":"+str(offset)+","hash_id":""}"}
    try:
        msg=None
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request,None,5)
            html=response.read().decode("utf-8")
            json_str = json.loads(html)
            ms=json_str["msg"]
            if len(ms) <5:
                break
            msg=ms[0]
        except Exception as e:
            print "eeeee",e
        #print msg
        if msg is not None:
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all("div", {"class" : "blk"})
            for blk in blks:
                page=blk.find("a").get("href")
                if page is not None:
                    node=page.replace("/topic/","") #將更多的種子入庫
                    parent=name
                    ne=blk.find("strong").text
                    try:
                        queue.put((node,ne,parent)) #首先放入隊列
                        while queue.qsize() >0:
                            n,name,p=queue.get() #頂節(jié)點出隊
                            size=queue.qsize()
                            if size > 0:
                                print size
                            getContent(n,name,p,top_id)
                            getChildren(n,name) #出隊內(nèi)容的子節(jié)點
                        conn.commit()
                    except Exception as e:
                        print "what"s wrong",e  
    except urllib2.URLError, e:
        print "error is",e
        pass

if name == "__main__":

i=0
while i<400:
    new_work()
    i=i+1

當(dāng)然代碼是十分簡單的，稍微有python基礎(chǔ)都可以搞定，注釋清楚明白，大家安靜討論研究下，獻(xiàn)丑了。

GPU云服務(wù)器云服務(wù)器某逼乎撞破某榴新手python

文章版權(quán)歸作者所有，未經(jīng)允許請勿轉(zhuǎn)載,若此文章存在違規(guī)行為，您可以聯(lián)系管理員刪除。

轉(zhuǎn)載請注明本文地址：http://systransis.cn/yun/42793.html

發(fā)表評論

登陸后可評論

0條評論

Enlightenment

男|高級講師

我要關(guān)注我要私信

TA的文章

什么叫做主機(jī)廠-汽車主機(jī)廠是干什么的？

閱讀 2483·2021-09-22 15:27
80VPS：日本/香港CN2服務(wù)器600元/月,E5/16G/1TB/20M帶寬

閱讀 3255·2021-09-03 10:32
namebox：羅馬尼亞VPS，1Gbps帶寬，不限流量，低至R96/月起，2G內(nèi)存/2核/32gS

閱讀 3558·2021-09-01 11:38
實現(xiàn)瀏覽器內(nèi)多個標(biāo)簽頁之間的通信

閱讀 2538·2019-08-30 15:56
直接使用sublime編譯stylus

閱讀 2256·2019-08-30 13:01
前端實例練習(xí) - 動效伸縮搜索框

閱讀 1575·2019-08-29 12:13
判斷用戶點擊是否在指定區(qū)域內(nèi)

閱讀 1460·2019-08-26 13:33
JS基礎(chǔ)（綁定及解綁事件，元素屬性及方法，元素位置及寬高的值）

閱讀 932·2019-08-26 13:30

成人国产在线小视频_日韩寡妇人妻调教在线播放_色成人www永久在线观看_2018国产精品久久_亚洲欧美高清在线30p_亚洲少妇综合一区_黄色在线播放国产_亚洲另类技巧小说校园_国产主播xx日韩_a级毛片在线免费

資訊專欄INFORMATION COLUMN

上云采購季！| 2核2G4M爆款云服務(wù)器低至59元/年，更有多臺、長期優(yōu)惠，快來選購！

python：狂抓“某逼乎”精彩話題，撞破“某榴”新手賬號

華麗分割線

相關(guān)文章

python 腳本撞庫國內(nèi)“某榴”賬號

不使用Ruby的十個理由

UCloud用戶社區(qū)UClub新手使用指南

[新手向視頻]新版PyCharm創(chuàng)建項目為什么會有問題

**8步從Python白板到專家，從基礎(chǔ)到深度學(xué)習(xí)**

發(fā)表評論

0條評論

Enlightenment

男|高級講師

TA的文章

什么叫做主機(jī)廠-汽車主機(jī)廠是干什么的？

80VPS：日本/香港CN2服務(wù)器600元/月,E5/16G/1TB/20M帶寬

namebox：羅馬尼亞VPS，1Gbps帶寬，不限流量，低至R96/月起，2G內(nèi)存/2核/32gS

實現(xiàn)瀏覽器內(nèi)多個標(biāo)簽頁之間的通信

直接使用sublime編譯stylus

前端實例練習(xí) - 動效伸縮搜索框

判斷用戶點擊是否在指定區(qū)域內(nèi)

JS基礎(chǔ)（綁定及解綁事件，元素屬性及方法，元素位置及寬高的值）

最新活動

資訊專欄INFORMATION COLUMN

上云采購季！| 2核2G4M爆款云服務(wù)器低至59元/年，更有多臺、長期優(yōu)惠，快來選購！

python：狂抓“某逼乎”精彩話題，撞破“某榴”新手賬號

華麗分割線

相關(guān)文章

發(fā)表評論

0條評論

男|高級講師

TA的文章

最新活動

上云采購季！| 2核2G4M爆款云服務(wù)器低至59元/年，更有多臺、長期優(yōu)惠，快來選購！

python：狂抓“某逼乎”精彩話題，撞破“某榴”新手賬號