江西居道科技有限公司主營業(yè)務(wù)包含網(wǎng)站建設(shè),APP開發(fā),小程序開發(fā),網(wǎng)絡(luò)推廣,SEO優(yōu)化,網(wǎng)編人員免不了要幫客戶進行一些網(wǎng)站維護操作,但是,各行各業(yè)特性不同,我們同事每次幫客戶維護網(wǎng)站時,都需要獲取大量的素材,圖片性質(zhì)的素材倒是好辦,直接上百度圖片上去找,但是,文字內(nèi)容就不好弄了。
我們的網(wǎng)絡(luò)編輯人員以往都是去一些客戶同行網(wǎng)站上搜集相關(guān)的素材,然后稍加整理,但是,這種方式需要消耗大量的人力,而且沒什么技巧可言,純粹是人工操作;有鑒于此,我們程序開發(fā)人員采用python寫了一個多線程分頁采集網(wǎng)站段落內(nèi)容的腳本工具,根據(jù)設(shè)定好的參數(shù)自動對指定網(wǎng)站進行采集,提取網(wǎng)站上的段落內(nèi)容,并將內(nèi)容保存到本機,現(xiàn)公布相關(guān)代碼,方便大家使用,轉(zhuǎn)載請注明出處!
#!/usr/bin/python import json import os import requests import threading import re import time import sys import colorama colorama.init(autoreset=True) #打開文件 with open('config.json','r') as f: data = json.load(f) f.close() def toInt(num): if num !='': return int(num) else: return 0 thead_count = 0 #待結(jié)束的進程數(shù) start_ = toInt(data['start']) #分頁起始值 end_ = toInt(data['end']) #分頁結(jié)束值 url_ = data['url'] #入口地址 urlinclude = data['urlinclude'] #URL必須包含的字符 urlunclude = data['urlunclude'] #URL不能包含的字符 textinclude = data['textinclude'] #內(nèi)容中必須包含的內(nèi)容 textunclude = data['textunclude'] #內(nèi)容中不能包含的字符 textreplace = data['textreplace'] #需要過濾的字符 textminsize = toInt(data['textminsize']) #有效段落的最少字符數(shù) textmaxsize = toInt(data['textmaxsize']) #有效段落的最大字符數(shù) encoding_ = data['encoding'] #頁面編碼 starttag = data['starttag'] #內(nèi)容提取開始字符 endtag = data['endtag'] #內(nèi)容提取結(jié)束字符 sleepTime = toInt(data['sleep']) #每次請求間隔 jsonkey = data['jsonkey'] #JSON格式數(shù)據(jù)返回時的字段 headers_ = data['headers'] #request請求主機頭參數(shù) todayStr = time.strftime("%Y%m%d",time.localtime()) total = 0 if encoding_=='': encoding_ = 'utf-8' #日志保存 def doLog(vstr): with open(todayStr + ".log",'a') as fo: if vstr !="": fo.writelines(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) + "\t" + vstr + "\n") else: fo.writelines(time.strftime("\n")) fo.close() def saveText(vstr): global total if vstr !='': #doLog('需要保存的內(nèi)容長度'+str(len(vstr))) #判斷不允許包含的內(nèi)容 if len(textunclude)>0: for tu_ in textunclude: if tu_!='': if vstr.find(tu_) !=-1: #doLog(vstr + "】中存在不允許的字符:" + tu_) return "" #處理替換內(nèi)容 if len(textreplace)>0: for vi in textreplace: if vi!='': vstr = vstr.replace(vi,'') print("\033[0;32;40m\t 收集的內(nèi)容長度:" + str(len(vstr)) + "\t\033[0m ") total = total +1 with open("采集結(jié)果.txt",'a') as fo: fo.writelines(vstr+"\n") fo.close() def getFromUrl(vurl): global thead_count,start_time if vurl !='': print('即將從' + vurl + '頁面獲取可用鏈接') #提取主網(wǎng)址 domain = "" if vurl.find('://') !=-1: domain = vurl[0:vurl.find('/',vurl.find('://')+4)] else: domain = vurl[0:vurl.find('/')] res = requests.get(vurl,"",headers=headers_ if len(headers_)>0 else {},timeout=3) if jsonkey !='': _json = res.json() if _json[jsonkey] !='': htmlStr = _json[jsonkey] else: print("================== ERROR ===================") else: htmlStr = res.text #print(htmlStr) a_href =re.findall('<a.*?href="(.*?)".*?',htmlStr,re.I) #利用正則,提取所有a鏈接 a_href = set(a_href) #過渡重復(fù)的鏈接 for i in a_href: urlFlag = 1 #對鏈接進行有效性判斷,先判斷不能包含的字符 if len(urlunclude)>0: for u1 in urlunclude: if i.find(u1) !=-1: urlFlag = 0 print("\033[0;31;40m\t" + i + "\t無效\033[0m ") break #判斷必須包含的內(nèi)容 if urlFlag>0 and len(urlinclude)>0: inFlag = 0 for u2 in urlinclude: if i.find(u2) !=-1: inFlag = 1 break if inFlag<1: urlFlag=0 #不存在指定內(nèi)容,視為無效 if urlFlag: #URL有效 if i[0:1] =='/': i = domain + i #相對目錄,補齊路徑 #提取內(nèi)容 if sleepTime>0: print('延時' + str(sleepTime) + '秒后開始采集') time.sleep(sleepTime) doLog('開始采集:' + i) res2 = requests.get(i,"",headers=headers_ if len(headers_)>0 else {},timeout=3) html_ = res2.text if html_ !='': htmlFlag = 1 #判斷是否包含指定內(nèi)容 if len(textinclude)>0: if html_.find(textinclude) !=-1: htmlFlag = 1 else: htmlFlag = 0 if htmlFlag<1: print(i + "\t不存在特定內(nèi)容,視為無效!") else: if starttag!="" or endtag!="": _startpos = 0 _endpos = len(html_) if starttag!="": _startpos = html_.find(starttag) if endtag!="": _endpos = html_.find(endtag,_startpos) if _startpos>= _endpos: _endpos = len(html_) #根據(jù)標簽,提取內(nèi)容 html_ = html_[_startpos:_endpos] #過濾掉html代碼,提取純中文 html_ = html_.replace('</p>',"</p>\r\n") #避免整段HTML代碼都沒換行 html_ = re.sub(r'</?\w+[^>]*>','',html_) #doLog(i + ':' + html_) #對內(nèi)容進行分割 tmpArr = html_.split("\r\n") for ti in tmpArr: ti2 = ti.strip().replace(" "," ") if len(ti2)>textminsize and len(ti2)<textmaxsize: #doLog(i + ':' + ti) saveText(ti2) else: if len(ti2)>textmaxsize: print(i + '的內(nèi)容長度為:' + str(len(ti2))) #內(nèi)容過長,嘗試再次分段 arr2 = ti2.replace("\r","\n").split("\n") for tj in arr2: tj2 = tj.strip().replace(" "," ") print('當前段落長度為:' + str(len(tj2))) if len(tj2)>textminsize and len(tj2)<textmaxsize: saveText(tj2) #else: # if len(tj2)>textmaxsize: # doLog(i + '-->' + tj2) else: print('段落不符合設(shè)定要求' + str(len(ti2))) print(i) print(vurl + " 采集完成,退出線程\n") if thead_count==1: print('任務(wù)已完成,共用時:'+str(formatFloat(time.time()-start_time)) + 's') print('共計:' + str(total)) #退出整個程序 sys.exit() else: if thead_count>0: thead_count -= 1 else: if thead_count>0: thead_count -= 1 print("程序成功啟動") if start_<1: start_ = 1 if end_<start_: end_=start_ thread_list = [] start_time = time.time() print('江西居道科技有限公司為您提供技術(shù)服務(wù),m.xhjnt.cn,轉(zhuǎn)載請注明出處') if url_.find('[pageindex]') !=-1: #循環(huán) for ui in range(start_,end_+1): _url_ = url_.replace('[pageindex]',str(ui)) myThread = threading.Thread(target=getFromUrl,args=(_url_,)) thead_count += 1 thread_list.append(myThread); for tl in thread_list: tl.start() if sleepTime>0: print('延時' + str(sleepTime) + '秒后繼續(xù)') time.sleep(sleepTime) #doLog("啟動一個進程"); else: getFromUrl(url_)
此外,還需要一個config.json配置文件,用來設(shè)定一些參數(shù)信息,代碼如下:
{"start":1,"end":2,"url":"http://m.xhjnt.cn/articleslist.html","urlinclude":["jsruixi/vip_doc"],"urlunclude":[],"textinclude":"</h1>","textunclude":["___","www.","://"],"textreplace":["南京","1、","2、","3、","4、","5、","6、","7、","8、","9、","①、","①.","②、","②.","③、","③.","④、","④.","⑤、","⑤.","⑥、","⑥.","⑦、","⑦.","⑧、","⑧.","⑨、","⑨.","⑩、","⑩.","⑴、","⑴.","⑵、","⑵.","⑶、","⑶.","⑷、","⑷.","⑸、","⑸.","⑹、","⑹.","⑺、","⑺.","⑻、","⑻.","⑼、","⑼.","⑽、","⑽.","一、","一.","二、","二.","三、","三.","四、","四.","五、","五.","六、","六.","七、","七.","八、","八.","九、","九.","十、","十.","1)、","1).","2)、","2).","3)、","3).","4)、","4).","5)、","5).","6)、","6).","7)、","7).","8)、","8).","①","⑴","1)","②","⑵","2)","③","⑶","3)","④","⑷","4)","⑤","⑸","5)","⑥","⑹","6)","⑦","⑺","7)","⑧","⑻","8)","⑨","⑼","⑩","⑽","(1)","(2)","(3)","(4)","(5)","(6)","(7)","(8)","(9)","(10)"],"textminsize":100,"textmaxsize":300,"encoding":"utf-8","starttag":"</h1>","endtag":"<div class=\"p-details-pre-nex\" id=\"pDetailsPreNext\">","sleep":3,"jsonkey":"","headers":{"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3","Accept - Encoding":"gzip, deflate, br","Accept-Language":"zh-CN,zh;q=0.9","Connection":"Keep-Alive","Host":"m.xhjnt.cn","User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}}
相關(guān)腳本提供下載,點擊下載
運行效果如下圖: