濟南企業(yè)建站系統(tǒng)企業(yè)員工培訓(xùn)總結(jié)
很多小伙伴學(xué)習(xí)Python的初衷就是為了爬取小說,方便又快捷~
辣么今天咱們來分享6個主流小說平臺的爬取教程~
一、流程步驟
流程基本都差不多,只是看網(wǎng)站具體加密反爬,咱們再進行解密。
實現(xiàn)爬蟲的第一步?
1、去抓包分析,分析數(shù)據(jù)在什么地方。
1. 打開開發(fā)者工具
2. 刷新網(wǎng)頁
3. 找數(shù)據(jù) --> 通過關(guān)鍵字搜索
2、獲取小說內(nèi)容
1. 目標(biāo)網(wǎng)址
2. 獲取網(wǎng)頁源代碼請求小說鏈接地址,解析出來 。
3. 請求小說內(nèi)容數(shù)據(jù)包鏈接:
4. 獲取加密內(nèi)容 --> ChapterContent
5. 進行解密 --> 分析加密規(guī)則 是通過什么樣方式 什么樣代碼進行加密
3、獲取響應(yīng)數(shù)據(jù)
response.text 獲取文本數(shù)據(jù) 字符串
response.json() 獲取json數(shù)據(jù) 完整json數(shù)據(jù)格式
response.content 獲取二進制數(shù)據(jù) 圖片 視頻 音頻 特定格式文件
二、案例
1、書旗
環(huán)境模塊
[環(huán)境使用]:Python 3.8Pycharm[模塊使用]:requests execjs re
源碼展示
# 導(dǎo)入數(shù)據(jù)請求模塊
import requests
# 導(dǎo)入正則模塊
import re
import execjs
# 模擬瀏覽器
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.3'
}
# 請求鏈接 -> 目錄頁面鏈接
html = '網(wǎng)址屏蔽了,不然過不了'
# 發(fā)送請求
html_ = requests.get(url=html, headers=headers).text
# 小說名字
name = re.findall('<title>(.*?)-書旗網(wǎng)</title>', html_)[0]
# 提取章節(jié)名字 / 章節(jié)ID
info = re.findall('data-clog="chapter\$\$chapterid=(\d+)&bid=8826245">\d+\.(.*?)</a>', html_, re.S)
print(name)
# for 循環(huán)遍歷
for chapter_id, index in info:title = index.strip()print(chapter_id, title)# 請求鏈接url = f'https://網(wǎng)址屏蔽了,不然過不了/reader?bid=8826245&cid={chapter_id}'# 發(fā)送請求 <Response [200]> 響應(yīng)對象response = requests.get(url=url, headers=headers)# 獲取響應(yīng)數(shù)據(jù)html_data = response.text# 正則匹配數(shù)據(jù)data = re.findall('contUrlSuffix":"\?(.*?)","shelf', html_data)[0].replace('amp;', '')# 構(gòu)建小說數(shù)據(jù)包鏈接地址link = 'https://c13.網(wǎng)址屏蔽了,不然過不了.com/pcapi/chapter/contentfree/?' + data# 發(fā)送請求json_data = requests.get(url=link, headers=headers).json()# 鍵值對取值, 提取加密內(nèi)容ChapterContent = json_data['ChapterContent']# 解密內(nèi)容 --> 通過python調(diào)用JS代碼, 解密f = open('書旗.js', encoding='utf-8')# 讀取JS代碼text = f.read()# 編譯JS代碼js_code = execjs.compile(text)# 調(diào)用Js代碼函數(shù)result = js_code.call('_decodeCont', ChapterContent).replace('<br/><br/>', '\n').replace('<br/>', '')# 保存數(shù)據(jù)with open(f'{name}.txt', mode='a', encoding='utf-8') as v:v.write(title)v.write('\n')v.write(result)v.write('\n')print(json_data)print(ChapterContent)print(result)
效果展示
2、塔讀
環(huán)境模塊
[環(huán)境使用]:Python 3.8Pycharm[模塊使用]:requests --> pip install requestsexecjs --> pip install pyexecjsre
源碼
# 導(dǎo)入數(shù)據(jù)請求模塊
import requests
# 導(dǎo)入正則表達式模塊
import re
# 導(dǎo)入讀取JS代碼
import execjs# 模擬瀏覽器
headers = {'Host': '網(wǎng)址屏蔽了,以免不過','Referer': '網(wǎng)址屏蔽了,以免不過','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
}
# 請求鏈接
link = '網(wǎng)址屏蔽了,以免不過'
# 發(fā)送請求
link_data = requests.get(url=link, headers=headers).text
# 小說名字
name = re.findall('book_name" content="(.*?)">', link_data)[0]
# 章節(jié)ID 和 章節(jié)名字
info = re.findall('href="/book/\d+/(\d+)/" target="_blank">(.*?)</a>', link_data)[9:]
page = 1
# for 循環(huán)遍歷
for chapter_id, title in info:print(chapter_id, title)# 獲取廣告 data-limit 參數(shù)j = open('塔讀.js', encoding='utf-8')# 讀取JS代碼text = j.read()# 編譯JS代碼js_code = execjs.compile(text)# 調(diào)用js代碼函數(shù)data_limit = js_code.call('o', chapter_id)print(data_limit)# 請求鏈接url = f'網(wǎng)址屏蔽了,以免不過/{page}'# 發(fā)送請求 <Response [200]> 響應(yīng)對象 表示請求成功response = requests.get(url=url, headers=headers)# 獲取響應(yīng)json數(shù)據(jù) --> 字典數(shù)據(jù)類型json_data = response.json()# 解析數(shù)據(jù) -> 鍵值對取值 content 獲取下來content = json_data['data']['content']# 處理小說內(nèi)容廣告 初級版本 --> 后續(xù)需要升級content_1 = re.sub(f'<p data-limit="{data_limit}">.*?</p>', '', content)# 提取小說內(nèi)容 -> 1. 正則表達式提取數(shù)據(jù) 2. css/xpath 提取result = re.findall('<p data-limit=".*?">(.*?)</p>', content_1)# 把列表合并成字符串string = '\n'.join(result)# 保存數(shù)據(jù)with open(f'{name}.txt', mode='a', encoding='utf-8') as f:f.write(title)f.write('\n')f.write(string)f.write('\n')print(string)page += 1
效果展示
3、飛盧
環(huán)境模塊
[環(huán)境使用]:Python 3.8Pycharm[模塊使用]:requests >>> 數(shù)據(jù)請求模塊parsel >>> 數(shù)據(jù)解析模塊re 正則表達式
源碼展示
# 數(shù)據(jù)請求模塊
import requests
# 數(shù)據(jù)解析模塊
import parsel
# 正則表達式模塊
import re
import base64def get_content(img):url = "https://aip.網(wǎng)址屏蔽,不然不過審.com/oauth/2.0/token"params = {"grant_type": "client_credentials","client_id": "","client_secret": ""}access_token = str(requests.post(url, params=params).json().get("access_token"))content = base64.b64encode(img).decode("utf-8")url_ = "網(wǎng)址屏蔽,不然不過審" + access_tokendata = {'image': content}headers = {'Content-Type': 'application/x-www-form-urlencoded','Accept': 'application/json'}response = requests.post(url=url_, headers=headers, data=data)words = '\n'.join([i['words'] for i in response.json()['words_result']])return words# 模擬偽裝
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
# 請求鏈接
link = '網(wǎng)址屏蔽,不然不過審'
# 發(fā)送請求
link_response = requests.get(url=link, headers=headers)
# 獲取響應(yīng)文本數(shù)據(jù)
link_data = link_response.text
# 把html文本數(shù)據(jù), 轉(zhuǎn)成可解析對象
link_selector = parsel.Selector(link_data)
# 提取書名
name = link_selector.css('#novelName::text').get()
# 提取鏈接
href = link_selector.css('.DivTr a::attr(href)').getall()
# for循環(huán)遍歷
for index in href[58:]:# 請求鏈接url = 'https:' + indexprint(url)# 發(fā)送請求 <Response [200]> 響應(yīng)對象response = requests.get(url=url, headers=headers)# 獲取響應(yīng)文本數(shù)據(jù)html_data = response.text# 把html文本數(shù)據(jù), 轉(zhuǎn)成可解析對象 <Selector xpath=None data='<html xmlns="http://www.w3.org/1999/x...'>selector = parsel.Selector(html_data)# 解析數(shù)據(jù), 提取標(biāo)題title = selector.css('.c_l_title h1::text').get() # 根據(jù)數(shù)據(jù)對應(yīng)標(biāo)簽直接復(fù)制css語法即可# 提取內(nèi)容content_list = selector.css('div.noveContent p::text').getall() # get提取第一個# 列表元素大于2 --> 能夠得到小說內(nèi)容if len(content_list) > 2:# 把列表合并成字符串content = '\n'.join(content_list)# 保存數(shù)據(jù)with open(name + '.txt', mode='a', encoding='utf-8') as f:f.write(title)f.write('\n')f.write(content)f.write('\n')
效果展示
因為這玩意爬下來是圖片,所以還要進行文字識別,
else:# 提取圖片內(nèi)容info = re.findall("image_do3\((.*?)\)", html_data)[0].split(',')img = 'https://read.faloo.com/Page4VipImage.aspx'img_data = {'num': '0','o': '3','id': '724903','n': info[3],'ct': '1','en': info[4],'t': '0','font_size': '16','font_color': '666666','FontFamilyType': '1','backgroundtype': '0','u': '15576696742','time': '','k': info[6].replace("'", ""),}img_content = requests.get(url=img, params=img_data, headers=headers).content# 文字識別, 提取圖片中文字內(nèi)容content = get_content(img=img_content)# 保存數(shù)據(jù)with open(name + '.txt', mode='a', encoding='utf-8') as f:f.write(title)f.write('\n')f.write(content)f.write('\n')
識別效果
4、縱橫中文
環(huán)境模塊
解釋器: python 3.8
編輯器: pycharm 2022.3
crypto-js
requests
源碼展示
import execjs
import requests
import recookies = {
}headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','Accept-Language': 'zh-CN,zh;q=0.9','Cache-Control': 'no-cache','Connection': 'keep-alive','Pragma': 'no-cache','Referer': '網(wǎng)址屏蔽了,不過審','Sec-Fetch-Dest': 'document','Sec-Fetch-Mode': 'navigate','Sec-Fetch-Site': 'same-site','Sec-Fetch-User': '?1','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36','sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"',
}response = requests.get('網(wǎng)址屏蔽了,不過審', cookies=cookies, headers=headers)html_data = response.text
i = re.findall('<div style="display:none" id="ejccontent">(.*?)</div>', html_data)[0]
f = open('demo.js', mode='r', encoding='utf-8').read()
ctx = execjs.compile(f)
result = ctx.call('sdk', i)
print(result)
5、筆趣閣
模塊環(huán)境
[相關(guān)模塊]:<第三方模塊>requests >>> pip install requestsparsel<內(nèi)置模塊>re[開發(fā)環(huán)境]:環(huán) 境: python 3.8編輯器:pycharm 2021.2
源碼展示
import requests # 第三方模塊 pip install requests
import parsel # 第三方模塊
import re # 內(nèi)置模塊 url = 'https://網(wǎng)址屏蔽/book/88109/'
# 偽裝
headers = {# 鍵值對 鍵 --》用戶代理 模擬瀏覽器的基本身份'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
# 發(fā)送請求 response 響應(yīng)體
response = requests.get(url=url, headers=headers)
print(response)selector = parsel.Selector(response.text)
title = selector.css('.zjlist dd a::text').getall()# 章節(jié)鏈接
link = selector.css('.zjlist dd a::attr(href)').getall()
# print(link)
# replace re.sub()# zip()
zip_data = zip(title, link)
for name, p in zip_data:# print(name)# print(p)passage_url = '網(wǎng)址屏蔽'+ p# print(passage_url)# 發(fā)送請求response_1 = requests.get(url=passage_url, headers=headers)# print(response_1.text)# 解析數(shù)據(jù) content 二進制 圖片 視頻# re # 查找所有re_data = re.findall('<div id="content"> (.*?)</div>', response_1.text)[0]# print(re_data)# replace 替換text = re_data.replace('筆趣閣 www.網(wǎng)址屏蔽.net,最快更新<a href="https://網(wǎng)址屏蔽/book/88109/">盜墓筆記 (全本)</a>', '')text = text.replace('最新章節(jié)!<br><br>', '').replace(' ', '')# print(text)text = text.replace('<br /><br />', '\n')print(text)passage = name + '\n' + textwith open('盜墓筆記.txt',mode='a') as file:file.write('')
6、起點
環(huán)境模塊
python3.8 解釋器版本
pycharm 代碼編輯器
requests 第三方模塊
代碼展示
import reimport requests # 第三方模塊 額外安裝
import subprocess
from functools import partial
# 處理execjs編碼報錯問題, 需在 import execjs之前
subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")
import execjsheaders = {'cookie': 用自己的,我的刪了
}
ctx = execjs.compile(open('起點.js', mode='r', encoding='utf-8').read())
url = 'https://網(wǎng)址屏蔽/chapter/1035614679/755998264/'
response = requests.get(url=url, headers=headers)html_data = response.textarg1 = re.findall('"content":"(.*?)"', html_data)[0]
arg2 = url.split('/')[-2]
arg3 = '0'
arg4 = re.findall('"fkp":"(.*?)"', html_data)[0]
arg5 = '1'
result = ctx.call('sdk', arg1, arg2, arg3, arg4, arg5)
print(result)text = re.findall('"content":"(.*?)","riskInfo"', html_data)[0]
text = text.replace('\\u003cp>', '\n')f = open('1.txt', mode='w', encoding='utf-8')
f.write(text)
源碼我都打包好了,還有詳細(xì)視頻講解,文末名片自取,備注【小說】快速通過。
好了,今天的分享就到這里了,下次見~