脚本宝典收集整理的这篇文章主要介绍了python笔趣阁爬虫案例,脚本宝典觉得挺不错的,现在分享给大家,也给大家做个参考。
一个爬虫练习案例, 不做任何商用.侵删.
主体:
"""
这个爬虫脚本可以再笔趣阁中搜索相应的小说并爬取
针对 https://www.biqugeq.COM/ 使用的爬虫程序,仅作为练习不做任何商用.
"""
import requests
import time
import os
From lxml import etree
from PRettytable import PrettyTable
from bqg_config import Config
class BQG:
"""
__headers: 设置 UA 避免直接被认为爬虫,直接拒绝此次请求
__session: 设置全局请求
"""
def __inIT__(self):
"""
将 requests 中的 session 赋值到类的 __session 中
"""
self._c = Config()
self._BookName = input("请输入你要搜索的书名: t")
self._session = requests.session()
self._session.headers = self._c.headers
self._session.get(self._c.head_url)
path = "{}\{}.txt".format(self.path(), self._bookName)
print("当前保存路径为: t" + path)
self.f = oPEn(path, "a", encoding="utf8")
def start(self):
""" 爬虫启动方法 """
url = self.seArchHtMLPrint(res=etree.HTML(self._session.get(self._c.search_url + self._bookName).text))
page_list = self.gethrefList(url)
count = len(page_list)
for i in range(count):
self.save(url=page_list[i])
self.sleep()
num = int((i / count) * 100)
print('r当前进度:{0}{1}%'.format('▉' * num, num), end='')
self.f.close()
def searchHtmlPrint(self, res):
""" 把 etree 格式化后的数据传入,打印可选选项,若有则返回一个链接,若没有就返回一个 None """
table = res.xpath(self._c.searchTable)[0]
count = len(table.xpath(self._c.searchTableCount))
url = ""
if count > 0:
books = table.xpath(self._c.searchTableBook)
authors = table.xpath(self._c.searchTableAuthor)
self.tablePrint(count=count, books=books, authors=authors)
num = input("请输入图书序号开始下载: t")
url = self._c.head_url + table.xpath(self._c.href(num=num))[0]
else:
print("暂无此书!")
return url
def sleep(self):
""" 休眠函数 """
time.sleep(self._c.timeSleep)
@staticmethod
def tablePrint(count, books, authors):
""" 打印搜搜到的小说 """
table = PrettyTable(['序号', '书名', '作者名'])
for i in range(count):
table.add_row([i + 1, books[i], authors[i]])
print(table)
def path(self):
""" 获取或设置一个保存的路径 """
input_path = input("默认保存路径为 D:\novel 目录下 ,若不更改请直接按回车, 更改直接输入即可. t")
path = ""
if input_path == "":
path = self._c.path
if not os.path.exists(path):
os.mkdir(path)
elif os.path.exists(input_path):
path = input_path
return path
def _getEtree(self, url):
""" 获取 etree 对象 """
res = self._session.get(url)
res.encoding = "gbk123"
return etree.HTML(res.text)
def getHrefList(self, url):
""" 获取章节列表 """
not_head_list = self._getEtree(url).xpath(self._c.book_list)[12:]
page_list = []
for i in not_head_list:
page_list.append(self._c.head_url + i)
return page_list
def save(self, url):
""" 获取每一章并保存"""
info = self._getEtree(url)
info_name = info.xpath(self._c.info_name)[0]
info_list = info.xpath(self._c.info)
info_all = ""
for i in info_list:
info_all += i.replace("u3000", "").replace("n", "").replace("(https://www.biqumo.com/0_269/2243417.html)",
"").replace(
"请记住本书首发域名:https://www.biqumo.com。笔趣阁手机版阅读网址:https://m.biqumo.com", "").replace(
"(https://www.biqumo.com/2_2784/57553374.html)", "")
self.f.write(info_name)
self.f.write("rn")
self.f.write("rn")
self.f.write(info_all)
self.f.write("rn")
self.f.write("rn")
if __name__ == "__main__":
BQG().start()
配置:
"""
专门存放 bqg_search 中使用到的 xpath 的一个类
"""
class Config:
"""
将 BQG 中使用的 xpath 提取出来,方便以后进行更改.
_searchTable 获取笔趣阁搜索后中的表格
_searchTableBook 表格中的书名
_searchTableAuthor 表格中的作者
_searchTableCount 表格有多少书,用于判断是否继续
_timeSleep 默认休眠时间
_head_url 搜索的首页
_search_url 真实的搜索地址
_book_list 获取章节列表
_info 获取章节内容
"""
def __init__(self):
self._headers = {
# 设置 UA 反爬
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Applewebkit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36",
}
self._timeSleep = 2
self._head_url = "https://www.biqugeq.com"
self._search_url = "https://www.biqugeq.com/search/?ie=gbk&siteid=xszww.com&q="
self._searchTable = '//div[@class="l bd"]/ul'
self._searchTableBook = 'li/span[2]/a/text()'
self._searchTableAuthor = 'li/span[4]/text()'
self._searchTableCount = '//div[@class="l bd"]/ul/li'
self._path = "D:\novel"
self._book_list = '//div[@class="listmain"]/dl/dd/a/@href'
self._info_name = '//div[@class="content"]/h1/text()'
self._info = '//div[@id="content"]/text()'
@staticmethod
def href(num):
""" 获取一个 href 的 """
return 'li[{}]/span[2]/a/@href'.format(num)
@property
def searchTable(self):
""" 限制只读 """
return self._searchTable
@property
def searchTableBook(self):
""" 限制只读 """
return self._searchTableBook
@property
def searchTableAuthor(self):
""" 限制只读 """
return self._searchTableAuthor
@property
def searchTableCount(self):
""" 限制只读 """
return self._searchTableCount
@property
def timeSleep(self):
""" 限制只读 """
return self._timeSleep
@property
def head_url(self):
""" 限制只读 """
return self._head_url
@property
def search_url(self):
""" 限制只读 """
return self._search_url
@property
def headers(self):
""" 限制只读 """
return self._headers
@property
def path(self):
""" 限制只读 """
return self._path
@path.setter
def path(self, path):
self._path = path
@property
def book_list(self):
""" 限制只读 """
return self._book_list
@property
def info_name(self):
""" 限制只读 """
return self._info_name
@property
def info(self):
""" 限制只读 """
return self._info
以上是脚本宝典为你收集整理的python笔趣阁爬虫案例全部内容,希望文章能够帮你解决python笔趣阁爬虫案例所遇到的问题。
本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ:384754419,请注明来意。