脚本宝典收集整理的这篇文章主要介绍了【数据采集与融合技术】第三次大作业,脚本宝典觉得挺不错的,现在分享给大家,也给大家做个参考。
【数据采集与融合技术】第三次大作业
def img_download(pic_url, count): # 根据图片url下载图片
if len(pic_url) > 4 and pic_url[-4] == '.': # 根据url后缀设定下载文件后缀
end_wITh = pic_url[-4:]
else:
end_with = ''
path = './EX3_1_images/' + 'img' + str(count) + end_with
try: # 下载图片
resp = requests.get(pic_url)
resp = resp.content
f = oPEn(path, 'wb')
f.write(resp)
f.close()
PRint('Download img', str(count), ' successfully!')
# 显示下载成功
except Exception as err:
print('Fail to download img', str(count),
' with error as ', err, '!')
# 显示下载失败
单线程下载
多线程下载
if t:
each_page_threads = []
for img in imgs:
T = threading.Thread(target=img_download, args=(img, cc))
cc += 1
T.setDaemon(False)
T.start()
each_page_threads.append(T)
for each_thread in each_page_threads:
each_thread.join()
大部分内容与之前的作业相同,所以实现起来比较简单,对于图片下载及页面跳转方面可以继续修改防止下载重复图片
要求:使用scrapy框架复现作业①
输出信息:同作业①
1)完成过程
import scrapy
From scrapy.selector import Selector as selector
from ..items import Ex32Item
class Ex32Spider(scrapy.Spider):
name = 'ex3_2'
Allowed_domains = ['www.weather.com.cn']
start_urls = ['http://www.weather.com.cn/']
count = 0 # 计算已生成的item即爬取到的图片url数
def parse(self, response):
try:
data = response.body.decode()
except:
return
s = selector(text=data)
pics = s.xpath('//img/@src').extract() # 爬取图片url
for pic in pics: # 逐个封装成item传递给pipeline
if self.count >= 115: # 数量足够时停止
break
if len(pic) < 5: # 排除非正确的图片url
continue
item = Ex32Item()
item['url'] = pic
item['count'] = self.count
self.count += 1
yield item
links = s.xpath('//@href').extract() # 爬取网页链接
for link in links:
if self.count >= 115: # 图片数量足够时退出
break
try:
url = response.urljoin(link)
yield scrapy.Request(url=url, callback=self.parse)
# 对新链接再次调用本函数,在新网页中重复上述操作直到爬取到足够的图片
except Exception:
continue # 图片不足且该链接访问失败时尝试下一个链接
import requests
from itemadapter import ItemAdapter
def img_download(pic_url, count): # 根据图片url下载图片
if len(pic_url) > 4 and pic_url[-4] == '.': # 根据url后缀设定下载文件后缀
end_with = pic_url[-4:]
else:
end_with = ''
path = 'D:\031904115\Data Collecting EX\EX3\EX3_2_images/'
+ 'img' + str(count) + end_with
try: # 下载图片
resp = requests.get(pic_url)
resp = resp.content
f = open(path, 'wb')
f.write(resp)
f.close()
print('Download img', str(count), ' successfully!')
# 显示下载成功
except Exception as err:
print('Fail to download img', str(count),
' with error as ', err, '!')
# 显示下载失败
class Ex32Pipeline:
def process_item(self, item, spider):
print(str(item['count']), ':', item['url']) # 图片信息展示
img_download(item['url'], item['count']) # 图片下载
return item
作业②主要是将爬虫转换至scrapy框架中,过程下来更加熟悉了框架,也了解了在框架中更多的操作方法
序号 | 电影名称 | 导演 | 演员 | 简介 | 电影评分 | 电影封面 |
---|---|---|---|---|---|---|
1 | 肖申克的救赎 | 弗兰克·德拉邦特 | 蒂姆·罗宾斯 | 希望让人自由 | 9.7 | ./imgs/xsk.jpg |
2.... |
HTML文档获取
电影信息爬取
由于导演与演员位于同一段文本内,故需要进一步操作分离
而经过实践发现上述爬取方式在演员及简介方面爬取不够准确
出现上述问题的原因在于原网页中部分电影没有简介或没有演员信息,进而导致后续电影与简介不匹配的现象
解决方法
import scrapy
from scrapy.selector import Selector as selector
from ..items import Ex33Item
class Ex33Spider(scrapy.Spider):
name = 'ex3_3'
allowed_domains = ['movie.douban.com/top250']
start_urls = ['https://movie.douban.com/top250/']
url = start_urls[0][:-1]
for i in range(1, 10):
start_urls.append(url + '?start=' + str(i * 25)) # 生成并装载所有页面url
def parse(self, response):
try:
data = response.body.decode()
except:
return
s = selector(text=data)
movies = s.xpath('//ol/li') # 爬取对应电影的li标签
ranks = movies.xpath('.//em/text()').extract() # 爬取排名
director_and_actor = movies.xpath('.//div[@class="bd"]/p/text()[position()=1]').extract() # 爬取导演及演员
quotes = []
scores = movies.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()').extract() # 爬取评分
titles = movies.xpath('.//img/@alt').extract() # 爬取电影中文名称
imgs = movies.xpath('.//img/@src').extract() # 爬取电影图片url
for movie in movies: # 逐项爬取电影简介
a = movie.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()').extract_First()
quote = a if a else 'None'
quotes.append(quote)
directors = []
actors = []
for m in director_and_actor: # 分割电影导演及演员信息,仅保留从左到右第一位导演及演员
m = m.replace('n', '')
m = m.split(':')
if len(m) > 1:
director = m[1].split()[0]
try:
actor = m[2].split()[0]
except IndexError:
actor = 'None'
directors.append(director)
actors.append(actor)
for i in range(len(movies)): # 数据封装进item
item = Ex33Item()
item['rank'] = ranks[i]
item['title'] = titles[i]
item['director'] = directors[i]
item['actor'] = actors[i]
item['quote'] = quotes[i]
item['score'] = scores[i]
item['img'] = imgs[i]
yield item
import requests
import SQLite3
from itemadapter import ItemAdapter
class MovieDB:
def openDB(self):
self.con=sqlite3.connect("movies.db")
self.cursor=self.con.cursor()
try: # 建表
self.cursor.execute("create table movies (序号 vArchar(16),电影名称 VARchar(16),导演 varchar(64),演员 varchar(32),简介 varchar(100),电影评分 varchar(16),电影封面 varchar(100),constraint pk_weather Primary key (序号))")
except Exception as err: # 表已存在则删除原有数据重新添加
self.cursor.execute("delete from movies")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, item):
try: # 数据插入
self.cursor.execute("insert into movies (序号,电影名称,导演,演员,简介,电影评分,电影封面) values (?,?,?,?,?,?,?)",
(item['rank'], item['title'], item['director'], item['actor'], item['quote'], item['score'], item['img']))
print('电影:', item['title'], ' 保存成功!')
except Exception as err:
print(err)
def img_download(pic_url, title): # 根据图片url下载图片
if len(pic_url) > 4 and pic_url[-4] == '.': # 根据url后缀设定下载文件后缀
end_with = pic_url[-4:]
else:
end_with = ''
path = 'D:\031904115\Data Collecting EX\EX3\EX3_3_images/'
+ 'img' + title + end_with
try: # 下载图片
resp = requests.get(pic_url)
resp = resp.content
f = open(path, 'wb')
f.write(resp)
f.close()
print('Download img', title, ' successfully!')
# 显示下载成功
except Exception as err:
print('Fail to download img', title,
' with error as:', err)
# 显示下载失败
class Ex33Pipeline:
def __init__(self): # 设置数据库对象
self.db = MovieDB()
def open_spider(self, spider): # 在spider开始运行时打开数据库
self.db.openDB()
def process_item(self, item, spider): # 数据保存及图片下载
self.db.insert(item)
img_download(item['img'], item['title'])
return item
def close_spider(self, spider): # 在spider结束运行时关闭数据库
self.db.closeDB()
作业③基本综合了之前学习的所有爬虫知识,实现下来还是有点点成就感。过程中不断发现错误确实让人头大,所幸尽量改正了,当然还是有着很多欠缺,希望以后熟练起来可以更漂亮地完成这样的任务。
以上是脚本宝典为你收集整理的【数据采集与融合技术】第三次大作业全部内容,希望文章能够帮你解决【数据采集与融合技术】第三次大作业所遇到的问题。
本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ:384754419,请注明来意。