使用you-get 多进程下载 cd_project_red的所有“赛博朋克2077“相关的视频
日期: 2020-12-05 分类: 跨站数据测试 1372次阅读
先贴上代码
有空补全详解
脉络:
- 进入cd_project_red的个人空间
- 找到视频的信息,发现是json
- 遍历所有视频的description、title、bvid
- 用正则选出所有含"赛博朋克2077"的视频,用其bvid建一个list
- 通过多进程you-get,下载list中的所有bvid(这一步有更好建议,希望大佬留言)
import sys
from you_get import common as you_get #导入you-get库
import requests
import random
import re
from concurrent.futures import ProcessPoolExecutor
user_agents = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
bvids=[]
bvids_list=[]
def get_bvid_raw(i):
headers = {
'User-Agent': random.choice(user_agents)
}
domain="https://api.bilibili.com/x/space/arc/search?mid=271442527&ps=30&tid=0&pn={}&keyword=&order=pubdate&jsonp=jsonp".format(i)
# print(domain)
response=requests.get(domain,headers=headers)
data_raw= response.content.decode()
# print(data_raw)
descprition=re.findall(r'"description":"(.*?)"[\s\S]?',data_raw)
# print(descprition)
title=re.findall(r'"title":"(.*?)"',data_raw)
# print(title)
bvid_code=re.findall(r'"bvid":"(.*?)"',data_raw)
# print(bvid)
dict_raw=zip(title,bvid_code,descprition)
# print(dict_raw)
for title in dict_raw:
if re.search("赛博朋克2077",title[0]) is not None or re.search("赛博朋克2077",title[2]) is not None:
# print(title[0]+"hao")
bvid=title[1]
bvids.append(bvid)
else:
# print(title[0]+"no")
pass
# print(bvids)
return bvids
def download(bvid):
directory = r'E:\1' #设置下载目录
url = 'https://www.bilibili.com/video/'+bvid #需要下载的视频地址
sys.argv = ['you-get','--playlist','-o',directory,url,] #sys传递参数执行下载,就像在命令行一样
you_get.main()
def main():
for i in range(1,6+1):
get_bvid_raw(i)
print("第1-{}页".format(i)+"有"+str(len(bvids)))
# print(bvids)
print(len(bvids))
# for bvid in bvids:
# download(bvid)
with ProcessPoolExecutor(max_workers=48) as executor:
executor.map(download,bvids)
if __name__=="__main__":
main()
除特别声明,本站所有文章均为原创,如需转载请以超级链接形式注明出处:SmartCat's Blog
精华推荐