您当前的位置:首页 > IT编程 > python
| C语言 | Java | VB | VC | python | Android | TensorFlow | C++ | oracle | 学术与代码 | cnn卷积神经网络 | gnn | 图像修复 | Keras | 数据集 | Neo4j | 自然语言处理 | 深度学习 | 医学CAD | 医学影像 | 超参数 | pointnet | pytorch | 异常检测 | Transformers | 情感分类 | 知识图谱 |

自学教程:python 爬取影视网站下载链接

51自学网 2021-10-30 22:37:05
  python
这篇教程python 爬取影视网站下载链接写得很实用,希望能帮到您。

项目地址:

https://github.com/GriffinLewis2001/Python_movie_links_scraper

运行效果

导入模块

import requests,refrom requests.cookies import RequestsCookieJarfrom fake_useragent import UserAgentimport os,pickle,threading,timeimport concurrent.futuresfrom goto import with_goto

爬虫主代码

def get_content_url_name(url):    send_headers = {     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",    "Connection": "keep-alive",    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",    "Accept-Language": "zh-CN,zh;q=0.8"        }    cookie_jar = RequestsCookieJar()    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")    response=requests.get(url,send_headers,cookies=cookie_jar)    response.encoding='utf-8'    content=response.text    reg=re.compile(r'<a href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  class="thumbnail-img" title="(.*?)"')    url_name_list=reg.findall(content)    return url_name_listdef get_content(url):    send_headers = {     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",    "Connection": "keep-alive",    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",    "Accept-Language": "zh-CN,zh;q=0.8"        }    cookie_jar = RequestsCookieJar()    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")    response=requests.get(url,send_headers,cookies=cookie_jar)    response.encoding='utf-8'    return response.textdef search_durl(url):    content=get_content(url)    reg=re.compile(r"{'//x64//x65//x63//x72//x69//x70//x74//x50//x61//x72//x61//x6d':'(.*?)'}")    index=reg.findall(content)[0]    download_url=url[:-5]+r'/downloadList?decriptParam='+index    content=get_content(download_url)    reg1=re.compile(r'title=".*?" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ')    download_list=reg1.findall(content)    return download_listdef get_page(url):    send_headers = {     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",    "Connection": "keep-alive",    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",    "Accept-Language": "zh-CN,zh;q=0.8"        }    cookie_jar = RequestsCookieJar()    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")    response=requests.get(url,send_headers,cookies=cookie_jar)    response.encoding='utf-8'    content=response.text    reg=re.compile(r'<a target="_blank" class="title" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  title="(.*?)">(.*?)<//a>')    url_name_list=reg.findall(content)    return url_name_list@with_gotodef main():    print("=========================================================")    name=input("请输入剧名(输入quit退出):")    if name == "quit":        exit()    url="http://www.yikedy.co/search?query="+name    dlist=get_page(url)    print("/n")    if(dlist):        num=0        count=0        for i in dlist:            if (name in i[1]) :                print(f"{num} {i[1]}")                num+=1            elif num==0 and count==len(dlist)-1:                goto .end            count+=1        dest=int(input("/n/n请输入剧的编号(输100跳过此次搜寻):"))        if dest == 100:            goto .end        x=0        print("/n以下为下载链接:/n")        for i in dlist:            if (name in i[1]):                if(x==dest):                    for durl in search_durl(i[0]):                        print(f"{durl}/n")                    print("/n")                    break                x+=1    else:        label .end        print("没找到或不想看/n")

完整代码

import requests,refrom requests.cookies import RequestsCookieJarfrom fake_useragent import UserAgentimport os,pickle,threading,timeimport concurrent.futuresfrom goto import with_gotodef get_content_url_name(url):    send_headers = {     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",    "Connection": "keep-alive",    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",    "Accept-Language": "zh-CN,zh;q=0.8"        }    cookie_jar = RequestsCookieJar()    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")    response=requests.get(url,send_headers,cookies=cookie_jar)    response.encoding='utf-8'    content=response.text    reg=re.compile(r'<a href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  class="thumbnail-img" title="(.*?)"')    url_name_list=reg.findall(content)    return url_name_listdef get_content(url):    send_headers = {     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",    "Connection": "keep-alive",    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",    "Accept-Language": "zh-CN,zh;q=0.8"        }    cookie_jar = RequestsCookieJar()    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")    response=requests.get(url,send_headers,cookies=cookie_jar)    response.encoding='utf-8'    return response.textdef search_durl(url):    content=get_content(url)    reg=re.compile(r"{'//x64//x65//x63//x72//x69//x70//x74//x50//x61//x72//x61//x6d':'(.*?)'}")    index=reg.findall(content)[0]    download_url=url[:-5]+r'/downloadList?decriptParam='+index    content=get_content(download_url)    reg1=re.compile(r'title=".*?" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow" ')    download_list=reg1.findall(content)    return download_listdef get_page(url):    send_headers = {     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",    "Connection": "keep-alive",    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",    "Accept-Language": "zh-CN,zh;q=0.8"        }    cookie_jar = RequestsCookieJar()    cookie_jar.set("mttp", "9740fe449238", domain="www.yikedy.co")    response=requests.get(url,send_headers,cookies=cookie_jar)    response.encoding='utf-8'    content=response.text    reg=re.compile(r'<a target="_blank" class="title" href="(.*?)" rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  rel="external nofollow"  title="(.*?)">(.*?)<//a>')    url_name_list=reg.findall(content)    return url_name_list@with_gotodef main():    print("=========================================================")    name=input("请输入剧名(输入quit退出):")    if name == "quit":        exit()    url="http://www.yikedy.co/search?query="+name    dlist=get_page(url)    print("/n")    if(dlist):        num=0        count=0        for i in dlist:            if (name in i[1]) :                print(f"{num} {i[1]}")                num+=1            elif num==0 and count==len(dlist)-1:                goto .end            count+=1        dest=int(input("/n/n请输入剧的编号(输100跳过此次搜寻):"))        if dest == 100:            goto .end        x=0        print("/n以下为下载链接:/n")        for i in dlist:            if (name in i[1]):                if(x==dest):                    for durl in search_durl(i[0]):                        print(f"{durl}/n")                    print("/n")                    break                x+=1    else:        label .end        print("没找到或不想看/n")print("本软件由CLY.所有/n/n")while(True):    main()

以上就是python 爬取影视网站下载链接的详细内容,更多关于python 爬取下载链接的资料请关注51zixue.net其它相关文章!


Python爬虫之爬取我爱我家二手房数据
pytorch中DataLoader()过程中遇到的一些问题
万事OK自学网:51自学网_软件自学网_CAD自学网自学excel、自学PS、自学CAD、自学C语言、自学css3实例,是一个通过网络自主学习工作技能的自学平台,网友喜欢的软件自学网站。