些都是笔记,还缺少详细整理,后续会更新。
下面这种方式,属于入门阶段,手动成分比较多.
首先安装必要组件:
pip3 install requests
pip3 install beautifulsoup4
一、爬汽车之家
#!/usr/bin/env python# coding:utf-8import requestsfrom bs4 import BeautifulSoup# 1.下载页面ret = requests.get(url="https://www.autohome.com.cn/news/")# print(ret) # 得到对象# ret.encoding="gbk" # 指定编码# print(ret.apparent_encoding)ret.encoding = ret.apparent_encoding # 指定编码等于原始页面编码# print(ret.text)# 2. 解析:获取想要的指定内容 beautifulsoupsoup = BeautifulSoup(ret.text, 'html.parser') # 使用lxml则速度更快# 如果要加class,则前面加下划线# div = soup.find(name='div', id='auto-channel-lazyload-article', _class='article-wrapper') # 找到外部DIVdiv = soup.find(name='div', attrs={"id":"auto-channel-lazyload-article","class":"article-wrapper"}) # 使用属性字典方式li_list = div.find_all(name='li')for li in li_list: h3 = li.find(name='h3') if not h3: continue print(h3.text) a = li.find('a') # print(a.attrs) print(a.get('href')) p = li.find(name='p') print(p.text) print('----->' * 20) img = li.find(name='img') src = img.get('src') filename = src.rsplit('__', maxsplit=1)[1] down_img = requests.get(url='https:' + src) with open(filename, 'wb') as f: f.write(down_img.content)
当然,从for循环输出开始,将内容写入文件或数据库,就随需求了。
View Code
二、登录抽屉
#!/usr/bin/env python# coding:utf-8import requests# 请求头要加,先访问普通网页,伪造得越像浏览器越好# 1. 先访问网页,获取cookie(未授权)ret = requests.get( url="https://dig.chouti.com/all/hot/recent/1", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', })# print(ret.text)r1_cookie_dict = ret.cookies.get_dict()# 2. 登录 发送用户名和密码认证, 带上未授权的cookie# 需要注意反爬虫策略response_login = requests.post( url="https://dig.chouti.com/login", data={ "phone": "8618912600100", "password": "wodemima", "oneMonth": "1" }, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }, cookies=r1_cookie_dict)# print(response_login.text)# cookie_dict=response_login.cookies.get_dict() # 第二次返回的cookie# 点赞r1 = requests.post( url="https://dig.chouti.com/link/vote?linksId=20630611", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}, cookies=r1_cookie_dict)print(r1.text)# {"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53074732774","likedTime":"1530752755154000","lvCount":"21","nick":"aabbccdd","uvCount":"1","voteTime":"小于1分钟前"}}}
requests和bs4的几个小片段:
#!/usr/bin/env python# coding:utf-8import requests,refrom bs4 import BeautifulSoup'''requests.get(url="http://www.baidu.com") # requests.request(method="get",url="xxx")requests.post(url="http://www.baidu.com") # requests.request(method="post",url="xxx")可以传的参数:url: 地址params: URL中传入的参数headers: 请求头cookies: Cookiedata: 数据 以上必需牢记'''ret = requests.get( url="https://www.baidu.com/s", params={"wd": "王历宏"}, # https://www.baidu.com/s?wd=%E6%9D%8E%E5%81%A5 headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', },)ret.encoding = ret.apparent_encoding# print(ret.text)soup = BeautifulSoup(ret.text, 'html.parser')div = soup.find(name='span', attrs={"class":"nums_text"})# lis = re.findall("\d+",div.text)# print("".join(lis))print(div.text)'''### json参数requests.post( url="http://www.baidu.com", # json={ # 'name':'alex', # 'passwd':'123456', # }, headers={}, cookies={}, # 如果搞不清对方是要Form_data 还是payload 就使用下面的方式。 data=json_dumps({ 'name':'alex', 'pwd':'123456', }))'''## 上传文件# auth 基本弹窗验证from requests.auth import HTTPBasicAuth,HTTPDigestAuthres = requests.get( 'https://api.github.com/user', auth=HTTPBasicAuth("abc@163.com","11223344") # 'https://api.github.com/user', auth=HTTPDigestAuth("abc@163.com","11223344") # 方法不一样)print(res.text)# timeout 超时时间# allow_redirects## proxies 代理'''proxies ={ "http":"61.172.249.96:80", "https":"http://61.185.219.126:3128",}ret = requests.get("http://www.proxy360.cn/Proxy",proxies=proxies)proxies2 = {"http://10.20.1.128":"http://10.10.1.10:5323"}'''# 使用代理字典,以及用户名密码'''from requests.auth import HTTPProxyAuthproxy_dict={ 'http':'77.75.105.165', 'https':'77.75.105.166'}auth=HTTPProxyAuth('username','mypwd')r = requests.get("http://www.google.com",proxies=proxy_dict,auth=auth)'''
我上交的作业,还是有不少问题。
#!/usr/bin/env python# coding:utf-8import requestsfrom bs4 import BeautifulSoupusername = input("请输入github账号:")pwd = input("请输入github密码:")print("请稍等几秒... ")# 1. 打开登录页ret1 = requests.get( url="https://github.com/login", headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', })r1_cookie_dict = ret1.cookies.get_dict() # 首次获取cookiesoup1 = BeautifulSoup(ret1.text, features='lxml')token1 = soup1.find(name="input", attrs={"name": "authenticity_token"}).get("value") # 拿到页面token# print(token1) # 是否取到 authenticity_token# 2. 登录动作ret2 = requests.post( url="https://github.com/session", data={ "commit": "Sign in", "utf8": "✓", "authenticity_token": token1, "login": username, "password": pwd, }, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', }, cookies=r1_cookie_dict # 带上首次的cookie)r2_cookie_dict = ret2.cookies.get_dict() # 获取登录成功后返回的cookie# print(ret2.text) # 确实是慢了点# 3. 作业中要求获取个人信息,所以打开个人settings页ret3 = requests.get( url="https://github.com/settings/profile", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', }, cookies=r2_cookie_dict # 带上登录成功后的cookie)# print(ret3.text)# 4. 查找并打印个人信息soup3 = BeautifulSoup(ret3.text, features='lxml')user_info_name= soup3.find(name="input", attrs={"name": "user[profile_name]"}).get("value")user_info_email = soup3.find(name="select", attrs={"name": "user[profile_email]"}).get("option") # 可能有问题user_info_bio = soup3.find(name="textarea", attrs={"name": "user[profile_bio]"}).get("value")user_info_url = soup3.find(name="input", attrs={"name": "user[profile_blog]"}).get("value")user_info_company = soup3.find(name="input", attrs={"name": "user[profile_company]"}).get("value")user_info_location = soup3.find(name="input", attrs={"name": "user[profile_location]"}).get("value")print('Name: ',user_info_name)print('Public email: ',user_info_email)print('Bio: ',user_info_bio)print('URL: ',user_info_url)print('Company: ',user_info_company)print('Location: ',user_info_location)'''以下是API的方式,试过,直接得到字典。from requests.auth import HTTPBasicAuthres = requests.get( 'https://api.github.com/user', auth=HTTPBasicAuth(username, pwd))print(res.text)'''
以下是老师给的指导意见,真是非常好的反馈:
1.请了解下python的pep8规范2.你的请求头一定要写完整,不要这么暴露你的爬虫请求,这种行为是不好的习惯。3.你代码的注释写在文档里最好了。4.你每个请求一定要try一下这在爬虫里很重要你要保证你的爬虫稳定运行5.你的代码应该封装成函数6.你写任何项目的时候注意下项目结构哈7.同学作业写的很好了,其实生产中bs4还是不多的。pyquery或者路径获取的方式用的很多。