快速上手Requests

首先确认一下您的python已经安装requests库，如果没安装可以使用如下命令进行安装

pip install requests

发送请求

requests库提供的具有请求功能的方法有七种：

requests.request(method.url,**kwargs) 构造并发送一个request，返回response对象
requests.head(url,**kwargs) 发送head请求，返回response对象
requests.get(url,params=None,**kwargs) 发送GET请求返回response对象
requests.post(url,data=None,json=None,**kwargs) 发送POST请求，返回response对象
requests.put(url,data=None,**kwargs) 发送PUT请求，返回response对象
requests.patch(url,data=None,**kwargs) 发送PATCH请求，返回response对象
requests.delete(url,**kwargs) 发送DELETE请求，返回response对象

参数：

method：请求方式，get,post等
url：拟获取页面的url链接
params：可选，字典或字节序列，作为参数增加到url中
data：可选，字典、字节序列或文件对象，作为request的内容
json：可选，json格式的数据，作为request的内容
headers：字典，HTTP相应头（模拟浏览器进行访问）
cookies：字典或cookieJar对象发送cookies
files：字典，向服务器传输文件时使用的字段
auth：元组，用来支持http认证功能
timeout：用于设定超时时间，单位为秒。
proxies：字典，用来设置代理服务器
allow_redirects：开关，表示是否允许对url进行重定向，默认为True
stream：开关，指是否对获取内容进行立即下载，默认为True
verify：开关，用于认证SSL证书，默认为True
cert：用于设置保存本地SSL证书路径

1.requests.request(method,url,**kwargs)

import requests
header = {}
header['User-Agent'] = 'Mozilla/5.0' \
'(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
# 定义头信息发送请求，返回response对象
response = requests.request('get','https://api.github.com/events',headers = header)

print(response.url) #返回请求的url
print(response.status_code) #返回状态码
print(response.encoding) #返回编码
print(response.text) #返回相应内容以unicode表示
print(response.headers) #返回响应头信息
print(response.cookies) #返回cookies
print(response.json()) #返回json数据

2.requests.head(url,**kwargs)

import requests
response = requests.head('https://github.com/get')
print('text:',response.text) #不会返回内容信息
print('headers:',response.headers) #返回头信息
print(response.cookies.items()) #返回cookies元组列表

3.requests.get(url,params=None,**kwargs)

import requests
url = "http://119.36.87.130/course/newsview.php"
kv = {}
kv['pid'] = '189'
kv['id'] = '2'
response = requests.get(url,params = kv)
print(response.text)
print(response.url) #http://119.36.87.130/course/newsview.php?pid=189&id=2
print(response.json()) #使用json()方法解析返回字典数据

你还可以将一个列表作为值传入：

import requests
url = "http://119.36.87.130/course/newsview.php"
kv = {}
kv['pid'] = '189'
kv['id'] = ['2','3']
response = requests.get(url,params = kv)
print(response.url) #http://119.36.87.130/course/newsview.php?pid=189&id=2&id=3

利用正则表达式抓取网页内容

import requests
import re
url='http://www.runoob.com/python3/python3-reg-expressions.html'
response = requests.get(url)
response.encoding = 'UTF-8'
pattern = re.compile('id="content">.*?<h1>(.*?)</h1>.*?<p>(.*?)</p><p>(.*?)</p>.*?<p>(.*?)</p>.*?<p>(.*?)</p>.*?<p>(.*?)</p>',re.S)
text = re.search(pattern,response.text)

for i in text.groups():
    print(i)

抓取二进制文件：图像，BytesIO创建内存对象存储数据，Image打开图像获得图像对象，也可以用上下问方式将图像直接写入文件，适合音频，视频等文件

import requests
from io import BytesIO
from PIL import Image

url = 'https://img.hsxhn.com/images/2018/05/11/120x120.jpg'
r = requests.get(url)
i = Image.open(BytesIO(r.content)) #获得一个图像对象
print(i.format,i.size,i.mode) #查看图像的来源，像素和像素类型（RGB）
#print(i.show())  #显示图片
i.save('requests_log.png')  #保存图像数据到文件

4.requests.post(url,data=None,json=None,**kwargs)

Reqeusts支持以form表单形式发送post请求，只需要将请求的参数构造成一个字典，然后传给requests.post()的data参数即可

import requests
kv = {'id':'admin'}
url = 'http://httpbin.org/post'
r = requests.post(url,data = kv) #以表单数据发送数据
body = r.json() #获得字典格式的返回数据 
print(body['form']) #窃取表单编码数据

上传文件：files参数指定上传文件，上传的文件在主体数据中

import requests
url = 'http://httpbin.org/post'
kv = {'file': open('report.txt', 'rb')}
r = requests.post(url,files = kv)
print (r.text)

请求响应

response对象包含服务器对HTTP请求的响应信息

import requests
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 ' \
'(Windows NT 10.0; Win64; x64) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/67.0.3396.79 Safari/537.36'
r = requests.get('http://docs.python-requests.org/en/master/',headers=headers)
print('chardet提供的编码:',r.apparent_encoding)
print('响应字节内容:',r.content)
print('响应cookies:',r.cookies.items())
print('请求到响应之间的时间:',r.elapsed)
print('响应编码:',r.encoding)
print('响应头信息:',r.headers)
print('头信息中的server:',r.headers['Server'])
print('请求历史记录:',r.history)
print('迭代响应数据:',r.iter_lines())
#print('响应json编码数据:',r.json())
print('返回解析的响应头链接:',r.links)
print('返回状态码:',r.status_code)
print('响应str内容:',r.text)
print('响应URL:',r.url)
print('返回发送的头参数:',r.request.headers)

Cookie

import requests
url = 'https://www.baidu.com'
response = requests.get(url)
print(response.cookies) #返回RequestsCookieJar对象，可以用list解封
print(response.cookies.items())

使用cookies进行登陆

import requests
url = 'http://httpbin.org/cookies'
response = requests.get(url)
headers = {}
headers['Cookie'] = '123456'
#cookies = {'a' : '123456'} #两种方式都可以
response = requests.get(url,headers = headers)
#response = requests.get(url,cookies = cookies) #两种方式都可以
print(response.text)

代理设置

对于某些网站，在测试的时候请求几次，能正常获取内容。但是一旦开始大规模爬取，对于大规模且频繁的请求，网站可能会弹出验证码，或者跳转到登录认证页面，更甚者可能会直接封禁客户端的IP，导致一定时间段内无法访问。

那么，为了防止这种情况发生，我们需要设置代理来解决这个问题，这就需要用到proxies参数

import requests
proxies = {}
proxies['http'] = 'http://10.10.1.10:3128'
proxies['https'] = 'http://10.10.1.10:1080'
url = "https://www.taobao.com"
r = requests.get(url,proxies = proxies)

超时

如果服务器没有及时响应，大多数对外部服务器的请求应该附加一个超时值。默认情况下，除非显式设置超时值，否则请求不会超时。如果没有超时，您的代码可能会挂起几分钟或更长时间

一旦你的客户端连接到服务器并发送了HTTP请求，读取超时就是客户端等待服务器发送响应的秒数

为超时指定单个值，如下所示：

r = requests.get('https://github.com',timeout = 5)

超时值将应用于超时connect和read超时。如果您想单独设置值，请指定一个元组：

r = requests.get('https://github.com', timeout=(3.05, 27))

如果远程服务器速度非常慢，您可以通过将None作为超时值，让“请求”永久等待响应：

r = requests.get('https://github.com', timeout=None)

抓取网页实例

#!/usr/bin/env python
#coding:utf-8

import json
from multiprocessing import Pool
from requests.exceptions import RequestException
import requests
import re

#类建立初始化
class get_parse:
    _headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'}
    _pattern = re.compile(
        '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>',
        re.S)

    def __init__(self,url):
        self.url=url
    #方法返回一个生成器
    def get_page(self):
        try:
            response = requests.get(self.url, headers=self._headers)
            if response.status_code == 200:
                body=response.text
                items=re.findall(self._pattern,body)
                for j in items:
                    dic = {}
                    for i in range(len(j)):
                        dic['id']=j[0]
                        dic['image']=j[1]
                        dic['title']=j[2]
                        dic['actor']=j[3].strip()
                        dic['time']=j[4].strip()
                        dic['score']=j[5]+j[6]
                    yield dic
        except RequestException as e:
            return e
    #类方法抓取内容写入到文件
    def write_file(self,data):
        with open('resule.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(data,ensure_ascii=False) + '\n')
            f.close()

#函数返回一个页面的内容
def main(offset):
    url='http://maoyan.com/board/4?offset=' + str(offset)
    data=get_parse(url)
    result=data.get_page()
    for item in result:
        print(item)
        data.write_file(item) #调用方法写入到文件

if __name__ == '__main__':
    #for i in range(10):
        #main(offset=i*10)
    pool = Pool()  #建立进程池
    pool.map(main, [i * 10 for i in range(10)]) #多进程运行函数，map和内置函数原理一样

Catalog