几个网络爬虫库的基本使用

一网络爬虫中用到的几个模块

1. Requests

import requests  
url = 'http://www.baidu.com'  
response = requests.get(url)  
print(response.status_code) #返回状态码  
print(response.text)  
print(response.content) #可以用于获取图片等二进制的数据  
print(response.cookies)  
#直接使用response.text可能会乱码,可以使用   
print(response.content.decode("utf-8"))  
response.encoding = "utf-8"  
print(response.text)  
#各种请求的方式  
requests.post(" ")  
requests.put(" ") 
requests.delete(" ")   
requests.head(" ") 
requests.options(" ")   
#json 数据解析  
import requests  
import json  
response = requests.get("http://www.baidu.com")  
print(response.json())  
print(json.loads(response.text))  
print(type(response.json()))    #json 格式都是字典格式  
#####################################################  
#response 可以获得许多属性  
response = requests.get("http://www.baidu.com")
print(type(response.status_code),response.status_code)
print(type(response.headers),response.headers)
print(type(response.cookies),response.cookies)
print(type(response.url),response.url)
print(type(response.history),response.history) 
####################################################
1.获取请求的cookies(会话维持的作用) 
import requests  
response = requests.get("http://www.baidu.com")  
print(response.cookies)  
for key,value in response.cookies.items():
    print(key+"="+value)
#session 跨请求保持cookie  
import requests  
s = requests.Session()  
s.get("http://httpbin.org/cookies/set/number/123456")  
print(response.text)  
2.网站的安全证书验证  
response = requests.get("http://www.baidu.com",verify=False)  
3.设置代理  
import requests  
proxies = {
    "http":"http://127.0.0.1:9999"  
    "http":"http://127.0.0.1:8888"
}  
response = requests.get("http://www.baidu.com",proxies=proxies)  
#如果代理需要设置账户名和密码，需要将字典更改如下:
proxies = {
    "http":"http://user:password@127.0.0.1:9999"
}  
#如果代理时通过socks 的方式则需要pip install "requests[socks]"  
proxies= {
    "http":"socks5://127.0.0.1:9999",
    "https":"sockes5://127.0.0.1:8888"
}  
4.超时设置  timeout=  
5.添加请求头  
headers = headers  
headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36   
(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
6.网站认证设置
import requests
from requests.auth import HTTPBasicAuth
response = requests.get("http://120.27.34.24:9001/",auth=HTTPBasicAuth("user","123"))
print(response.status_code)  
#另一种方式
import requests
response = requests.get("http://120.27.34.24:9001/",auth=("user","123"))
print(response.status_code)  
7.异常的处理  
import requests
from requests.exceptions import ReadTimeout,ConnectionError,RequestException
try:
    response = requests.get("http://httpbin.org/get",timout=0.1)
    print(response.status_code)
except ReadTimeout:
    print("timeout")
except ConnectionError:
    print("connection Error")
except RequestException:
    print("error")

2. BeautifulSoup

from bs4 import BeautifulSoup  
html = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
soup = BeautifulSoup(html,'lxml')  
print(soup.prettify())  
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.p)
print(soup.p["class"])
print(soup.a)
print(soup.find_all('a'))
print(soup.find(id='link3'))
for link in soup.find_all('a'):
    print(link.get('href')) 
#获取文本内容     
print(soup.get_text())  
#嵌套的方式获取子节点和孙节点  
print(soup.head.title.string)  
#所有的子标签形成一个列表  
print(soup.p.contents)  
#children的使用需要遍历  
for i,child in enumerate(soup.p.children):
    print(i,child)  
soup.a.next_siblings    #获取后面的兄弟节点
soup.a.previous_siblings    #获取前面的兄弟节点
soup.a.next_sibling #获取下一个兄弟标签
souo.a.previous_sinbling    #获取上一个兄弟标签
##############################################
#标准选择器  
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''  
from bs4 import BeautifulSoup  
soup = BeautifulSoup(html,'lxml')  
print(soup.find_all('ul'))   
for ul in soup.find_all('ul'):
    for i in ul.find_all("li"):
        print(i)
#attrs  
print(soup.find_all(attrs={'id':'list-1'}))  
print(soup.find_all(attrs={'name':'elements'}))  
#text  
print(soup.find_all(text="Foo"))  
##############################################  
#CSS选择器  
.-class #-id  
标签1,标签2 找到所有的标签1和标签2  
标签1 标签2 找到标签1内部的所有标签2   
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''  
from bs4 import BeautifulSoup  
soup = BeautifulSoup(html,'lxml')  
print(soup.select('.panel .panel-heading'))  
print(soup.select('ul li'))  
print(soup.select('#list-2 .element'))  
#get_text()获取文本内容  
for li in soup.select('li'):  
    print(li.get_text())  
#获取属性名使用[属性名]或者attrs[属性名]  
for ul in soup.select('ul'):
    print(ul['id'])  
    print(ul.attrs['id'])  

3. Regular Expression

python 正则表达式的基本规则

re.match(pattern，string，flags=0)
import re
content= "hello 123 4567 World_This is a regex Demo"
result = re.match('^hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content)
print(result)
print(result.group())  
#匹配到的字符串的长度
print(result.span())  
#匹配具体的目标需要用()  
import re
content= "hello 1234567 World_This is a regex Demo"
result = re.match('^hello\s(\d+)\sWorld.*Demo$',content)
print(result)
print(result.group())
print(result.group(1))
print(result.span())  
#存在特殊字符时需要转义符号\  
import re
content= "price is $5.00"
result = re.match('price is \$5\.00',content)
print(result)
print(result.group())  
#尽量使用泛匹配，使用括号得到匹配目标，  
尽量使用非贪婪模式，有换行符就用re.S  
re.search 返回字符串并返回第一个成功的结果，扫描整个字符串  
import re   
content = "extra things hello 123455 world_this is a Re Extra things"  
result = re.search("hello.*?(\d+).*?Re",content)  
print(result)  
print(result.group())  
print(result.group(1))  
#re.findall 以列表的形式返回所有的子串  
html = '''<div id="songs-list">
    <h2 class="title">经典老歌</h2>
    <p class="introduction">
        经典老歌列表
    </p>
    <ul id="list" class="list-group">
        <li data-view="2">一路上有你</li>
        <li data-view="7">
            <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
        </li>
        <li data-view="4" class="active">
            <a href="/3.mp3" singer="齐秦">往事随风</a>
        </li>
        <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
        <li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
        <li data-view="5">
            <a href="/6.mp3" singer="邓丽君">但愿人长久</a>
        </li>
    </ul>
</div>'''  
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>',html,re.S)  
print(results)  
for result in results:
    print(results[1])  
#\s*?是为了解决所有的换行，有的没有换行问题  
(<a.*?>)？这样是因为html 中有的有a标签，有的没有  
？表示匹配一个或0个  
re.sub 替换字符串中的每一个字符串后返回替换后的字符串  
content = "Extra things hello 123455 World_this is a regex Demo extra things"
content = re.sub('\d+','',content)
print(content)   
import re
content = "Extra things hello 123455 World_this is a regex Demo extra things"
content = re.sub('(\d+)',r'\1 7890',content)  
\1是为了获得第一个要匹配的值，防止转义字符要加上r
print(content)  
#re.compile  正则表达式编译成表达式对象，方便复用表达式  
content= """hello 12345 world_this fan"""  
pattern =re.compile("hello.*fan",re.S)
result = re.match(pattern,content)
print(result)
print(result.group())

4. selenium

##selenium 模块的基本使用  
from selenium import webdriver  
url = 'http://www.baidu.com'  
driver = webdriver.Firefox()  
driver.get(url)   
print(driver.page_source)   
##selenium 元素的查找  
name = driver.find_element_by_id('q')  
name = driver.find_element_by_css_selector("#q")  
name = driver.find_element_bt_xpath('//*[@id="q"]')  
#几种主要的查找方式
find_element_by_id
find_element_by_xpath
find_element_by_css_selector
find_element_by_tag_name
find_element_by_class_name
find_element_by_name
find_element_by_link_text  
##简单的元素交互操作  
from selenium import webdriver  
import time  
driver.get('http://www.taobao.com')  
#找到输入框的按钮  
input_str= driver.find_element_by_id('q')  
input_str.send_keys('外套')  
time.sleep(1)  
#清除输入框的数据  
input_str.clear()  
input_str.send_keys('秋裤')  
#找到搜索按钮，并click 点击  
button = driver.find_element_by_class_name('btn-search') 
button.click()  
#前进和后退  
driver.get('http://www.baidu.com')
driver.get('http://www.taobao.com')
driver.get('http://www.python.org/')
driver.back()
driver.forward()  
##selenium 滚动条的操作  
from selenium import webdriver  
driver = webdriver.Firefox()  
driver.get('http://www.taobao.com')  
driver.maximize_window()  
#将页面滚动条托到底部  
js = "var q = document.documentElement.scrollTop=100000" driver.execute_script(js)  
time.sleep(3)  
driver.save_screenshot('截图保存位置')  
#将页面滚动条下拉到顶部  
js =   "var q = document.documentElement.scrollTop=0"  
driver.execute_script(js)  
#scrolltop后面的数字来调整下拉的任意位置  
##显式等待与隐式等待  
Selenium显示等待和隐式等待的区别
1、selenium的显示等待
原理：显示等待，就是明确的要等到某个元素的出现或者是某个元素的可点击等条件，  
等不到，就一直等，除非在规定的时间之内都没找到，那么久跳出Exception
(简而言之，就是直到元素出现才去操作，如果超时则报异常)
2、selenium的隐式等待
原理：隐式等待，就是在创建driver时，为浏览器对象创建一个等待时间，  
这个方法是得不到某个元素就等待一段时间，直到拿到某个元素位置。
注意：在使用隐式等待的时候，实际上浏览器会在你自  
己设定的时间内部断的刷新页面去寻找我们需要的元素
语法：driver.implicitly_wait(10)

5. Xpath 的语法

1.实例文档

<?xml version="1.0" encoding="ISO-8859-1"?>
<bookstore>
<book>
  <title lang="eng">Harry Potter</title>
  <price>29.99</price>
</book>
<book>
  <title lang="eng">Learning XML</title>
  <price>39.95</price>
</book>
</bookstore>

2.选取节点

表达式	描述
nodename	选取此节点的所有子节点
/	从根节点选取
//	从匹配选择的当前节点选择文档中的节点，而不考虑他们的位置
.	选取当前节点
..	选取当前节点的父节点
@	选取属性

3.实例

路径表达式	结果
bookstore	选取bookstore元素的所有子节点
/bookstore	选取根元素bookstore
bookstore/book	选取bookstore 的子元素的所有book 元素
//book	选取所有book子元素，而不管它们在文档中的位置
bookstore//book	选择属于 bookstore 元素的后代的所有 book 元素，而不管它们位于 bookstore 之下的什么位置
//@lang	选取名为lang 的所有属性

4.谓语实例（用来查找某个特定的节点或者包含某个指定的值的节点）

路径表达式	结果
/bookstore/book[1]	选取属于 bookstore 子元素的第一个 book 元素
/bookstore/book[last()]	选取属于 bookstore 子元素的最后一个 book 元素
/bookstore/book[last()-1]	选取属于 bookstore 子元素的倒数第二个 book 元素
/bookstore/book[position()<3]	选取最前面的两个属于 bookstore 元素的子元素的 book 元素
//title[@lang]	选取所有拥有名为 lang 的属性的 title 元素
//title[@lang=’eng’]	选取所有 title 元素，且这些元素拥有值为 eng 的 lang 属性
/bookstore/book[price>35.00]	选取 bookstore 元素的所有 book 元素，且其中的 price 元素的值须大于 35.00
/bookstore/book[price>35.00]/title	选取 bookstore 元素中的 book 元素的所有 title 元素，且其中的 price 元素的值须大于 35.00

5.选取未知节点

通配符	描述
/bookstore/*	选取bookstore元素的所有子元素
//*	选取文档中的所有元素
//title[@*]	选取所有带有属性的title 元素

6.路径的表达式

#绝对路径  
/step/step/
#相对路径  
step/step/

6. lxml的用法

#lxml 会自动修正html 的代码 
from lxml import etree
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result)  

1.读取文件

#新建文件  hello.html  
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
from lxml import etree   
html = etree.HTML("hello.html")  
result = etree.tostring(html,pretty_print=True)  
print(result)

2.获取所有li 标签

from lxml import etree  
html = etree.HTML('hello.html')  
print type(html)  
result = html.xpath('//li')
print (result)  # 返回的是列表  
print len(result)  
print type(result)  
print type(result[0])   
#获取li标签的所有class  
result = html.xpath('//li/@class')  
print (result)  
 ['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']  
#获取li 标签下href 属性为link1.html的<a>标签  
result = html.xpath("//li/a[@href="link1.html"]")
print (result[0].text)  
#获取li 标签下的所有<span> 标签 双斜杠,不是子元素 
result = html.xpath('//li//span')
result = html.xapth('//li/a//@class')  
#获取最后一个li 标签的<a> 的href  
result = html.xpath('//li[last()]/a/@href')  
#获取倒数第二个元素的内容  
result = html.xpath('//li[last()-1]/a')  
获取class 为bold 的标签名  
result = html.xpath('//*[@class='bold']')  
print (result[0].tag)  

二网络爬虫实例

1. Pexels 与Unsplash图片爬虫

import requests
from lxml import etree
import os
import time
from multiprocessing import Pool
import sys

headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'}
path = "/mnt/e/desktop/pic/"
def get_pic(url):
    re = requests.get(url,headers=headers)
    selector = etree.HTML(re.text)
    img_html = selector.xpath("//div[@class='hide-featured-badge  hide-favorite-badge']")
    for img_src in img_html:
        try:
            t = img_src.xpath("article/a[1]/img/@src")[0]
        except IndexError:
            continue
        img_url = t.split("?")[0]
        img_name = t.split("?")[0].split("/")[-1].split("-")[-1]
        print(img_url,img_name)
        print("下面开始下载{}".format(img_name))
        if os.path.exists(path):
            flag=1
        else:
            os.makedirs(path)
		#将程序路径切换到path路径，方便保存文件
        os.chdir(path)
		#去除已经下载过的图片，避免重复
        if os.path.exists(img_name):
            print('已经下载过，跳过')
            continue
        res = requests.get(img_url,headers=headers)
        f = open(img_name,'wb')
        f.write(res.content)
        f.close()
        print('{}下载完成'.format(img_name))
    time.sleep(1)
if __name__ == "__main__":
    urls = ['https://www.pexels.com/{}?page={}'.format(sys.argv[1],str(i)) for i in range(1,50)] 
    pool = Pool(processes=4)
    pool.map(get_pic,urls)
    
    
import requests
import json
import os
import time
from multiprocessing import Pool
import sys
path= "E://desktop/11/"

def get_pic(url):
    re = requests.get (url)
    #图片的链接信息在network/XHR中找到，json格式
    target = json.loads(re.text)
    for i in target['results']:
        img_name = i['id'] + '.jpg'
        full_url = i['urls']['full']
        print(img_name,full_url)
        if os.path.exists(path):
            flag=1
        else:
            os.makedirs(path)
        os.chdir(path)
        if os.path.exists(img_name):
            print("已经下载过")
            continue
        print("下面开始下载{}".format(img_name))
        res = requests.get(full_url)
        f = open(img_name,'wb')
        f.write(res.content)
        f.close()
        print("{}下载完成".format(img_name))
    time.sleep(1)
if __name__ == "__main__":
    urls = 'https://unsplash.com/napi/search/photos?query={}&page={}'.format(sys.argv[1],str(i)) for i in range(1,50))
    pool = Pool(processes=4)
    pool.map(get_pic,urls)

爬虫模块使用

一网络爬虫中用到的几个模块

1. Requests

2. BeautifulSoup

3. Regular Expression

4. selenium

5. Xpath 的语法

1.实例文档

2.选取节点

3.实例

4.谓语实例（用来查找某个特定的节点或者包含某个指定的值的节点）

5.选取未知节点

6.路径的表达式

6. lxml的用法

1.读取文件

2.获取所有li 标签

二网络爬虫实例

1. Pexels 与Unsplash图片爬虫

CATALOG

FEATURED TAGS

FRIENDS

一 网络爬虫中用到的几个模块

1. Requests

2. BeautifulSoup

3. Regular Expression

4. selenium

5. Xpath 的语法

1.实例文档

2.选取节点

3.实例

4.谓语实例（用来查找某个特定的节点或者包含某个指定的值的节点）

5.选取未知节点

6.路径的表达式

6. lxml的用法

1.读取文件

2.获取所有li 标签

二 网络爬虫实例

1. Pexels 与Unsplash图片爬虫

CATALOG

FEATURED TAGS

FRIENDS

一网络爬虫中用到的几个模块

二网络爬虫实例