几个网络爬虫库的基本使用

爬虫模块使用

Posted by Mr.Children on December 21, 2017

一 网络爬虫中用到的几个模块

1. Requests

import requests  
url = 'http://www.baidu.com'  
response = requests.get(url)  
print(response.status_code) #返回状态码  
print(response.text)  
print(response.content) #可以用于获取图片等二进制的数据  
print(response.cookies)  
#直接使用response.text可能会乱码,可以使用   
print(response.content.decode("utf-8"))  
response.encoding = "utf-8"  
print(response.text)  
#各种请求的方式  
requests.post(" ")  
requests.put(" ") 
requests.delete(" ")   
requests.head(" ") 
requests.options(" ")   
#json 数据解析  
import requests  
import json  
response = requests.get("http://www.baidu.com")  
print(response.json())  
print(json.loads(response.text))  
print(type(response.json()))    #json 格式都是字典格式  
#####################################################  
#response 可以获得许多属性  
response = requests.get("http://www.baidu.com")
print(type(response.status_code),response.status_code)
print(type(response.headers),response.headers)
print(type(response.cookies),response.cookies)
print(type(response.url),response.url)
print(type(response.history),response.history) 
####################################################
1.获取请求的cookies(会话维持的作用) 
import requests  
response = requests.get("http://www.baidu.com")  
print(response.cookies)  
for key,value in response.cookies.items():
    print(key+"="+value)
#session 跨请求保持cookie  
import requests  
s = requests.Session()  
s.get("http://httpbin.org/cookies/set/number/123456")  
print(response.text)  
2.网站的安全证书验证  
response = requests.get("http://www.baidu.com",verify=False)  
3.设置代理  
import requests  
proxies = {
    "http":"http://127.0.0.1:9999"  
    "http":"http://127.0.0.1:8888"
}  
response = requests.get("http://www.baidu.com",proxies=proxies)  
#如果代理需要设置账户名和密码,需要将字典更改如下:
proxies = {
    "http":"http://user:password@127.0.0.1:9999"
}  
#如果代理时通过socks 的方式则需要pip install "requests[socks]"  
proxies= {
    "http":"socks5://127.0.0.1:9999",
    "https":"sockes5://127.0.0.1:8888"
}  
4.超时设置  timeout=  
5.添加请求头  
headers = headers  
headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36   
(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
6.网站认证设置
import requests
from requests.auth import HTTPBasicAuth
response = requests.get("http://120.27.34.24:9001/",auth=HTTPBasicAuth("user","123"))
print(response.status_code)  
#另一种方式
import requests
response = requests.get("http://120.27.34.24:9001/",auth=("user","123"))
print(response.status_code)  
7.异常的处理  
import requests
from requests.exceptions import ReadTimeout,ConnectionError,RequestException
try:
    response = requests.get("http://httpbin.org/get",timout=0.1)
    print(response.status_code)
except ReadTimeout:
    print("timeout")
except ConnectionError:
    print("connection Error")
except RequestException:
    print("error")

2. BeautifulSoup

from bs4 import BeautifulSoup  
html = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
soup = BeautifulSoup(html,'lxml')  
print(soup.prettify())  
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.p)
print(soup.p["class"])
print(soup.a)
print(soup.find_all('a'))
print(soup.find(id='link3'))
for link in soup.find_all('a'):
    print(link.get('href')) 
#获取文本内容     
print(soup.get_text())  
#嵌套的方式获取子节点和孙节点  
print(soup.head.title.string)  
#所有的子标签形成一个列表  
print(soup.p.contents)  
#children的使用需要遍历  
for i,child in enumerate(soup.p.children):
    print(i,child)  
soup.a.next_siblings    #获取后面的兄弟节点
soup.a.previous_siblings    #获取前面的兄弟节点
soup.a.next_sibling #获取下一个兄弟标签
souo.a.previous_sinbling    #获取上一个兄弟标签
##############################################
#标准选择器  
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''  
from bs4 import BeautifulSoup  
soup = BeautifulSoup(html,'lxml')  
print(soup.find_all('ul'))   
for ul in soup.find_all('ul'):
    for i in ul.find_all("li"):
        print(i)
#attrs  
print(soup.find_all(attrs={'id':'list-1'}))  
print(soup.find_all(attrs={'name':'elements'}))  
#text  
print(soup.find_all(text="Foo"))  
##############################################  
#CSS选择器  
.-class #-id  
标签1,标签2 找到所有的标签1和标签2  
标签1 标签2 找到标签1内部的所有标签2   
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''  
from bs4 import BeautifulSoup  
soup = BeautifulSoup(html,'lxml')  
print(soup.select('.panel .panel-heading'))  
print(soup.select('ul li'))  
print(soup.select('#list-2 .element'))  
#get_text()获取文本内容  
for li in soup.select('li'):  
    print(li.get_text())  
#获取属性名使用[属性名]或者attrs[属性名]  
for ul in soup.select('ul'):
    print(ul['id'])  
    print(ul.attrs['id'])  

3. Regular Expression

python 正则表达式的基本规则

re.match(patternstringflags=0)
import re
content= "hello 123 4567 World_This is a regex Demo"
result = re.match('^hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content)
print(result)
print(result.group())  
#匹配到的字符串的长度
print(result.span())  
#匹配具体的目标需要用()  
import re
content= "hello 1234567 World_This is a regex Demo"
result = re.match('^hello\s(\d+)\sWorld.*Demo$',content)
print(result)
print(result.group())
print(result.group(1))
print(result.span())  
#存在特殊字符时需要转义符号\  
import re
content= "price is $5.00"
result = re.match('price is \$5\.00',content)
print(result)
print(result.group())  
#尽量使用泛匹配,使用括号得到匹配目标,  
尽量使用非贪婪模式,有换行符就用re.S  
re.search 返回字符串并返回第一个成功的结果,扫描整个字符串  
import re   
content = "extra things hello 123455 world_this is a Re Extra things"  
result = re.search("hello.*?(\d+).*?Re",content)  
print(result)  
print(result.group())  
print(result.group(1))  
#re.findall 以列表的形式返回所有的子串  
html = '''<div id="songs-list">
    <h2 class="title">经典老歌</h2>
    <p class="introduction">
        经典老歌列表
    </p>
    <ul id="list" class="list-group">
        <li data-view="2">一路上有你</li>
        <li data-view="7">
            <a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
        </li>
        <li data-view="4" class="active">
            <a href="/3.mp3" singer="齐秦">往事随风</a>
        </li>
        <li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
        <li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
        <li data-view="5">
            <a href="/6.mp3" singer="邓丽君">但愿人长久</a>
        </li>
    </ul>
</div>'''  
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>',html,re.S)  
print(results)  
for result in results:
    print(results[1])  
#\s*?是为了解决所有的换行,有的没有换行问题  
(<a.*?>)?这样是因为html 中有的有a标签,有的没有  
?表示匹配一个或0  
re.sub 替换字符串中的每一个字符串后返回替换后的字符串  
content = "Extra things hello 123455 World_this is a regex Demo extra things"
content = re.sub('\d+','',content)
print(content)   
import re
content = "Extra things hello 123455 World_this is a regex Demo extra things"
content = re.sub('(\d+)',r'\1 7890',content)  
\1是为了获得第一个要匹配的值,防止转义字符要加上r
print(content)  
#re.compile  正则表达式编译成表达式对象,方便复用表达式  
content= """hello 12345 world_this fan"""  
pattern =re.compile("hello.*fan",re.S)
result = re.match(pattern,content)
print(result)
print(result.group())

4. selenium

##selenium 模块的基本使用  
from selenium import webdriver  
url = 'http://www.baidu.com'  
driver = webdriver.Firefox()  
driver.get(url)   
print(driver.page_source)   
##selenium 元素的查找  
name = driver.find_element_by_id('q')  
name = driver.find_element_by_css_selector("#q")  
name = driver.find_element_bt_xpath('//*[@id="q"]')  
#几种主要的查找方式
find_element_by_id
find_element_by_xpath
find_element_by_css_selector
find_element_by_tag_name
find_element_by_class_name
find_element_by_name
find_element_by_link_text  
##简单的元素交互操作  
from selenium import webdriver  
import time  
driver.get('http://www.taobao.com')  
#找到输入框的按钮  
input_str= driver.find_element_by_id('q')  
input_str.send_keys('外套')  
time.sleep(1)  
#清除输入框的数据  
input_str.clear()  
input_str.send_keys('秋裤')  
#找到搜索按钮,并click 点击  
button = driver.find_element_by_class_name('btn-search') 
button.click()  
#前进和后退  
driver.get('http://www.baidu.com')
driver.get('http://www.taobao.com')
driver.get('http://www.python.org/')
driver.back()
driver.forward()  
##selenium 滚动条的操作  
from selenium import webdriver  
driver = webdriver.Firefox()  
driver.get('http://www.taobao.com')  
driver.maximize_window()  
#将页面滚动条托到底部  
js = "var q = document.documentElement.scrollTop=100000" driver.execute_script(js)  
time.sleep(3)  
driver.save_screenshot('截图保存位置')  
#将页面滚动条下拉到顶部  
js =   "var q = document.documentElement.scrollTop=0"  
driver.execute_script(js)  
#scrolltop后面的数字来调整下拉的任意位置  
##显式等待与隐式等待  
Selenium显示等待和隐式等待的区别
1selenium的显示等待
原理:显示等待,就是明确的要等到某个元素的出现或者是某个元素的可点击等条件,  
等不到,就一直等,除非在规定的时间之内都没找到,那么久跳出Exception
(简而言之,就是直到元素出现才去操作,如果超时则报异常)
2selenium的隐式等待
原理:隐式等待,就是在创建driver时,为浏览器对象创建一个等待时间,  
这个方法是得不到某个元素就等待一段时间,直到拿到某个元素位置。
注意:在使用隐式等待的时候,实际上浏览器会在你自  
己设定的时间内部断的刷新页面去寻找我们需要的元素
语法:driver.implicitly_wait(10)

5. Xpath 的语法

1.实例文档
<?xml version="1.0" encoding="ISO-8859-1"?>
<bookstore>
<book>
  <title lang="eng">Harry Potter</title>
  <price>29.99</price>
</book>
<book>
  <title lang="eng">Learning XML</title>
  <price>39.95</price>
</book>
</bookstore>
2.选取节点
表达式 描述
nodename 选取此节点的所有子节点
/ 从根节点选取
// 从匹配选择的当前节点选择文档中的节点,而不考虑他们的位置
. 选取当前节点
.. 选取当前节点的父节点
@ 选取属性
3.实例
路径表达式 结果
bookstore 选取bookstore元素的所有子节点
/bookstore 选取根元素bookstore
bookstore/book 选取bookstore 的子元素的所有book 元素
//book 选取所有book子元素,而不管它们在文档中的位置
bookstore//book 选择属于 bookstore 元素的后代的所有 book 元素,而不管它们位于 bookstore 之下的什么位置
//@lang 选取名为lang 的所有属性
4.谓语实例(用来查找某个特定的节点或者包含某个指定的值的节点)
路径表达式 结果
/bookstore/book[1] 选取属于 bookstore 子元素的第一个 book 元素
/bookstore/book[last()] 选取属于 bookstore 子元素的最后一个 book 元素
/bookstore/book[last()-1] 选取属于 bookstore 子元素的倒数第二个 book 元素
/bookstore/book[position()<3] 选取最前面的两个属于 bookstore 元素的子元素的 book 元素
//title[@lang] 选取所有拥有名为 lang 的属性的 title 元素
//title[@lang=’eng’] 选取所有 title 元素,且这些元素拥有值为 eng 的 lang 属性
/bookstore/book[price>35.00] 选取 bookstore 元素的所有 book 元素,且其中的 price 元素的值须大于 35.00
/bookstore/book[price>35.00]/title 选取 bookstore 元素中的 book 元素的所有 title 元素,且其中的 price 元素的值须大于 35.00
5.选取未知节点
通配符 描述
/bookstore/* 选取bookstore元素的所有子元素
//* 选取文档中的所有元素
//title[@*] 选取所有带有属性的title 元素
6.路径的表达式
#绝对路径  
/step/step/
#相对路径  
step/step/

6. lxml的用法

#lxml 会自动修正html 的代码 
from lxml import etree
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result)  
1.读取文件
#新建文件  hello.html  
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
from lxml import etree   
html = etree.HTML("hello.html")  
result = etree.tostring(html,pretty_print=True)  
print(result)
2.获取所有li 标签
from lxml import etree  
html = etree.HTML('hello.html')  
print type(html)  
result = html.xpath('//li')
print (result)  # 返回的是列表  
print len(result)  
print type(result)  
print type(result[0])   
#获取li标签的所有class  
result = html.xpath('//li/@class')  
print (result)  
 ['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']  
#获取li 标签下href 属性为link1.html的<a>标签  
result = html.xpath("//li/a[@href="link1.html"]")
print (result[0].text)  
#获取li 标签下的所有<span> 标签 双斜杠,不是子元素 
result = html.xpath('//li//span')
result = html.xapth('//li/a//@class')  
#获取最后一个li 标签的<a> 的href  
result = html.xpath('//li[last()]/a/@href')  
#获取倒数第二个元素的内容  
result = html.xpath('//li[last()-1]/a')  
获取class bold 的标签名  
result = html.xpath('//*[@class='bold']')  
print (result[0].tag)  

二 网络爬虫实例

1. Pexels 与Unsplash图片爬虫

import requests
from lxml import etree
import os
import time
from multiprocessing import Pool
import sys

headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'}
path = "/mnt/e/desktop/pic/"
def get_pic(url):
    re = requests.get(url,headers=headers)
    selector = etree.HTML(re.text)
    img_html = selector.xpath("//div[@class='hide-featured-badge  hide-favorite-badge']")
    for img_src in img_html:
        try:
            t = img_src.xpath("article/a[1]/img/@src")[0]
        except IndexError:
            continue
        img_url = t.split("?")[0]
        img_name = t.split("?")[0].split("/")[-1].split("-")[-1]
        print(img_url,img_name)
        print("下面开始下载{}".format(img_name))
        if os.path.exists(path):
            flag=1
        else:
            os.makedirs(path)
		#将程序路径切换到path路径,方便保存文件
        os.chdir(path)
		#去除已经下载过的图片,避免重复
        if os.path.exists(img_name):
            print('已经下载过,跳过')
            continue
        res = requests.get(img_url,headers=headers)
        f = open(img_name,'wb')
        f.write(res.content)
        f.close()
        print('{}下载完成'.format(img_name))
    time.sleep(1)
if __name__ == "__main__":
    urls = ['https://www.pexels.com/{}?page={}'.format(sys.argv[1],str(i)) for i in range(1,50)] 
    pool = Pool(processes=4)
    pool.map(get_pic,urls)
    
    
import requests
import json
import os
import time
from multiprocessing import Pool
import sys
path= "E://desktop/11/"

def get_pic(url):
    re = requests.get (url)
    #图片的链接信息在network/XHR中找到,json格式
    target = json.loads(re.text)
    for i in target['results']:
        img_name = i['id'] + '.jpg'
        full_url = i['urls']['full']
        print(img_name,full_url)
        if os.path.exists(path):
            flag=1
        else:
            os.makedirs(path)
        os.chdir(path)
        if os.path.exists(img_name):
            print("已经下载过")
            continue
        print("下面开始下载{}".format(img_name))
        res = requests.get(full_url)
        f = open(img_name,'wb')
        f.write(res.content)
        f.close()
        print("{}下载完成".format(img_name))
    time.sleep(1)
if __name__ == "__main__":
    urls = 'https://unsplash.com/napi/search/photos?query={}&page={}'.format(sys.argv[1],str(i)) for i in range(1,50))
    pool = Pool(processes=4)
    pool.map(get_pic,urls)