一 网络爬虫中用到的几个模块
1. Requests
import requests
url = 'http://www.baidu.com'
response = requests.get(url)
print(response.status_code) #返回状态码
print(response.text)
print(response.content) #可以用于获取图片等二进制的数据
print(response.cookies)
#直接使用response.text可能会乱码,可以使用
print(response.content.decode("utf-8"))
response.encoding = "utf-8"
print(response.text)
#各种请求的方式
requests.post(" ")
requests.put(" ")
requests.delete(" ")
requests.head(" ")
requests.options(" ")
#json 数据解析
import requests
import json
response = requests.get("http://www.baidu.com")
print(response.json())
print(json.loads(response.text))
print(type(response.json())) #json 格式都是字典格式
#####################################################
#response 可以获得许多属性
response = requests.get("http://www.baidu.com")
print(type(response.status_code),response.status_code)
print(type(response.headers),response.headers)
print(type(response.cookies),response.cookies)
print(type(response.url),response.url)
print(type(response.history),response.history)
####################################################
1.获取请求的cookies(会话维持的作用)
import requests
response = requests.get("http://www.baidu.com")
print(response.cookies)
for key,value in response.cookies.items():
print(key+"="+value)
#session 跨请求保持cookie
import requests
s = requests.Session()
s.get("http://httpbin.org/cookies/set/number/123456")
print(response.text)
2.网站的安全证书验证
response = requests.get("http://www.baidu.com",verify=False)
3.设置代理
import requests
proxies = {
"http":"http://127.0.0.1:9999"
"http":"http://127.0.0.1:8888"
}
response = requests.get("http://www.baidu.com",proxies=proxies)
#如果代理需要设置账户名和密码,需要将字典更改如下:
proxies = {
"http":"http://user:password@127.0.0.1:9999"
}
#如果代理时通过socks 的方式则需要pip install "requests[socks]"
proxies= {
"http":"socks5://127.0.0.1:9999",
"https":"sockes5://127.0.0.1:8888"
}
4.超时设置 timeout=
5.添加请求头
headers = headers
headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
6.网站认证设置
import requests
from requests.auth import HTTPBasicAuth
response = requests.get("http://120.27.34.24:9001/",auth=HTTPBasicAuth("user","123"))
print(response.status_code)
#另一种方式
import requests
response = requests.get("http://120.27.34.24:9001/",auth=("user","123"))
print(response.status_code)
7.异常的处理
import requests
from requests.exceptions import ReadTimeout,ConnectionError,RequestException
try:
response = requests.get("http://httpbin.org/get",timout=0.1)
print(response.status_code)
except ReadTimeout:
print("timeout")
except ConnectionError:
print("connection Error")
except RequestException:
print("error")
2. BeautifulSoup
from bs4 import BeautifulSoup
html = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
soup = BeautifulSoup(html,'lxml')
print(soup.prettify())
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.p)
print(soup.p["class"])
print(soup.a)
print(soup.find_all('a'))
print(soup.find(id='link3'))
for link in soup.find_all('a'):
print(link.get('href'))
#获取文本内容
print(soup.get_text())
#嵌套的方式获取子节点和孙节点
print(soup.head.title.string)
#所有的子标签形成一个列表
print(soup.p.contents)
#children的使用需要遍历
for i,child in enumerate(soup.p.children):
print(i,child)
soup.a.next_siblings #获取后面的兄弟节点
soup.a.previous_siblings #获取前面的兄弟节点
soup.a.next_sibling #获取下一个兄弟标签
souo.a.previous_sinbling #获取上一个兄弟标签
##############################################
#标准选择器
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all('ul'))
for ul in soup.find_all('ul'):
for i in ul.find_all("li"):
print(i)
#attrs
print(soup.find_all(attrs={'id':'list-1'}))
print(soup.find_all(attrs={'name':'elements'}))
#text
print(soup.find_all(text="Foo"))
##############################################
#CSS选择器
.-class #-id
标签1,标签2 找到所有的标签1和标签2
标签1 标签2 找到标签1内部的所有标签2
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
#get_text()获取文本内容
for li in soup.select('li'):
print(li.get_text())
#获取属性名使用[属性名]或者attrs[属性名]
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
3. Regular Expression
python 正则表达式的基本规则
re.match(pattern,string,flags=0)
import re
content= "hello 123 4567 World_This is a regex Demo"
result = re.match('^hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content)
print(result)
print(result.group())
#匹配到的字符串的长度
print(result.span())
#匹配具体的目标需要用()
import re
content= "hello 1234567 World_This is a regex Demo"
result = re.match('^hello\s(\d+)\sWorld.*Demo$',content)
print(result)
print(result.group())
print(result.group(1))
print(result.span())
#存在特殊字符时需要转义符号\
import re
content= "price is $5.00"
result = re.match('price is \$5\.00',content)
print(result)
print(result.group())
#尽量使用泛匹配,使用括号得到匹配目标,
尽量使用非贪婪模式,有换行符就用re.S
re.search 返回字符串并返回第一个成功的结果,扫描整个字符串
import re
content = "extra things hello 123455 world_this is a Re Extra things"
result = re.search("hello.*?(\d+).*?Re",content)
print(result)
print(result.group())
print(result.group(1))
#re.findall 以列表的形式返回所有的子串
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>',html,re.S)
print(results)
for result in results:
print(results[1])
#\s*?是为了解决所有的换行,有的没有换行问题
(<a.*?>)?这样是因为html 中有的有a标签,有的没有
?表示匹配一个或0个
re.sub 替换字符串中的每一个字符串后返回替换后的字符串
content = "Extra things hello 123455 World_this is a regex Demo extra things"
content = re.sub('\d+','',content)
print(content)
import re
content = "Extra things hello 123455 World_this is a regex Demo extra things"
content = re.sub('(\d+)',r'\1 7890',content)
\1是为了获得第一个要匹配的值,防止转义字符要加上r
print(content)
#re.compile 正则表达式编译成表达式对象,方便复用表达式
content= """hello 12345 world_this fan"""
pattern =re.compile("hello.*fan",re.S)
result = re.match(pattern,content)
print(result)
print(result.group())
4. selenium
##selenium 模块的基本使用
from selenium import webdriver
url = 'http://www.baidu.com'
driver = webdriver.Firefox()
driver.get(url)
print(driver.page_source)
##selenium 元素的查找
name = driver.find_element_by_id('q')
name = driver.find_element_by_css_selector("#q")
name = driver.find_element_bt_xpath('//*[@id="q"]')
#几种主要的查找方式
find_element_by_id
find_element_by_xpath
find_element_by_css_selector
find_element_by_tag_name
find_element_by_class_name
find_element_by_name
find_element_by_link_text
##简单的元素交互操作
from selenium import webdriver
import time
driver.get('http://www.taobao.com')
#找到输入框的按钮
input_str= driver.find_element_by_id('q')
input_str.send_keys('外套')
time.sleep(1)
#清除输入框的数据
input_str.clear()
input_str.send_keys('秋裤')
#找到搜索按钮,并click 点击
button = driver.find_element_by_class_name('btn-search')
button.click()
#前进和后退
driver.get('http://www.baidu.com')
driver.get('http://www.taobao.com')
driver.get('http://www.python.org/')
driver.back()
driver.forward()
##selenium 滚动条的操作
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('http://www.taobao.com')
driver.maximize_window()
#将页面滚动条托到底部
js = "var q = document.documentElement.scrollTop=100000" driver.execute_script(js)
time.sleep(3)
driver.save_screenshot('截图保存位置')
#将页面滚动条下拉到顶部
js = "var q = document.documentElement.scrollTop=0"
driver.execute_script(js)
#scrolltop后面的数字来调整下拉的任意位置
##显式等待与隐式等待
Selenium显示等待和隐式等待的区别
1、selenium的显示等待
原理:显示等待,就是明确的要等到某个元素的出现或者是某个元素的可点击等条件,
等不到,就一直等,除非在规定的时间之内都没找到,那么久跳出Exception
(简而言之,就是直到元素出现才去操作,如果超时则报异常)
2、selenium的隐式等待
原理:隐式等待,就是在创建driver时,为浏览器对象创建一个等待时间,
这个方法是得不到某个元素就等待一段时间,直到拿到某个元素位置。
注意:在使用隐式等待的时候,实际上浏览器会在你自
己设定的时间内部断的刷新页面去寻找我们需要的元素
语法:driver.implicitly_wait(10)
5. Xpath 的语法
1.实例文档
<?xml version="1.0" encoding="ISO-8859-1"?>
<bookstore>
<book>
<title lang="eng">Harry Potter</title>
<price>29.99</price>
</book>
<book>
<title lang="eng">Learning XML</title>
<price>39.95</price>
</book>
</bookstore>
2.选取节点
表达式 | 描述 |
---|---|
nodename | 选取此节点的所有子节点 |
/ | 从根节点选取 |
// | 从匹配选择的当前节点选择文档中的节点,而不考虑他们的位置 |
. | 选取当前节点 |
.. | 选取当前节点的父节点 |
@ | 选取属性 |
3.实例
路径表达式 | 结果 |
---|---|
bookstore | 选取bookstore元素的所有子节点 |
/bookstore | 选取根元素bookstore |
bookstore/book | 选取bookstore 的子元素的所有book 元素 |
//book | 选取所有book子元素,而不管它们在文档中的位置 |
bookstore//book | 选择属于 bookstore 元素的后代的所有 book 元素,而不管它们位于 bookstore 之下的什么位置 |
//@lang | 选取名为lang 的所有属性 |
4.谓语实例(用来查找某个特定的节点或者包含某个指定的值的节点)
路径表达式 | 结果 |
---|---|
/bookstore/book[1] | 选取属于 bookstore 子元素的第一个 book 元素 |
/bookstore/book[last()] | 选取属于 bookstore 子元素的最后一个 book 元素 |
/bookstore/book[last()-1] | 选取属于 bookstore 子元素的倒数第二个 book 元素 |
/bookstore/book[position()<3] | 选取最前面的两个属于 bookstore 元素的子元素的 book 元素 |
//title[@lang] | 选取所有拥有名为 lang 的属性的 title 元素 |
//title[@lang=’eng’] | 选取所有 title 元素,且这些元素拥有值为 eng 的 lang 属性 |
/bookstore/book[price>35.00] | 选取 bookstore 元素的所有 book 元素,且其中的 price 元素的值须大于 35.00 |
/bookstore/book[price>35.00]/title | 选取 bookstore 元素中的 book 元素的所有 title 元素,且其中的 price 元素的值须大于 35.00 |
5.选取未知节点
通配符 | 描述 |
---|---|
/bookstore/* | 选取bookstore元素的所有子元素 |
//* | 选取文档中的所有元素 |
//title[@*] | 选取所有带有属性的title 元素 |
6.路径的表达式
#绝对路径
/step/step/
#相对路径
step/step/
6. lxml的用法
#lxml 会自动修正html 的代码
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result)
1.读取文件
#新建文件 hello.html
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
from lxml import etree
html = etree.HTML("hello.html")
result = etree.tostring(html,pretty_print=True)
print(result)
2.获取所有li 标签
from lxml import etree
html = etree.HTML('hello.html')
print type(html)
result = html.xpath('//li')
print (result) # 返回的是列表
print len(result)
print type(result)
print type(result[0])
#获取li标签的所有class
result = html.xpath('//li/@class')
print (result)
['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']
#获取li 标签下href 属性为link1.html的<a>标签
result = html.xpath("//li/a[@href="link1.html"]")
print (result[0].text)
#获取li 标签下的所有<span> 标签 双斜杠,不是子元素
result = html.xpath('//li//span')
result = html.xapth('//li/a//@class')
#获取最后一个li 标签的<a> 的href
result = html.xpath('//li[last()]/a/@href')
#获取倒数第二个元素的内容
result = html.xpath('//li[last()-1]/a')
获取class 为bold 的标签名
result = html.xpath('//*[@class='bold']')
print (result[0].tag)
二 网络爬虫实例
1. Pexels 与Unsplash图片爬虫
import requests
from lxml import etree
import os
import time
from multiprocessing import Pool
import sys
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'}
path = "/mnt/e/desktop/pic/"
def get_pic(url):
re = requests.get(url,headers=headers)
selector = etree.HTML(re.text)
img_html = selector.xpath("//div[@class='hide-featured-badge hide-favorite-badge']")
for img_src in img_html:
try:
t = img_src.xpath("article/a[1]/img/@src")[0]
except IndexError:
continue
img_url = t.split("?")[0]
img_name = t.split("?")[0].split("/")[-1].split("-")[-1]
print(img_url,img_name)
print("下面开始下载{}".format(img_name))
if os.path.exists(path):
flag=1
else:
os.makedirs(path)
#将程序路径切换到path路径,方便保存文件
os.chdir(path)
#去除已经下载过的图片,避免重复
if os.path.exists(img_name):
print('已经下载过,跳过')
continue
res = requests.get(img_url,headers=headers)
f = open(img_name,'wb')
f.write(res.content)
f.close()
print('{}下载完成'.format(img_name))
time.sleep(1)
if __name__ == "__main__":
urls = ['https://www.pexels.com/{}?page={}'.format(sys.argv[1],str(i)) for i in range(1,50)]
pool = Pool(processes=4)
pool.map(get_pic,urls)
import requests
import json
import os
import time
from multiprocessing import Pool
import sys
path= "E://desktop/11/"
def get_pic(url):
re = requests.get (url)
#图片的链接信息在network/XHR中找到,json格式
target = json.loads(re.text)
for i in target['results']:
img_name = i['id'] + '.jpg'
full_url = i['urls']['full']
print(img_name,full_url)
if os.path.exists(path):
flag=1
else:
os.makedirs(path)
os.chdir(path)
if os.path.exists(img_name):
print("已经下载过")
continue
print("下面开始下载{}".format(img_name))
res = requests.get(full_url)
f = open(img_name,'wb')
f.write(res.content)
f.close()
print("{}下载完成".format(img_name))
time.sleep(1)
if __name__ == "__main__":
urls = 'https://unsplash.com/napi/search/photos?query={}&page={}'.format(sys.argv[1],str(i)) for i in range(1,50))
pool = Pool(processes=4)
pool.map(get_pic,urls)