import urllib.request url="http://blog.csdn.net/weiwei_pig/article/details/51178226" headers=("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Mobile Safari/537.36") opener = urllib.request.build_opener() opener.addheaders = [headers] data = opener.open(url).read() print(data)
方法2、用add_header()添加报头
1 2 3 4 5
import urllib.request url="http://blog.csdn.net/weiwei_pig/article/details/51178226" req=urllib.request.Request(url) req.add_header("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Mobile Safari/537.36") data = urllib.request.urlopen(req).read()
1.3、超时设置,比如gayhun
1 2 3 4 5 6 7 8
import urllib.request for i in range(1,100): try: file = urllib.request.urlopen("https://github.com/",timeout=1) data = file.read() print(len(data)) except Exception as e: print("出现异常–>"+str(e))
def match(pattern, string, flags=0): """Try to apply the pattern at the start of the string, returning a match object, or None if no match was found.""" return _compile(pattern, flags).match(string)
import re test = 'http://news.163.com/17/0624/10/CNMHVBJP0001899N.html' print(re.match(r'http',test)) # <_sre.SRE_Match object; span=(0, 4), match='http'> print(re.match(r'news',test)) # None
2、re.search()函数
函数语法:
1
1 re.search(pattern, string[, flags])
1 2 3 4
1 def search(pattern, string, flags=0): 2 """Scan through string looking for a match to the pattern, returning 3 a match object, or None if no match was found.""" 4 return _compile(pattern, flags).search(string)
1 import re 2 test = 'I am a loving child to learn.' 3 print(re.search(r'I',test)) # <_sre.SRE_Match object; span=(0, 1), match='I'> 4 print(re.search(r'learn',test)) # <_sre.SRE_Match object; span=(23, 28), match='learn'> 5 print(re.search(r'alina',test)) # None
3、re.sub()函数
函数语法:
1
1 re.sub(pattern,repl,string,count,flags)
1 2 3 4 5 6 7 8
1 def sub(pattern, repl, string, count=0, flags=0): 2 """Return the string obtained by replacing the leftmost 3 non-overlapping occurrences of the pattern in string by the 4 replacement repl. repl can be either a string or a callable; 5 if a string, backslash escapes in it are processed. If it is 6 a callable, it's passed the match object and must return 7 a replacement string to be used.""" 8 return _compile(pattern, flags).sub(repl, string, count)
函数参数说明:
pattern:匹配的正则表达式
repl:替换的字符串
String:要被查找替换的原始字符串
count:匹配后替换的最大次数,默认0表示途欢所有的匹配
re.sub()函数用于替换字符串中的匹配项。
1 2 3
1 import re 2 test = 'I am a loving child to learn.' 3 print(re.sub(r'child','MMMMM',test)) # 替换字符串,将child 替换成MMMMM
4、re.findall()函数
函数语法:
1
1 re.findall(pattern,string,flags)
1 2 3 4 5 6 7 8 9
1 def findall(pattern, string, flags=0): 2 """Return a list of all non-overlapping matches in the string. 3 4 If one or more capturing groups are present in the pattern, return 5 a list of groups; this will be a list of tuples if the pattern 6 has more than one group. 7 8 Empty matches are included in the result.""" 9 return _compile(pattern, flags).findall(string)
re.findall()可以获取字符串中所有匹配的字符串
1 2 3
1 import re 2 test = '<a href="http://www.educity.cn/zhibo/" target="_blank">直播课堂</a>' 3 print(re.findall(r'<a href="(.*)" target="_blank">(.*)</a>',test)) #[('http://www.educity.cn/zhibo/', '直播课堂')]
3.2、常见匹配
email
1 2 3 4 5 6
import re pattern = "\w+([.+-]\w+)*@\w+([.-]\w+)*\.\w+([.-]\w+)*" string = "<a href='http://www.baidu.com'>百度</a><br><a href='w.linkings@gail.com'>电邮</a>" result = re.search(pattern,string) print(result) print(result.group(0))
import requests import re url = 'http://120.24.86.145:8002/qiumingshan/' s = requests.Session() source = s.get(url) expression = re.search(r'(\d+[+\-*])+(\d+)', source.text).group() result = eval(expression) post = {'value': result} print(s.post(url, data = post).text)
五、更优雅的包,requests
5.1、简单使用
1 2 3 4 5 6 7 8
>>> import requests 然后,尝试获取某个网页。本例子中,我们来获取Github的公共时间线 >>> r = requests.get('https://github.com/timeline.json') >>> r = requests.post("http://httpbin.org/post") >>> r = requests.put("http://httpbin.org/put") >>> r = requests.delete("http://httpbin.org/delete") >>> r = requests.head("http://httpbin.org/get") >>> r = requests.options("http://httpbin.org/get")