1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
| import re import sqlite3 import time import urllib.request from bs4 import BeautifulSoup from prettytable import PrettyTable
headers = ("User - Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36") opener = urllib.request.build_opener() opener.addheaders = [headers]
urllib.request.install_opener(opener)
def save_img_url(data): path="img_url.txt" file=open(path,"a") file.write(data+"\n") file.close()
def save_img(): for p in range(0,10): flag = True url = "https://movie.douban.com/top250?start=" + str(p * 25) data1 = urllib.request.urlopen(url).read().decode("utf-8") save_img_url(url) pat = re.compile(r'<img.*src="(.*?).jpg"') img_url = pat.findall(data1) for a_i in range(0, len(img_url)): loop = True while loop: try: this_img = (img_url[a_i]) this_img_url = this_img + ".jpg" print(this_img_url) img_path = "img/" + str((a_i + 1) + p * 25) + ".jpg" print("正在爬取第", str((a_i + 1) + p * 25), "张图片") urllib.request.urlretrieve(this_img_url, img_path) time.sleep(0.1) loop = False except: print("图片爬取错误,正在重试!")
def save_all(): titles = [] kinds = [] ranks = [] quotes = [] viewns = []
for p in range(0,10): url = "https://movie.douban.com/top250?start=" + str(p * 25) data1 = urllib.request.urlopen(url).read().decode("utf-8")
pat1 = re.compile(r'<span class="title">([^&]+)</span>',re.S) title = pat1.findall(data1) titles = titles + title
pat2= re.compile(r'<br>(.*?)</p>',re.S) kind = pat2.findall(data1) for i in range(0,len(kind)): kind[i] = re.sub(' ', '', kind[i]) kind[i] = re.sub(' ', '', kind[i]) kind[i] = re.sub(' ', '', kind[i]) kind[i] = re.sub('\n', '', kind[i]) kinds = kinds + kind
pat3 = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>',re.S) rank = pat3.findall(data1) ranks = ranks + rank
url = "https://movie.douban.com/top250?start=" + str(p * 25) data1 = urllib.request.urlopen(url).read().decode("utf-8") pat4 = re.compile(r'<span>(.*?)</span>',re.S) viewn = pat4.findall(data1) del viewn[0: 6] viewns = viewns + viewn
soup = BeautifulSoup(data1,"html.parser") for item in soup.find_all('div', class_="item"): pat5 = re.compile(r'<span class="inq">(.*)</span>') item = str(item) quote = "".join(re.findall(pat5, item)) if len(quote) != 0: quotes.append(quote) else: quotes.append(" ") print("第", p+1, "页爬取完毕") print("---------------------------") print("检查数据正确性:") print("影片名称个数", len(titles)) print("影片种类个数", len(kinds)) print("影片评分个数", len(ranks)) print("影片评分人数个数", len(viewns)) print("影片精华简评个数", len(quotes))
conn = sqlite3.connect('Spider.db') print("建立并打开数据库 Spider.db 成功!") c = conn.cursor() c.execute(''' create table Movie(ID int primary key not null, title varchar(50), kind varchar(50), ranks varchar(50), viewer varchar(50), quote varchar(50)); ''') print("表Movie创建成功!")
print("开始向Movie表插入数据") for i in range(0,len(titles)): c.execute(''' insert into Movie values(?,?,?,?,?,?); ''', (i+1, titles[i],kinds[i],ranks[i],viewns[i],quotes[i])) print("数据插入Movie成功!") conn.commit() conn.close()
def select_sql(): tb = PrettyTable() tb.field_names = ["ID", "电影名称", "种类", "评分", "评论人数", "优秀短评"]
conn = sqlite3.connect('Spider.db') c = conn.cursor() print("开始查询SQL数据库的Movie表:") cursor = c.execute("SELECT * FROM Movie") for row in cursor: tb.add_row([row[0], row[1], row[2], row[3], row[4], row[5]]) print(tb) print("SQL数据库查询成功!") conn.commit() conn.close()
def main(): print("开始爬取数据!") save_all() print("数据爬取完毕!4秒后开始爬取图片") print("---------------------------") time.sleep(4) save_img() print("图片爬取完毕!请在img目录下查看对应ID图片") print("4秒后开始查询数据库Movie表") print("---------------------------") time.sleep(4) select_sql()
if __name__ == "__main__": main()
|