1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
| import re import time from lxml import etree import pymongo import requests from bs4 import BeautifulSoup
uri = "mongodb://user:pass@127.0.0.1:27017/demo" arr = [] now = time.strftime("%Y-%m-%d", time.localtime()) print('time.localtime=>', now)
client = pymongo.MongoClient(uri) db = client.demo collection = db.weather_cn_data headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'Cookie': '' }
def detail(detail_url, option): url = detail_url res = requests.get(url, headers=headers) paqu_shengfen_table(res)
def paqu_shengfen_table(res): soup = BeautifulSoup(res.content, 'html.parser') print('soup', soup) table_nums = soup.select('.lQCity > ul > li').__len__() for i in range(table_nums): tr_all = soup.select('.conMidtab2 > table')[i]
tr = tr_all.find_all('tr') shengfen = (tr[2].get_text().split())[0] beautiful(res, tr, shengfen)
def beautiful(res, tr, shengfen):
td = tr[2].find_all('td') city = td[1].get_text().strip() tianqixianxiang_1 = td[2].get_text().strip() fengxiangfengli_1 = td[3].get_text().strip() zuigaoqiwen = td[4].get_text().strip() tianqixianxiang_2 = td[5].get_text().strip() fengxiangfengli_2 = td[6].get_text().strip() zuidiqiwen = td[7].get_text().strip()
db_data = { 'time': now, 'province': shengfen, 'city': city, 'daytime_weather_conditions': tianqixianxiang_1, 'daytime_wind': fengxiangfengli_1, 'maximum_temperature': zuigaoqiwen, 'nighttime_weather_conditions': tianqixianxiang_2, 'nighttime__wind': fengxiangfengli_2, 'minimum_temperature': zuidiqiwen } print(res) arr.append(db_data) for j in tr[3:]: td = j.find_all('td')
city = td[0].get_text().strip() tianqixianxiang_1 = td[1].get_text().strip() fengxiangfengli_1 = td[2].get_text().strip() zuigaoqiwen = td[3].get_text().strip() tianqixianxiang_2 = td[4].get_text().strip() fengxiangfengli_2 = td[5].get_text().strip() zuidiqiwen = td[6].get_text().strip()
db_data = { 'time': now, 'province': shengfen, 'city': city, 'daytime_weather_conditions': tianqixianxiang_1, 'daytime_wind': fengxiangfengli_1, 'maximum_temperature': zuigaoqiwen, 'nighttime_weather_conditions': tianqixianxiang_2, 'nighttime__wind': fengxiangfengli_2, 'minimum_temperature': zuidiqiwen } print(res) arr.append(db_data)
def get_datail_url(url, option): detail(url, option)
def start(option):
urls = [ 'https://www.weather.com.cn/textFC/hb.shtml', 'https://www.weather.com.cn/textFC/db.shtml', 'https://www.weather.com.cn/textFC/hd.shtml', 'https://www.weather.com.cn/textFC/hz.shtml', 'https://www.weather.com.cn/textFC/hn.shtml', 'https://www.weather.com.cn/textFC/xb.shtml', 'https://www.weather.com.cn/textFC/xn.shtml', 'https://www.weather.com.cn/textFC/gat.shtml' ] for i in range(0, 7): get_datail_url(urls[i], option)
def job(): start(1) print('arr', arr) print('db result', collection.insert_many(arr))
if __name__ == '__main__': job()
|