| | |
| | |
| | from scrap_util import getDriver, titleLocInfo, find_key_paragrap, extract_from_driver, table_record_doc |
| | import helium as hm |
| | import time |
| | import datetime |
| | import numpy as np |
| | import pandas as pd |
| |
|
| | print("please input the url list to dealt with") |
| | print("调用 web tool TO get Info and context") |
| | urls = ['http://csglw.beijing.gov.cn/zwxx/rsgl/gwygl/202209/t20220909_2812285.html', |
| | 'http://ghzrzyw.beijing.gov.cn/zhengwuxinxi/rsxx/sj/202304/t20230418_3058292.html', |
| | 'http://mzj.beijing.gov.cn/art/2022/6/16/art_383_630732.html', |
| | 'http://www.scrsw.net/zhaokao/2023/zk91559_1.html', |
| | 'https://www.gyct.com.cn/info/1487/112938.htm', |
| | 'http://www.scrsw.net/zhaokao/2023/zk92498_1.html', |
| | 'http://www.hnrs.com.cn/shiye/5148.html', |
| | 'http://www.hnrs.com.cn/shiye/5077.html', |
| | 'http://www.hnrs.com.cn/shiye/5072.html', |
| | 'http://hrss.gd.gov.cn/zwgk/xxgkml/content/post_4148656.html', |
| | 'https://www.gdzz.gov.cn/tzgg/content/post_18310.html', |
| | 'https://www.gdzz.gov.cn/tzgg/content/post_18035.html', |
| | 'https://www.jsdzj.gov.cn/art/2023/2/24/art_28_15933.html', |
| | 'http://www.js.msa.gov.cn/art/2023/2/24/art_11436_1391666.html', |
| | 'http://www.jsrsks.com/index/article/content/id/3315.shtml'] |
| | |
| | num_urls = len(urls) |
| | print(f"需要处理的任务url个数{num_urls}") |
| |
|
| |
|
| | st = time.time() |
| |
|
| | driver = getDriver() |
| | hm.set_driver(driver) |
| |
|
| | |
| | task_docs = [] |
| | contents = [] |
| | for task_link in urls: |
| | hm.go_to(task_link) |
| | time.sleep(1) |
| | |
| | |
| | |
| | items_ = driver.find_elements_by_xpath("//p") |
| | items_ = [i.text for i in items_ if i.text != ""] |
| | context_to_label = "\n".join(items_) |
| | doc = extract_from_driver(driver) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | doc_row = table_record_doc(doc) |
| | content = [task_link] |
| | content.extend(doc_row) |
| | content.append(context_to_label) |
| | contents.append(content) |
| | task_docs.append(doc) |
| | print(doc) |
| | |
| | title = doc["title"].replace("\n", "").strip(" ") |
| | |
| | |
| | |
| |
|
| | driver.close() |
| | print(f"当前任务爬取完成!,总共{num_urls}") |
| |
|
| | name = ["sub_url", "title", "zwk_year", "zwk_sheng", "zwk_diqu", "zwk_zip", "zwlx", |
| | "bm_sj", "fee_sj", "ks_sj", "zkz_sj", |
| | "tidy_bm_sj","tidy_fee_sj", "tidy_ks_sj", "tidy_zkz_sj", |
| | "fn_list", "content"] |
| |
|
| |
|
| | end_time = time.time() |
| | cost = end_time - st |
| | print(f"total cost time:{cost}") |
| | |
| | dt, tm = str(datetime.datetime.today()).split() |
| | tm = tm[:5] |
| | time_ckt = dt+"_"+tm |
| | fn_name = "./scrap_data/" + time_ckt + ".csv" |
| | df = pd.DataFrame(data=contents, columns=name) |
| | for i in df.columns: |
| | if np.all(pd.notnull(df[i])) == False: |
| | df[i].fillna("", inplace=True) |
| | num_res = df.shape[0] |
| | print(f"抓取结果数{num_res}") |
| | df.to_csv(fn_name, index=False) |
| |
|
| | |
| | print(f"# 信息数据库保存! 在文件中:{fn_name}") |
| |
|