| | import json |
| | import jieba |
| | import re |
| | import requests |
| | import backoff |
| | import time |
| |
|
| |
|
| | @backoff.on_exception(backoff.expo, requests.exceptions.RequestException) |
| | def post_url(url, headers, payload): |
| | time.sleep(1) |
| | response = requests.request("POST", url, headers=headers, data=payload) |
| | return response |
| |
|
| |
|
| | def seg(text): |
| | text = text.replace('\n', " ") |
| | sentences = re.split(r'(?<=[。!?.!?: ])\s*', text) |
| | sentences = [string for string in sentences if string != ''] |
| | return sentences |
| |
|
| |
|
| | def clean_text(text): |
| | text = text.replace('\n', "") |
| | text = re.sub(r"-", " ", text) |
| | text = re.sub(r"\d+/\d+/\d+", "", text) |
| | text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) |
| | text = re.sub( |
| | r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) |
| | pure_text = '' |
| | for letter in text: |
| | if letter.isalpha() or letter == ' ': |
| | pure_text += letter |
| |
|
| | text = ' '.join(word for word in pure_text.split() if len(word) > 1) |
| | return text |
| |
|
| |
|
| | def article_to_group(groups, topics): |
| | para = {} |
| | for i in groups: |
| | if not i[1] in para: |
| | para[i[1]] = i[0] |
| | else: |
| | para[i[1]] = para[i[1]] + i[0] |
| | return para |
| |
|
| |
|
| | def generation(para, max_length): |
| | API_KEY = "IZt1uK9PAI0LiqleqT0cE30b" |
| | SECRET_KEY = "Xv5kHB8eyhNuI1B1G7fRgm2SIPdlxGxs" |
| |
|
| | def get_access_token(): |
| |
|
| | url = "https://aip.baidubce.com/oauth/2.0/token" |
| | params = {"grant_type": "client_credentials", |
| | "client_id": API_KEY, "client_secret": SECRET_KEY} |
| | return str(requests.post(url, params=params).json().get("access_token")) |
| |
|
| | url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/news_summary?charset=UTF-8&access_token=" + get_access_token() |
| | topic = {} |
| | Ai_abstract = [] |
| | for i, (j, k) in enumerate(para.items()): |
| | input_text = k |
| | |
| | payload = json.dumps({ |
| | "content": k, |
| | "max_summary_len": max_length |
| | }) |
| | headers = { |
| | 'Content-Type': 'application/json', |
| | 'Accept': 'application/json' |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | topic[j] = (j, k) |
| | Ai_abstract.append(j) |
| | return topic,Ai_abstract |
| | def formate_text(title_dict,outline_list): |
| | formated = [] |
| | for each in outline_list: |
| | if(each not in title_dict.keys()): |
| | formated.append(f"# {each}") |
| | if(each in title_dict.keys()): |
| | formated.append(f"## {each}") |
| | formated.append(title_dict[each][1]) |
| | return formated |