Spaces:
Runtime error
Runtime error
| import re | |
| import pandas as pd | |
| import os | |
| from dotenv import load_dotenv | |
| import openai | |
| from datetime import datetime | |
| import httpx | |
| import gradio as gr | |
| from docxtpl import DocxTemplate | |
| # 加载 .env 文件 | |
| load_dotenv() | |
| authorization = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsImp0aSI6IjkxMzIxYjY3YWM3ZWZlNTdjMGRmNWJkNmMxMTU2ZTI3OTU3OWI0M2ZjZDhjYWYxMGE1ZjllY2UzNWNjZmVlNTIxZTU5MGJjMzZiYzM5NzNhIn0.eyJhdWQiOiJGRnlIN0VCbTc1aFF4cnJZTWU4NWVVbnNsNWdVQy1aSWVDdnpuN2hwYkJBIiwianRpIjoiOTEzMjFiNjdhYzdlZmU1N2MwZGY1YmQ2YzExNTZlMjc5NTc5YjQzZmNkOGNhZjEwYTVmOWVjZTM1Y2NmZWU1MjFlNTkwYmMzNmJjMzk3M2EiLCJpYXQiOjE3Mjk4NTIzMjIsIm5iZiI6MTcyOTg1MjMyMiwiZXhwIjoxNzI5OTM4NzIyLjExNjMwOCwic3ViIjoiNTU5NTgiLCJzY29wZSI6WyJhdXRoZW50aWNhdGVkIl0sImVtYWlsIjoiamllLndhbmdAa2luZGluZ2xhdy5jb20iLCJzdWJzY3JpcHRpb24iOiJlc3NlbnRpYWwiLCJ0ZW5hbnRfZXhwaXJ5IjpudWxsLCJuYW1lIjoiSmllIFdhbmcifQ.e8bjYP0qebVjdiw8SIJYEVFj9agn_7ZS5EWvEEm_sUuDFSn2IfvIr2U2ExhF6oKlj0TXPatLFLOLZJgXjIyOGn3k2beP1QEsq3jtVrfM8-KG7ZnLXehYl9xp7gRDqNST8_M_tt6m1cLWoFl7-BvpSBJQxFCsD8_uOzK5swB1MHDUegZnvwMKHHP4rm5sHinXcEQ_eyzKsiZ8ZE4Zn6LCa7HWam0Ca61BGPMU4GrNK2kfn19rIb70huJ8tNN3ulqp5x1bJQVfIKUEWTrp0KJmQOsvY7idfi-jWluuJ3g3VULxzZuwU7YN2Gxv5gom9N-eCAdiPyb3IOumLnN2mr3ZT09R8nhGzW8MO2JRai-YgbnVMrkTqTnpFgz9JfOrNOme-Hw1AhLvJN3O2Db8uY6evtljeJqikfjHvWyztOntlCE5RpfCihGHDorFiKhSu2vxA9f4c_Dt0Cm3_HjDMSuqy0jU14F-CQkaJbT6ApCAIUS2xSUCzSpcjSR8BUjjua5KfMh_hM8eFQxOWWXmJBomCX0ZnQeADYJ5USK_NO89DCsSdUkYsBeP9vBbjiD8FS71vu4mfv4Mdz18ZVL1yDjIq8HboLjT7KLPQDHI9PSDzochvxTmHnW6MayTyvuFGPAUvPMDAUL2-kSdTDhdRwYZF1GTk4K2Dd7vsTpLNBZMdDY" | |
| headers = { | |
| 'accept': '*/*', | |
| 'accept-language': 'zh-CN,zh;q=0.9', | |
| 'content-type': 'application/json', | |
| 'dnt': '1', | |
| 'origin': 'https://www.dataguidance.ai', | |
| 'priority': 'u=1, i', | |
| 'referer': 'https://www.dataguidance.ai', | |
| 'sec-ch-ua': '"Not?A_Brand";v="99", "Chromium";v="130"', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'sec-ch-ua-platform': '"macOS"', | |
| 'sec-fetch-dest': 'empty', | |
| 'sec-fetch-mode': 'cors', | |
| 'sec-fetch-site': 'cross-site', | |
| 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36' | |
| } | |
| if os.getenv('MODE') == "dev": | |
| proxies = { | |
| 'http://': 'http://127.0.0.1:7890', | |
| 'https://': 'http://127.0.0.1:7890' | |
| } | |
| else: | |
| proxies = None | |
| title_translate_prompt = """ | |
| 你是一位翻译专家,精通中英文,擅长复杂的术论文翻译成易懂的科普文章。 | |
| 请将输入的标题从英文翻译到中文。 | |
| """ | |
| country_translate_prompt = """ | |
| 你是一位翻译专家,精通中英文,擅长复杂的术论文翻译成易懂的科普文章。 | |
| 请将输入的国家或地区名称从英文翻译到中文。 | |
| """ | |
| prompt = """ | |
| # Character | |
| 你是一位翻译专家,精通中英文,擅长复杂的术论文翻译成易懂的科普文章。你无需编程,而是专注于题解和翻译。 | |
| ## Skills | |
| - 将英文学术论文翻译成中文科普文章(保持原有格式和专业术语,例如FLAC,JPEG,Microsoft,Amazon等) | |
| - 注意必须准确传达原文的事实和背景 | |
| - 在需要的时候,在括号中标记对应的英文单词 | |
| ### Skill 1: 直接翻译 | |
| - 根据英文内容直接翻译,保持原有的格式,尽量不遗漏任何信息 | |
| ## 策略 | |
| 策略: | |
| 1. 根据Skill 1进行英文内容直详,保持原有格式,不要遗漏任何信息 | |
| 2. 去除所有的HTML标签 | |
| 3. 去除所有的Markdown格式 | |
| ## 限制 | |
| - 必须翻译原值的全部内容,包括专业术语(例如 FLCA ,JPEG等)以及公司名词(例如 Microsoft,Amazon等) | |
| - 根据数据保护的相关术语词汇对应表,Controller对应中文"控制者",Processor对应"处理者"Data breach对应"数据泄漏",Sub processor对应"子处理者",Information Subject对应"数据主体",Transfer对应"传输" | |
| - 在应对可能存在多义的英文词汇时,要在括号中标记对应的英文单词 | |
| - 回答所有问题时,不能使用"很抱歉,但是"等开头 | |
| - 必须遵守道德和法律,不能产生、传播或解释任何非法、有害或歧视性的内容 | |
| """ | |
| # OpenRouter API 配置 | |
| openrouter_url = 'https://openrouter.ai/api/v1' | |
| # 从Excel读取区域映射 | |
| def load_area_mapping(): | |
| mapping_df = pd.read_excel('area_mapping.xlsx') | |
| area_mapping = {} | |
| for _, row in mapping_df.iterrows(): | |
| area_mapping[row['area']] = [country.strip() for country in row['countries'].split(',')] | |
| return area_mapping | |
| # 替换原来的硬编码映射 | |
| area_mapping = load_area_mapping() | |
| def process_urls(urls_text, auth, progress=gr.Progress()): | |
| # Update headers with the provided auth token | |
| headers['authorization'] = f"Bearer {auth}" | |
| # Initialize or load existing DataFrame | |
| if os.path.exists('output.xlsx'): | |
| df = pd.read_excel('output.xlsx') | |
| else: | |
| df = pd.DataFrame(columns=['url', 'html', 'area', 'country', 'date', 'translated_title', 'translated_content', 'comment']) | |
| # Load area mapping | |
| area_mapping = load_area_mapping() | |
| # Split URLs into list | |
| urls = [url.strip() for url in urls_text.split('\n') if url.strip()] | |
| results = [] | |
| for url in progress.tqdm(urls): | |
| # Check if URL already exists in DataFrame | |
| if url in df['url'].values: | |
| if pd.notna(df.loc[df['url'] == url, 'translated_content'].iloc[0]): | |
| results.append(f"跳过已存在的 URL: {url}") | |
| continue | |
| try: | |
| # Extract path from URL | |
| match = re.search(r'/news/(.+?)(?:\?|$)', url) | |
| if not match: | |
| results.append(f"URL 格式不正确: {url}") | |
| continue | |
| path = match.group(0) | |
| composed_url = f'https://dgcb20-ca-northeurope-dglive.yellowground-c1f17366.northeurope.azurecontainerapps.io/api/v1/content/articles/by_path?path={path}' | |
| # Get article content | |
| headers['referer'] = url | |
| response = httpx.get(composed_url, headers=headers, proxies=proxies) | |
| if response.status_code != 200: | |
| results.append(f"获取内容失败 ({response.status_code}): {url}") | |
| continue | |
| data = response.json() | |
| html_content = data['contentBody']['html']['en'] | |
| title = data['title']['en'] | |
| split_title = title.split(':') | |
| country_en = split_title[0].strip() | |
| # Translate country | |
| country_zh = client.chat.completions.create( | |
| model="google/gemini-flash-1.5-8b", | |
| messages=[ | |
| {"role": "system", "content": country_translate_prompt}, | |
| {"role": "user", "content": country_en} | |
| ] | |
| ).choices[0].message.content | |
| # Determine area | |
| area = '其他' | |
| for region, countries in area_mapping.items(): | |
| if any(country in country_zh for country in countries): | |
| area = region | |
| break | |
| # Process date | |
| published_on = data['publishedOn'] | |
| published_on_zh = datetime.strptime(published_on, '%Y-%m-%dT%H:%M:%S%z').strftime('%Y年%m月%d日') | |
| # Translate title | |
| title_zh = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": title_translate_prompt}, | |
| {"role": "user", "content": title} | |
| ] | |
| ).choices[0].message.content | |
| # Translate content | |
| translation_response = client.beta.chat.completions.parse( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": prompt}, | |
| {"role": "user", "content": html_content} | |
| ] | |
| ) | |
| translated_content = translation_response.choices[0].message.content.replace('\n', '\r\n') | |
| # Add or update DataFrame | |
| new_row = { | |
| 'url': url, | |
| 'html': html_content, | |
| 'area': area, | |
| 'country': country_zh, | |
| 'date': published_on_zh, | |
| 'translated_title': title_zh, | |
| 'translated_content': translated_content, | |
| 'comment': '' | |
| } | |
| if url in df['url'].values: | |
| df.loc[df['url'] == url] = new_row | |
| else: | |
| df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) | |
| results.append(f"成功处理 URL: {url}") | |
| except Exception as e: | |
| results.append(f"处理出错 ({str(e)}): {url}") | |
| # Save after each successful processing | |
| df.to_excel('output.xlsx', index=False) | |
| # Return both the results text and the path to the Excel file | |
| return '\n'.join(results), 'output.xlsx' | |
| def process_excel_and_generate_docs(excel_file, term, start_year, end_year, start_month, start_day, end_month, end_day): | |
| # 读取Excel文件 | |
| df = pd.read_excel(excel_file.name) | |
| # Initialize the news dictionary | |
| news_dict = {'news': {}} | |
| # Group by area and convert to the desired format | |
| for area, group in df.groupby('area'): | |
| news_dict['news'][area.lower()] = [ | |
| { | |
| 'title': row['translated_title'], | |
| 'date': row['date'], | |
| 'content': str(row['translated_content']).replace('\r\n', '\n').replace('\r', '\n').replace('\n', '\r\n'), | |
| 'comment': row['comment'], | |
| 'country': row['country'] | |
| } | |
| for _, row in group.iterrows() | |
| ] | |
| # Create the context dictionary with all required fields | |
| context = { | |
| 'term': term, | |
| 'start_year': start_year, | |
| 'end_year': end_year, | |
| "start_month": start_month, | |
| "start_day": start_day, | |
| "end_month": end_month, | |
| "end_day": end_day, | |
| **news_dict | |
| } | |
| # Render PDF template | |
| pdf_output_path = "pdf.docx" | |
| pdf_tpl = DocxTemplate("v1.1 周报模板.docx") | |
| pdf_tpl.render(context) | |
| pdf_tpl.save(pdf_output_path) | |
| # Render Email template | |
| email_output_path = "email.docx" | |
| email_tpl = DocxTemplate("v1.1 周报邮件格式调整.docx") | |
| email_tpl.render(context) | |
| email_tpl.save(email_output_path) | |
| return [pdf_output_path, email_output_path] | |
| def create_combined_interface(): | |
| with gr.Blocks() as app: | |
| gr.Markdown("# News Processing & Report Generation Tool") | |
| with gr.Tabs() as tabs: | |
| # Tab 1: URL Processing | |
| with gr.Tab("URL Processing"): | |
| with gr.Row(): | |
| auth_input = gr.Textbox( | |
| label="Authorization Token", | |
| value=authorization, | |
| type="password", | |
| lines=1, | |
| ) | |
| with gr.Row(): | |
| urls_input = gr.Textbox( | |
| label="Input URLs (one per line)", | |
| placeholder="https://www.dataguidance.ai/news/...\nhttps://www.dataguidance.ai/news/...", | |
| lines=20 | |
| ) | |
| with gr.Row(): | |
| process_button = gr.Button("Process URLs") | |
| with gr.Row(): | |
| output = gr.Textbox(label="Processing Results", lines=20) | |
| with gr.Row(): | |
| file_output = gr.File(label="Download Processed Excel") | |
| # Tab 2: Report Generation | |
| with gr.Tab("Report Generation"): | |
| with gr.Row(): | |
| excel_file = gr.File(label="Upload Excel File") | |
| with gr.Row(): | |
| term = gr.Textbox(label="期数", value="201") | |
| start_year = gr.Textbox(label="起始年", value="2024") | |
| end_year = gr.Textbox(label="结束年", value="2024") | |
| with gr.Row(): | |
| start_month = gr.Textbox(label="起始月份", value="9") | |
| start_day = gr.Textbox(label="起始日", value="1") | |
| end_month = gr.Textbox(label="结束月份", value="9") | |
| end_day = gr.Textbox(label="结束日", value="15") | |
| with gr.Row(): | |
| generate_btn = gr.Button("Generate Reports") | |
| with gr.Row(): | |
| pdf_output = gr.File(label="Download PDF Report") | |
| email_output = gr.File(label="Download Email Template") | |
| # Connect the buttons to their respective functions | |
| process_button.click( | |
| fn=lambda auth, urls: process_urls(urls, auth=auth), | |
| inputs=[auth_input, urls_input], | |
| outputs=[output, file_output] | |
| ) | |
| generate_btn.click( | |
| fn=process_excel_and_generate_docs, | |
| inputs=[excel_file, term, start_year, end_year, start_month, start_day, end_month, end_day], | |
| outputs=[pdf_output, email_output] | |
| ) | |
| return app | |
| auth = (os.environ.get("GRADIO_USERNAME", "admin"), | |
| os.environ.get("GRADIO_PASSWORD", "password123")) | |
| if __name__ == "__main__": | |
| # Initialize OpenAI client | |
| client = openai.Client( | |
| api_key=os.getenv('OPENROUTER_API_KEY'), | |
| base_url=openrouter_url, | |
| http_client=httpx.Client(proxies=proxies) | |
| ) | |
| # Launch combined Gradio interface with authentication | |
| app = create_combined_interface() | |
| app.launch( | |
| # auth=auth, | |
| max_threads=3, # Limit concurrent processing | |
| show_error=True, | |
| share=True, | |
| server_name="0.0.0.0" | |
| ) | |