Spaces:
Runtime error
Runtime error
File size: 8,485 Bytes
822c3cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import re
import pandas as pd
import os
from dotenv import load_dotenv
import openai
# from pydantic import BaseModel
# from googletrans import Translator
from datetime import datetime
import httpx
# https://platform.openai.com/docs/guides/structured-outputs/introduction
# 加载 .env 文件
load_dotenv()
authorization = "Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsImp0aSI6IjkxMzIxYjY3YWM3ZWZlNTdjMGRmNWJkNmMxMTU2ZTI3OTU3OWI0M2ZjZDhjYWYxMGE1ZjllY2UzNWNjZmVlNTIxZTU5MGJjMzZiYzM5NzNhIn0.eyJhdWQiOiJGRnlIN0VCbTc1aFF4cnJZTWU4NWVVbnNsNWdVQy1aSWVDdnpuN2hwYkJBIiwianRpIjoiOTEzMjFiNjdhYzdlZmU1N2MwZGY1YmQ2YzExNTZlMjc5NTc5YjQzZmNkOGNhZjEwYTVmOWVjZTM1Y2NmZWU1MjFlNTkwYmMzNmJjMzk3M2EiLCJpYXQiOjE3Mjk4NTIzMjIsIm5iZiI6MTcyOTg1MjMyMiwiZXhwIjoxNzI5OTM4NzIyLjExNjMwOCwic3ViIjoiNTU5NTgiLCJzY29wZSI6WyJhdXRoZW50aWNhdGVkIl0sImVtYWlsIjoiamllLndhbmdAa2luZGluZ2xhdy5jb20iLCJzdWJzY3JpcHRpb24iOiJlc3NlbnRpYWwiLCJ0ZW5hbnRfZXhwaXJ5IjpudWxsLCJuYW1lIjoiSmllIFdhbmcifQ.e8bjYP0qebVjdiw8SIJYEVFj9agn_7ZS5EWvEEm_sUuDFSn2IfvIr2U2ExhF6oKlj0TXPatLFLOLZJgXjIyOGn3k2beP1QEsq3jtVrfM8-KG7ZnLXehYl9xp7gRDqNST8_M_tt6m1cLWoFl7-BvpSBJQxFCsD8_uOzK5swB1MHDUegZnvwMKHHP4rm5sHinXcEQ_eyzKsiZ8ZE4Zn6LCa7HWam0Ca61BGPMU4GrNK2kfn19rIb70huJ8tNN3ulqp5x1bJQVfIKUEWTrp0KJmQOsvY7idfi-jWluuJ3g3VULxzZuwU7YN2Gxv5gom9N-eCAdiPyb3IOumLnN2mr3ZT09R8nhGzW8MO2JRai-YgbnVMrkTqTnpFgz9JfOrNOme-Hw1AhLvJN3O2Db8uY6evtljeJqikfjHvWyztOntlCE5RpfCihGHDorFiKhSu2vxA9f4c_Dt0Cm3_HjDMSuqy0jU14F-CQkaJbT6ApCAIUS2xSUCzSpcjSR8BUjjua5KfMh_hM8eFQxOWWXmJBomCX0ZnQeADYJ5USK_NO89DCsSdUkYsBeP9vBbjiD8FS71vu4mfv4Mdz18ZVL1yDjIq8HboLjT7KLPQDHI9PSDzochvxTmHnW6MayTyvuFGPAUvPMDAUL2-kSdTDhdRwYZF1GTk4K2Dd7vsTpLNBZMdDY"
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'authorization': authorization,
'content-type': 'application/json',
'dnt': '1',
'origin': 'https://www.dataguidance.ai',
'priority': 'u=1, i',
'referer': 'https://www.dataguidance.ai',
'sec-ch-ua': '"Not?A_Brand";v="99", "Chromium";v="130"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}
proxies = {
'http://': 'http://127.0.0.1:7890',
'https://': 'http://127.0.0.1:7890'
}
title_translate_prompt = """
你是一位翻译专家,精通中英文,擅长复杂的术论文翻译成易懂的科普文章。
请将输入的标题从英文翻译到中文。
"""
country_translate_prompt = """
你是一位翻译专家,精通中英文,擅长复杂的术论文翻译成易懂的科普文章。
请将输入的国家或地区名称从英文翻译到中文。
"""
prompt = """
# Character
你是一位翻译专家,精通中英文,擅长复杂的术论文翻译成易懂的科普文章。你无需编程,而是专注于题解和翻译。
## Skills
- 将英文学术论文翻译成中文科普文章(保持原有格式和专业术语,例如FLAC,JPEG,Microsoft,Amazon等)
- 注意必须准确传达原文的事实和背景
- 在需要的时候,在括号中标记对应的英文单词
### Skill 1: 直接翻译
- 根据英文内容直接翻译,保持原有的格式,尽量不遗漏任何信息
## 策略
策略:
1. 根据Skill 1进行英文内容直详,保持原有格式,不要遗漏任何信息
2. 去除所有的HTML标签
## 限制
- 必须翻译原值的全部内容,包括专业术语(例如 FLCA ,JPEG等)以及公司名词(例如 Microsoft,Amazon等)
- 根据数据保护的相关术语词汇对应表,Controller对应中文"控制者",Processor对应"处理者"Data breach对应"数据泄漏",Sub processor对应"子处理者",Information Subject对应"数据主体",Transfer对应"传输"
- 在应对可能存在多义的英文词汇时,要在括号中标记对应的英文单词
- 回答所有问题时,不能使用"很抱歉,但是"等开头语
- 必须遵守道德和法律,不能产生、传播或解释任何非法、有害或歧视性的内容
"""
# OpenRouter API 配置
openrouter_url = 'https://openrouter.ai/api/v1'
# 读取已存在的 Excel 文件
if os.path.exists('output.xlsx'):
df = pd.read_excel('output.xlsx')
else:
#column: url html area country date translated_title translated_content comment
df = pd.DataFrame(columns=['url', 'html', 'area', 'country', 'date', 'translated_title', 'translated_content', 'comment'])
# since googletrans's dependencies conflict with openai, we use openai's translate function instead
# translator = Translator()
# 修改这行
client = openai.Client(api_key=os.getenv('OPENROUTER_API_KEY'), base_url=openrouter_url, http_client=httpx.Client(proxies=proxies))
# 从Excel读取区域映射
def load_area_mapping():
mapping_df = pd.read_excel('area_mapping.xlsx')
area_mapping = {}
for _, row in mapping_df.iterrows():
area_mapping[row['area']] = [country.strip() for country in row['countries'].split(',')]
return area_mapping
# 替换原来的硬编码映射
area_mapping = load_area_mapping()
# 处理每个 URL
for index, row in df.iterrows():
url = row['url']
if pd.notna(row['html']) and pd.notna(row['translated_content']):
print(f"跳过已存在的 URL: {url}")
continue
print(f"处理 URL: {url}")
# Extract path from URL like "https://www.dataguidance.ai/news/south-korea-pipc-fined-online-retailer-pipa-violations"
match = re.search(r'/news/(.+?)(?:\?|$)', url)
if match:
path = match.group(0)
composed_url = f'https://dgcb20-ca-northeurope-dglive.yellowground-c1f17366.northeurope.azurecontainerapps.io/api/v1/content/articles/by_path?path={path}'
headers['referer'] = url
response = httpx.get(composed_url, headers=headers, proxies=proxies)
# print(response.status_code)
# print(response.json())
html_content = response.json()['contentBody']['html']['en']
# title: South Korea: PIPC fined online retailer for PIPA violations after data breach
title = response.json()['title']['en']
split_title = title.split(':')
# country = South Korea
country_en = split_title[0].strip()
country_zh = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": country_translate_prompt},
{"role": "user", "content": country_en}
]
).choices[0].message.content
# 确定区域
area = '其他'
for region, countries in area_mapping.items():
if any(country in country_zh for country in countries):
area = region
break
# published_on = "2024-10-24T11:49:41+01:00"
published_on = response.json()['publishedOn']
# published_on_zh = "2024年10月24日"
published_on_zh = datetime.strptime(published_on, '%Y-%m-%dT%H:%M:%S%z').strftime('%Y年%m月%d日')
print(html_content)
# translate title
title_zh = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": title_translate_prompt},
{"role": "user", "content": title}
]
).choices[0].message.content
translation_response = client.beta.chat.completions.parse(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": html_content}
]
# response_format=DGNews, # Assuming DGNews is defined elsewhere
)
# # prompt caching
# 替换软换行(\n)为硬换行(\r\n)
translated_content = translation_response.choices[0].message.content.replace('\n', '\r\n')
# # 更新 DataFrame 中的相应行
df.at[index, 'area'] = area
df.at[index, 'country'] = country_zh
df.at[index, 'date'] = published_on_zh
df.at[index, 'translated_title'] = title_zh
df.at[index, 'html'] = html_content
df.at[index, 'translated_content'] = translated_content
# print(f"已更新 URL: {url}")
else:
print(f"URL 格式不正确: {url}")
# 将更新后的 DataFrame 写入 Excel 文件
df.to_excel('output.xlsx', index=False)
|