Spaces:

HarveyChiu
/

weeklyDataReport

Runtime error

File size: 8,485 Bytes

822c3cb

import re
import pandas as pd
import os
from dotenv import load_dotenv
import openai
# from pydantic import BaseModel
# from googletrans import Translator
from datetime import datetime
import httpx
# https://platform.openai.com/docs/guides/structured-outputs/introduction

# 加载 .env 文件
load_dotenv()

authorization = "Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsImp0aSI6IjkxMzIxYjY3YWM3ZWZlNTdjMGRmNWJkNmMxMTU2ZTI3OTU3OWI0M2ZjZDhjYWYxMGE1ZjllY2UzNWNjZmVlNTIxZTU5MGJjMzZiYzM5NzNhIn0.eyJhdWQiOiJGRnlIN0VCbTc1aFF4cnJZTWU4NWVVbnNsNWdVQy1aSWVDdnpuN2hwYkJBIiwianRpIjoiOTEzMjFiNjdhYzdlZmU1N2MwZGY1YmQ2YzExNTZlMjc5NTc5YjQzZmNkOGNhZjEwYTVmOWVjZTM1Y2NmZWU1MjFlNTkwYmMzNmJjMzk3M2EiLCJpYXQiOjE3Mjk4NTIzMjIsIm5iZiI6MTcyOTg1MjMyMiwiZXhwIjoxNzI5OTM4NzIyLjExNjMwOCwic3ViIjoiNTU5NTgiLCJzY29wZSI6WyJhdXRoZW50aWNhdGVkIl0sImVtYWlsIjoiamllLndhbmdAa2luZGluZ2xhdy5jb20iLCJzdWJzY3JpcHRpb24iOiJlc3NlbnRpYWwiLCJ0ZW5hbnRfZXhwaXJ5IjpudWxsLCJuYW1lIjoiSmllIFdhbmcifQ.e8bjYP0qebVjdiw8SIJYEVFj9agn_7ZS5EWvEEm_sUuDFSn2IfvIr2U2ExhF6oKlj0TXPatLFLOLZJgXjIyOGn3k2beP1QEsq3jtVrfM8-KG7ZnLXehYl9xp7gRDqNST8_M_tt6m1cLWoFl7-BvpSBJQxFCsD8_uOzK5swB1MHDUegZnvwMKHHP4rm5sHinXcEQ_eyzKsiZ8ZE4Zn6LCa7HWam0Ca61BGPMU4GrNK2kfn19rIb70huJ8tNN3ulqp5x1bJQVfIKUEWTrp0KJmQOsvY7idfi-jWluuJ3g3VULxzZuwU7YN2Gxv5gom9N-eCAdiPyb3IOumLnN2mr3ZT09R8nhGzW8MO2JRai-YgbnVMrkTqTnpFgz9JfOrNOme-Hw1AhLvJN3O2Db8uY6evtljeJqikfjHvWyztOntlCE5RpfCihGHDorFiKhSu2vxA9f4c_Dt0Cm3_HjDMSuqy0jU14F-CQkaJbT6ApCAIUS2xSUCzSpcjSR8BUjjua5KfMh_hM8eFQxOWWXmJBomCX0ZnQeADYJ5USK_NO89DCsSdUkYsBeP9vBbjiD8FS71vu4mfv4Mdz18ZVL1yDjIq8HboLjT7KLPQDHI9PSDzochvxTmHnW6MayTyvuFGPAUvPMDAUL2-kSdTDhdRwYZF1GTk4K2Dd7vsTpLNBZMdDY"

headers = {
    'accept': '*/*',
    'accept-language': 'zh-CN,zh;q=0.9',
    'authorization': authorization,
    'content-type': 'application/json',
    'dnt': '1',
    'origin': 'https://www.dataguidance.ai',
    'priority': 'u=1, i',
    'referer': 'https://www.dataguidance.ai',
    'sec-ch-ua': '"Not?A_Brand";v="99", "Chromium";v="130"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'cross-site',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}

proxies = {
    'http://': 'http://127.0.0.1:7890',
    'https://': 'http://127.0.0.1:7890'
}

title_translate_prompt = """
你是一位翻译专家，精通中英文，擅长复杂的术论文翻译成易懂的科普文章。
请将输入的标题从英文翻译到中文。
"""

country_translate_prompt = """
你是一位翻译专家，精通中英文，擅长复杂的术论文翻译成易懂的科普文章。
请将输入的国家或地区名称从英文翻译到中文。
"""

prompt = """
# Character
你是一位翻译专家，精通中英文，擅长复杂的术论文翻译成易懂的科普文章。你无需编程，而是专注于题解和翻译。

## Skills
- 将英文学术论文翻译成中文科普文章（保持原有格式和专业术语，例如FLAC，JPEG，Microsoft，Amazon等）
- 注意必须准确传达原文的事实和背景
- 在需要的时候，在括号中标记对应的英文单词

### Skill 1: 直接翻译
- 根据英文内容直接翻译，保持原有的格式，尽量不遗漏任何信息

## 策略
策略：
1. 根据Skill 1进行英文内容直详，保持原有格式，不要遗漏任何信息
2. 去除所有的HTML标签

## 限制
- 必须翻译原值的全部内容，包括专业术语（例如 FLCA ，JPEG等）以及公司名词（例如 Microsoft，Amazon等）
- 根据数据保护的相关术语词汇对应表，Controller对应中文"控制者"，Processor对应"处理者"Data breach对应"数据泄漏"，Sub processor对应"子处理者"，Information Subject对应"数据主体"，Transfer对应"传输"
- 在应对可能存在多义的英文词汇时，要在括号中标记对应的英文单词
- 回答所有问题时，不能使用"很抱歉，但是"等开头语
- 必须遵守道德和法律，不能产生、传播或解释任何非法、有害或歧视性的内容
"""

# OpenRouter API 配置
openrouter_url = 'https://openrouter.ai/api/v1'

# 读取已存在的 Excel 文件
if os.path.exists('output.xlsx'):
    df = pd.read_excel('output.xlsx')
else:
    #column: url	html	area	country	date	translated_title	translated_content	comment
    df = pd.DataFrame(columns=['url', 'html', 'area', 'country', 'date', 'translated_title', 'translated_content', 'comment'])

# since googletrans's dependencies conflict with openai, we use openai's translate function instead
# translator = Translator()

# 修改这行
client = openai.Client(api_key=os.getenv('OPENROUTER_API_KEY'), base_url=openrouter_url, http_client=httpx.Client(proxies=proxies))

# 从Excel读取区域映射
def load_area_mapping():
    mapping_df = pd.read_excel('area_mapping.xlsx')
    area_mapping = {}
    for _, row in mapping_df.iterrows():
        area_mapping[row['area']] = [country.strip() for country in row['countries'].split(',')]
    return area_mapping

# 替换原来的硬编码映射
area_mapping = load_area_mapping()

# 处理每个 URL
for index, row in df.iterrows():
    url = row['url']

    if pd.notna(row['html']) and pd.notna(row['translated_content']):
        print(f"跳过已存在的 URL: {url}")
        continue

    print(f"处理 URL: {url}")
    
    # Extract path from URL like "https://www.dataguidance.ai/news/south-korea-pipc-fined-online-retailer-pipa-violations"
    match = re.search(r'/news/(.+?)(?:\?|$)', url)
    if match:
        path = match.group(0)
        
        composed_url = f'https://dgcb20-ca-northeurope-dglive.yellowground-c1f17366.northeurope.azurecontainerapps.io/api/v1/content/articles/by_path?path={path}'

        headers['referer'] = url
        response = httpx.get(composed_url, headers=headers, proxies=proxies)
        # print(response.status_code)
        # print(response.json())
        html_content = response.json()['contentBody']['html']['en']
        # title: South Korea: PIPC fined online retailer for PIPA violations after data breach
        title = response.json()['title']['en']
        split_title = title.split(':')
        # country = South Korea
        country_en = split_title[0].strip()
        
        country_zh = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": country_translate_prompt},
                {"role": "user", "content": country_en}
            ]
        ).choices[0].message.content

        # 确定区域
        area = '其他'
        for region, countries in area_mapping.items():
            if any(country in country_zh for country in countries):
                area = region
                break
        
        # published_on = "2024-10-24T11:49:41+01:00"
        published_on = response.json()['publishedOn']
        # published_on_zh = "2024年10月24日"
        published_on_zh = datetime.strptime(published_on, '%Y-%m-%dT%H:%M:%S%z').strftime('%Y年%m月%d日')
        print(html_content)

        # translate title
        title_zh = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": title_translate_prompt},
                {"role": "user", "content": title}
            ]
        ).choices[0].message.content
        
        translation_response = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": html_content}
            ]
            # response_format=DGNews,  # Assuming DGNews is defined elsewhere
        )

        # # prompt caching

        # 替换软换行（\n）为硬换行（\r\n）
        translated_content = translation_response.choices[0].message.content.replace('\n', '\r\n')

        # # 更新 DataFrame 中的相应行
        df.at[index, 'area'] = area
        df.at[index, 'country'] = country_zh
        df.at[index, 'date'] = published_on_zh
        df.at[index, 'translated_title'] = title_zh
        df.at[index, 'html'] = html_content
        df.at[index, 'translated_content'] = translated_content

        # print(f"已更新 URL: {url}")
    else:
        print(f"URL 格式不正确: {url}")

# 将更新后的 DataFrame 写入 Excel 文件
df.to_excel('output.xlsx', index=False)