import re
import pandas as pd
import os
from dotenv import load_dotenv
import openai
from datetime import datetime
import httpx
import gradio as gr
from docxtpl import DocxTemplate

# 加载 .env 文件
load_dotenv()

authorization = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsImp0aSI6IjkxMzIxYjY3YWM3ZWZlNTdjMGRmNWJkNmMxMTU2ZTI3OTU3OWI0M2ZjZDhjYWYxMGE1ZjllY2UzNWNjZmVlNTIxZTU5MGJjMzZiYzM5NzNhIn0.eyJhdWQiOiJGRnlIN0VCbTc1aFF4cnJZTWU4NWVVbnNsNWdVQy1aSWVDdnpuN2hwYkJBIiwianRpIjoiOTEzMjFiNjdhYzdlZmU1N2MwZGY1YmQ2YzExNTZlMjc5NTc5YjQzZmNkOGNhZjEwYTVmOWVjZTM1Y2NmZWU1MjFlNTkwYmMzNmJjMzk3M2EiLCJpYXQiOjE3Mjk4NTIzMjIsIm5iZiI6MTcyOTg1MjMyMiwiZXhwIjoxNzI5OTM4NzIyLjExNjMwOCwic3ViIjoiNTU5NTgiLCJzY29wZSI6WyJhdXRoZW50aWNhdGVkIl0sImVtYWlsIjoiamllLndhbmdAa2luZGluZ2xhdy5jb20iLCJzdWJzY3JpcHRpb24iOiJlc3NlbnRpYWwiLCJ0ZW5hbnRfZXhwaXJ5IjpudWxsLCJuYW1lIjoiSmllIFdhbmcifQ.e8bjYP0qebVjdiw8SIJYEVFj9agn_7ZS5EWvEEm_sUuDFSn2IfvIr2U2ExhF6oKlj0TXPatLFLOLZJgXjIyOGn3k2beP1QEsq3jtVrfM8-KG7ZnLXehYl9xp7gRDqNST8_M_tt6m1cLWoFl7-BvpSBJQxFCsD8_uOzK5swB1MHDUegZnvwMKHHP4rm5sHinXcEQ_eyzKsiZ8ZE4Zn6LCa7HWam0Ca61BGPMU4GrNK2kfn19rIb70huJ8tNN3ulqp5x1bJQVfIKUEWTrp0KJmQOsvY7idfi-jWluuJ3g3VULxzZuwU7YN2Gxv5gom9N-eCAdiPyb3IOumLnN2mr3ZT09R8nhGzW8MO2JRai-YgbnVMrkTqTnpFgz9JfOrNOme-Hw1AhLvJN3O2Db8uY6evtljeJqikfjHvWyztOntlCE5RpfCihGHDorFiKhSu2vxA9f4c_Dt0Cm3_HjDMSuqy0jU14F-CQkaJbT6ApCAIUS2xSUCzSpcjSR8BUjjua5KfMh_hM8eFQxOWWXmJBomCX0ZnQeADYJ5USK_NO89DCsSdUkYsBeP9vBbjiD8FS71vu4mfv4Mdz18ZVL1yDjIq8HboLjT7KLPQDHI9PSDzochvxTmHnW6MayTyvuFGPAUvPMDAUL2-kSdTDhdRwYZF1GTk4K2Dd7vsTpLNBZMdDY"

headers = {
    'accept': '*/*',
    'accept-language': 'zh-CN,zh;q=0.9',
    'content-type': 'application/json',
    'dnt': '1',
    'origin': 'https://www.dataguidance.ai',
    'priority': 'u=1, i',
    'referer': 'https://www.dataguidance.ai',
    'sec-ch-ua': '"Not?A_Brand";v="99", "Chromium";v="130"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'cross-site',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}

if os.getenv('MODE') == "dev":
    proxies = {
        'http://': 'http://127.0.0.1:7890',
        'https://': 'http://127.0.0.1:7890'
    }
else:
    proxies = None

title_translate_prompt = """
你是一位翻译专家，精通中英文，擅长复杂的术论文翻译成易懂的科普文章。
请将输入的标题从英文翻译到中文。
"""

country_translate_prompt = """
你是一位翻译专家，精通中英文，擅长复杂的术论文翻译成易懂的科普文章。
请将输入的国家或地区名称从英文翻译到中文。
"""

prompt = """
# Character
你是一位翻译专家，精通中英文，擅长复杂的术论文翻译成易懂的科普文章。你无需编程，而是专注于题解和翻译。

## Skills
- 将英文学术论文翻译成中文科普文章（保持原有格式和专业术语，例如FLAC，JPEG，Microsoft，Amazon等）
- 注意必须准确传达原文的事实和背景
- 在需要的时候，在括号中标记对应的英文单词

### Skill 1: 直接翻译
- 根据英文内容直接翻译，保持原有的格式，尽量不遗漏任何信息

## 策略
策略：
1. 根据Skill 1进行英文内容直详，保持原有格式，不要遗漏任何信息
2. 去除所有的HTML标签
3. 去除所有的Markdown格式

## 限制
- 必须翻译原值的全部内容，包括专业术语（例如 FLCA ，JPEG等）以及公司名词（例如 Microsoft，Amazon等）
- 根据数据保护的相关术语词汇对应表，Controller对应中文"控制者"，Processor对应"处理者"Data breach对应"数据泄漏"，Sub processor对应"子处理者"，Information Subject对应"数据主体"，Transfer对应"传输"
- 在应对可能存在多义的英文词汇时，要在括号中标记对应的英文单词
- 回答所有问题时，不能使用"很抱歉，但是"等开头
- 必须遵守道德和法律，不能产生、传播或解释任何非法、有害或歧视性的内容
"""

# OpenRouter API 配置
openrouter_url = 'https://openrouter.ai/api/v1'

# 从Excel读取区域映射
def load_area_mapping():
    mapping_df = pd.read_excel('area_mapping.xlsx')
    area_mapping = {}
    for _, row in mapping_df.iterrows():
        area_mapping[row['area']] = [country.strip() for country in row['countries'].split(',')]
    return area_mapping

# 替换原来的硬编码映射
area_mapping = load_area_mapping()

def process_urls(urls_text, auth, progress=gr.Progress()):
    # Update headers with the provided auth token
    headers['authorization'] = f"Bearer {auth}"
    
    # Initialize or load existing DataFrame
    if os.path.exists('output.xlsx'):
        df = pd.read_excel('output.xlsx')
    else:
        df = pd.DataFrame(columns=['url', 'html', 'area', 'country', 'date', 'translated_title', 'translated_content', 'comment'])

    # Load area mapping
    area_mapping = load_area_mapping()
    
    # Split URLs into list
    urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
    
    results = []
    for url in progress.tqdm(urls):
        # Check if URL already exists in DataFrame
        if url in df['url'].values:
            if pd.notna(df.loc[df['url'] == url, 'translated_content'].iloc[0]):
                results.append(f"跳过已存在的 URL: {url}")
                continue
        
        try:
            # Extract path from URL
            match = re.search(r'/news/(.+?)(?:\?|$)', url)
            if not match:
                results.append(f"URL 格式不正确: {url}")
                continue
                
            path = match.group(0)
            composed_url = f'https://dgcb20-ca-northeurope-dglive.yellowground-c1f17366.northeurope.azurecontainerapps.io/api/v1/content/articles/by_path?path={path}'
            
            # Get article content
            headers['referer'] = url
            response = httpx.get(composed_url, headers=headers, proxies=proxies)
            
            if response.status_code != 200:
                results.append(f"获取内容失败 ({response.status_code}): {url}")
                continue
                
            data = response.json()
            html_content = data['contentBody']['html']['en']
            title = data['title']['en']
            split_title = title.split(':')
            country_en = split_title[0].strip()
            
            # Translate country
            country_zh = client.chat.completions.create(
                model="google/gemini-flash-1.5-8b",
                messages=[
                    {"role": "system", "content": country_translate_prompt},
                    {"role": "user", "content": country_en}
                ]
            ).choices[0].message.content
            
            # Determine area
            area = '其他'
            for region, countries in area_mapping.items():
                if any(country in country_zh for country in countries):
                    area = region
                    break
            
            # Process date
            published_on = data['publishedOn']
            published_on_zh = datetime.strptime(published_on, '%Y-%m-%dT%H:%M:%S%z').strftime('%Y年%m月%d日')
            
            # Translate title
            title_zh = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": title_translate_prompt},
                    {"role": "user", "content": title}
                ]
            ).choices[0].message.content
            
            # Translate content
            translation_response = client.beta.chat.completions.parse(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": html_content}
                ]
            )
            
            translated_content = translation_response.choices[0].message.content.replace('\n', '\r\n')
            
            # Add or update DataFrame
            new_row = {
                'url': url,
                'html': html_content,
                'area': area,
                'country': country_zh,
                'date': published_on_zh,
                'translated_title': title_zh,
                'translated_content': translated_content,
                'comment': ''
            }
            
            if url in df['url'].values:
                df.loc[df['url'] == url] = new_row
            else:
                df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
            
            results.append(f"成功处理 URL: {url}")
            
        except Exception as e:
            results.append(f"处理出错 ({str(e)}): {url}")
        
        # Save after each successful processing
        df.to_excel('output.xlsx', index=False)
    
    # Return both the results text and the path to the Excel file
    return '\n'.join(results), 'output.xlsx'

def process_excel_and_generate_docs(excel_file, term, start_year, end_year, start_month, start_day, end_month, end_day):
    # 读取Excel文件
    df = pd.read_excel(excel_file.name)  

    # Initialize the news dictionary
    news_dict = {'news': {}}

    # Group by area and convert to the desired format
    for area, group in df.groupby('area'):
        news_dict['news'][area.lower()] = [
            {
                'title': row['translated_title'],
                'date': row['date'],
                'content': str(row['translated_content']).replace('\r\n', '\n').replace('\r', '\n').replace('\n', '\r\n'),
                'comment': row['comment'],
                'country': row['country']
            }
            for _, row in group.iterrows()
        ]

    # Create the context dictionary with all required fields
    context = {
        'term': term,
        'start_year': start_year,
        'end_year': end_year,
        "start_month": start_month,
        "start_day": start_day,
        "end_month": end_month,
        "end_day": end_day,
        **news_dict
    }

    # Render PDF template
    pdf_output_path = "pdf.docx"
    pdf_tpl = DocxTemplate("v1.1 周报模板.docx")
    pdf_tpl.render(context)
    pdf_tpl.save(pdf_output_path)
    
    # Render Email template
    email_output_path = "email.docx"
    email_tpl = DocxTemplate("v1.1 周报邮件格式调整.docx")
    email_tpl.render(context)
    email_tpl.save(email_output_path)
    
    return [pdf_output_path, email_output_path]
    

def create_combined_interface():
    with gr.Blocks() as app:
        gr.Markdown("# News Processing & Report Generation Tool")
        
        with gr.Tabs() as tabs:
            # Tab 1: URL Processing
            with gr.Tab("URL Processing"):
                with gr.Row():
                    auth_input = gr.Textbox(
                        label="Authorization Token",
                        value=authorization,
                        type="password",
                        lines=1,
                    )
                
                with gr.Row():
                    urls_input = gr.Textbox(
                        label="Input URLs (one per line)",
                        placeholder="https://www.dataguidance.ai/news/...\nhttps://www.dataguidance.ai/news/...",
                        lines=20
                    )
                with gr.Row():
                    process_button = gr.Button("Process URLs")
                with gr.Row():
                    output = gr.Textbox(label="Processing Results", lines=20)
                with gr.Row():
                    file_output = gr.File(label="Download Processed Excel")
            
            # Tab 2: Report Generation
            with gr.Tab("Report Generation"):
                with gr.Row():
                    excel_file = gr.File(label="Upload Excel File")
                
                with gr.Row():
                    term = gr.Textbox(label="期数", value="201")
                    start_year = gr.Textbox(label="起始年", value="2024")
                    end_year = gr.Textbox(label="结束年", value="2024")
                
                with gr.Row():
                    start_month = gr.Textbox(label="起始月份", value="9")
                    start_day = gr.Textbox(label="起始日", value="1")
                    end_month = gr.Textbox(label="结束月份", value="9")
                    end_day = gr.Textbox(label="结束日", value="15")
                
                with gr.Row():
                    generate_btn = gr.Button("Generate Reports")
                
                with gr.Row():
                    pdf_output = gr.File(label="Download PDF Report")
                    email_output = gr.File(label="Download Email Template")
        
        # Connect the buttons to their respective functions
        process_button.click(
            fn=lambda auth, urls: process_urls(urls, auth=auth),
            inputs=[auth_input, urls_input],
            outputs=[output, file_output]
        )
        
        generate_btn.click(
            fn=process_excel_and_generate_docs,
            inputs=[excel_file, term, start_year, end_year, start_month, start_day, end_month, end_day],
            outputs=[pdf_output, email_output]
        )
    
    return app

auth = (os.environ.get("GRADIO_USERNAME", "admin"), 
        os.environ.get("GRADIO_PASSWORD", "password123"))

if __name__ == "__main__":
    # Initialize OpenAI client
    client = openai.Client(
        api_key=os.getenv('OPENROUTER_API_KEY'),
        base_url=openrouter_url,
        http_client=httpx.Client(proxies=proxies)
    )
    
    # Launch combined Gradio interface with authentication
    app = create_combined_interface()
    app.launch(
        # auth=auth,
        max_threads=3,  # Limit concurrent processing
        show_error=True,
        share=True,
        server_name="0.0.0.0"
    )