"""
作者：曾浩龙（独立开发）
创建时间：2024 年 10 月 25 日
第三方依赖库：Gradio (https://www.gradio.app/) 与 OpenAI Python API library (https://github.com/openai/openai-python)
其他说明：本项目声明仅供学习和研究使用。
"""
import os
import platform
print(platform.python_version())

import Levenshtein
import gradio as gr
from openai import OpenAI


def find_closest_string(user_input, valid_strings):
    """查找与输入字符串最接近的字符串，精确的编辑距离，根据 Levenshtein 编辑距离最小原则。"""
    min_distance = 999999999  # 初始化最小编辑距离为大的正数
    closest_string = None

    for valid_string in valid_strings:
        # 计算 Levenshtein 距离：
        distance = Levenshtein.distance(user_input, valid_string)
        if distance < min_distance:
            min_distance = distance
            closest_string = valid_string

    return closest_string


def process_input(user_input):
    """
    处理用户输入的字符串：
      1 - 若在有效集合中则直接返回；
      2 - 否则返回与用户输入的最接近的有效字符串。
    """
    if user_input in valid_strings:
        return user_input  # 如果用户输入的字符串在集合里，直接返回。
    else:
        # 否则，找到编辑距离最小的字符串。
        return find_closest_string(user_input, valid_strings)


def demo(project_TCGA, output_language="Chinese"):
    project_TCGA = process_input(project_TCGA)
    name_English, name_Chinese = project_name_TCGA[project_TCGA]
    tcga_link = f"https://portal.gdc.cancer.gov/projects/{project_TCGA}"
    output1, output2 = None, None

    if output_language == "Chinese":
        output1 = f"✍️ 简称：{project_TCGA}\n❤️ 中文全称：{name_Chinese}\n💛 英文全称：{name_English}\n🔗 链接：{tcga_link}"
        system_instruction = f"您是公共卫生、流行病学、癌症研究和精准医学领域的专家，对{name_Chinese}有着深入的理解。"
        prompt_template = f"""
您的任务是深入分析并撰写关于{name_Chinese}这种复杂疾病的摘要，内容必须准确、详实、逻辑清晰、可读性强，这对普通公众了解这种复杂疾病非常重要。
具体内容需要包括：1 - {name_Chinese}的基本定义和概述，临床病理特征；2 - {name_Chinese}的病因和风险因素；3 - {name_Chinese}的流行病学调查结果，患病率和死亡率；4 - {name_Chinese}的临床症状与早期识别；5 - {name_Chinese}的疾病进展与转移及其密切相关的生物标志物和异常基因改变；6 - {name_Chinese}的生存率与预后；7 - {name_Chinese}的诊断、治疗方法和未来研究。
""".strip()

    else:
        output1 = f"✍️ Abbreviation: {project_TCGA}\n❤️ Full name in Chinese: {name_Chinese}\n💛 Full Name in English: {name_English}\n🔗 Link: {tcga_link}"
        system_instruction = f"You are an expert in the fields of public health, epidemiology, cancer research, and precision medicine, with a deep comprehension of {name_English}."
        prompt_template = f"""
Your task is to analyze and write an in-depth summary about the complex disease of {name_English} that must be accurate, informative, logical, and readable, which is very important for the general public to understand this complex disease.
Specific content needs to include: 1 - Basic definition and overview of {name_English}, clinicopathologic features; 2 - Etiology and risk factors of {name_English}; 3 - Epidemiologic findings, prevalence, and mortality rates of {name_English}; 4 - Clinical signs and early recognition of {name_English}; 5 - Disease progression and metastasis of {name_English} and its closely related biomarkers and aberrant gene alterations; 6 - Survival and prognosis of {name_English}; and 7 - Diagnostics, therapeutic approaches, and future research of {name_English}.
""".strip()

    try:
        # 要实例化一个 OpenAI 对象，你需要设置 OpenAI API Key、Base URL、最大重试次数以及超时限制时间。
        client = OpenAI(
            api_key=os.environ["OPENAI_API_KEY"],
            base_url=os.environ["API_BASE"],
            max_retries=3,
            timeout=60,
        )

        # 调用 client.chat.completions.create，设置关键参数。
        chat_completion = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",  # gpt-4o-mini-2024-07-18, gpt-4-turbo
            messages=[
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": prompt_template},
            ],
            n=1,
            seed=42,
            temperature=0.50,
            max_tokens=3600 if output_language == "Chinese" else 2048,
            logprobs=False,
            # top_logprobs=3,
            presence_penalty=0.20,
            frequency_penalty=0.20,
        )
        resp_text = chat_completion.choices[0].message.content.strip()

        # 在普通文本框不能用 "**" 渲染加粗，Markdown 才可以。因此，将输入字符串中所有的 "**" 替换为 ""。
        # if "**" in resp_text:
        #     resp_text = resp_text.replace("**", "")
        # if "# " in resp_text:
        #     resp_text = resp_text.replace("# ", "")
        # if "#" in resp_text:
        #     resp_text = resp_text.replace("#", "")

        if output_language == "Chinese":
            # "🤖 请注意：以下内容通过提示工程驱动的 GPT-4 Turbo 生成\n\n"
            output2 = "" + resp_text
        else:
            # "🤖 Note: The following content is generated by the GPT-4 Turbo driven by Prompt Engineering\n\n"
            output2 = "" + resp_text

    except Exception as e:
        print(str(e), "Response Error")
        return output1, "Response Error"

    return output1, output2


# TCGA 有 33 种癌症类型
project_name_TCGA = {
    "TCGA-ACC": ["adrenocortical carcinoma", "肾上腺皮质癌"],
    "TCGA-BLCA": ["bladder urothelial carcinoma", "膀胱尿路上皮癌"],
    "TCGA-BRCA": ["breast invasive carcinoma", "浸润性乳腺癌"],
    "TCGA-CESC": [
        "cervical squamous cell carcinoma and endocervical adenocarcinoma",
        "宫颈鳞状细胞癌与宫颈内膜腺癌",
    ],
    "TCGA-CHOL": ["cholangiocarcinoma", "胆管癌"],
    "TCGA-COAD": ["colon adenocarcinoma", "结肠腺癌"],
    "TCGA-DLBC": [
        "lymphoid neoplasm diffuse large B-cell lymphoma",
        "弥漫性大 B 细胞淋巴瘤",
    ],
    "TCGA-ESCA": ["esophageal carcinoma", "食道癌"],
    "TCGA-GBM": ["glioblastoma multiforme", "多形性胶质母细胞瘤"],
    "TCGA-HNSC": ["head and neck squamous cell carcinoma", "头颈部鳞状细胞癌"],
    "TCGA-KICH": ["kidney chromophobe", "肾嫌色细胞癌"],
    "TCGA-KIRC": ["kidney renal clear cell carcinoma", "肾透明细胞癌"],
    "TCGA-KIRP": ["kidney renal papillary cell carcinoma", "乳头状肾细胞癌"],
    "TCGA-LAML": ["acute myeloid leukemia", "急性髓系白血病"],
    "TCGA-LGG": ["brain lower grade glioma", "低级别脑胶质瘤"],
    "TCGA-LIHC": ["liver hepatocellular carcinoma", "肝细胞癌"],
    "TCGA-LUAD": ["lung adenocarcinoma", "肺腺癌"],
    "TCGA-LUSC": ["lung squamous cell carcinoma", "肺鳞状细胞癌"],
    "TCGA-MESO": ["mesothelioma", "间皮瘤"],
    "TCGA-OV": ["ovarian serous cystadenocarcinoma", "卵巢浆液性囊腺癌"],
    "TCGA-PAAD": ["pancreatic adenocarcinoma", "胰腺腺癌"],
    "TCGA-PCPG": ["pheochromocytoma and paraganglioma", "嗜铬细胞瘤和副神经节瘤"],
    "TCGA-PRAD": ["prostate adenocarcinoma", "前列腺腺癌"],
    "TCGA-READ": ["rectum adenocarcinoma", "直肠腺癌"],
    "TCGA-SARC": ["sarcoma", "肉瘤"],
    "TCGA-SKCM": ["skin cutaneous melanoma", "皮肤黑色素瘤"],
    "TCGA-STAD": ["stomach adenocarcinoma", "胃腺癌"],
    "TCGA-TGCT": ["testicular germ cell tumors", "睾丸生殖细胞肿瘤"],
    "TCGA-THCA": ["thyroid carcinoma", "甲状腺癌"],
    "TCGA-THYM": ["thymoma", "胸腺瘤"],
    "TCGA-UCEC": ["uterine corpus endometrial carcinoma", "子宫体子宫内膜癌"],
    "TCGA-UCS": ["uterine carcinosarcoma", "子宫癌肉瘤"],
    "TCGA-UVM": ["uveal melanoma", "眼内（葡萄膜）黑色素瘤"],
}
# 预定义的字符串集合：
valid_strings = {input_query for input_query in project_name_TCGA.keys()}
# print(len(project_name_TCGA.keys()))
# input_query = input("请输入您要查询的 TCGA 项目名称：")
# print(project_name_TCGA[input_query])
# print([k for k in project_name_TCGA.keys()])

# 支持 Markdown 和 HTML 内容格式：
# Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project.
# desc = """<h1 align="center" style="font-family: Latin Modern Math, sans-serif; font-size: 22px; color: #00FF7F;">🎉 Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project 🧬</h1>"""

desc = """<h1 align="center" style="font-family: KaiTi, sans-serif; font-size: 22px; color: #00FF7F;">🎉 TCGA 项目涉及的所有癌症类型的缩写、中英文全称和描述 🧬</h1>"""
outputs = [
    gr.Textbox(
        label="🔎 1. 您查询的 TCGA 项目的癌症类型", show_copy_button=True
    ),  # 1. The Full Name of The Cancer Type Queried.
    gr.Textbox(
        label="👩‍⚕️ 2. 迅速了解这种癌症类型的信息",
        show_copy_button=True,
    ),  # 2. Insight Into The Cancer Type Being Queried. A Quick Look At The Cancer Type Being Queried
]
my_demo = gr.Interface(
    fn=demo,
    inputs=[
        gr.Dropdown(
            choices=[k for k in project_name_TCGA.keys()],
            value="TCGA-READ",
            allow_custom_value=True,
            filterable=True,
            interactive=True,
            label="⌨️ 请输入您要查询的 TCGA 项目名称，如 TCGA-READ",
        ),  # Please enter the name of the TCGA project you want to query, such as TCGA-READ.
        gr.Dropdown(
            choices=["Chinese", "English"],
            value="Chinese",
            allow_custom_value=False,
            label="👨‍💻 输出语言目前仅支持中文和英文",
        ),
    ],
    outputs=outputs,
    submit_btn=gr.Button("提交", variant="primary"),
    clear_btn=gr.Button("清除", variant="secondary"),
    cache_examples=True,
    examples=[["TCGA-READ", "Chinese"], ["TCGA-COAD", "English"]],
    description=desc,
    theme="JohnSmith9982/small_and_pretty",
)
my_demo.launch(show_api=False, show_error=True)