import pandas as pd
from datasets import load_dataset
import json

def get_smi_text():
    # 1. 加载数据集并转换为DataFrame（延续之前的示例）
    ds = load_dataset("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/PubChem-extended", revision="main")
    # print(ds['train'][0])
    # exit()

    # 选择需要的字段（根据实际字段名修改）
    selected_fields = ["SMILES", "description"]

    # 假设数据集有多个拆分（如train、validation），分别转换为DataFrame
    dfs = []
    for split in ds.keys():  # 遍历所有拆分（如train、validation等）
        df = ds[split].select_columns(selected_fields).to_pandas()
        dfs.append(df)

    # 2. 拼接多个DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)  # ignore_index=True重置索引
    print(f"拼接后的DataFrame形状: {combined_df.shape}")
    print(combined_df.head())

    # 3. 保存为JSONL格式
    output_file = "PubChem-extended.jsonl"
    combined_df.to_json(
        output_file,
        orient="records",  # 按记录格式输出（每条记录一个JSON对象）
        lines=True,        # 启用JSON Lines格式（每行一个对象）
        force_ascii=False  # 保留非ASCII字符（如特殊化学名称）
    )

    print(f"数据已保存到 {output_file}")

def get_all_jsonl():
    dfs = []
    df1 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/chebi.jsonl", lines=True)
    print(df1.head())
    dfs.append(df1)
    df2 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/LPM-24-extra-extended.jsonl", lines=True)
    print(df2.head())
    dfs.append(df2)
    df3 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/PubChem-extended.jsonl", lines=True)
    print(df3.head())
    dfs.append(df3)
    combined_df = pd.concat(dfs, ignore_index=True)  # ignore_index=True重置索引
    print(f"拼接后的DataFrame形状: {combined_df.shape}")
    print(combined_df.head())
    df_unique = combined_df.drop_duplicates()
    print(f"去重后的DataFrame形状: {df_unique.shape}")

    output_file = "train.jsonl"
    df_unique.to_json(
        output_file,
        orient="records",  # 按记录格式输出（每条记录一个JSON对象）
        lines=True,        # 启用JSON Lines格式（每行一个对象）
        force_ascii=False  # 保留非ASCII字符（如特殊化学名称）
    )

    print(f"数据已保存到 {output_file}")


human_message = {
    'from': 'human',
    'value': ''
}

gpt_message = {
    'from': 'gpt',
    'value': ''
}

system_prompt = """
You are a specialized tool for predicting molecular properties based on SMILES strings. Your core function is to take a molecule's SMILES (Simplified Molecular-Input Line-Entry System) notation as input and output a comprehensive, accurate description of its key properties. 

When processing the input SMILES, focus on describing the following properties (as applicable and relevant to the molecule):
- Chemical class or family (e.g., alkane, aromatic compound, steroid, nucleotide)
- Physical properties: melting point range, boiling point range, solubility (in water and common organic solvents), state of matter at room temperature (solid, liquid, gas)
- Chemical reactivity: key functional groups and their typical reactions (e.g., ester hydrolysis, amine protonation, alkene addition)
- Biological activity (if applicable): therapeutic category, target biomolecules (e.g., enzyme inhibitors, receptor agonists), toxicity profile highlights
- Spectroscopic features: characteristic peaks in NMR (¹H, ¹³C), IR, or mass spectrometry
- Other notable properties: chirality, stability under ambient conditions, flammability, hygroscopicity

Ensure your descriptions are concise yet informative, using precise chemical terminology. Avoid speculation; base properties on well-established chemical knowledge. If a property is highly variable or not well-defined for the given molecule, state this clearly.

Input: SMILES string of a molecule
Output: Structured description of the molecule's properties as outlined above
"""

from copy import deepcopy

def get_conversations(SMILES:str, des:str):
    
    conversations = []
    conversations.append(deepcopy(human_message))
    conversations[-1]['value'] = SMILES

    conversations.append(deepcopy(gpt_message))
    conversations[-1]['value'] = des
    # conversations[-1]['value'] = annotation
    return conversations

def get_fineturn_dataset():
    import random
    dataset_list = []
    with open('/mmu_nlp_ssd/wanghuiyang/myPharmHGT/train.jsonl', 'r', encoding='utf-8') as file:
        for line in file.readlines():
            dataset_list.append(json.loads(line))
    print(len(dataset_list))

    output_list = []
    for data in dataset_list:
        conversations = get_conversations(data['SMILES'], data['description'])
        output_list.append({'conversations': conversations, 'system':system_prompt})

    train_len = int(0.9 * len(output_list))
    random.shuffle(output_list)
    train_dataset = output_list[:train_len]
    test_dataset = output_list[train_len:]

    output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/train.json'
    with open(output_path, 'w') as f:
        json.dump(train_dataset, f, indent=2, ensure_ascii=False) 

    output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/test.json'
    with open(output_path, 'w') as f:
        json.dump(test_dataset, f, indent=2, ensure_ascii=False) 

get_fineturn_dataset()