import pandas as pd from datasets import load_dataset import json def get_smi_text(): # 1. 加载数据集并转换为DataFrame(延续之前的示例) ds = load_dataset("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/PubChem-extended", revision="main") # print(ds['train'][0]) # exit() # 选择需要的字段(根据实际字段名修改) selected_fields = ["SMILES", "description"] # 假设数据集有多个拆分(如train、validation),分别转换为DataFrame dfs = [] for split in ds.keys(): # 遍历所有拆分(如train、validation等) df = ds[split].select_columns(selected_fields).to_pandas() dfs.append(df) # 2. 拼接多个DataFrame combined_df = pd.concat(dfs, ignore_index=True) # ignore_index=True重置索引 print(f"拼接后的DataFrame形状: {combined_df.shape}") print(combined_df.head()) # 3. 保存为JSONL格式 output_file = "PubChem-extended.jsonl" combined_df.to_json( output_file, orient="records", # 按记录格式输出(每条记录一个JSON对象) lines=True, # 启用JSON Lines格式(每行一个对象) force_ascii=False # 保留非ASCII字符(如特殊化学名称) ) print(f"数据已保存到 {output_file}") def get_all_jsonl(): dfs = [] df1 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/chebi.jsonl", lines=True) print(df1.head()) dfs.append(df1) df2 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/LPM-24-extra-extended.jsonl", lines=True) print(df2.head()) dfs.append(df2) df3 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/PubChem-extended.jsonl", lines=True) print(df3.head()) dfs.append(df3) combined_df = pd.concat(dfs, ignore_index=True) # ignore_index=True重置索引 print(f"拼接后的DataFrame形状: {combined_df.shape}") print(combined_df.head()) df_unique = combined_df.drop_duplicates() print(f"去重后的DataFrame形状: {df_unique.shape}") output_file = "train.jsonl" df_unique.to_json( output_file, orient="records", # 按记录格式输出(每条记录一个JSON对象) lines=True, # 启用JSON Lines格式(每行一个对象) force_ascii=False # 保留非ASCII字符(如特殊化学名称) ) print(f"数据已保存到 {output_file}") human_message = { 'from': 'human', 'value': '' } gpt_message = { 'from': 'gpt', 'value': '' } system_prompt = """ You are a specialized tool for predicting molecular properties based on SMILES strings. Your core function is to take a molecule's SMILES (Simplified Molecular-Input Line-Entry System) notation as input and output a comprehensive, accurate description of its key properties. When processing the input SMILES, focus on describing the following properties (as applicable and relevant to the molecule): - Chemical class or family (e.g., alkane, aromatic compound, steroid, nucleotide) - Physical properties: melting point range, boiling point range, solubility (in water and common organic solvents), state of matter at room temperature (solid, liquid, gas) - Chemical reactivity: key functional groups and their typical reactions (e.g., ester hydrolysis, amine protonation, alkene addition) - Biological activity (if applicable): therapeutic category, target biomolecules (e.g., enzyme inhibitors, receptor agonists), toxicity profile highlights - Spectroscopic features: characteristic peaks in NMR (¹H, ¹³C), IR, or mass spectrometry - Other notable properties: chirality, stability under ambient conditions, flammability, hygroscopicity Ensure your descriptions are concise yet informative, using precise chemical terminology. Avoid speculation; base properties on well-established chemical knowledge. If a property is highly variable or not well-defined for the given molecule, state this clearly. Input: SMILES string of a molecule Output: Structured description of the molecule's properties as outlined above """ from copy import deepcopy def get_conversations(SMILES:str, des:str): conversations = [] conversations.append(deepcopy(human_message)) conversations[-1]['value'] = SMILES conversations.append(deepcopy(gpt_message)) conversations[-1]['value'] = des # conversations[-1]['value'] = annotation return conversations def get_fineturn_dataset(): import random dataset_list = [] with open('/mmu_nlp_ssd/wanghuiyang/myPharmHGT/train.jsonl', 'r', encoding='utf-8') as file: for line in file.readlines(): dataset_list.append(json.loads(line)) print(len(dataset_list)) output_list = [] for data in dataset_list: conversations = get_conversations(data['SMILES'], data['description']) output_list.append({'conversations': conversations, 'system':system_prompt}) train_len = int(0.9 * len(output_list)) random.shuffle(output_list) train_dataset = output_list[:train_len] test_dataset = output_list[train_len:] output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/train.json' with open(output_path, 'w') as f: json.dump(train_dataset, f, indent=2, ensure_ascii=False) output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/test.json' with open(output_path, 'w') as f: json.dump(test_dataset, f, indent=2, ensure_ascii=False) get_fineturn_dataset()