|
|
import pandas as pd |
|
|
from datasets import load_dataset |
|
|
import json |
|
|
|
|
|
def get_smi_text(): |
|
|
|
|
|
ds = load_dataset("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/PubChem-extended", revision="main") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
selected_fields = ["SMILES", "description"] |
|
|
|
|
|
|
|
|
dfs = [] |
|
|
for split in ds.keys(): |
|
|
df = ds[split].select_columns(selected_fields).to_pandas() |
|
|
dfs.append(df) |
|
|
|
|
|
|
|
|
combined_df = pd.concat(dfs, ignore_index=True) |
|
|
print(f"拼接后的DataFrame形状: {combined_df.shape}") |
|
|
print(combined_df.head()) |
|
|
|
|
|
|
|
|
output_file = "PubChem-extended.jsonl" |
|
|
combined_df.to_json( |
|
|
output_file, |
|
|
orient="records", |
|
|
lines=True, |
|
|
force_ascii=False |
|
|
) |
|
|
|
|
|
print(f"数据已保存到 {output_file}") |
|
|
|
|
|
def get_all_jsonl(): |
|
|
dfs = [] |
|
|
df1 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/chebi.jsonl", lines=True) |
|
|
print(df1.head()) |
|
|
dfs.append(df1) |
|
|
df2 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/LPM-24-extra-extended.jsonl", lines=True) |
|
|
print(df2.head()) |
|
|
dfs.append(df2) |
|
|
df3 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/PubChem-extended.jsonl", lines=True) |
|
|
print(df3.head()) |
|
|
dfs.append(df3) |
|
|
combined_df = pd.concat(dfs, ignore_index=True) |
|
|
print(f"拼接后的DataFrame形状: {combined_df.shape}") |
|
|
print(combined_df.head()) |
|
|
df_unique = combined_df.drop_duplicates() |
|
|
print(f"去重后的DataFrame形状: {df_unique.shape}") |
|
|
|
|
|
output_file = "train.jsonl" |
|
|
df_unique.to_json( |
|
|
output_file, |
|
|
orient="records", |
|
|
lines=True, |
|
|
force_ascii=False |
|
|
) |
|
|
|
|
|
print(f"数据已保存到 {output_file}") |
|
|
|
|
|
|
|
|
human_message = { |
|
|
'from': 'human', |
|
|
'value': '' |
|
|
} |
|
|
|
|
|
gpt_message = { |
|
|
'from': 'gpt', |
|
|
'value': '' |
|
|
} |
|
|
|
|
|
system_prompt = """ |
|
|
You are a specialized tool for predicting molecular properties based on SMILES strings. Your core function is to take a molecule's SMILES (Simplified Molecular-Input Line-Entry System) notation as input and output a comprehensive, accurate description of its key properties. |
|
|
|
|
|
When processing the input SMILES, focus on describing the following properties (as applicable and relevant to the molecule): |
|
|
- Chemical class or family (e.g., alkane, aromatic compound, steroid, nucleotide) |
|
|
- Physical properties: melting point range, boiling point range, solubility (in water and common organic solvents), state of matter at room temperature (solid, liquid, gas) |
|
|
- Chemical reactivity: key functional groups and their typical reactions (e.g., ester hydrolysis, amine protonation, alkene addition) |
|
|
- Biological activity (if applicable): therapeutic category, target biomolecules (e.g., enzyme inhibitors, receptor agonists), toxicity profile highlights |
|
|
- Spectroscopic features: characteristic peaks in NMR (¹H, ¹³C), IR, or mass spectrometry |
|
|
- Other notable properties: chirality, stability under ambient conditions, flammability, hygroscopicity |
|
|
|
|
|
Ensure your descriptions are concise yet informative, using precise chemical terminology. Avoid speculation; base properties on well-established chemical knowledge. If a property is highly variable or not well-defined for the given molecule, state this clearly. |
|
|
|
|
|
Input: SMILES string of a molecule |
|
|
Output: Structured description of the molecule's properties as outlined above |
|
|
""" |
|
|
|
|
|
from copy import deepcopy |
|
|
|
|
|
def get_conversations(SMILES:str, des:str): |
|
|
|
|
|
conversations = [] |
|
|
conversations.append(deepcopy(human_message)) |
|
|
conversations[-1]['value'] = SMILES |
|
|
|
|
|
conversations.append(deepcopy(gpt_message)) |
|
|
conversations[-1]['value'] = des |
|
|
|
|
|
return conversations |
|
|
|
|
|
def get_fineturn_dataset(): |
|
|
import random |
|
|
dataset_list = [] |
|
|
with open('/mmu_nlp_ssd/wanghuiyang/myPharmHGT/train.jsonl', 'r', encoding='utf-8') as file: |
|
|
for line in file.readlines(): |
|
|
dataset_list.append(json.loads(line)) |
|
|
print(len(dataset_list)) |
|
|
|
|
|
output_list = [] |
|
|
for data in dataset_list: |
|
|
conversations = get_conversations(data['SMILES'], data['description']) |
|
|
output_list.append({'conversations': conversations, 'system':system_prompt}) |
|
|
|
|
|
train_len = int(0.9 * len(output_list)) |
|
|
random.shuffle(output_list) |
|
|
train_dataset = output_list[:train_len] |
|
|
test_dataset = output_list[train_len:] |
|
|
|
|
|
output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/train.json' |
|
|
with open(output_path, 'w') as f: |
|
|
json.dump(train_dataset, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/test.json' |
|
|
with open(output_path, 'w') as f: |
|
|
json.dump(test_dataset, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
get_fineturn_dataset() |