0814output / generate_finetune_dataset.py
GSheep's picture
Upload folder using huggingface_hub
0ab9189 verified
import pandas as pd
from datasets import load_dataset
import json
def get_smi_text():
# 1. 加载数据集并转换为DataFrame(延续之前的示例)
ds = load_dataset("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/PubChem-extended", revision="main")
# print(ds['train'][0])
# exit()
# 选择需要的字段(根据实际字段名修改)
selected_fields = ["SMILES", "description"]
# 假设数据集有多个拆分(如train、validation),分别转换为DataFrame
dfs = []
for split in ds.keys(): # 遍历所有拆分(如train、validation等)
df = ds[split].select_columns(selected_fields).to_pandas()
dfs.append(df)
# 2. 拼接多个DataFrame
combined_df = pd.concat(dfs, ignore_index=True) # ignore_index=True重置索引
print(f"拼接后的DataFrame形状: {combined_df.shape}")
print(combined_df.head())
# 3. 保存为JSONL格式
output_file = "PubChem-extended.jsonl"
combined_df.to_json(
output_file,
orient="records", # 按记录格式输出(每条记录一个JSON对象)
lines=True, # 启用JSON Lines格式(每行一个对象)
force_ascii=False # 保留非ASCII字符(如特殊化学名称)
)
print(f"数据已保存到 {output_file}")
def get_all_jsonl():
dfs = []
df1 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/chebi.jsonl", lines=True)
print(df1.head())
dfs.append(df1)
df2 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/LPM-24-extra-extended.jsonl", lines=True)
print(df2.head())
dfs.append(df2)
df3 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/PubChem-extended.jsonl", lines=True)
print(df3.head())
dfs.append(df3)
combined_df = pd.concat(dfs, ignore_index=True) # ignore_index=True重置索引
print(f"拼接后的DataFrame形状: {combined_df.shape}")
print(combined_df.head())
df_unique = combined_df.drop_duplicates()
print(f"去重后的DataFrame形状: {df_unique.shape}")
output_file = "train.jsonl"
df_unique.to_json(
output_file,
orient="records", # 按记录格式输出(每条记录一个JSON对象)
lines=True, # 启用JSON Lines格式(每行一个对象)
force_ascii=False # 保留非ASCII字符(如特殊化学名称)
)
print(f"数据已保存到 {output_file}")
human_message = {
'from': 'human',
'value': ''
}
gpt_message = {
'from': 'gpt',
'value': ''
}
system_prompt = """
You are a specialized tool for predicting molecular properties based on SMILES strings. Your core function is to take a molecule's SMILES (Simplified Molecular-Input Line-Entry System) notation as input and output a comprehensive, accurate description of its key properties.
When processing the input SMILES, focus on describing the following properties (as applicable and relevant to the molecule):
- Chemical class or family (e.g., alkane, aromatic compound, steroid, nucleotide)
- Physical properties: melting point range, boiling point range, solubility (in water and common organic solvents), state of matter at room temperature (solid, liquid, gas)
- Chemical reactivity: key functional groups and their typical reactions (e.g., ester hydrolysis, amine protonation, alkene addition)
- Biological activity (if applicable): therapeutic category, target biomolecules (e.g., enzyme inhibitors, receptor agonists), toxicity profile highlights
- Spectroscopic features: characteristic peaks in NMR (¹H, ¹³C), IR, or mass spectrometry
- Other notable properties: chirality, stability under ambient conditions, flammability, hygroscopicity
Ensure your descriptions are concise yet informative, using precise chemical terminology. Avoid speculation; base properties on well-established chemical knowledge. If a property is highly variable or not well-defined for the given molecule, state this clearly.
Input: SMILES string of a molecule
Output: Structured description of the molecule's properties as outlined above
"""
from copy import deepcopy
def get_conversations(SMILES:str, des:str):
conversations = []
conversations.append(deepcopy(human_message))
conversations[-1]['value'] = SMILES
conversations.append(deepcopy(gpt_message))
conversations[-1]['value'] = des
# conversations[-1]['value'] = annotation
return conversations
def get_fineturn_dataset():
import random
dataset_list = []
with open('/mmu_nlp_ssd/wanghuiyang/myPharmHGT/train.jsonl', 'r', encoding='utf-8') as file:
for line in file.readlines():
dataset_list.append(json.loads(line))
print(len(dataset_list))
output_list = []
for data in dataset_list:
conversations = get_conversations(data['SMILES'], data['description'])
output_list.append({'conversations': conversations, 'system':system_prompt})
train_len = int(0.9 * len(output_list))
random.shuffle(output_list)
train_dataset = output_list[:train_len]
test_dataset = output_list[train_len:]
output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/train.json'
with open(output_path, 'w') as f:
json.dump(train_dataset, f, indent=2, ensure_ascii=False)
output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/test.json'
with open(output_path, 'w') as f:
json.dump(test_dataset, f, indent=2, ensure_ascii=False)
get_fineturn_dataset()