0814output / generate_finetune_dataset.py

Upload folder using huggingface_hub

0ab9189 verified 5 months ago

5.48 kB

	import pandas as pd
	from datasets import load_dataset
	import json

	def get_smi_text():
	# 1. 加载数据集并转换为DataFrame（延续之前的示例）
	ds = load_dataset("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/PubChem-extended", revision="main")
	# print(ds['train'][0])
	# exit()

	# 选择需要的字段（根据实际字段名修改）
	selected_fields = ["SMILES", "description"]

	# 假设数据集有多个拆分（如train、validation），分别转换为DataFrame
	dfs = []
	for split in ds.keys(): # 遍历所有拆分（如train、validation等）
	df = ds[split].select_columns(selected_fields).to_pandas()
	dfs.append(df)

	# 2. 拼接多个DataFrame
	combined_df = pd.concat(dfs, ignore_index=True) # ignore_index=True重置索引
	print(f"拼接后的DataFrame形状: {combined_df.shape}")
	print(combined_df.head())

	# 3. 保存为JSONL格式
	output_file = "PubChem-extended.jsonl"
	combined_df.to_json(
	output_file,
	orient="records", # 按记录格式输出（每条记录一个JSON对象）
	lines=True, # 启用JSON Lines格式（每行一个对象）
	force_ascii=False # 保留非ASCII字符（如特殊化学名称）
	)

	print(f"数据已保存到 {output_file}")

	def get_all_jsonl():
	dfs = []
	df1 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/chebi.jsonl", lines=True)
	print(df1.head())
	dfs.append(df1)
	df2 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/LPM-24-extra-extended.jsonl", lines=True)
	print(df2.head())
	dfs.append(df2)
	df3 = pd.read_json("/mmu_nlp_ssd/wanghuiyang/myPharmHGT/PubChem-extended.jsonl", lines=True)
	print(df3.head())
	dfs.append(df3)
	combined_df = pd.concat(dfs, ignore_index=True) # ignore_index=True重置索引
	print(f"拼接后的DataFrame形状: {combined_df.shape}")
	print(combined_df.head())
	df_unique = combined_df.drop_duplicates()
	print(f"去重后的DataFrame形状: {df_unique.shape}")

	output_file = "train.jsonl"
	df_unique.to_json(
	output_file,
	orient="records", # 按记录格式输出（每条记录一个JSON对象）
	lines=True, # 启用JSON Lines格式（每行一个对象）
	force_ascii=False # 保留非ASCII字符（如特殊化学名称）
	)

	print(f"数据已保存到 {output_file}")


	human_message = {
	'from': 'human',
	'value': ''
	}

	gpt_message = {
	'from': 'gpt',
	'value': ''
	}

	system_prompt = """
	You are a specialized tool for predicting molecular properties based on SMILES strings. Your core function is to take a molecule's SMILES (Simplified Molecular-Input Line-Entry System) notation as input and output a comprehensive, accurate description of its key properties.

	When processing the input SMILES, focus on describing the following properties (as applicable and relevant to the molecule):
	- Chemical class or family (e.g., alkane, aromatic compound, steroid, nucleotide)
	- Physical properties: melting point range, boiling point range, solubility (in water and common organic solvents), state of matter at room temperature (solid, liquid, gas)
	- Chemical reactivity: key functional groups and their typical reactions (e.g., ester hydrolysis, amine protonation, alkene addition)
	- Biological activity (if applicable): therapeutic category, target biomolecules (e.g., enzyme inhibitors, receptor agonists), toxicity profile highlights
	- Spectroscopic features: characteristic peaks in NMR (¹H, ¹³C), IR, or mass spectrometry
	- Other notable properties: chirality, stability under ambient conditions, flammability, hygroscopicity

	Ensure your descriptions are concise yet informative, using precise chemical terminology. Avoid speculation; base properties on well-established chemical knowledge. If a property is highly variable or not well-defined for the given molecule, state this clearly.

	Input: SMILES string of a molecule
	Output: Structured description of the molecule's properties as outlined above
	"""

	from copy import deepcopy

	def get_conversations(SMILES:str, des:str):

	conversations = []
	conversations.append(deepcopy(human_message))
	conversations[-1]['value'] = SMILES

	conversations.append(deepcopy(gpt_message))
	conversations[-1]['value'] = des
	# conversations[-1]['value'] = annotation
	return conversations

	def get_fineturn_dataset():
	import random
	dataset_list = []
	with open('/mmu_nlp_ssd/wanghuiyang/myPharmHGT/train.jsonl', 'r', encoding='utf-8') as file:
	for line in file.readlines():
	dataset_list.append(json.loads(line))
	print(len(dataset_list))

	output_list = []
	for data in dataset_list:
	conversations = get_conversations(data['SMILES'], data['description'])
	output_list.append({'conversations': conversations, 'system':system_prompt})

	train_len = int(0.9 * len(output_list))
	random.shuffle(output_list)
	train_dataset = output_list[:train_len]
	test_dataset = output_list[train_len:]

	output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/train.json'
	with open(output_path, 'w') as f:
	json.dump(train_dataset, f, indent=2, ensure_ascii=False)

	output_path = '/mmu_nlp_ssd/wanghuiyang/myPharmHGT/dataset/generated/test.json'
	with open(output_path, 'w') as f:
	json.dump(test_dataset, f, indent=2, ensure_ascii=False)

	get_fineturn_dataset()