Spaces:

toecm
/

PureVersation

Running

App Files Files Community

PureVersation / src /prepare_training_data.py

toecm

Create prepare_training_data.py

0529047 verified 4 months ago

raw

history blame contribute delete

2.49 kB

	import pandas as pd
	import glob
	import os

	# ⚙️ CONFIGURATION
	DATASET_DIR = "/app/iuuy_datasets" # Where your dialect CSVs live
	OUTPUT_FILE = "train.csv" # The file AutoTrain needs
	MIN_LENGTH = 3 # Filter out tiny, meaningless utterances

	def create_instruction_prompt(row):
	"""
	Converts a raw data row into a rich training example for the LLM.
	Format: Alpaca-Style Instruction
	"""
	dialect = str(row.get('Dialect', 'Unknown Dialect')).strip()
	utterance = str(row.get('Utterance', '')).strip()
	clarification = str(row.get('Clarification', '')).strip()
	context = str(row.get('Linguistic_Context', 'General conversation')).strip()
	tone = str(row.get('Tone_Category', 'Neutral')).strip()

	# 🟢 Skip bad data (empty rows)
	if len(utterance) < MIN_LENGTH or len(clarification) < MIN_LENGTH:
	return None

	# 📝 The Prompt Template
	# This teaches the model: Given a Dialect + Context + Utterance -> Provide Meaning + Tone.
	text = f"""### Instruction:
	You are an expert sociolinguist specializing in {dialect}.
	Analyze the following utterance found in a {context} setting.
	Provide the Standard English interpretation and the emotional tone.

	### Input:
	{utterance}

	### Response:
	Meaning: {clarification}
	Tone: {tone}
	"""
	return text

	def main():
	print(f"📂 Scanning {DATASET_DIR} for research data...")

	all_files = glob.glob(os.path.join(DATASET_DIR, "*.csv"))
	training_data = []

	for filename in all_files:
	try:
	df = pd.read_csv(filename)
	print(f" - Processing {os.path.basename(filename)} ({len(df)} rows)...")

	# Apply the template to every row
	for _, row in df.iterrows():
	prompt = create_instruction_prompt(row)
	if prompt:
	training_data.append(prompt)

	except Exception as e:
	print(f" ⚠️ Error reading {filename}: {e}")

	# 🟢 Create the final Training DataFrame
	train_df = pd.DataFrame(training_data, columns=["text"])

	# Save to root (so AutoTrain can find it easily)
	train_df.to_csv(OUTPUT_FILE, index=False)

	print("\n✅ SUCCESS!")
	print(f"📊 compiled {len(train_df)} training examples.")
	print(f"💾 Saved to: {os.path.abspath(OUTPUT_FILE)}")
	print("🚀 You can now run 'autotrain llm' using this file.")

	if __name__ == "__main__":
	main()