PureVersation / src /prepare_training_data.py
toecm's picture
Create prepare_training_data.py
0529047 verified
import pandas as pd
import glob
import os
# ⚙️ CONFIGURATION
DATASET_DIR = "/app/iuuy_datasets" # Where your dialect CSVs live
OUTPUT_FILE = "train.csv" # The file AutoTrain needs
MIN_LENGTH = 3 # Filter out tiny, meaningless utterances
def create_instruction_prompt(row):
"""
Converts a raw data row into a rich training example for the LLM.
Format: Alpaca-Style Instruction
"""
dialect = str(row.get('Dialect', 'Unknown Dialect')).strip()
utterance = str(row.get('Utterance', '')).strip()
clarification = str(row.get('Clarification', '')).strip()
context = str(row.get('Linguistic_Context', 'General conversation')).strip()
tone = str(row.get('Tone_Category', 'Neutral')).strip()
# 🟢 Skip bad data (empty rows)
if len(utterance) < MIN_LENGTH or len(clarification) < MIN_LENGTH:
return None
# 📝 The Prompt Template
# This teaches the model: Given a Dialect + Context + Utterance -> Provide Meaning + Tone.
text = f"""### Instruction:
You are an expert sociolinguist specializing in {dialect}.
Analyze the following utterance found in a {context} setting.
Provide the Standard English interpretation and the emotional tone.
### Input:
{utterance}
### Response:
Meaning: {clarification}
Tone: {tone}
"""
return text
def main():
print(f"📂 Scanning {DATASET_DIR} for research data...")
all_files = glob.glob(os.path.join(DATASET_DIR, "*.csv"))
training_data = []
for filename in all_files:
try:
df = pd.read_csv(filename)
print(f" - Processing {os.path.basename(filename)} ({len(df)} rows)...")
# Apply the template to every row
for _, row in df.iterrows():
prompt = create_instruction_prompt(row)
if prompt:
training_data.append(prompt)
except Exception as e:
print(f" ⚠️ Error reading {filename}: {e}")
# 🟢 Create the final Training DataFrame
train_df = pd.DataFrame(training_data, columns=["text"])
# Save to root (so AutoTrain can find it easily)
train_df.to_csv(OUTPUT_FILE, index=False)
print("\n✅ SUCCESS!")
print(f"📊 compiled {len(train_df)} training examples.")
print(f"💾 Saved to: {os.path.abspath(OUTPUT_FILE)}")
print("🚀 You can now run 'autotrain llm' using this file.")
if __name__ == "__main__":
main()