import pandas as pd import glob import os # āš™ļø CONFIGURATION DATASET_DIR = "/app/iuuy_datasets" # Where your dialect CSVs live OUTPUT_FILE = "train.csv" # The file AutoTrain needs MIN_LENGTH = 3 # Filter out tiny, meaningless utterances def create_instruction_prompt(row): """ Converts a raw data row into a rich training example for the LLM. Format: Alpaca-Style Instruction """ dialect = str(row.get('Dialect', 'Unknown Dialect')).strip() utterance = str(row.get('Utterance', '')).strip() clarification = str(row.get('Clarification', '')).strip() context = str(row.get('Linguistic_Context', 'General conversation')).strip() tone = str(row.get('Tone_Category', 'Neutral')).strip() # 🟢 Skip bad data (empty rows) if len(utterance) < MIN_LENGTH or len(clarification) < MIN_LENGTH: return None # šŸ“ The Prompt Template # This teaches the model: Given a Dialect + Context + Utterance -> Provide Meaning + Tone. text = f"""### Instruction: You are an expert sociolinguist specializing in {dialect}. Analyze the following utterance found in a {context} setting. Provide the Standard English interpretation and the emotional tone. ### Input: {utterance} ### Response: Meaning: {clarification} Tone: {tone} """ return text def main(): print(f"šŸ“‚ Scanning {DATASET_DIR} for research data...") all_files = glob.glob(os.path.join(DATASET_DIR, "*.csv")) training_data = [] for filename in all_files: try: df = pd.read_csv(filename) print(f" - Processing {os.path.basename(filename)} ({len(df)} rows)...") # Apply the template to every row for _, row in df.iterrows(): prompt = create_instruction_prompt(row) if prompt: training_data.append(prompt) except Exception as e: print(f" āš ļø Error reading {filename}: {e}") # 🟢 Create the final Training DataFrame train_df = pd.DataFrame(training_data, columns=["text"]) # Save to root (so AutoTrain can find it easily) train_df.to_csv(OUTPUT_FILE, index=False) print("\nāœ… SUCCESS!") print(f"šŸ“Š compiled {len(train_df)} training examples.") print(f"šŸ’¾ Saved to: {os.path.abspath(OUTPUT_FILE)}") print("šŸš€ You can now run 'autotrain llm' using this file.") if __name__ == "__main__": main()