Spaces:
Running
Running
| import pandas as pd | |
| import glob | |
| import os | |
| # ⚙️ CONFIGURATION | |
| DATASET_DIR = "/app/iuuy_datasets" # Where your dialect CSVs live | |
| OUTPUT_FILE = "train.csv" # The file AutoTrain needs | |
| MIN_LENGTH = 3 # Filter out tiny, meaningless utterances | |
| def create_instruction_prompt(row): | |
| """ | |
| Converts a raw data row into a rich training example for the LLM. | |
| Format: Alpaca-Style Instruction | |
| """ | |
| dialect = str(row.get('Dialect', 'Unknown Dialect')).strip() | |
| utterance = str(row.get('Utterance', '')).strip() | |
| clarification = str(row.get('Clarification', '')).strip() | |
| context = str(row.get('Linguistic_Context', 'General conversation')).strip() | |
| tone = str(row.get('Tone_Category', 'Neutral')).strip() | |
| # 🟢 Skip bad data (empty rows) | |
| if len(utterance) < MIN_LENGTH or len(clarification) < MIN_LENGTH: | |
| return None | |
| # 📝 The Prompt Template | |
| # This teaches the model: Given a Dialect + Context + Utterance -> Provide Meaning + Tone. | |
| text = f"""### Instruction: | |
| You are an expert sociolinguist specializing in {dialect}. | |
| Analyze the following utterance found in a {context} setting. | |
| Provide the Standard English interpretation and the emotional tone. | |
| ### Input: | |
| {utterance} | |
| ### Response: | |
| Meaning: {clarification} | |
| Tone: {tone} | |
| """ | |
| return text | |
| def main(): | |
| print(f"📂 Scanning {DATASET_DIR} for research data...") | |
| all_files = glob.glob(os.path.join(DATASET_DIR, "*.csv")) | |
| training_data = [] | |
| for filename in all_files: | |
| try: | |
| df = pd.read_csv(filename) | |
| print(f" - Processing {os.path.basename(filename)} ({len(df)} rows)...") | |
| # Apply the template to every row | |
| for _, row in df.iterrows(): | |
| prompt = create_instruction_prompt(row) | |
| if prompt: | |
| training_data.append(prompt) | |
| except Exception as e: | |
| print(f" ⚠️ Error reading {filename}: {e}") | |
| # 🟢 Create the final Training DataFrame | |
| train_df = pd.DataFrame(training_data, columns=["text"]) | |
| # Save to root (so AutoTrain can find it easily) | |
| train_df.to_csv(OUTPUT_FILE, index=False) | |
| print("\n✅ SUCCESS!") | |
| print(f"📊 compiled {len(train_df)} training examples.") | |
| print(f"💾 Saved to: {os.path.abspath(OUTPUT_FILE)}") | |
| print("🚀 You can now run 'autotrain llm' using this file.") | |
| if __name__ == "__main__": | |
| main() |