Spaces:

toecm
/

PureVersation

Running

File size: 2,493 Bytes
import pandas as pd
import glob
import os

# ⚙️ CONFIGURATION
DATASET_DIR = "/app/iuuy_datasets"  # Where your dialect CSVs live
OUTPUT_FILE = "train.csv"           # The file AutoTrain needs
MIN_LENGTH = 3                      # Filter out tiny, meaningless utterances

def create_instruction_prompt(row):
    """
    Converts a raw data row into a rich training example for the LLM.
    Format: Alpaca-Style Instruction
    """
    dialect = str(row.get('Dialect', 'Unknown Dialect')).strip()
    utterance = str(row.get('Utterance', '')).strip()
    clarification = str(row.get('Clarification', '')).strip()
    context = str(row.get('Linguistic_Context', 'General conversation')).strip()
    tone = str(row.get('Tone_Category', 'Neutral')).strip()
    
    # 🟢 Skip bad data (empty rows)
    if len(utterance) < MIN_LENGTH or len(clarification) < MIN_LENGTH:
        return None

    # 📝 The Prompt Template
    # This teaches the model: Given a Dialect + Context + Utterance -> Provide Meaning + Tone.
    text = f"""### Instruction:
You are an expert sociolinguist specializing in {dialect}. 
Analyze the following utterance found in a {context} setting.
Provide the Standard English interpretation and the emotional tone.

### Input:
{utterance}

### Response:
Meaning: {clarification}
Tone: {tone}
"""
    return text

def main():
    print(f"📂 Scanning {DATASET_DIR} for research data...")
    
    all_files = glob.glob(os.path.join(DATASET_DIR, "*.csv"))
    training_data = []

    for filename in all_files:
        try:
            df = pd.read_csv(filename)
            print(f"   - Processing {os.path.basename(filename)} ({len(df)} rows)...")
            
            # Apply the template to every row
            for _, row in df.iterrows():
                prompt = create_instruction_prompt(row)
                if prompt:
                    training_data.append(prompt)
                    
        except Exception as e:
            print(f"   ⚠️ Error reading {filename}: {e}")

    # 🟢 Create the final Training DataFrame
    train_df = pd.DataFrame(training_data, columns=["text"])
    
    # Save to root (so AutoTrain can find it easily)
    train_df.to_csv(OUTPUT_FILE, index=False)
    
    print("\n✅ SUCCESS!")
    print(f"📊 compiled {len(train_df)} training examples.")
    print(f"💾 Saved to: {os.path.abspath(OUTPUT_FILE)}")
    print("🚀 You can now run 'autotrain llm' using this file.")

if __name__ == "__main__":
    main()