Spaces:

Kahrhoff
/

Model-Training-V2

Sleeping

App Files Files Community

Kahrhoff commited on Oct 31, 2025

Commit

6f5c468

verified ·

1 Parent(s): 137c491

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -282

app.py CHANGED Viewed

@@ -1,283 +1,283 @@
-#!/usr/bin/env python3
-"""
-OpenFinancial Chatbot - Hugging Face Space Trainer
-==================================================
-This script is designed to run directly in a Hugging Face Space.
-Upload this file along with your training data to a HF Space and it will:
-1. Load your training data automatically
-2. Train the model using available hardware (GPU/CPU)
-3. Save the trained model to the space's file system
-4. Provide a simple interface to monitor progress
-Instructions:
-1. Create a new HF Space (Gradio SDK)
-2. Upload this file as app.py
-3. Upload your training CSV files to the space
-4. The space will automatically start training when it loads
-"""
-import os
-import json
-import time
-import pandas as pd
-from datasets import Dataset
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    Trainer,
-    TrainingArguments,
-    DataCollatorForLanguageModeling
-)
-import torch
-from huggingface_hub import login
-import gradio as gr
-# Configuration
-BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-OUTPUT_MODEL_DIR = "./trained_model"
-TRAINING_DATA_FILES = ["trainingData.csv", "training_data.csv", "data.csv"]  # Try multiple names
-def find_training_data():
-    """Find training data files in the space"""
-    print("🔍 Looking for training data files...")
-    # Check for CSV files
-    for filename in TRAINING_DATA_FILES:
-        if os.path.exists(filename):
-            print(f"✅ Found training data: {filename}")
-            return filename
-    # Check all CSV files in current directory
-    csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
-    if csv_files:
-        print(f"✅ Found CSV files: {csv_files}")
-        return csv_files[0]  # Use the first one
-    print("❌ No training data found. Please upload a CSV file with 'Question' and 'Answer' columns.")
-    return None
-def load_training_data(filename):
-    """Load and prepare training data"""
-    print(f"📊 Loading training data from {filename}...")
-    try:
-        # Read CSV file
-        df = pd.read_csv(filename)
-        print(f"Raw data shape: {df.shape}")
-        # Check for required columns (flexible naming)
-        question_cols = [col for col in df.columns if 'question' in col.lower() or 'prompt' in col.lower() or 'input' in col.lower()]
-        answer_cols = [col for col in df.columns if 'answer' in col.lower() or 'response' in col.lower() or 'output' in col.lower()]
-        if not question_cols or not answer_cols:
-            print(f"Available columns: {list(df.columns)}")
-            raise ValueError("Could not find Question/Answer columns")
-        question_col = question_cols[0]
-        answer_col = answer_cols[0]
-        print(f"Using columns: {question_col} -> {answer_col}")
-        # Create training format
-        training_data = []
-        for _, row in df.iterrows():
-            question = str(row[question_col]).strip()
-            answer = str(row[answer_col]).strip()
-            if question and answer and question != 'nan' and answer != 'nan':
-                # Format as conversation
-                text = f"### Question: {question}\n### Answer: {answer}<|endoftext|>"
-                training_data.append({"text": text})
-        print(f"✅ Processed {len(training_data)} valid training examples")
-        return training_data
-    except Exception as e:
-        print(f"❌ Error loading data: {e}")
-        return None
-def train_model(training_data):
-    """Train the model with the provided data"""
-    print("🚀 Starting model training...")
-    # Check hardware
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"💻 Using device: {device}")
-    if torch.cuda.is_available():
-        print(f"🔥 GPU: {torch.cuda.get_device_name(0)}")
-    # Create dataset
-    dataset = Dataset.from_list(training_data)
-    print(f"📊 Dataset size: {len(dataset)} examples")
-    # Load tokenizer and model
-    print("🔧 Loading model and tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None
-    )
-    # Tokenize dataset
-    print("🔄 Tokenizing dataset...")
-    def tokenize_function(examples):
-        return tokenizer(
-            examples["text"],
-            truncation=True,
-            padding=False,
-            max_length=512
-        )
-    tokenized_dataset = dataset.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["text"]
-    )
-    # Training arguments
-    batch_size = 4 if torch.cuda.is_available() else 2
-    gradient_steps = 4 if torch.cuda.is_available() else 8
-    training_args = TrainingArguments(
-        output_dir="./results",
-        num_train_epochs=3,
-        per_device_train_batch_size=batch_size,
-        gradient_accumulation_steps=gradient_steps,
-        warmup_steps=50,
-        learning_rate=2e-5,
-        logging_steps=10,
-        save_steps=500,
-        save_total_limit=2,
-        remove_unused_columns=False,
-        dataloader_num_workers=0,  # Avoid multiprocessing issues
-        fp16=torch.cuda.is_available(),
-        report_to=None,  # Disable wandb
-    )
-    # Data collator
-    data_collator = DataCollatorForLanguageModeling(
-        tokenizer=tokenizer,
-        mlm=False,
-    )
-    # Create trainer
-    print("⚙️ Initializing trainer...")
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_dataset,
-        data_collator=data_collator,
-        tokenizer=tokenizer,
-    )
-    # Train the model
-    print("🔥 Starting training...")
-    start_time = time.time()
-    try:
-        trainer.train()
-        end_time = time.time()
-        training_duration = (end_time - start_time) / 60
-        # Save the model
-        print("💾 Saving trained model...")
-        trainer.save_model(OUTPUT_MODEL_DIR)
-        tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
-        # Create a completion marker
-        with open("training_complete.txt", "w") as f:
-            f.write(f"Training completed successfully!\nDuration: {training_duration:.1f} minutes\nModel saved to: {OUTPUT_MODEL_DIR}")
-        return f"✅ Training completed in {training_duration:.1f} minutes!\n\nModel saved to: {OUTPUT_MODEL_DIR}\n\nYou can now download the trained_model folder."
-    except Exception as e:
-        error_msg = f"❌ Training failed: {str(e)}"
-        print(error_msg)
-        # Create error marker
-        with open("training_error.txt", "w") as f:
-            f.write(error_msg)
-        return error_msg
-def create_interface():
-    """Create Gradio interface"""
-    # Check for existing status
-    initial_status = "🚀 Ready to start training..."
-    if os.path.exists("training_complete.txt"):
-        with open("training_complete.txt", "r") as f:
-            initial_status = f.read()
-    elif os.path.exists("training_error.txt"):
-        with open("training_error.txt", "r") as f:
-            initial_status = f.read()
-    with gr.Blocks(title="OpenFinancial Chatbot Trainer") as demo:
-        gr.Markdown("# 🤖 OpenFinancial Chatbot - Cloud Trainer")
-        gr.Markdown("Upload your training CSV file and click 'Start Training' to begin.")
-        status_output = gr.Textbox(
-            label="Training Status",
-            value=initial_status,
-            lines=10,
-            max_lines=20
-        )
-        with gr.Row():
-            start_btn = gr.Button("🚀 Start Training", variant="primary")
-            refresh_btn = gr.Button("🔄 Refresh Status", variant="secondary")
-        # File download section
-        gr.Markdown("## 📥 Download Trained Model")
-        download_info = gr.Markdown("After training completes, download the files below:")
-        def start_training():
-            # Find and load data
-            data_file = find_training_data()
-            if not data_file:
-                return "❌ No training data found. Please upload a CSV file with Question and Answer columns."
-            training_data = load_training_data(data_file)
-            if not training_data:
-                return "❌ Failed to load training data. Check the CSV format."
-            # Start training
-            return train_model(training_data)
-        def refresh_status():
-            if os.path.exists("training_complete.txt"):
-                with open("training_complete.txt", "r") as f:
-                    return f.read()
-            elif os.path.exists("training_error.txt"):
-                with open("training_error.txt", "r") as f:
-                    return f.read()
-            else:
-                return "🚀 Ready to start training..."
-        start_btn.click(start_training, outputs=status_output)
-        refresh_btn.click(refresh_status, outputs=status_output)
-    return demo
-if __name__ == "__main__":
-    print("🤖 OpenFinancial Chatbot - HF Space Trainer")
-    print("=" * 50)
-    # Auto-login if token is available
-    if "HF_TOKEN" in os.environ:
-        try:
-            login(token=os.environ["HF_TOKEN"])
-            print("✅ Hugging Face authentication successful")
-        except:
-            print("⚠️ HF authentication failed (optional)")
-    # Launch interface
-    interface = create_interface()
     interface.launch()

+#!/usr/bin/env python3
+"""
+OpenFinancial Chatbot - Hugging Face Space Trainer
+==================================================
+This script is designed to run directly in a Hugging Face Space.
+Upload this file along with your training data to a HF Space and it will:
+1. Load your training data automatically
+2. Train the model using available hardware (GPU/CPU)
+3. Save the trained model to the space's file system
+4. Provide a simple interface to monitor progress
+Instructions:
+1. Create a new HF Space (Gradio SDK)
+2. Upload this file as app.py
+3. Upload your training CSV files to the space
+4. The space will automatically start training when it loads
+"""
+import os
+import json
+import time
+import pandas as pd
+from datasets import Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    DataCollatorForLanguageModeling
+)
+import torch
+from huggingface_hub import login
+import gradio as gr
+# Configuration
+BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+OUTPUT_MODEL_DIR = "./trained_model"
+TRAINING_DATA_FILES = ["customer_service_conversations.csv", "financial_conversations.csv", "financial_qa_conversations.csv", "trainingData.csv"]  # Try multiple names
+def find_training_data():
+    """Find training data files in the space"""
+    print("🔍 Looking for training data files...")
+    # Check for CSV files
+    for filename in TRAINING_DATA_FILES:
+        if os.path.exists(filename):
+            print(f"Found training data: {filename}")
+            return filename
+    # Check all CSV files in current directory
+    csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
+    if csv_files:
+        print(f"Found CSV files: {csv_files}")
+        return csv_files[0]  # Use the first one
+    print("No training data found. Please upload a CSV file with 'Question' and 'Answer' columns.")
+    return None
+def load_training_data(filename):
+    """Load and prepare training data"""
+    print(f"📊 Loading training data from {filename}...")
+    try:
+        # Read CSV file
+        df = pd.read_csv(filename)
+        print(f"Raw data shape: {df.shape}")
+        # Check for required columns (flexible naming)
+        question_cols = [col for col in df.columns if 'question' in col.lower() or 'prompt' in col.lower() or 'input' in col.lower()]
+        answer_cols = [col for col in df.columns if 'answer' in col.lower() or 'response' in col.lower() or 'output' in col.lower()]
+        if not question_cols or not answer_cols:
+            print(f"Available columns: {list(df.columns)}")
+            raise ValueError("Could not find Question/Answer columns")
+        question_col = question_cols[0]
+        answer_col = answer_cols[0]
+        print(f"Using columns: {question_col} -> {answer_col}")
+        # Create training format
+        training_data = []
+        for _, row in df.iterrows():
+            question = str(row[question_col]).strip()
+            answer = str(row[answer_col]).strip()
+            if question and answer and question != 'nan' and answer != 'nan':
+                # Format as conversation
+                text = f"### Question: {question}\n### Answer: {answer}<|endoftext|>"
+                training_data.append({"text": text})
+        print(f"Processed {len(training_data)} valid training examples")
+        return training_data
+    except Exception as e:
+        print(f"Error loading data: {e}")
+        return None
+def train_model(training_data):
+    """Train the model with the provided data"""
+    print("Starting model training...")
+    # Check hardware
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
+    # Create dataset
+    dataset = Dataset.from_list(training_data)
+    print(f"Dataset size: {len(dataset)} examples")
+    # Load tokenizer and model
+    print("Loading model and tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto" if torch.cuda.is_available() else None
+    )
+    # Tokenize dataset
+    print("Tokenizing dataset...")
+    def tokenize_function(examples):
+        return tokenizer(
+            examples["text"],
+            truncation=True,
+            padding=False,
+            max_length=512
+        )
+    tokenized_dataset = dataset.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=["text"]
+    )
+    # Training arguments
+    batch_size = 4 if torch.cuda.is_available() else 2
+    gradient_steps = 4 if torch.cuda.is_available() else 8
+    training_args = TrainingArguments(
+        output_dir="./results",
+        num_train_epochs=3,
+        per_device_train_batch_size=batch_size,
+        gradient_accumulation_steps=gradient_steps,
+        warmup_steps=50,
+        learning_rate=2e-5,
+        logging_steps=10,
+        save_steps=500,
+        save_total_limit=2,
+        remove_unused_columns=False,
+        dataloader_num_workers=0,  # Avoid multiprocessing issues
+        fp16=torch.cuda.is_available(),
+        report_to=None,  # Disable wandb
+    )
+    # Data collator
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+    )
+    # Create trainer
+    print("Initializing trainer...")
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset,
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+    )
+    # Train the model
+    print("Starting training...")
+    start_time = time.time()
+    try:
+        trainer.train()
+        end_time = time.time()
+        training_duration = (end_time - start_time) / 60
+        # Save the model
+        print("Saving trained model...")
+        trainer.save_model(OUTPUT_MODEL_DIR)
+        tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
+        # Create a completion marker
+        with open("training_complete.txt", "w") as f:
+            f.write(f"Training completed successfully!\nDuration: {training_duration:.1f} minutes\nModel saved to: {OUTPUT_MODEL_DIR}")
+        return f"Training completed in {training_duration:.1f} minutes!\n\nModel saved to: {OUTPUT_MODEL_DIR}\n\nYou can now download the trained_model folder."
+    except Exception as e:
+        error_msg = f"Training failed: {str(e)}"
+        print(error_msg)
+        # Create error marker
+        with open("training_error.txt", "w") as f:
+            f.write(error_msg)
+        return error_msg
+def create_interface():
+    """Create Gradio interface"""
+    # Check for existing status
+    initial_status = "Ready to start training..."
+    if os.path.exists("training_complete.txt"):
+        with open("training_complete.txt", "r") as f:
+            initial_status = f.read()
+    elif os.path.exists("training_error.txt"):
+        with open("training_error.txt", "r") as f:
+            initial_status = f.read()
+    with gr.Blocks(title="OpenFinancial Chatbot Trainer") as demo:
+        gr.Markdown("# OpenFinancial Chatbot - Cloud Trainer")
+        gr.Markdown("Upload your training CSV file and click 'Start Training' to begin.")
+        status_output = gr.Textbox(
+            label="Training Status",
+            value=initial_status,
+            lines=10,
+            max_lines=20
+        )
+        with gr.Row():
+            start_btn = gr.Button("Start Training", variant="primary")
+            refresh_btn = gr.Button("Refresh Status", variant="secondary")
+        # File download section
+        gr.Markdown("## Download Trained Model")
+        download_info = gr.Markdown("After training completes, download the files below:")
+        def start_training():
+            # Find and load data
+            data_file = find_training_data()
+            if not data_file:
+                return "No training data found. Please upload a CSV file with Question and Answer columns."
+            training_data = load_training_data(data_file)
+            if not training_data:
+                return "Failed to load training data. Check the CSV format."
+            # Start training
+            return train_model(training_data)
+        def refresh_status():
+            if os.path.exists("training_complete.txt"):
+                with open("training_complete.txt", "r") as f:
+                    return f.read()
+            elif os.path.exists("training_error.txt"):
+                with open("training_error.txt", "r") as f:
+                    return f.read()
+            else:
+                return "Ready to start training..."
+        start_btn.click(start_training, outputs=status_output)
+        refresh_btn.click(refresh_status, outputs=status_output)
+    return demo
+if __name__ == "__main__":
+    print("OpenFinancial Chatbot - HF Space Trainer")
+    print("=" * 50)
+    # Auto-login if token is available
+    if "HF_TOKEN" in os.environ:
+        try:
+            login(token=os.environ["HF_TOKEN"])
+            print("Hugging Face authentication successful")
+        except:
+            print("HF authentication failed (optional)")
+    # Launch interface
+    interface = create_interface()
     interface.launch()