Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| OpenFinancial Chatbot - Hugging Face Space Trainer | |
| ================================================== | |
| This script is designed to run directly in a Hugging Face Space. | |
| Upload this file along with your training data to a HF Space and it will: | |
| 1. Load your training data automatically | |
| 2. Train the model using available hardware (GPU/CPU) | |
| 3. Save the trained model to the space's file system | |
| 4. Provide a simple interface to monitor progress | |
| Instructions: | |
| 1. Create a new HF Space (Gradio SDK) | |
| 2. Upload this file as app.py | |
| 3. Upload your training CSV files to the space | |
| 4. The space will automatically start training when it loads | |
| """ | |
| import os | |
| import json | |
| import time | |
| import pandas as pd | |
| from datasets import Dataset | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| Trainer, | |
| TrainingArguments, | |
| DataCollatorForLanguageModeling | |
| ) | |
| import torch | |
| from huggingface_hub import login | |
| import gradio as gr | |
| # Configuration | |
| BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| OUTPUT_MODEL_DIR = "./trained_model" | |
| TRAINING_DATA_FILES = ["customer_service_conversations.csv", "financial_conversations.csv", "financial_qa_conversations.csv", "trainingData.csv"] # Try multiple names | |
| def find_training_data(): | |
| """Find training data files in the space""" | |
| # Check for CSV files | |
| for filename in TRAINING_DATA_FILES: | |
| if os.path.exists(filename): | |
| return filename | |
| # Check all CSV files in current directory | |
| csv_files = [f for f in os.listdir('.') if f.endswith('.csv')] | |
| if csv_files: | |
| return csv_files[0] # Use the first one | |
| print("No training data found. Please upload a CSV file with 'Question' and 'Answer' columns.") | |
| return None | |
| def load_training_data(filename): | |
| """Load and prepare training data""" | |
| try: | |
| # Read CSV file | |
| df = pd.read_csv(filename) | |
| # Check for required columns (flexible naming) | |
| question_cols = [col for col in df.columns if 'question' in col.lower()] | |
| answer_cols = [col for col in df.columns if 'answer' in col.lower()] | |
| if not question_cols or not answer_cols: | |
| raise ValueError("Could not find Question/Answer columns") | |
| question_col = question_cols[0] | |
| answer_col = answer_cols[0] | |
| # Create training format | |
| training_data = [] | |
| for _, row in df.iterrows(): | |
| question = str(row[question_col]).strip() | |
| answer = str(row[answer_col]).strip() | |
| if question and answer and question != 'nan' and answer != 'nan': | |
| # Format as conversation | |
| text = f"### Question: {question}\n### Answer: {answer}<|endoftext|>" | |
| training_data.append({"text": text}) | |
| print(f"Processed {len(training_data)} valid training examples") | |
| return training_data | |
| except Exception as e: | |
| return None | |
| def train_model(training_data): | |
| """Train the model with the provided data""" | |
| print("Starting model training...") | |
| # Check hardware | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| if torch.cuda.is_available(): | |
| print(f"GPU: {torch.cuda.get_device_name(0)}") | |
| # Create dataset | |
| dataset = Dataset.from_list(training_data) | |
| # Load tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto" if torch.cuda.is_available() else None | |
| ) | |
| # Tokenize dataset | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples["text"], | |
| truncation=True, | |
| padding=False, | |
| max_length=512 | |
| ) | |
| tokenized_dataset = dataset.map( | |
| tokenize_function, | |
| batched=True, | |
| remove_columns=["text"] | |
| ) | |
| # Training arguments | |
| batch_size = 4 if torch.cuda.is_available() else 2 | |
| gradient_steps = 4 if torch.cuda.is_available() else 8 | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=batch_size, | |
| gradient_accumulation_steps=gradient_steps, | |
| warmup_steps=50, | |
| learning_rate=2e-5, | |
| logging_steps=10, | |
| save_steps=500, | |
| save_total_limit=2, | |
| remove_unused_columns=False, | |
| dataloader_num_workers=0, # Avoid multiprocessing issues | |
| fp16=torch.cuda.is_available(), | |
| report_to=None, # Disable wandb | |
| ) | |
| # Data collator | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=False, | |
| ) | |
| # Create trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| ) | |
| # Train the model | |
| start_time = time.time() | |
| try: | |
| trainer.train() | |
| end_time = time.time() | |
| training_duration = (end_time - start_time) / 60 | |
| # Save the model | |
| trainer.save_model(OUTPUT_MODEL_DIR) | |
| tokenizer.save_pretrained(OUTPUT_MODEL_DIR) | |
| # Create a completion marker | |
| with open("training_complete.txt", "w") as f: | |
| f.write(f"Training completed successfully!\nDuration: {training_duration:.1f} minutes\nModel saved to: {OUTPUT_MODEL_DIR}") | |
| return f"Training completed in {training_duration:.1f} minutes!\n\nModel saved to: {OUTPUT_MODEL_DIR}\n\nYou can now download the trained_model folder." | |
| except Exception as e: | |
| error_msg = f"Training failed: {str(e)}" | |
| print(error_msg) | |
| # Create error marker | |
| with open("training_error.txt", "w") as f: | |
| f.write(error_msg) | |
| return error_msg | |
| def create_interface(): | |
| """Create Gradio interface""" | |
| # Check for existing status | |
| initial_status = "Ready to start training..." | |
| if os.path.exists("training_complete.txt"): | |
| with open("training_complete.txt", "r") as f: | |
| initial_status = f.read() | |
| elif os.path.exists("training_error.txt"): | |
| with open("training_error.txt", "r") as f: | |
| initial_status = f.read() | |
| with gr.Blocks(title="OpenFinancial Chatbot Trainer") as demo: | |
| gr.Markdown("# OpenFinancial Chatbot - Cloud Trainer") | |
| gr.Markdown("Upload your training CSV file and click 'Start Training' to begin.") | |
| status_output = gr.Textbox( | |
| label="Training Status", | |
| value=initial_status, | |
| lines=10, | |
| max_lines=20 | |
| ) | |
| with gr.Row(): | |
| start_btn = gr.Button("Start Training", variant="primary") | |
| refresh_btn = gr.Button("Refresh Status", variant="secondary") | |
| # File download section | |
| gr.Markdown("## Download Trained Model") | |
| download_info = gr.Markdown("After training completes, download the files below:") | |
| def start_training(): | |
| # Find and load data | |
| data_file = find_training_data() | |
| if not data_file: | |
| return "No training data found. Please upload a CSV file with Question and Answer columns." | |
| training_data = load_training_data(data_file) | |
| if not training_data: | |
| return "Failed to load training data. Check the CSV format." | |
| # Start training | |
| return train_model(training_data) | |
| def refresh_status(): | |
| if os.path.exists("training_complete.txt"): | |
| with open("training_complete.txt", "r") as f: | |
| return f.read() | |
| elif os.path.exists("training_error.txt"): | |
| with open("training_error.txt", "r") as f: | |
| return f.read() | |
| else: | |
| return "Ready to start training..." | |
| start_btn.click(start_training, outputs=status_output) | |
| refresh_btn.click(refresh_status, outputs=status_output) | |
| return demo | |
| if __name__ == "__main__": | |
| # Launch interface | |
| interface = create_interface() | |
| interface.launch() |