Spaces:
Sleeping
Sleeping
| """ | |
| Fine-tuning utilities for language models. | |
| """ | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| Trainer, | |
| TrainingArguments, | |
| DataCollatorForLanguageModeling | |
| ) | |
| from datasets import Dataset | |
| import json | |
| import torch | |
| import os | |
| from typing import List, Dict | |
| class FinancialQATrainer: | |
| def __init__( | |
| self, | |
| model_name: str = "distilgpt2", | |
| output_dir: str = "models/fine_tuned_model", | |
| device: str = "cuda" if torch.cuda.is_available() else "cpu" | |
| ): | |
| """ | |
| Initialize trainer for financial QA fine-tuning. | |
| Args: | |
| model_name: Base model to fine-tune | |
| output_dir: Directory to save fine-tuned model | |
| device: Device to use for training | |
| """ | |
| self.model_name = model_name | |
| self.output_dir = output_dir | |
| self.device = device | |
| # Initialize tokenizer and model | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device) | |
| # Add special tokens if needed | |
| special_tokens = ["<|question|>", "<|answer|>", "<|endoftext|>"] | |
| self.tokenizer.add_special_tokens({"additional_special_tokens": special_tokens}) | |
| self.model.resize_token_embeddings(len(self.tokenizer)) | |
| def prepare_dataset(self, qa_pairs: List[Dict]) -> Dataset: | |
| """ | |
| Prepare dataset for fine-tuning. | |
| Args: | |
| qa_pairs: List of question-answer pairs | |
| Returns: | |
| HuggingFace Dataset | |
| """ | |
| # Format data | |
| formatted_data = [] | |
| for pair in qa_pairs: | |
| text = f"<|question|>{pair['question']}<|answer|>{pair['answer']}<|endoftext|>" | |
| formatted_data.append({"text": text}) | |
| # Create dataset | |
| dataset = Dataset.from_list(formatted_data) | |
| # Tokenize dataset | |
| def tokenize_function(examples): | |
| return self.tokenizer( | |
| examples["text"], | |
| truncation=True, | |
| max_length=512, | |
| padding="max_length" | |
| ) | |
| tokenized_dataset = dataset.map( | |
| tokenize_function, | |
| remove_columns=dataset.column_names, | |
| batched=True | |
| ) | |
| return tokenized_dataset | |
| def train( | |
| self, | |
| train_dataset: Dataset, | |
| eval_dataset: Dataset = None, | |
| num_epochs: int = 3, | |
| batch_size: int = 8, | |
| learning_rate: float = 2e-5, | |
| warmup_steps: int = 500, | |
| logging_steps: int = 100 | |
| ): | |
| """ | |
| Fine-tune the model. | |
| Args: | |
| train_dataset: Training dataset | |
| eval_dataset: Evaluation dataset (optional) | |
| num_epochs: Number of training epochs | |
| batch_size: Batch size for training | |
| learning_rate: Learning rate | |
| warmup_steps: Number of warmup steps | |
| logging_steps: Number of steps between logging | |
| """ | |
| training_args = TrainingArguments( | |
| output_dir=self.output_dir, | |
| num_train_epochs=num_epochs, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| warmup_steps=warmup_steps, | |
| learning_rate=learning_rate, | |
| logging_steps=logging_steps, | |
| evaluation_strategy="epoch" if eval_dataset else "no", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True if eval_dataset else False, | |
| remove_unused_columns=False, | |
| ) | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=self.tokenizer, | |
| mlm=False | |
| ) | |
| trainer = Trainer( | |
| model=self.model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| data_collator=data_collator, | |
| ) | |
| # Train the model | |
| trainer.train() | |
| # Save the model and tokenizer | |
| trainer.save_model() | |
| self.tokenizer.save_pretrained(self.output_dir) | |
| def from_qa_file(cls, qa_file_path: str, **kwargs): | |
| """ | |
| Create trainer instance and prepare dataset from QA file. | |
| Args: | |
| qa_file_path: Path to JSON file containing QA pairs | |
| **kwargs: Additional arguments for trainer initialization | |
| Returns: | |
| Tuple of (trainer, train_dataset, eval_dataset) | |
| """ | |
| # Load QA pairs | |
| with open(qa_file_path, 'r') as f: | |
| qa_pairs = json.load(f) | |
| # Create trainer | |
| trainer = cls(**kwargs) | |
| # Split data into train/eval | |
| train_size = int(0.8 * len(qa_pairs)) | |
| train_pairs = qa_pairs[:train_size] | |
| eval_pairs = qa_pairs[train_size:] | |
| # Prepare datasets | |
| train_dataset = trainer.prepare_dataset(train_pairs) | |
| eval_dataset = trainer.prepare_dataset(eval_pairs) | |
| return trainer, train_dataset, eval_dataset | |
| def fine_tune_model( | |
| qa_file_path: str, | |
| model_name: str = "distilgpt2", | |
| output_dir: str = "models/fine_tuned_model", | |
| **training_kwargs | |
| ): | |
| """ | |
| Convenience function to fine-tune a model on QA pairs. | |
| Args: | |
| qa_file_path: Path to JSON file containing QA pairs | |
| model_name: Base model to fine-tune | |
| output_dir: Directory to save fine-tuned model | |
| **training_kwargs: Additional arguments for training | |
| """ | |
| # Initialize trainer and prepare datasets | |
| trainer, train_dataset, eval_dataset = FinancialQATrainer.from_qa_file( | |
| qa_file_path, | |
| model_name=model_name, | |
| output_dir=output_dir | |
| ) | |
| # Train the model | |
| trainer.train( | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| **training_kwargs | |
| ) | |
| if __name__ == "__main__": | |
| # Example usage | |
| qa_file = "qa_pairs/qa_dataset.json" | |
| fine_tune_model( | |
| qa_file, | |
| num_epochs=3, | |
| batch_size=8, | |
| learning_rate=2e-5 | |
| ) | |