Spaces:
Running
Running
| import os | |
| import sys | |
| import json | |
| import torch | |
| from datasets import Dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
| # Add root path to access Flask app and db | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| root_dir = os.path.dirname(current_dir) | |
| sys.path.append(root_dir) | |
| from app import create_app | |
| from models import Feedback | |
| MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest" | |
| CUSTOM_MODEL_DIR = os.path.join(root_dir, "custom_model") | |
| STATUS_FILE = os.path.join(root_dir, "training_status.json") | |
| def update_status(status, progress=0, message=""): | |
| with open(STATUS_FILE, "w") as f: | |
| json.dump({"status": status, "progress": progress, "message": message}, f) | |
| def get_training_data(): | |
| app = create_app() | |
| with app.app_context(): | |
| # Fetch feedbacks that aren't purely neutral/empty | |
| feedbacks = Feedback.query.filter(Feedback.sentiment.in_(['Positive', 'Negative'])).all() | |
| # Label mapping for CardiffNLP model | |
| # 0: Negative, 1: Neutral, 2: Positive | |
| label_map = {'Negative': 0, 'Positive': 2} | |
| texts = [] | |
| labels = [] | |
| for f in feedbacks: | |
| if f.cleaned_text: | |
| texts.append(f.cleaned_text) | |
| labels.append(label_map[f.sentiment]) | |
| return texts, labels | |
| def main(): | |
| update_status("Starting", 5, "Extracting data from database...") | |
| texts, labels = get_training_data() | |
| if len(texts) < 50: | |
| update_status("Error", 0, "Insufficient data for training. Need at least 50 positive/negative feedback entries.") | |
| return | |
| update_status("Processing", 20, f"Preparing dataset of {len(texts)} entries...") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # Create HuggingFace dataset | |
| dataset_dict = { | |
| "text": texts, | |
| "label": labels | |
| } | |
| raw_dataset = Dataset.from_dict(dataset_dict) | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) | |
| tokenized_dataset = raw_dataset.map(tokenize_function, batched=True) | |
| # Split into train/eval | |
| split_dataset = tokenized_dataset.train_test_split(test_size=0.1) | |
| train_dataset = split_dataset["train"] | |
| eval_dataset = split_dataset["test"] | |
| update_status("Training", 40, "Downloading weights and initializing neural network...") | |
| # We use num_labels=3 because the base model expects 3 | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3) | |
| training_args = TrainingArguments( | |
| output_dir="./trainer_logs", | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| num_train_epochs=2, | |
| weight_decay=0.01, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| update_status("Training", 60, "Fine-tuning model weights... This may take a few minutes.") | |
| trainer.train() | |
| update_status("Saving", 90, "Saving local custom model...") | |
| # Clean up old directory if exists | |
| if not os.path.exists(CUSTOM_MODEL_DIR): | |
| os.makedirs(CUSTOM_MODEL_DIR) | |
| model.save_pretrained(CUSTOM_MODEL_DIR) | |
| tokenizer.save_pretrained(CUSTOM_MODEL_DIR) | |
| update_status("Completed", 100, "Successfully trained and exported custom AI model. Application is now using the enhanced AI.") | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except Exception as e: | |
| update_status("Error", 0, str(e)) | |