import os import sys import json import torch from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments # Add root path to access Flask app and db current_dir = os.path.dirname(os.path.abspath(__file__)) root_dir = os.path.dirname(current_dir) sys.path.append(root_dir) from app import create_app from models import Feedback MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest" CUSTOM_MODEL_DIR = os.path.join(root_dir, "custom_model") STATUS_FILE = os.path.join(root_dir, "training_status.json") def update_status(status, progress=0, message=""): with open(STATUS_FILE, "w") as f: json.dump({"status": status, "progress": progress, "message": message}, f) def get_training_data(): app = create_app() with app.app_context(): # Fetch feedbacks that aren't purely neutral/empty feedbacks = Feedback.query.filter(Feedback.sentiment.in_(['Positive', 'Negative'])).all() # Label mapping for CardiffNLP model # 0: Negative, 1: Neutral, 2: Positive label_map = {'Negative': 0, 'Positive': 2} texts = [] labels = [] for f in feedbacks: if f.cleaned_text: texts.append(f.cleaned_text) labels.append(label_map[f.sentiment]) return texts, labels def main(): update_status("Starting", 5, "Extracting data from database...") texts, labels = get_training_data() if len(texts) < 50: update_status("Error", 0, "Insufficient data for training. Need at least 50 positive/negative feedback entries.") return update_status("Processing", 20, f"Preparing dataset of {len(texts)} entries...") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Create HuggingFace dataset dataset_dict = { "text": texts, "label": labels } raw_dataset = Dataset.from_dict(dataset_dict) def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) tokenized_dataset = raw_dataset.map(tokenize_function, batched=True) # Split into train/eval split_dataset = tokenized_dataset.train_test_split(test_size=0.1) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] update_status("Training", 40, "Downloading weights and initializing neural network...") # We use num_labels=3 because the base model expects 3 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3) training_args = TrainingArguments( output_dir="./trainer_logs", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=2, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) update_status("Training", 60, "Fine-tuning model weights... This may take a few minutes.") trainer.train() update_status("Saving", 90, "Saving local custom model...") # Clean up old directory if exists if not os.path.exists(CUSTOM_MODEL_DIR): os.makedirs(CUSTOM_MODEL_DIR) model.save_pretrained(CUSTOM_MODEL_DIR) tokenizer.save_pretrained(CUSTOM_MODEL_DIR) update_status("Completed", 100, "Successfully trained and exported custom AI model. Application is now using the enhanced AI.") if __name__ == "__main__": try: main() except Exception as e: update_status("Error", 0, str(e))