File size: 3,864 Bytes
e45ddff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import sys
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# Add root path to access Flask app and db
current_dir = os.path.dirname(os.path.abspath(__file__))
root_dir = os.path.dirname(current_dir)
sys.path.append(root_dir)

from app import create_app
from models import Feedback

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
CUSTOM_MODEL_DIR = os.path.join(root_dir, "custom_model")
STATUS_FILE = os.path.join(root_dir, "training_status.json")

def update_status(status, progress=0, message=""):
    with open(STATUS_FILE, "w") as f:
        json.dump({"status": status, "progress": progress, "message": message}, f)

def get_training_data():
    app = create_app()
    with app.app_context():
        # Fetch feedbacks that aren't purely neutral/empty
        feedbacks = Feedback.query.filter(Feedback.sentiment.in_(['Positive', 'Negative'])).all()
        
        # Label mapping for CardiffNLP model
        # 0: Negative, 1: Neutral, 2: Positive
        label_map = {'Negative': 0, 'Positive': 2}
        
        texts = []
        labels = []
        for f in feedbacks:
            if f.cleaned_text:
                texts.append(f.cleaned_text)
                labels.append(label_map[f.sentiment])
                
        return texts, labels

def main():
    update_status("Starting", 5, "Extracting data from database...")
    
    texts, labels = get_training_data()
    
    if len(texts) < 50:
        update_status("Error", 0, "Insufficient data for training. Need at least 50 positive/negative feedback entries.")
        return
        
    update_status("Processing", 20, f"Preparing dataset of {len(texts)} entries...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Create HuggingFace dataset
    dataset_dict = {
        "text": texts,
        "label": labels
    }
    raw_dataset = Dataset.from_dict(dataset_dict)
    
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
        
    tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
    
    # Split into train/eval
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]
    
    update_status("Training", 40, "Downloading weights and initializing neural network...")
    
    # We use num_labels=3 because the base model expects 3
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
    
    training_args = TrainingArguments(
        output_dir="./trainer_logs",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )
    
    update_status("Training", 60, "Fine-tuning model weights... This may take a few minutes.")
    trainer.train()
    
    update_status("Saving", 90, "Saving local custom model...")
    # Clean up old directory if exists
    if not os.path.exists(CUSTOM_MODEL_DIR):
        os.makedirs(CUSTOM_MODEL_DIR)
        
    model.save_pretrained(CUSTOM_MODEL_DIR)
    tokenizer.save_pretrained(CUSTOM_MODEL_DIR)
    
    update_status("Completed", 100, "Successfully trained and exported custom AI model. Application is now using the enhanced AI.")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        update_status("Error", 0, str(e))