Spaces:
Running
Running
File size: 3,864 Bytes
e45ddff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | import os
import sys
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# Add root path to access Flask app and db
current_dir = os.path.dirname(os.path.abspath(__file__))
root_dir = os.path.dirname(current_dir)
sys.path.append(root_dir)
from app import create_app
from models import Feedback
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
CUSTOM_MODEL_DIR = os.path.join(root_dir, "custom_model")
STATUS_FILE = os.path.join(root_dir, "training_status.json")
def update_status(status, progress=0, message=""):
with open(STATUS_FILE, "w") as f:
json.dump({"status": status, "progress": progress, "message": message}, f)
def get_training_data():
app = create_app()
with app.app_context():
# Fetch feedbacks that aren't purely neutral/empty
feedbacks = Feedback.query.filter(Feedback.sentiment.in_(['Positive', 'Negative'])).all()
# Label mapping for CardiffNLP model
# 0: Negative, 1: Neutral, 2: Positive
label_map = {'Negative': 0, 'Positive': 2}
texts = []
labels = []
for f in feedbacks:
if f.cleaned_text:
texts.append(f.cleaned_text)
labels.append(label_map[f.sentiment])
return texts, labels
def main():
update_status("Starting", 5, "Extracting data from database...")
texts, labels = get_training_data()
if len(texts) < 50:
update_status("Error", 0, "Insufficient data for training. Need at least 50 positive/negative feedback entries.")
return
update_status("Processing", 20, f"Preparing dataset of {len(texts)} entries...")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Create HuggingFace dataset
dataset_dict = {
"text": texts,
"label": labels
}
raw_dataset = Dataset.from_dict(dataset_dict)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
# Split into train/eval
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
update_status("Training", 40, "Downloading weights and initializing neural network...")
# We use num_labels=3 because the base model expects 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
training_args = TrainingArguments(
output_dir="./trainer_logs",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
update_status("Training", 60, "Fine-tuning model weights... This may take a few minutes.")
trainer.train()
update_status("Saving", 90, "Saving local custom model...")
# Clean up old directory if exists
if not os.path.exists(CUSTOM_MODEL_DIR):
os.makedirs(CUSTOM_MODEL_DIR)
model.save_pretrained(CUSTOM_MODEL_DIR)
tokenizer.save_pretrained(CUSTOM_MODEL_DIR)
update_status("Completed", 100, "Successfully trained and exported custom AI model. Application is now using the enhanced AI.")
if __name__ == "__main__":
try:
main()
except Exception as e:
update_status("Error", 0, str(e))
|