Spaces:
Sleeping
Sleeping
| import json | |
| import torch | |
| from datasets import Dataset | |
| import evaluate | |
| from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments | |
| # Load dataset from JSON | |
| with open("ai_training_dataset.json", "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| # Check that all labels are integers (0, 1, or 2) | |
| for item in data: | |
| item["label"] = int(item["label"]) # Ensure type is correct | |
| # Convert to HuggingFace Dataset | |
| dataset = Dataset.from_list(data) | |
| dataset = dataset.train_test_split(test_size=0.2) | |
| train_dataset = dataset["train"] | |
| eval_dataset = dataset["test"] | |
| # Load tokenizer | |
| model_name = "roberta-base" | |
| tokenizer = RobertaTokenizer.from_pretrained(model_name) | |
| # Tokenization function | |
| def tokenize(example): | |
| return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512) | |
| # Tokenize datasets | |
| train_dataset = train_dataset.map(tokenize, batched=True) | |
| eval_dataset = eval_dataset.map(tokenize, batched=True) | |
| # Keep only model-required fields | |
| train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) | |
| eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) | |
| # Load model with 3 output labels | |
| model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3) | |
| # Optional: define metrics | |
| accuracy = evaluate.load("accuracy") | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| preds = torch.argmax(torch.tensor(logits), dim=1) | |
| return accuracy.compute(predictions=preds, references=labels) | |
| # Training configuration | |
| training_args = TrainingArguments( | |
| output_dir="./models/roberta-detector", | |
| evaluation_strategy="epoch", # MUST match save_strategy | |
| save_strategy="epoch", | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| num_train_epochs=3, | |
| logging_steps=10, | |
| save_total_limit=1, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| report_to="none", # Prevents WandB issues | |
| ) | |
| # Trainer setup | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| compute_metrics=compute_metrics, | |
| ) | |
| # Train | |
| trainer.train() | |
| # Save model + tokenizer | |
| model.save_pretrained("./models/roberta-detector") | |
| tokenizer.save_pretrained("./models/roberta-detector") | |
| print("✅ Model trained and saved.") | |