Spaces:

Divyonko
/

LivePulse

Running

LivePulse / ml /train_muril.py

DivYonko

first commit

11a0fc5 about 1 month ago

3.45 kB

	from datasets import load_dataset, Dataset
	from transformers import (
	AutoTokenizer, AutoModelForSequenceClassification,
	TrainingArguments, Trainer
	)
	import torch
	import numpy as np
	from sklearn.metrics import accuracy_score, f1_score
	import pandas as pd

	MODEL_BASE = "google/muril-base-cased"
	OUTPUT_DIR = "./muril-sentimix"
	NUM_LABELS = 3

	LABEL2ID = {"negative": 0, "neutral": 1, "positive": 2}
	ID2LABEL = {0: "Negative", 1: "Neutral", 2: "Positive"}

	# ── Load dataset ──────────────────────────────────────────────
	raw = load_dataset("AmaanP314/youtube-comment-sentiment")["train"]

	# Rename and encode columns
	df = raw.to_pandas()[["CommentText", "Sentiment"]].dropna()
	df = df.rename(columns={"CommentText": "text"})
	df["labels"] = df["Sentiment"].str.lower().map(LABEL2ID)
	df = df.dropna(subset=["labels"])
	df["labels"] = df["labels"].astype(int)

	# Sample 100k to keep training fast (remove cap if you have GPU + time)
	df = df.sample(n=min(100_000, len(df)), random_state=42).reset_index(drop=True)

	split = Dataset.from_pandas(df[["text", "labels"]]).train_test_split(test_size=0.1, seed=42)

	# ── Tokenizer ─────────────────────────────────────────────────
	tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)

	def tokenize(batch):
	return tokenizer(batch["text"], truncation=True, max_length=128, padding="max_length")

	split = split.map(tokenize, batched=True)
	split.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

	# ── Model ─────────────────────────────────────────────────────
	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_BASE, num_labels=NUM_LABELS, id2label=ID2LABEL, label2id={v: k for k, v in ID2LABEL.items()}
	)

	# ── Metrics ───────────────────────────────────────────────────
	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	preds = np.argmax(logits, axis=-1)
	return {
	"accuracy": accuracy_score(labels, preds),
	"f1": f1_score(labels, preds, average="weighted")
	}

	# ── Training args ─────────────────────────────────────────────
	args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	num_train_epochs=4,
	per_device_train_batch_size=16,
	per_device_eval_batch_size=32,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model="f1",
	learning_rate=2e-5,
	warmup_ratio=0.1,
	weight_decay=0.01,
	fp16=torch.cuda.is_available(),
	logging_steps=50,
	report_to="none",
	)

	# ── Train ─────────────────────────────────────────────────────
	trainer = Trainer(
	model=model,
	args=args,
	train_dataset=split["train"],
	eval_dataset=split["test"],
	compute_metrics=compute_metrics,
	)

	trainer.train()
	trainer.save_model(OUTPUT_DIR)
	tokenizer.save_pretrained(OUTPUT_DIR)
	print(f"Model saved to {OUTPUT_DIR}")