cmeneses99's picture
Initial deploy: SMS Classifier API
f9ac587
import json
import sys
import unicodedata
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
import numpy as np
import evaluate
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
)
from config import (
BASE_MODEL,
DATASET_CSV,
MODEL_OUTPUT_DIR,
NUM_TRAIN_EPOCHS,
PER_DEVICE_TRAIN_BATCH_SIZE,
PER_DEVICE_EVAL_BATCH_SIZE,
LEARNING_RATE,
WEIGHT_DECAY,
WARMUP_RATIO,
MAX_LENGTH,
LOGGING_STEPS,
)
accuracy_metric = evaluate.load("accuracy")
def normalize(text: str) -> str:
text = text.lower()
text = unicodedata.normalize("NFD", text)
text = "".join(c for c in text if unicodedata.category(c) != "Mn")
return text
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return accuracy_metric.compute(predictions=predictions, references=labels)
def main():
csv_path = Path(__file__).parent / DATASET_CSV
print(f"Cargando dataset desde {csv_path}...")
df = pd.read_csv(csv_path)
categories = sorted(df["category"].unique().tolist())
num_labels = len(categories)
label2id = {name: i for i, name in enumerate(categories)}
id2label = {i: name for i, name in enumerate(categories)}
print(f"Categorias ({num_labels}): {categories}")
df["text"] = df["text"].apply(normalize)
df["labels"] = df["category"].map(label2id)
train_df, val_df = train_test_split(
df, test_size=0.15, stratify=df["labels"], random_state=42
)
print(f"Train: {len(train_df)} | Val: {len(val_df)}")
train_dataset = Dataset.from_pandas(train_df[["text", "labels"]].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df[["text", "labels"]].reset_index(drop=True))
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
def tokenize(batch):
return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)
train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=["text"])
model = AutoModelForSequenceClassification.from_pretrained(
BASE_MODEL,
num_labels=num_labels,
id2label=id2label,
label2id=label2id,
)
output_dir = str(Path(__file__).parent / MODEL_OUTPUT_DIR)
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=NUM_TRAIN_EPOCHS,
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
learning_rate=LEARNING_RATE,
weight_decay=WEIGHT_DECAY,
warmup_ratio=WARMUP_RATIO,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
logging_steps=LOGGING_STEPS,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
)
print("Iniciando fine-tuning...")
trainer.train()
print(f"Guardando modelo en {output_dir}...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
label_map_path = Path(output_dir) / "label_map.json"
with open(label_map_path, "w", encoding="utf-8") as f:
json.dump(id2label, f, ensure_ascii=False, indent=2)
print(f"Listo! Modelo guardado en {output_dir}")
print(f"Mapa de etiquetas: {label_map_path}")
if __name__ == "__main__":
main()