Spaces:

cmeneses99
/

sms-classifier-api

Running

File size: 3,877 Bytes

f9ac587

import json
import sys
import unicodedata
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))

import numpy as np
import evaluate
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

from config import (
    BASE_MODEL,
    DATASET_CSV,
    MODEL_OUTPUT_DIR,
    NUM_TRAIN_EPOCHS,
    PER_DEVICE_TRAIN_BATCH_SIZE,
    PER_DEVICE_EVAL_BATCH_SIZE,
    LEARNING_RATE,
    WEIGHT_DECAY,
    WARMUP_RATIO,
    MAX_LENGTH,
    LOGGING_STEPS,
)

accuracy_metric = evaluate.load("accuracy")


def normalize(text: str) -> str:
    text = text.lower()
    text = unicodedata.normalize("NFD", text)
    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
    return text


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


def main():
    csv_path = Path(__file__).parent / DATASET_CSV
    print(f"Cargando dataset desde {csv_path}...")
    df = pd.read_csv(csv_path)

    categories = sorted(df["category"].unique().tolist())
    num_labels = len(categories)
    label2id = {name: i for i, name in enumerate(categories)}
    id2label = {i: name for i, name in enumerate(categories)}
    print(f"Categorias ({num_labels}): {categories}")

    df["text"] = df["text"].apply(normalize)
    df["labels"] = df["category"].map(label2id)

    train_df, val_df = train_test_split(
        df, test_size=0.15, stratify=df["labels"], random_state=42
    )
    print(f"Train: {len(train_df)} | Val: {len(val_df)}")

    train_dataset = Dataset.from_pandas(train_df[["text", "labels"]].reset_index(drop=True))
    val_dataset = Dataset.from_pandas(val_df[["text", "labels"]].reset_index(drop=True))

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

    def tokenize(batch):
        return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

    train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["text"])
    val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=["text"])

    model = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    output_dir = str(Path(__file__).parent / MODEL_OUTPUT_DIR)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=NUM_TRAIN_EPOCHS,
        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=WARMUP_RATIO,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        logging_steps=LOGGING_STEPS,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
    )

    print("Iniciando fine-tuning...")
    trainer.train()

    print(f"Guardando modelo en {output_dir}...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    label_map_path = Path(output_dir) / "label_map.json"
    with open(label_map_path, "w", encoding="utf-8") as f:
        json.dump(id2label, f, ensure_ascii=False, indent=2)

    print(f"Listo! Modelo guardado en {output_dir}")
    print(f"Mapa de etiquetas: {label_map_path}")


if __name__ == "__main__":
    main()