Spaces:
Running
Running
| import json | |
| import sys | |
| import unicodedata | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| import numpy as np | |
| import evaluate | |
| from datasets import Dataset | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorWithPadding, | |
| ) | |
| from config import ( | |
| BASE_MODEL, | |
| DATASET_CSV, | |
| MODEL_OUTPUT_DIR, | |
| NUM_TRAIN_EPOCHS, | |
| PER_DEVICE_TRAIN_BATCH_SIZE, | |
| PER_DEVICE_EVAL_BATCH_SIZE, | |
| LEARNING_RATE, | |
| WEIGHT_DECAY, | |
| WARMUP_RATIO, | |
| MAX_LENGTH, | |
| LOGGING_STEPS, | |
| ) | |
| accuracy_metric = evaluate.load("accuracy") | |
| def normalize(text: str) -> str: | |
| text = text.lower() | |
| text = unicodedata.normalize("NFD", text) | |
| text = "".join(c for c in text if unicodedata.category(c) != "Mn") | |
| return text | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| return accuracy_metric.compute(predictions=predictions, references=labels) | |
| def main(): | |
| csv_path = Path(__file__).parent / DATASET_CSV | |
| print(f"Cargando dataset desde {csv_path}...") | |
| df = pd.read_csv(csv_path) | |
| categories = sorted(df["category"].unique().tolist()) | |
| num_labels = len(categories) | |
| label2id = {name: i for i, name in enumerate(categories)} | |
| id2label = {i: name for i, name in enumerate(categories)} | |
| print(f"Categorias ({num_labels}): {categories}") | |
| df["text"] = df["text"].apply(normalize) | |
| df["labels"] = df["category"].map(label2id) | |
| train_df, val_df = train_test_split( | |
| df, test_size=0.15, stratify=df["labels"], random_state=42 | |
| ) | |
| print(f"Train: {len(train_df)} | Val: {len(val_df)}") | |
| train_dataset = Dataset.from_pandas(train_df[["text", "labels"]].reset_index(drop=True)) | |
| val_dataset = Dataset.from_pandas(val_df[["text", "labels"]].reset_index(drop=True)) | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| def tokenize(batch): | |
| return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH) | |
| train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["text"]) | |
| val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=["text"]) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| BASE_MODEL, | |
| num_labels=num_labels, | |
| id2label=id2label, | |
| label2id=label2id, | |
| ) | |
| output_dir = str(Path(__file__).parent / MODEL_OUTPUT_DIR) | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| num_train_epochs=NUM_TRAIN_EPOCHS, | |
| per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, | |
| per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE, | |
| learning_rate=LEARNING_RATE, | |
| weight_decay=WEIGHT_DECAY, | |
| warmup_ratio=WARMUP_RATIO, | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="accuracy", | |
| logging_steps=LOGGING_STEPS, | |
| report_to="none", | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| tokenizer=tokenizer, | |
| data_collator=DataCollatorWithPadding(tokenizer), | |
| compute_metrics=compute_metrics, | |
| ) | |
| print("Iniciando fine-tuning...") | |
| trainer.train() | |
| print(f"Guardando modelo en {output_dir}...") | |
| trainer.save_model(output_dir) | |
| tokenizer.save_pretrained(output_dir) | |
| label_map_path = Path(output_dir) / "label_map.json" | |
| with open(label_map_path, "w", encoding="utf-8") as f: | |
| json.dump(id2label, f, ensure_ascii=False, indent=2) | |
| print(f"Listo! Modelo guardado en {output_dir}") | |
| print(f"Mapa de etiquetas: {label_map_path}") | |
| if __name__ == "__main__": | |
| main() | |