Spaces:

DelaliScratchwerk
/

TextPeriod_Summarization

Sleeping

App Files Files Community

DelaliScratchwerk commited on Nov 10, 2025

Commit

7936f59

verified ·

1 Parent(s): d25d051

Update train_hf_classifier.py

Browse files

Files changed (1) hide show

train_hf_classifier.py +69 -49

train_hf_classifier.py CHANGED Viewed

@@ -1,18 +1,16 @@
-# train_hf_classifier.py
 import json
 from datasets import load_dataset
 from transformers import (
     AutoTokenizer,
     AutoModelForSequenceClassification,
-    Trainer,
     TrainingArguments,
 )
-from huggingface_hub import HfApi
-MODEL_NAME = "distilbert-base-uncased"   # backbone
-REPO_ID = "DelaliScratchwerk/text-period-bert"  # <- choose a new model repo name
 LABELS = [
     "pre-1900",
     "1900–1945",
@@ -23,87 +21,109 @@ LABELS = [
     "2019–2022",
     "2023–present",
 ]
-label2id = {l: i for i, l in enumerate(LABELS)}
-id2label = {i: l for l, i in label2id.items()}
-# 1) Load your jsonl data (same files you used for SetFit)
-ds = load_dataset("json", data_files={"train": "train.jsonl", "val": "val.jsonl"})
-# Check columns: assume {"text": "...", "label": "1946–1990"}
 def encode_label(example):
-    example["labels"] = label2id[example["label"]]
-    return example
 ds = ds.map(encode_label)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-def tokenize(examples):
     return tokenizer(
-        examples["text"],
-        padding="max_length",
         truncation=True,
         max_length=256,
     )
-tokenized = ds.map(tokenize, batched=True)
-# HF Trainer expects these columns
-tokenized = tokenized.remove_columns(["text", "label"])
 tokenized.set_format("torch")
 model = AutoModelForSequenceClassification.from_pretrained(
-    MODEL_NAME,
     num_labels=len(LABELS),
     id2label=id2label,
-    label2id=label2id,
 )
 args = TrainingArguments(
     output_dir="./checkpoints-bert",
-    evaluation_strategy="epoch",
-    save_strategy="epoch",
     learning_rate=2e-5,
-    per_device_train_batch_size=16,
-    per_device_eval_batch_size=16,
-    num_train_epochs=3,
     weight_decay=0.01,
-    load_best_model_at_end=True,
-    metric_for_best_model="accuracy",
 )
-from datasets import load_metric
-metric = load_metric("accuracy")
-def compute_metrics(eval_pred):
-    logits, labels = eval_pred
-    preds = logits.argmax(axis=-1)
-    return metric.compute(predictions=preds, references=labels)
 trainer = Trainer(
     model=model,
     args=args,
     train_dataset=tokenized["train"],
     eval_dataset=tokenized["val"],
     compute_metrics=compute_metrics,
 )
 trainer.train()
 print("Eval:", trainer.evaluate())
-# 2) Push model to Hub
-trainer.push_to_hub(REPO_ID)
-# 3) Also upload labels list as labels.json (handy but optional)
-with open("labels.json", "w") as f:
-    json.dump(LABELS, f, ensure_ascii=False, indent=2)
-api = HfApi()
-api.upload_file(
-    path_or_fileobj="labels.json",
     path_in_repo="labels.json",
-    repo_id=REPO_ID,
     repo_type="model",
 )
-print("Pushed model to:", REPO_ID)

 import json
+import numpy as np
 from datasets import load_dataset
 from transformers import (
     AutoTokenizer,
     AutoModelForSequenceClassification,
     TrainingArguments,
+    Trainer,
 )
+import evaluate
+from huggingface_hub import upload_file
+# ---------- LABELS ----------
 LABELS = [
     "pre-1900",
     "1900–1945",
     "2019–2022",
     "2023–present",
 ]
+name2id = {name: i for i, name in enumerate(LABELS)}
+id2label = {i: name for i, name in enumerate(LABELS)}
+# ---------- DATA ----------
+# expects train.jsonl / val.jsonl with fields: "text", "label" (label is one of LABELS)
+ds = load_dataset(
+    "json",
+    data_files={"train": "train.jsonl", "val": "val.jsonl"},
+)
+# make sure all label names are present in train
+seen = set(row["label"] for row in ds["train"])
+missing = set(LABELS) - seen
+if missing:
+    raise ValueError(f"Train set missing labels: {missing}")
+# map string labels -> ids
 def encode_label(example):
+    return {"label": name2id[example["label"]]}
 ds = ds.map(encode_label)
+# ---------- TOKENIZATION ----------
+model_ckpt = "distilbert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+def tokenize_batch(batch):
     return tokenizer(
+        batch["text"],
         truncation=True,
+        padding="max_length",
         max_length=256,
     )
+tokenized = ds.map(tokenize_batch, batched=True)
+# set format for Trainer
+tokenized = tokenized.remove_columns(
+    [c for c in tokenized["train"].column_names if c not in ["input_ids", "attention_mask", "label"]]
+)
 tokenized.set_format("torch")
+# ---------- MODEL ----------
 model = AutoModelForSequenceClassification.from_pretrained(
+    model_ckpt,
     num_labels=len(LABELS),
     id2label=id2label,
+    label2id=name2id,
 )
+# ---------- METRICS ----------
+accuracy_metric = evaluate.load("accuracy")
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    return accuracy_metric.compute(predictions=preds, references=labels)
+# ---------- TRAINING ARGUMENTS (no evaluation_strategy etc.) ----------
 args = TrainingArguments(
     output_dir="./checkpoints-bert",
     learning_rate=2e-5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=4,
     weight_decay=0.01,
+    logging_steps=10,
+    save_total_limit=2,
 )
+# ---------- TRAINER ----------
 trainer = Trainer(
     model=model,
     args=args,
     train_dataset=tokenized["train"],
     eval_dataset=tokenized["val"],
+    tokenizer=tokenizer,
     compute_metrics=compute_metrics,
 )
+# ---------- TRAIN + EVAL ----------
 trainer.train()
 print("Eval:", trainer.evaluate())
+# ---------- PUSH TO HUB ----------
+repo_id = "DelaliScratchwerk/text-period-bert"  # pick the name you want
+trainer.push_to_hub(repo_id)
+print("Pushed model to:", repo_id)
+# also push labels.json so your Space / client can load the label names
+with open("labels_bert.json", "w") as f:
+    json.dump(LABELS, f, ensure_ascii=False)
+upload_file(
+    path_or_fileobj="labels_bert.json",
     path_in_repo="labels.json",
+    repo_id=repo_id,
     repo_type="model",
 )
+print("Uploaded labels.json")