Spaces:

DelaliScratchwerk
/

TextPeriod_Summarization

Sleeping

App Files Files Community

DelaliScratchwerk commited on Nov 10, 2025

Commit

d25d051

verified ·

1 Parent(s): 8ac19c7

Create train_hf_classifier.py

Browse files

Files changed (1) hide show

train_hf_classifier.py +109 -0

train_hf_classifier.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# train_hf_classifier.py
+import json
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+)
+from huggingface_hub import HfApi
+MODEL_NAME = "distilbert-base-uncased"   # backbone
+REPO_ID = "DelaliScratchwerk/text-period-bert"  # <- choose a new model repo name
+LABELS = [
+    "pre-1900",
+    "1900–1945",
+    "1946–1990",
+    "1991–2008",
+    "2009–2015",
+    "2016–2018",
+    "2019–2022",
+    "2023–present",
+]
+label2id = {l: i for i, l in enumerate(LABELS)}
+id2label = {i: l for l, i in label2id.items()}
+# 1) Load your jsonl data (same files you used for SetFit)
+ds = load_dataset("json", data_files={"train": "train.jsonl", "val": "val.jsonl"})
+# Check columns: assume {"text": "...", "label": "1946–1990"}
+def encode_label(example):
+    example["labels"] = label2id[example["label"]]
+    return example
+ds = ds.map(encode_label)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+def tokenize(examples):
+    return tokenizer(
+        examples["text"],
+        padding="max_length",
+        truncation=True,
+        max_length=256,
+    )
+tokenized = ds.map(tokenize, batched=True)
+# HF Trainer expects these columns
+tokenized = tokenized.remove_columns(["text", "label"])
+tokenized.set_format("torch")
+model = AutoModelForSequenceClassification.from_pretrained(
+    MODEL_NAME,
+    num_labels=len(LABELS),
+    id2label=id2label,
+    label2id=label2id,
+)
+args = TrainingArguments(
+    output_dir="./checkpoints-bert",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
+)
+from datasets import load_metric
+metric = load_metric("accuracy")
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = logits.argmax(axis=-1)
+    return metric.compute(predictions=preds, references=labels)
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=tokenized["train"],
+    eval_dataset=tokenized["val"],
+    compute_metrics=compute_metrics,
+)
+trainer.train()
+print("Eval:", trainer.evaluate())
+# 2) Push model to Hub
+trainer.push_to_hub(REPO_ID)
+# 3) Also upload labels list as labels.json (handy but optional)
+with open("labels.json", "w") as f:
+    json.dump(LABELS, f, ensure_ascii=False, indent=2)
+api = HfApi()
+api.upload_file(
+    path_or_fileobj="labels.json",
+    path_in_repo="labels.json",
+    repo_id=REPO_ID,
+    repo_type="model",
+)
+print("Pushed model to:", REPO_ID)