Spaces:

DelaliScratchwerk
/

TextPeriod_Summarization

Sleeping

App Files Files Community

DelaliScratchwerk commited on Nov 15, 2025

Commit

467cb44

verified ·

1 Parent(s): 7936f59

Delete train_hf_classifier.py

Browse files

Files changed (1) hide show

train_hf_classifier.py +0 -129

train_hf_classifier.py DELETED Viewed

@@ -1,129 +0,0 @@
-import json
-import numpy as np
-from datasets import load_dataset
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-    TrainingArguments,
-    Trainer,
-)
-import evaluate
-from huggingface_hub import upload_file
-# ---------- LABELS ----------
-LABELS = [
-    "pre-1900",
-    "1900–1945",
-    "1946–1990",
-    "1991–2008",
-    "2009–2015",
-    "2016–2018",
-    "2019–2022",
-    "2023–present",
-]
-name2id = {name: i for i, name in enumerate(LABELS)}
-id2label = {i: name for i, name in enumerate(LABELS)}
-# ---------- DATA ----------
-# expects train.jsonl / val.jsonl with fields: "text", "label" (label is one of LABELS)
-ds = load_dataset(
-    "json",
-    data_files={"train": "train.jsonl", "val": "val.jsonl"},
-)
-# make sure all label names are present in train
-seen = set(row["label"] for row in ds["train"])
-missing = set(LABELS) - seen
-if missing:
-    raise ValueError(f"Train set missing labels: {missing}")
-# map string labels -> ids
-def encode_label(example):
-    return {"label": name2id[example["label"]]}
-ds = ds.map(encode_label)
-# ---------- TOKENIZATION ----------
-model_ckpt = "distilbert-base-uncased"
-tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
-def tokenize_batch(batch):
-    return tokenizer(
-        batch["text"],
-        truncation=True,
-        padding="max_length",
-        max_length=256,
-    )
-tokenized = ds.map(tokenize_batch, batched=True)
-# set format for Trainer
-tokenized = tokenized.remove_columns(
-    [c for c in tokenized["train"].column_names if c not in ["input_ids", "attention_mask", "label"]]
-)
-tokenized.set_format("torch")
-# ---------- MODEL ----------
-model = AutoModelForSequenceClassification.from_pretrained(
-    model_ckpt,
-    num_labels=len(LABELS),
-    id2label=id2label,
-    label2id=name2id,
-)
-# ---------- METRICS ----------
-accuracy_metric = evaluate.load("accuracy")
-def compute_metrics(eval_pred):
-    logits, labels = eval_pred
-    preds = np.argmax(logits, axis=-1)
-    return accuracy_metric.compute(predictions=preds, references=labels)
-# ---------- TRAINING ARGUMENTS (no evaluation_strategy etc.) ----------
-args = TrainingArguments(
-    output_dir="./checkpoints-bert",
-    learning_rate=2e-5,
-    per_device_train_batch_size=8,
-    per_device_eval_batch_size=8,
-    num_train_epochs=4,
-    weight_decay=0.01,
-    logging_steps=10,
-    save_total_limit=2,
-)
-# ---------- TRAINER ----------
-trainer = Trainer(
-    model=model,
-    args=args,
-    train_dataset=tokenized["train"],
-    eval_dataset=tokenized["val"],
-    tokenizer=tokenizer,
-    compute_metrics=compute_metrics,
-)
-# ---------- TRAIN + EVAL ----------
-trainer.train()
-print("Eval:", trainer.evaluate())
-# ---------- PUSH TO HUB ----------
-repo_id = "DelaliScratchwerk/text-period-bert"  # pick the name you want
-trainer.push_to_hub(repo_id)
-print("Pushed model to:", repo_id)
-# also push labels.json so your Space / client can load the label names
-with open("labels_bert.json", "w") as f:
-    json.dump(LABELS, f, ensure_ascii=False)
-upload_file(
-    path_or_fileobj="labels_bert.json",
-    path_in_repo="labels.json",
-    repo_id=repo_id,
-    repo_type="model",
-)
-print("Uploaded labels.json")