NYCourtOrderClass / train100.py
szlevi's picture
Upload folder using huggingface_hub
813dc3a verified
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import json
import os
def load_dataset_from_dir(folder_path): # load jsons as text+label pairs from a folder, with label validation
records = []
for fname in os.listdir(folder_path):
if fname.endswith(".json"):
with open(os.path.join(folder_path, fname), "r") as f:
item = json.load(f)
label = item.get("label", -1)
if isinstance(label, int) and 0 <= label < 4: # Ensure label is 0-3
records.append({"text": item["text"], "label": label})
else:
print(f"Skipping {fname}: invalid label '{label}'")
return Dataset.from_list(records)
def tokenize(examples, tokenizer): # tokenizer for small dataset
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
def train():
# label mapping
id2label = {
0: "Fully Compliant",
1: "Procedurally Defective",
2: "Defective under State Law",
3: "Constitutionally Defective"
}
# init model with mapping
tokenizer = AutoTokenizer.from_pretrained("Stern5497/sbert-legal-xlm-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
"Stern5497/sbert-legal-xlm-roberta-base",
num_labels=4,
id2label=id2label,
label2id={v: k for k, v in id2label.items()},
torch_dtype=torch.float32 # no FP16 for stability
)
# load/verify dataset
dataset = load_dataset_from_dir("clean_dataset")
print(f"Loaded {len(dataset)} samples. Labels: {set([x['label'] for x in dataset])}")
# tokenize w/ batched=False for small dataset
tokenized_dataset = dataset.map(
lambda x: tokenize(x, tokenizer),
batched=False
)
# reasonable args
training_args = TrainingArguments(
output_dir="finetuned_model",
per_device_train_batch_size=2,
num_train_epochs=3,
logging_steps=1,
save_strategy="no",
fp16=False, # disabled for stability
gradient_accumulation_steps=1,
report_to="none"
)
# train with eval disabled
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
# start training
try:
print("Starting training...")
trainer.train()
trainer.save_model("finetuned_model/final")
tokenizer.save_pretrained("finetuned_model/final")
print("🎉 Training completed successfully!")
except Exception as e:
print(f"Training failed: {e}")
print("Proceeding with untrained model for demo purposes")
if __name__ == "__main__":
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
train()