Spaces:
Runtime error
Runtime error
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
| from datasets import Dataset | |
| import torch | |
| import json | |
| import os | |
| def load_dataset_from_dir(folder_path): # load jsons as text+label pairs from a folder, with label validation | |
| records = [] | |
| for fname in os.listdir(folder_path): | |
| if fname.endswith(".json"): | |
| with open(os.path.join(folder_path, fname), "r") as f: | |
| item = json.load(f) | |
| label = item.get("label", -1) | |
| if isinstance(label, int) and 0 <= label < 4: # Ensure label is 0-3 | |
| records.append({"text": item["text"], "label": label}) | |
| else: | |
| print(f"Skipping {fname}: invalid label '{label}'") | |
| return Dataset.from_list(records) | |
| def tokenize(examples, tokenizer): # tokenizer for small dataset | |
| return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) | |
| def train(): | |
| # label mapping | |
| id2label = { | |
| 0: "Fully Compliant", | |
| 1: "Procedurally Defective", | |
| 2: "Defective under State Law", | |
| 3: "Constitutionally Defective" | |
| } | |
| # init model with mapping | |
| tokenizer = AutoTokenizer.from_pretrained("Stern5497/sbert-legal-xlm-roberta-base") | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| "Stern5497/sbert-legal-xlm-roberta-base", | |
| num_labels=4, | |
| id2label=id2label, | |
| label2id={v: k for k, v in id2label.items()}, | |
| torch_dtype=torch.float32 # no FP16 for stability | |
| ) | |
| # load/verify dataset | |
| dataset = load_dataset_from_dir("clean_dataset") | |
| print(f"Loaded {len(dataset)} samples. Labels: {set([x['label'] for x in dataset])}") | |
| # tokenize w/ batched=False for small dataset | |
| tokenized_dataset = dataset.map( | |
| lambda x: tokenize(x, tokenizer), | |
| batched=False | |
| ) | |
| # reasonable args | |
| training_args = TrainingArguments( | |
| output_dir="finetuned_model", | |
| per_device_train_batch_size=2, | |
| num_train_epochs=3, | |
| logging_steps=1, | |
| save_strategy="no", | |
| fp16=False, # disabled for stability | |
| gradient_accumulation_steps=1, | |
| report_to="none" | |
| ) | |
| # train with eval disabled | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset | |
| ) | |
| # start training | |
| try: | |
| print("Starting training...") | |
| trainer.train() | |
| trainer.save_model("finetuned_model/final") | |
| tokenizer.save_pretrained("finetuned_model/final") | |
| print("🎉 Training completed successfully!") | |
| except Exception as e: | |
| print(f"Training failed: {e}") | |
| print("Proceeding with untrained model for demo purposes") | |
| if __name__ == "__main__": | |
| os.environ["CUDA_LAUNCH_BLOCKING"] = "1" | |
| train() |