Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import re | |
| import gradio as gr | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| import requests | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
| from huggingface_hub import whoami | |
| # HF login | |
| try: | |
| user = whoami() | |
| print(f"Authenticated as: {user['name']}") | |
| except Exception as e: | |
| print(f"HFace authentication failed: {e}") | |
| labels = ["Fully Compliant", "Procedural Defect", "Statutory Defect", "Constitutional Violation"] | |
| NVIDIA_API_KEY = os.environ.get("NVDGX_API_KEY") | |
| # loading model | |
| path = "finetuned_model/final" if os.path.exists("finetuned_model/final") else "Stern5497/sbert-legal-xlm-roberta-base" | |
| tokenizer = AutoTokenizer.from_pretrained(path) | |
| model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=4) | |
| classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0) | |
| # anonymizer | |
| ANON_REPLACEMENTS = [ | |
| (r"\b\d{3}-\d{2}-\d{4}\b", "XXX-XX-XXXX"), # SSN | |
| (r"\b\d{3}-\d{3}-\d{4}\b", "XXX-XXX-XXXX"), # phone | |
| (r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "EMAIL"), | |
| (r"\bDOB: \d{2}/\d{2}/\d{4}\b", "DOB: XX/XX/XXXX"), | |
| (r"\bChildren?\b", "Child"), | |
| (r"\b\d{5}(?:-\d{4})?\b", "ZIP") # ZIP | |
| ] | |
| def anonymize(text): | |
| for pattern, repl in ANON_REPLACEMENTS: | |
| text = re.sub(pattern, repl, text) | |
| text = re.sub(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b", lambda m: "Plaintiff" if "Plaintiff" in m.group() else "Defendant", text) | |
| return text | |
| # OCR | |
| def textract(file): | |
| if file.name.endswith(".pdf"): | |
| images = convert_from_path(file.name) | |
| text = "\n".join(pytesseract.image_to_string(img) for img in images) | |
| else: | |
| text = pytesseract.image_to_string(Image.open(file.name)) | |
| return anonymize(text) | |
| # classifying | |
| def classify(text): | |
| result = classifier(text[:512])[0] | |
| return result["label"], float(result["score"]) | |
| # GUI | |
| with gr.Blocks(title="NY Court Order Classifier") as app: | |
| gr.Markdown("## NY Court Order Classifier") | |
| gr.Markdown("Upload a court order and classify it. Optionally assign a label.") | |
| filein = gr.File(label="Upload PDF or Image") | |
| textracted = gr.Textbox(lines=20, label="Extracted Text Preview") | |
| predicted = gr.Textbox(label="Predicted Label") | |
| confidence = gr.Textbox(label="Confidence") | |
| labelc = gr.Radio(choices=labels, label="Assign Manual Label") | |
| savebtn = gr.Button("Save Label") | |
| status = gr.Textbox(label="Status") | |
| def upload(file): | |
| if not file: | |
| return "", "", "", "", "" | |
| text = textract(file) | |
| label, score = classify(text) | |
| return text[:3000], label, str(round(score, 3)), "", "" | |
| filein.change( | |
| fn=upload, | |
| inputs=filein, | |
| outputs=[textracted, predicted, confidence, labelc, status] | |
| ) | |
| def savelabel(choice, file, text): | |
| os.makedirs("labeled_dataset", exist_ok=True) | |
| labelidx = labels.index(choice) | |
| base = os.path.basename(file.name) | |
| with open(f"labeled_dataset/{base}.json", "w") as f: | |
| json.dump({"text": text, "label": labelidx}, f, indent=2) | |
| return f"✅ Saved label: {choice}" | |
| savebtn.click(fn=savelabel, inputs=[labelc, filein, textracted], outputs=status) | |
| if __name__ == "__main__": | |
| app.launch() | |