import os import json import re import gradio as gr from PIL import Image from pdf2image import convert_from_path import pytesseract import requests from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline from huggingface_hub import whoami # HF login try: user = whoami() print(f"Authenticated as: {user['name']}") except Exception as e: print(f"HFace authentication failed: {e}") labels = ["Fully Compliant", "Procedural Defect", "Statutory Defect", "Constitutional Violation"] NVIDIA_API_KEY = os.environ.get("NVDGX_API_KEY") # loading model path = "finetuned_model/final" if os.path.exists("finetuned_model/final") else "Stern5497/sbert-legal-xlm-roberta-base" tokenizer = AutoTokenizer.from_pretrained(path) model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=4) classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0) # anonymizer ANON_REPLACEMENTS = [ (r"\b\d{3}-\d{2}-\d{4}\b", "XXX-XX-XXXX"), # SSN (r"\b\d{3}-\d{3}-\d{4}\b", "XXX-XXX-XXXX"), # phone (r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "EMAIL"), (r"\bDOB: \d{2}/\d{2}/\d{4}\b", "DOB: XX/XX/XXXX"), (r"\bChildren?\b", "Child"), (r"\b\d{5}(?:-\d{4})?\b", "ZIP") # ZIP ] def anonymize(text): for pattern, repl in ANON_REPLACEMENTS: text = re.sub(pattern, repl, text) text = re.sub(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b", lambda m: "Plaintiff" if "Plaintiff" in m.group() else "Defendant", text) return text # OCR def textract(file): if file.name.endswith(".pdf"): images = convert_from_path(file.name) text = "\n".join(pytesseract.image_to_string(img) for img in images) else: text = pytesseract.image_to_string(Image.open(file.name)) return anonymize(text) # classifying def classify(text): result = classifier(text[:512])[0] return result["label"], float(result["score"]) # GUI with gr.Blocks(title="NY Court Order Classifier") as app: gr.Markdown("## NY Court Order Classifier") gr.Markdown("Upload a court order and classify it. Optionally assign a label.") filein = gr.File(label="Upload PDF or Image") textracted = gr.Textbox(lines=20, label="Extracted Text Preview") predicted = gr.Textbox(label="Predicted Label") confidence = gr.Textbox(label="Confidence") labelc = gr.Radio(choices=labels, label="Assign Manual Label") savebtn = gr.Button("Save Label") status = gr.Textbox(label="Status") def upload(file): if not file: return "", "", "", "", "" text = textract(file) label, score = classify(text) return text[:3000], label, str(round(score, 3)), "", "" filein.change( fn=upload, inputs=filein, outputs=[textracted, predicted, confidence, labelc, status] ) def savelabel(choice, file, text): os.makedirs("labeled_dataset", exist_ok=True) labelidx = labels.index(choice) base = os.path.basename(file.name) with open(f"labeled_dataset/{base}.json", "w") as f: json.dump({"text": text, "label": labelidx}, f, indent=2) return f"✅ Saved label: {choice}" savebtn.click(fn=savelabel, inputs=[labelc, filein, textracted], outputs=status) if __name__ == "__main__": app.launch()