File size: 3,354 Bytes
813dc3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import json
import re
import gradio as gr
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from huggingface_hub import whoami


# HF login
try:
    user = whoami()
    print(f"Authenticated as: {user['name']}")
except Exception as e:
    print(f"HFace authentication failed: {e}")

labels = ["Fully Compliant", "Procedural Defect", "Statutory Defect", "Constitutional Violation"]
NVIDIA_API_KEY = os.environ.get("NVDGX_API_KEY")

# loading model
path = "finetuned_model/final" if os.path.exists("finetuned_model/final") else "Stern5497/sbert-legal-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=4)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

# anonymizer
ANON_REPLACEMENTS = [
    (r"\b\d{3}-\d{2}-\d{4}\b", "XXX-XX-XXXX"),     # SSN
    (r"\b\d{3}-\d{3}-\d{4}\b", "XXX-XXX-XXXX"),    # phone
    (r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "EMAIL"),
    (r"\bDOB: \d{2}/\d{2}/\d{4}\b", "DOB: XX/XX/XXXX"),
    (r"\bChildren?\b", "Child"),
    (r"\b\d{5}(?:-\d{4})?\b", "ZIP")               # ZIP
]

def anonymize(text):
    for pattern, repl in ANON_REPLACEMENTS:
        text = re.sub(pattern, repl, text)
    text = re.sub(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b", lambda m: "Plaintiff" if "Plaintiff" in m.group() else "Defendant", text)
    return text

# OCR
def textract(file):
    if file.name.endswith(".pdf"):
        images = convert_from_path(file.name)
        text = "\n".join(pytesseract.image_to_string(img) for img in images)
    else:
        text = pytesseract.image_to_string(Image.open(file.name))
    return anonymize(text)

# classifying
def classify(text):
    result = classifier(text[:512])[0]
    return result["label"], float(result["score"])

# GUI
with gr.Blocks(title="NY Court Order Classifier") as app:
    gr.Markdown("## NY Court Order Classifier")
    gr.Markdown("Upload a court order and classify it. Optionally assign a label.")

    filein = gr.File(label="Upload PDF or Image")
    textracted = gr.Textbox(lines=20, label="Extracted Text Preview")
    predicted = gr.Textbox(label="Predicted Label")
    confidence = gr.Textbox(label="Confidence")

    labelc = gr.Radio(choices=labels, label="Assign Manual Label")
    savebtn = gr.Button("Save Label")
    status = gr.Textbox(label="Status")

    def upload(file):
        if not file:
            return "", "", "", "", ""
        text = textract(file)
        label, score = classify(text)
        return text[:3000], label, str(round(score, 3)), "", ""

    filein.change(
        fn=upload,
        inputs=filein,
        outputs=[textracted, predicted, confidence, labelc, status]
    )

    def savelabel(choice, file, text):
        os.makedirs("labeled_dataset", exist_ok=True)
        labelidx = labels.index(choice)
        base = os.path.basename(file.name)
        with open(f"labeled_dataset/{base}.json", "w") as f:
            json.dump({"text": text, "label": labelidx}, f, indent=2)
        return f"✅ Saved label: {choice}"

    savebtn.click(fn=savelabel, inputs=[labelc, filein, textracted], outputs=status)

if __name__ == "__main__":
    app.launch()