szlevi's picture
Upload folder using huggingface_hub
813dc3a verified
import os
import json
import re
import gradio as gr
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from huggingface_hub import whoami
# HF login
try:
user = whoami()
print(f"Authenticated as: {user['name']}")
except Exception as e:
print(f"HFace authentication failed: {e}")
labels = ["Fully Compliant", "Procedural Defect", "Statutory Defect", "Constitutional Violation"]
NVIDIA_API_KEY = os.environ.get("NVDGX_API_KEY")
# loading model
path = "finetuned_model/final" if os.path.exists("finetuned_model/final") else "Stern5497/sbert-legal-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=4)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
# anonymizer
ANON_REPLACEMENTS = [
(r"\b\d{3}-\d{2}-\d{4}\b", "XXX-XX-XXXX"), # SSN
(r"\b\d{3}-\d{3}-\d{4}\b", "XXX-XXX-XXXX"), # phone
(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "EMAIL"),
(r"\bDOB: \d{2}/\d{2}/\d{4}\b", "DOB: XX/XX/XXXX"),
(r"\bChildren?\b", "Child"),
(r"\b\d{5}(?:-\d{4})?\b", "ZIP") # ZIP
]
def anonymize(text):
for pattern, repl in ANON_REPLACEMENTS:
text = re.sub(pattern, repl, text)
text = re.sub(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b", lambda m: "Plaintiff" if "Plaintiff" in m.group() else "Defendant", text)
return text
# OCR
def textract(file):
if file.name.endswith(".pdf"):
images = convert_from_path(file.name)
text = "\n".join(pytesseract.image_to_string(img) for img in images)
else:
text = pytesseract.image_to_string(Image.open(file.name))
return anonymize(text)
# classifying
def classify(text):
result = classifier(text[:512])[0]
return result["label"], float(result["score"])
# GUI
with gr.Blocks(title="NY Court Order Classifier") as app:
gr.Markdown("## NY Court Order Classifier")
gr.Markdown("Upload a court order and classify it. Optionally assign a label.")
filein = gr.File(label="Upload PDF or Image")
textracted = gr.Textbox(lines=20, label="Extracted Text Preview")
predicted = gr.Textbox(label="Predicted Label")
confidence = gr.Textbox(label="Confidence")
labelc = gr.Radio(choices=labels, label="Assign Manual Label")
savebtn = gr.Button("Save Label")
status = gr.Textbox(label="Status")
def upload(file):
if not file:
return "", "", "", "", ""
text = textract(file)
label, score = classify(text)
return text[:3000], label, str(round(score, 3)), "", ""
filein.change(
fn=upload,
inputs=filein,
outputs=[textracted, predicted, confidence, labelc, status]
)
def savelabel(choice, file, text):
os.makedirs("labeled_dataset", exist_ok=True)
labelidx = labels.index(choice)
base = os.path.basename(file.name)
with open(f"labeled_dataset/{base}.json", "w") as f:
json.dump({"text": text, "label": labelidx}, f, indent=2)
return f"✅ Saved label: {choice}"
savebtn.click(fn=savelabel, inputs=[labelc, filein, textracted], outputs=status)
if __name__ == "__main__":
app.launch()