Spaces:

szlevi
/

NYCourtOrderClass

Runtime error

App Files Files Community

NYCourtOrderClass / gui.py

szlevi

Upload folder using huggingface_hub

813dc3a verified 7 months ago

raw

history blame contribute delete

3.35 kB

	import os
	import json
	import re
	import gradio as gr
	from PIL import Image
	from pdf2image import convert_from_path
	import pytesseract
	import requests
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
	from huggingface_hub import whoami


	# HF login
	try:
	user = whoami()
	print(f"Authenticated as: {user['name']}")
	except Exception as e:
	print(f"HFace authentication failed: {e}")

	labels = ["Fully Compliant", "Procedural Defect", "Statutory Defect", "Constitutional Violation"]
	NVIDIA_API_KEY = os.environ.get("NVDGX_API_KEY")

	# loading model
	path = "finetuned_model/final" if os.path.exists("finetuned_model/final") else "Stern5497/sbert-legal-xlm-roberta-base"
	tokenizer = AutoTokenizer.from_pretrained(path)
	model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=4)
	classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

	# anonymizer
	ANON_REPLACEMENTS = [
	(r"\b\d{3}-\d{2}-\d{4}\b", "XXX-XX-XXXX"), # SSN
	(r"\b\d{3}-\d{3}-\d{4}\b", "XXX-XXX-XXXX"), # phone
	(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "EMAIL"),
	(r"\bDOB: \d{2}/\d{2}/\d{4}\b", "DOB: XX/XX/XXXX"),
	(r"\bChildren?\b", "Child"),
	(r"\b\d{5}(?:-\d{4})?\b", "ZIP") # ZIP
	]

	def anonymize(text):
	for pattern, repl in ANON_REPLACEMENTS:
	text = re.sub(pattern, repl, text)
	text = re.sub(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b", lambda m: "Plaintiff" if "Plaintiff" in m.group() else "Defendant", text)
	return text

	# OCR
	def textract(file):
	if file.name.endswith(".pdf"):
	images = convert_from_path(file.name)
	text = "\n".join(pytesseract.image_to_string(img) for img in images)
	else:
	text = pytesseract.image_to_string(Image.open(file.name))
	return anonymize(text)

	# classifying
	def classify(text):
	result = classifier(text[:512])[0]
	return result["label"], float(result["score"])

	# GUI
	with gr.Blocks(title="NY Court Order Classifier") as app:
	gr.Markdown("## NY Court Order Classifier")
	gr.Markdown("Upload a court order and classify it. Optionally assign a label.")

	filein = gr.File(label="Upload PDF or Image")
	textracted = gr.Textbox(lines=20, label="Extracted Text Preview")
	predicted = gr.Textbox(label="Predicted Label")
	confidence = gr.Textbox(label="Confidence")

	labelc = gr.Radio(choices=labels, label="Assign Manual Label")
	savebtn = gr.Button("Save Label")
	status = gr.Textbox(label="Status")

	def upload(file):
	if not file:
	return "", "", "", "", ""
	text = textract(file)
	label, score = classify(text)
	return text[:3000], label, str(round(score, 3)), "", ""

	filein.change(
	fn=upload,
	inputs=filein,
	outputs=[textracted, predicted, confidence, labelc, status]
	)

	def savelabel(choice, file, text):
	os.makedirs("labeled_dataset", exist_ok=True)
	labelidx = labels.index(choice)
	base = os.path.basename(file.name)
	with open(f"labeled_dataset/{base}.json", "w") as f:
	json.dump({"text": text, "label": labelidx}, f, indent=2)
	return f"✅ Saved label: {choice}"

	savebtn.click(fn=savelabel, inputs=[labelc, filein, textracted], outputs=status)

	if __name__ == "__main__":
	app.launch()