Spaces:

pradoth
/

innocence_matrix

Sleeping

App Files Files Community

pradoth commited on Nov 10, 2025

Commit

a6b677f

verified ·

1 Parent(s): 99b161e

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

app/__init__.py +1 -0
app/bert_model.py +22 -0
app/main_bert.py +85 -0
app/preprocess.py +20 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Innocence Claim API application package."""

app/bert_model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch, pathlib, nltk
+from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
+from app.preprocess import split_sentences
+nltk.download('punkt', quiet=True)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+MODEL_PATH = pathlib.Path(__file__).parent.parent / "models/distilbert_innocence"
+tokenizer  = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
+model      = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(device).eval()
+@torch.no_grad()
+def predict_sentences(text: str, cutoff: float = 0.70):
+    sents = split_sentences(text)
+    if not sents:
+        return []
+    encoded = tokenizer(sents, truncation=True, padding=True,
+                        max_length=128, return_tensors='pt').to(device)
+    logits = model(**encoded).logits
+    probs  = torch.softmax(logits, dim=1)[:, 1].cpu().tolist()
+    return [{'sentence': s, 'confidence': round(p, 3)}
+            for s, p in zip(sents, probs) if p >= cutoff]

app/main_bert.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from fastapi import FastAPI, File, Form, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import pathlib, pickle, pdfplumber, spacy, torch, tempfile, re, io
+app = FastAPI(title="Innocence-Claim API", version="1.0")
+# ---------- CORS Configuration ----------
+# Configured for Hugging Face Spaces - allows all origins for API accessibility
+# Hugging Face Spaces URLs follow pattern: https://{username}-{space-name}.hf.space
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins for public API access
+    allow_credentials=False,  # Must be False when allow_origins is ["*"]
+    allow_methods=["GET", "POST", "OPTIONS"],
+    allow_headers=["*"],
+)
+# ---------- load pipeline ----------
+pkl_path = pathlib.Path(__file__).parent.parent / "models" / "innocence_pipeline.pkl"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Custom unpickler to handle CUDA tensors on CPU
+class CPU_Unpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if module == 'torch.storage' and name == '_load_from_bytes':
+            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
+        else:
+            return super().find_class(module, name)
+with open(pkl_path, "rb") as f:
+    if device.type == "cpu":
+        bundle = CPU_Unpickler(f).load()
+    else:
+        bundle = pickle.load(f)
+tokenizer, model = bundle["tokenizer"], bundle["model"]
+model.to(device)
+nlp = spacy.load("en_core_web_sm")
+def predict(text: str) -> float:
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
+    with torch.no_grad():
+        prob = torch.softmax(model(**inputs).logits, dim=1)[0, 1].item()
+    return round(prob, 3)
+def analyse_pdf(pdf_path: pathlib.Path, cutoff: float):
+    rows, total, score_sum = [], 0, 0.0
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_idx, page in enumerate(pdf.pages, 1):
+            text = page.extract_text() or ""
+            for sent_idx, sent in enumerate(nlp(text).sents, 1):
+                s = sent.text.strip()
+                if 10 < len(s) < 500:
+                    total += 1
+                    score = predict(s)
+                    score_sum += score
+                    if score >= cutoff:
+                        rows.append({
+                            "sentence": s,
+                            "confidence": score,
+                            "page": page_idx,
+                            "sent_id": sent_idx,
+                        })
+    reliability = round((score_sum / total) * 100, 1) if total else 0.0
+    tier = "High" if reliability >= 80 else "Medium" if reliability >= 50 else "Low"
+    return {"reliability_percent": reliability, "tier": tier}
+@app.post("/predict")
+async def predict_pdf(
+    file: UploadFile = File(...),
+    cutoff: float = Form(0.7),
+):
+    if file.content_type != "application/pdf":
+        raise HTTPException(400, "PDF required")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(await file.read())
+        tmp_path = pathlib.Path(tmp.name)
+    try:
+        return analyse_pdf(tmp_path, cutoff)
+    finally:
+        tmp_path.unlink(missing_ok=True)
+@app.get("/health")
+def health():
+    return {"status": "ok"}

app/preprocess.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import re, pdfplumber, pathlib, nltk, pandas as pd
+from nltk.tokenize import sent_tokenize
+nltk.download('punkt', quiet=True)
+JUNK = re.compile(r'<footer>.*?</footer>|<header>.*?</header>|^\s*\d+\s*\|\s*P\s*a\s*g\s*e.*|Dictate Express.*', flags=re.I)
+def extract_inmate(pdf_path: pathlib.Path, last_name: str) -> str:
+    text = ""
+    with pdfplumber.open(pdf_path) as doc:
+        for p in doc.pages:
+            text += " " + (p.extract_text() or "")
+    text = JUNK.sub(" ", text)
+    # keep only inmate lines
+    lines = [re.sub(rf"^{last_name},?\s+[A-Z]\.\s*", "", l, flags=re.I)
+             for l in text.splitlines()
+             if re.match(rf"^{last_name},?\s+[A-Z]\.", l, flags=re.I)]
+    return " ".join(lines)
+def split_sentences(text: str):
+    return [s.strip() for s in sent_tokenize(text) if len(s.split()) >= 6]