Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- app/__init__.py +1 -0
- app/bert_model.py +22 -0
- app/main_bert.py +85 -0
- app/preprocess.py +20 -0
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Innocence Claim API application package."""
|
app/bert_model.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch, pathlib, nltk
|
| 2 |
+
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
|
| 3 |
+
from app.preprocess import split_sentences
|
| 4 |
+
nltk.download('punkt', quiet=True)
|
| 5 |
+
|
| 6 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 7 |
+
|
| 8 |
+
MODEL_PATH = pathlib.Path(__file__).parent.parent / "models/distilbert_innocence"
|
| 9 |
+
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
|
| 10 |
+
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(device).eval()
|
| 11 |
+
|
| 12 |
+
@torch.no_grad()
|
| 13 |
+
def predict_sentences(text: str, cutoff: float = 0.70):
|
| 14 |
+
sents = split_sentences(text)
|
| 15 |
+
if not sents:
|
| 16 |
+
return []
|
| 17 |
+
encoded = tokenizer(sents, truncation=True, padding=True,
|
| 18 |
+
max_length=128, return_tensors='pt').to(device)
|
| 19 |
+
logits = model(**encoded).logits
|
| 20 |
+
probs = torch.softmax(logits, dim=1)[:, 1].cpu().tolist()
|
| 21 |
+
return [{'sentence': s, 'confidence': round(p, 3)}
|
| 22 |
+
for s, p in zip(sents, probs) if p >= cutoff]
|
app/main_bert.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, File, Form, UploadFile, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
import pathlib, pickle, pdfplumber, spacy, torch, tempfile, re, io
|
| 4 |
+
|
| 5 |
+
app = FastAPI(title="Innocence-Claim API", version="1.0")
|
| 6 |
+
|
| 7 |
+
# ---------- CORS Configuration ----------
|
| 8 |
+
# Configured for Hugging Face Spaces - allows all origins for API accessibility
|
| 9 |
+
# Hugging Face Spaces URLs follow pattern: https://{username}-{space-name}.hf.space
|
| 10 |
+
app.add_middleware(
|
| 11 |
+
CORSMiddleware,
|
| 12 |
+
allow_origins=["*"], # Allow all origins for public API access
|
| 13 |
+
allow_credentials=False, # Must be False when allow_origins is ["*"]
|
| 14 |
+
allow_methods=["GET", "POST", "OPTIONS"],
|
| 15 |
+
allow_headers=["*"],
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# ---------- load pipeline ----------
|
| 19 |
+
pkl_path = pathlib.Path(__file__).parent.parent / "models" / "innocence_pipeline.pkl"
|
| 20 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 21 |
+
|
| 22 |
+
# Custom unpickler to handle CUDA tensors on CPU
|
| 23 |
+
class CPU_Unpickler(pickle.Unpickler):
|
| 24 |
+
def find_class(self, module, name):
|
| 25 |
+
if module == 'torch.storage' and name == '_load_from_bytes':
|
| 26 |
+
return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
|
| 27 |
+
else:
|
| 28 |
+
return super().find_class(module, name)
|
| 29 |
+
|
| 30 |
+
with open(pkl_path, "rb") as f:
|
| 31 |
+
if device.type == "cpu":
|
| 32 |
+
bundle = CPU_Unpickler(f).load()
|
| 33 |
+
else:
|
| 34 |
+
bundle = pickle.load(f)
|
| 35 |
+
|
| 36 |
+
tokenizer, model = bundle["tokenizer"], bundle["model"]
|
| 37 |
+
model.to(device)
|
| 38 |
+
nlp = spacy.load("en_core_web_sm")
|
| 39 |
+
|
| 40 |
+
def predict(text: str) -> float:
|
| 41 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
|
| 42 |
+
with torch.no_grad():
|
| 43 |
+
prob = torch.softmax(model(**inputs).logits, dim=1)[0, 1].item()
|
| 44 |
+
return round(prob, 3)
|
| 45 |
+
|
| 46 |
+
def analyse_pdf(pdf_path: pathlib.Path, cutoff: float):
|
| 47 |
+
rows, total, score_sum = [], 0, 0.0
|
| 48 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 49 |
+
for page_idx, page in enumerate(pdf.pages, 1):
|
| 50 |
+
text = page.extract_text() or ""
|
| 51 |
+
for sent_idx, sent in enumerate(nlp(text).sents, 1):
|
| 52 |
+
s = sent.text.strip()
|
| 53 |
+
if 10 < len(s) < 500:
|
| 54 |
+
total += 1
|
| 55 |
+
score = predict(s)
|
| 56 |
+
score_sum += score
|
| 57 |
+
if score >= cutoff:
|
| 58 |
+
rows.append({
|
| 59 |
+
"sentence": s,
|
| 60 |
+
"confidence": score,
|
| 61 |
+
"page": page_idx,
|
| 62 |
+
"sent_id": sent_idx,
|
| 63 |
+
})
|
| 64 |
+
reliability = round((score_sum / total) * 100, 1) if total else 0.0
|
| 65 |
+
tier = "High" if reliability >= 80 else "Medium" if reliability >= 50 else "Low"
|
| 66 |
+
return {"reliability_percent": reliability, "tier": tier}
|
| 67 |
+
|
| 68 |
+
@app.post("/predict")
|
| 69 |
+
async def predict_pdf(
|
| 70 |
+
file: UploadFile = File(...),
|
| 71 |
+
cutoff: float = Form(0.7),
|
| 72 |
+
):
|
| 73 |
+
if file.content_type != "application/pdf":
|
| 74 |
+
raise HTTPException(400, "PDF required")
|
| 75 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 76 |
+
tmp.write(await file.read())
|
| 77 |
+
tmp_path = pathlib.Path(tmp.name)
|
| 78 |
+
try:
|
| 79 |
+
return analyse_pdf(tmp_path, cutoff)
|
| 80 |
+
finally:
|
| 81 |
+
tmp_path.unlink(missing_ok=True)
|
| 82 |
+
|
| 83 |
+
@app.get("/health")
|
| 84 |
+
def health():
|
| 85 |
+
return {"status": "ok"}
|
app/preprocess.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re, pdfplumber, pathlib, nltk, pandas as pd
|
| 2 |
+
from nltk.tokenize import sent_tokenize
|
| 3 |
+
nltk.download('punkt', quiet=True)
|
| 4 |
+
|
| 5 |
+
JUNK = re.compile(r'<footer>.*?</footer>|<header>.*?</header>|^\s*\d+\s*\|\s*P\s*a\s*g\s*e.*|Dictate Express.*', flags=re.I)
|
| 6 |
+
|
| 7 |
+
def extract_inmate(pdf_path: pathlib.Path, last_name: str) -> str:
|
| 8 |
+
text = ""
|
| 9 |
+
with pdfplumber.open(pdf_path) as doc:
|
| 10 |
+
for p in doc.pages:
|
| 11 |
+
text += " " + (p.extract_text() or "")
|
| 12 |
+
text = JUNK.sub(" ", text)
|
| 13 |
+
# keep only inmate lines
|
| 14 |
+
lines = [re.sub(rf"^{last_name},?\s+[A-Z]\.\s*", "", l, flags=re.I)
|
| 15 |
+
for l in text.splitlines()
|
| 16 |
+
if re.match(rf"^{last_name},?\s+[A-Z]\.", l, flags=re.I)]
|
| 17 |
+
return " ".join(lines)
|
| 18 |
+
|
| 19 |
+
def split_sentences(text: str):
|
| 20 |
+
return [s.strip() for s in sent_tokenize(text) if len(s.split()) >= 6]
|