pradoth commited on
Commit
a6b677f
·
verified ·
1 Parent(s): 99b161e

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app/__init__.py +1 -0
  2. app/bert_model.py +22 -0
  3. app/main_bert.py +85 -0
  4. app/preprocess.py +20 -0
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Innocence Claim API application package."""
app/bert_model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, pathlib, nltk
2
+ from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
3
+ from app.preprocess import split_sentences
4
+ nltk.download('punkt', quiet=True)
5
+
6
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
7
+
8
+ MODEL_PATH = pathlib.Path(__file__).parent.parent / "models/distilbert_innocence"
9
+ tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
10
+ model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(device).eval()
11
+
12
+ @torch.no_grad()
13
+ def predict_sentences(text: str, cutoff: float = 0.70):
14
+ sents = split_sentences(text)
15
+ if not sents:
16
+ return []
17
+ encoded = tokenizer(sents, truncation=True, padding=True,
18
+ max_length=128, return_tensors='pt').to(device)
19
+ logits = model(**encoded).logits
20
+ probs = torch.softmax(logits, dim=1)[:, 1].cpu().tolist()
21
+ return [{'sentence': s, 'confidence': round(p, 3)}
22
+ for s, p in zip(sents, probs) if p >= cutoff]
app/main_bert.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, Form, UploadFile, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ import pathlib, pickle, pdfplumber, spacy, torch, tempfile, re, io
4
+
5
+ app = FastAPI(title="Innocence-Claim API", version="1.0")
6
+
7
+ # ---------- CORS Configuration ----------
8
+ # Configured for Hugging Face Spaces - allows all origins for API accessibility
9
+ # Hugging Face Spaces URLs follow pattern: https://{username}-{space-name}.hf.space
10
+ app.add_middleware(
11
+ CORSMiddleware,
12
+ allow_origins=["*"], # Allow all origins for public API access
13
+ allow_credentials=False, # Must be False when allow_origins is ["*"]
14
+ allow_methods=["GET", "POST", "OPTIONS"],
15
+ allow_headers=["*"],
16
+ )
17
+
18
+ # ---------- load pipeline ----------
19
+ pkl_path = pathlib.Path(__file__).parent.parent / "models" / "innocence_pipeline.pkl"
20
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+
22
+ # Custom unpickler to handle CUDA tensors on CPU
23
+ class CPU_Unpickler(pickle.Unpickler):
24
+ def find_class(self, module, name):
25
+ if module == 'torch.storage' and name == '_load_from_bytes':
26
+ return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
27
+ else:
28
+ return super().find_class(module, name)
29
+
30
+ with open(pkl_path, "rb") as f:
31
+ if device.type == "cpu":
32
+ bundle = CPU_Unpickler(f).load()
33
+ else:
34
+ bundle = pickle.load(f)
35
+
36
+ tokenizer, model = bundle["tokenizer"], bundle["model"]
37
+ model.to(device)
38
+ nlp = spacy.load("en_core_web_sm")
39
+
40
+ def predict(text: str) -> float:
41
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
42
+ with torch.no_grad():
43
+ prob = torch.softmax(model(**inputs).logits, dim=1)[0, 1].item()
44
+ return round(prob, 3)
45
+
46
+ def analyse_pdf(pdf_path: pathlib.Path, cutoff: float):
47
+ rows, total, score_sum = [], 0, 0.0
48
+ with pdfplumber.open(pdf_path) as pdf:
49
+ for page_idx, page in enumerate(pdf.pages, 1):
50
+ text = page.extract_text() or ""
51
+ for sent_idx, sent in enumerate(nlp(text).sents, 1):
52
+ s = sent.text.strip()
53
+ if 10 < len(s) < 500:
54
+ total += 1
55
+ score = predict(s)
56
+ score_sum += score
57
+ if score >= cutoff:
58
+ rows.append({
59
+ "sentence": s,
60
+ "confidence": score,
61
+ "page": page_idx,
62
+ "sent_id": sent_idx,
63
+ })
64
+ reliability = round((score_sum / total) * 100, 1) if total else 0.0
65
+ tier = "High" if reliability >= 80 else "Medium" if reliability >= 50 else "Low"
66
+ return {"reliability_percent": reliability, "tier": tier}
67
+
68
+ @app.post("/predict")
69
+ async def predict_pdf(
70
+ file: UploadFile = File(...),
71
+ cutoff: float = Form(0.7),
72
+ ):
73
+ if file.content_type != "application/pdf":
74
+ raise HTTPException(400, "PDF required")
75
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
76
+ tmp.write(await file.read())
77
+ tmp_path = pathlib.Path(tmp.name)
78
+ try:
79
+ return analyse_pdf(tmp_path, cutoff)
80
+ finally:
81
+ tmp_path.unlink(missing_ok=True)
82
+
83
+ @app.get("/health")
84
+ def health():
85
+ return {"status": "ok"}
app/preprocess.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, pdfplumber, pathlib, nltk, pandas as pd
2
+ from nltk.tokenize import sent_tokenize
3
+ nltk.download('punkt', quiet=True)
4
+
5
+ JUNK = re.compile(r'<footer>.*?</footer>|<header>.*?</header>|^\s*\d+\s*\|\s*P\s*a\s*g\s*e.*|Dictate Express.*', flags=re.I)
6
+
7
+ def extract_inmate(pdf_path: pathlib.Path, last_name: str) -> str:
8
+ text = ""
9
+ with pdfplumber.open(pdf_path) as doc:
10
+ for p in doc.pages:
11
+ text += " " + (p.extract_text() or "")
12
+ text = JUNK.sub(" ", text)
13
+ # keep only inmate lines
14
+ lines = [re.sub(rf"^{last_name},?\s+[A-Z]\.\s*", "", l, flags=re.I)
15
+ for l in text.splitlines()
16
+ if re.match(rf"^{last_name},?\s+[A-Z]\.", l, flags=re.I)]
17
+ return " ".join(lines)
18
+
19
+ def split_sentences(text: str):
20
+ return [s.strip() for s in sent_tokenize(text) if len(s.split()) >= 6]