File size: 852 Bytes
a6b677f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import re, pdfplumber, pathlib, nltk, pandas as pd
from nltk.tokenize import sent_tokenize
nltk.download('punkt', quiet=True)

JUNK = re.compile(r'<footer>.*?</footer>|<header>.*?</header>|^\s*\d+\s*\|\s*P\s*a\s*g\s*e.*|Dictate Express.*', flags=re.I)

def extract_inmate(pdf_path: pathlib.Path, last_name: str) -> str:
    text = ""
    with pdfplumber.open(pdf_path) as doc:
        for p in doc.pages:
            text += " " + (p.extract_text() or "")
    text = JUNK.sub(" ", text)
    # keep only inmate lines
    lines = [re.sub(rf"^{last_name},?\s+[A-Z]\.\s*", "", l, flags=re.I)
             for l in text.splitlines()
             if re.match(rf"^{last_name},?\s+[A-Z]\.", l, flags=re.I)]
    return " ".join(lines)

def split_sentences(text: str):
    return [s.strip() for s in sent_tokenize(text) if len(s.split()) >= 6]