RE_Extractor / app.py
SarahXia0405's picture
Update app.py
57c627b verified
import gradio as gr
import regex as re
from dataclasses import dataclass
from typing import Dict, List, Tuple, Any
from pypdf import PdfReader
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import json
import csv
import os
# ----------------------------
# 1) Underwriting keyword dictionary
# ----------------------------
def build_keyword_dict() -> Dict[str, Dict[str, Any]]:
return {
"Pricing_Valuation": {
"weight": 3.0,
"terms": [
"purchase price", "asking price", "offer price",
"price per unit", "price per sf", "price per square foot",
"cap rate", "going-in cap", "exit cap", "terminal cap",
"valuation", "appraisal",
"irr", "levered irr", "unlevered irr",
"equity multiple", "cash-on-cash", "cash on cash",
"yield on cost", "break-even occupancy", "breakeven occupancy",
],
"regex": [
r"\bcap\s*rate\b",
r"\bgoing[-\s]*in\s+cap\b",
r"\bexit\s+cap\b|\bterminal\s+cap\b",
r"\bIRR\b",
r"\bequity\s+multiple\b",
r"\bcash[-\s]*on[-\s]*cash\b",
r"\byield\s+on\s+cost\b",
r"\bDSCR\b",
r"\bLTV\b|\bLTC\b",
r"\b\$\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:/sf|/SF|per\s*sf|per\s*SF|psf|PSF)\b",
r"\b\$\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:/unit|per\s*unit)\b",
],
},
"NOI_CashFlow": {
"weight": 3.0,
"terms": [
"noi", "net operating income",
"t-12", "t12", "trailing 12", "ttm",
"ytd", "annualized", "run rate", "pro forma",
"stabilized noi", "underwritten noi",
"cash flow", "net cash flow", "ebitda",
"effective gross income", "egi",
"gross potential rent", "gpr", "scheduled rent",
"other income", "ancillary income",
],
"regex": [
r"\bNOI\b|\bNet\s+Operating\s+Income\b",
r"\bT-?12\b|\bTrailing\s*12\b|\bTTM\b|\bYTD\b",
r"\bPro\s*Forma\b|\bUnderwritten\b|\bStabilized\b",
r"\bEBITDA\b",
r"\bEGI\b|\bEffective\s+Gross\s+Income\b",
],
},
"Occupancy_Rents": {
"weight": 2.5,
"terms": [
"occupancy", "physical occupancy", "economic occupancy",
"vacancy", "vacancy rate",
"market rent", "in-place rent", "in place rent",
"effective rent", "asking rent",
"rent growth", "rental rate growth",
"concessions", "free rent",
"loss to lease", "mark-to-market", "mark to market",
"renewal rate", "retention", "turnover",
"absorption",
"bad debt", "credit loss", "delinquency",
],
"regex": [
r"\boccupanc(?:y|ies)\b",
r"\bvacanc(?:y|ies)\b",
r"\bloss\s+to\s+lease\b",
r"\bmark[-\s]*to[-\s]*market\b",
r"\bconcession(?:s)?\b|\bfree\s+rent\b",
],
},
"Leases_Tenants": {
"weight": 3.0,
"terms": [
"rent roll", "tenant", "tenant mix", "top tenants",
"lease abstract", "lease term", "remaining term",
"walt", "wale", "weighted average lease term",
"commencement", "expiration", "lease expiration",
"options", "renewal options",
"escalations", "steps", "bumps", "rent schedule",
"base rent", "minimum rent",
"cam", "nnn", "triple net", "reimbursements",
"expense stop", "base year", "gross-up", "gross up",
"ti", "tenant improvements",
"leasing commission", "lc",
"security deposit", "letter of credit", "loc",
"guaranty", "guarantee",
"assignment", "sublease",
"credit rating", "tenant financials",
],
"regex": [
r"\bRent\s+Roll\b",
r"\bWALT\b|\bWALE\b|\bWeighted\s+Average\s+Lease\s+Term\b",
r"\bNNN\b|\bTriple\s+Net\b|\bCAM\b",
r"\bTI\b|\bTenant\s+Improvements?\b",
r"\bLeasing\s+Commission\b|\bLC\b",
r"\bLetter\s+of\s+Credit\b|\bLOC\b",
r"\bLease\s+(?:Abstract|Term|Expiration|Commencement)\b",
],
},
"Expenses": {
"weight": 2.3,
"terms": [
"operating expenses", "opex",
"property tax", "real estate taxes", "taxes",
"insurance",
"utilities", "water", "sewer", "electric", "gas",
"repairs and maintenance", "r&m", "maintenance",
"payroll", "personnel",
"management fee",
"contract services",
"landscaping", "trash", "janitorial",
"marketing", "admin",
"hoa", "coa",
"reserves", "replacement reserves",
"recoverable", "non-recoverable",
"reassessment", "tax appeal", "protest",
],
"regex": [
r"\bOpEx\b|\bOperating\s+Expenses\b",
r"\bReal\s+Estate\s+Taxes?\b|\bProperty\s+Taxes?\b|\bTaxes?\b",
r"\bInsurance\b",
r"\bUtilities?\b",
r"\bManagement\s+Fee\b",
r"\breassessment\b|\btax\s+appeal\b|\bprotest\b",
r"\brecoverable\b|\bnon[-\s]*recoverable\b",
],
},
"CapEx_ValueAdd": {
"weight": 2.7,
"terms": [
"capex", "capital expenditures",
"renovation", "repositioning", "value-add", "value add",
"deferred maintenance",
"replacement reserves",
"budget", "scope", "timeline", "phasing",
"rent premium", "upgrade",
],
"regex": [
r"\bCapEx\b|\bCapital\s+Expenditures?\b",
r"\bValue[-\s]*Add\b",
r"\bDeferred\s+Maintenance\b",
r"\bRent\s+Premium\b",
],
},
"Debt_Financing": {
"weight": 2.8,
"terms": [
"loan", "debt", "financing",
"ltv", "ltc", "dscr",
"interest rate", "coupon", "sofr", "spread",
"fixed", "floating",
"amortization", "interest only", "io",
"maturity", "term",
"prepayment", "yield maintenance", "defeasance",
"covenants",
"recourse", "non-recourse", "nonrecourse",
"refinance",
],
"regex": [
r"\bLTV\b|\bLTC\b|\bDSCR\b",
r"\bSOFR\b",
r"\bInterest\s+Only\b|\bIO\b",
r"\bYield\s+Maintenance\b|\bDefeasance\b",
r"\bNon[-\s]*Recourse\b",
],
},
"Market_Demographics": {
"weight": 1.8,
"terms": [
"market", "submarket", "trade area",
"demographics", "population", "households",
"median household income", "mhi",
"employment", "job growth",
"major employers",
"supply pipeline", "under construction", "deliveries",
"comparable", "comp set",
"traffic counts",
],
"regex": [
r"\bDemographics\b",
r"\bPopulation\b|\bHouseholds\b",
r"\bMedian\s+Household\s+Income\b|\bMHI\b",
r"\bUnder\s+Construction\b|\bDeliveries\b|\bPipeline\b",
r"\bTraffic\s+Counts?\b",
],
},
"Risk_Legal_DD": {
"weight": 2.0,
"terms": [
"risk factors", "assumptions", "underwriting assumptions",
"forward-looking", "disclaimer", "disclosures",
"environmental", "phase i", "phase ii",
"zoning", "entitlements",
"survey", "alta",
"title", "easement", "encumbrance",
"ada", "flood zone", "fema",
"litigation", "property condition assessment", "pca",
],
"regex": [
r"\bRisk\s+Factors\b|\bDisclosures?\b|\bDisclaimer\b",
r"\bForward[-\s]*Looking\b",
r"\bPhase\s*I\b|\bPhase\s*II\b|\bEnvironmental\b",
r"\bZoning\b|\bEntitlements?\b",
r"\bFlood\s+Zone\b|\bFEMA\b",
r"\bLitigation\b",
],
},
}
# ----------------------------
# 2) PDF extraction
# ----------------------------
@dataclass
class PageText:
page: int
text: str
source: str # "text" or "ocr"
text_chars: int
def extract_text_layer(pdf_path: str) -> List[str]:
reader = PdfReader(pdf_path)
out = []
for page in reader.pages:
out.append(page.extract_text() or "")
return out
def ocr_page_tesseract(img: Image.Image) -> str:
config = "--oem 1 --psm 6"
return pytesseract.image_to_string(img, lang="eng", config=config) or ""
def extract_pdf_pages(pdf_path: str, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int) -> List[PageText]:
text_pages = extract_text_layer(pdf_path)
pages: List[PageText] = []
for i, t in enumerate(text_pages):
base = (t or "").strip()
base_chars = len(base)
if use_ocr and base_chars < int(ocr_min_chars):
imgs = convert_from_path(pdf_path, dpi=int(ocr_dpi), first_page=i + 1, last_page=i + 1)
img = imgs[0]
ocr_text = (ocr_page_tesseract(img) or "").strip()
if len(ocr_text) > base_chars:
pages.append(PageText(page=i + 1, text=ocr_text, source="ocr", text_chars=len(ocr_text)))
continue
pages.append(PageText(page=i + 1, text=base, source="text", text_chars=base_chars))
return pages
# ----------------------------
# 3) Matching & scoring (no pandas)
# ----------------------------
def normalize_text(s: str) -> str:
return (s or "").lower()
def compile_patterns(kw: Dict[str, Dict[str, Any]]) -> Dict[str, List[re.Pattern]]:
compiled: Dict[str, List[re.Pattern]] = {}
for cat, cfg in kw.items():
pats: List[re.Pattern] = []
for term in cfg.get("terms", []):
term = (term or "").strip().lower()
if not term:
continue
pat = re.escape(term).replace(r"\ ", r"\s+")
pats.append(re.compile(rf"(?i)\b{pat}\b"))
for rp in cfg.get("regex", []):
pats.append(re.compile(rf"(?i){rp}"))
compiled[cat] = pats
return compiled
def find_snippets(text: str, patterns: List[re.Pattern], window: int = 90, max_snippets: int = 4) -> List[str]:
snips: List[str] = []
for pat in patterns:
for m in pat.finditer(text):
s = max(0, m.start() - window)
e = min(len(text), m.end() + window)
snippet = re.sub(r"\s+", " ", text[s:e].strip())
snips.append(snippet)
if len(snips) >= max_snippets:
return snips
return snips
def score_pages(pages: List[PageText], kw: Dict[str, Dict[str, Any]]):
compiled = compile_patterns(kw)
overall_hits = {cat: 0 for cat in kw.keys()}
overall_weighted = 0.0
page_records = []
for p in pages:
t = normalize_text(p.text)
page_hits_total = 0
page_weighted = 0.0
cat_hits = {}
for cat, cfg in kw.items():
hits = 0
for pat in compiled[cat]:
hits += len(list(pat.finditer(t)))
cat_hits[cat] = hits
overall_hits[cat] += hits
if hits:
page_hits_total += hits
page_weighted += hits * float(cfg["weight"])
overall_weighted += page_weighted
top_cats = sorted(cat_hits.items(), key=lambda x: x[1], reverse=True)
snippet_lines: List[str] = []
for cat, hits in top_cats[:3]:
if hits <= 0:
continue
snips = find_snippets(t, compiled[cat], window=90, max_snippets=2)
for s in snips:
snippet_lines.append(f"[{cat}] {s}")
page_records.append({
"page": p.page,
"source": p.source,
"text_chars": p.text_chars,
"hits_total": int(page_hits_total),
"score_weighted": round(page_weighted, 2),
"top_snippets": "\n".join(snippet_lines[:6]),
})
cat_summary = []
for cat, cfg in kw.items():
cat_summary.append({
"category": cat,
"weight": cfg["weight"],
"hits": int(overall_hits[cat]),
"weighted_hits": round(overall_hits[cat] * float(cfg["weight"]), 2),
})
page_records_sorted = sorted(
page_records,
key=lambda r: (r["score_weighted"], r["hits_total"]),
reverse=True
)
cat_summary_sorted = sorted(
cat_summary,
key=lambda r: (r["weighted_hits"], r["hits"]),
reverse=True
)
meta = {
"total_pages": len(pages),
"total_hits": int(sum(overall_hits.values())),
"total_weighted_score": round(float(overall_weighted), 2),
"sources": {
"text": sum(1 for p in pages if p.source == "text"),
"ocr": sum(1 for p in pages if p.source == "ocr"),
}
}
return page_records_sorted, cat_summary_sorted, meta
def write_csv(path: str, rows: List[dict], headers: List[str]):
with open(path, "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(headers)
for r in rows:
w.writerow([r.get(h, "") for h in headers])
# ----------------------------
# 4) Gradio app
# ----------------------------
def run_extract(pdf_file, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int, topk_pages: int):
if pdf_file is None:
return None, None, "", None, "Please upload a PDF."
kw = build_keyword_dict()
pages = extract_pdf_pages(
pdf_path=pdf_file.name,
use_ocr=use_ocr,
ocr_min_chars=int(ocr_min_chars),
ocr_dpi=int(ocr_dpi),
)
page_ranking, cat_summary, meta = score_pages(pages, kw)
topk = int(topk_pages)
top_pages = page_ranking[:topk]
payload = {
"meta": meta,
"category_summary": cat_summary,
"page_ranking": page_ranking,
}
json_path = "underwriting_keywords_output.json"
csv_pages_path = "page_ranking.csv"
csv_cats_path = "category_summary.csv"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
write_csv(
csv_pages_path,
page_ranking,
headers=["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"]
)
write_csv(
csv_cats_path,
cat_summary,
headers=["category", "weight", "hits", "weighted_hits"]
)
summary = (
f"Total pages: {meta['total_pages']} | "
f"Total hits: {meta['total_hits']} | "
f"Weighted score: {meta['total_weighted_score']} | "
f"Sources: {meta['sources']}"
)
# Gradio Dataframe expects list-of-lists with headers
cats_headers = ["category", "weight", "hits", "weighted_hits"]
cats_table = [cats_headers] + [[r[h] for h in cats_headers] for r in cat_summary]
pages_headers = ["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"]
pages_table = [pages_headers] + [[r[h] for h in pages_headers] for r in top_pages]
return cats_table, pages_table, summary, [json_path, csv_pages_path, csv_cats_path], "Done."
with gr.Blocks(title="OM Underwriting Keyword Extractor") as demo:
gr.Markdown(
"# OM Underwriting Keyword Extractor\n"
"Upload a real estate OM PDF and extract underwriting keyword signals.\n\n"
"**This build uses minimal deps (no pandas/numpy/torch).** OCR fallback uses Tesseract."
)
with gr.Row():
pdf = gr.File(label="Upload OM PDF", file_types=[".pdf"])
with gr.Column():
use_ocr = gr.Checkbox(value=True, label="Enable OCR fallback (recommended for OM)")
ocr_min_chars = gr.Slider(0, 3000, value=350, step=50, label="OCR trigger: if text chars on page <")
ocr_dpi = gr.Slider(120, 300, value=200, step=10, label="OCR render DPI")
topk_pages = gr.Slider(5, 60, value=15, step=1, label="Show Top-K pages")
run_btn = gr.Button("Extract Keywords")
gr.Markdown("## Category Summary (sorted by weighted hits)")
out_cats = gr.Dataframe(interactive=False)
gr.Markdown("## Top Pages (highest underwriting signal)")
out_pages = gr.Dataframe(interactive=False)
out_summary = gr.Textbox(label="Run Summary", interactive=False)
out_files = gr.File(label="Download Outputs (JSON + CSVs)", file_count="multiple")
out_status = gr.Textbox(label="Status", interactive=False)
run_btn.click(
fn=run_extract,
inputs=[pdf, use_ocr, ocr_min_chars, ocr_dpi, topk_pages],
outputs=[out_cats, out_pages, out_summary, out_files, out_status],
)
demo.launch()