Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import regex as re | |
| from dataclasses import dataclass | |
| from typing import Dict, List, Tuple, Any | |
| from pypdf import PdfReader | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import pytesseract | |
| import json | |
| import csv | |
| import os | |
| # ---------------------------- | |
| # 1) Underwriting keyword dictionary | |
| # ---------------------------- | |
| def build_keyword_dict() -> Dict[str, Dict[str, Any]]: | |
| return { | |
| "Pricing_Valuation": { | |
| "weight": 3.0, | |
| "terms": [ | |
| "purchase price", "asking price", "offer price", | |
| "price per unit", "price per sf", "price per square foot", | |
| "cap rate", "going-in cap", "exit cap", "terminal cap", | |
| "valuation", "appraisal", | |
| "irr", "levered irr", "unlevered irr", | |
| "equity multiple", "cash-on-cash", "cash on cash", | |
| "yield on cost", "break-even occupancy", "breakeven occupancy", | |
| ], | |
| "regex": [ | |
| r"\bcap\s*rate\b", | |
| r"\bgoing[-\s]*in\s+cap\b", | |
| r"\bexit\s+cap\b|\bterminal\s+cap\b", | |
| r"\bIRR\b", | |
| r"\bequity\s+multiple\b", | |
| r"\bcash[-\s]*on[-\s]*cash\b", | |
| r"\byield\s+on\s+cost\b", | |
| r"\bDSCR\b", | |
| r"\bLTV\b|\bLTC\b", | |
| r"\b\$\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:/sf|/SF|per\s*sf|per\s*SF|psf|PSF)\b", | |
| r"\b\$\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:/unit|per\s*unit)\b", | |
| ], | |
| }, | |
| "NOI_CashFlow": { | |
| "weight": 3.0, | |
| "terms": [ | |
| "noi", "net operating income", | |
| "t-12", "t12", "trailing 12", "ttm", | |
| "ytd", "annualized", "run rate", "pro forma", | |
| "stabilized noi", "underwritten noi", | |
| "cash flow", "net cash flow", "ebitda", | |
| "effective gross income", "egi", | |
| "gross potential rent", "gpr", "scheduled rent", | |
| "other income", "ancillary income", | |
| ], | |
| "regex": [ | |
| r"\bNOI\b|\bNet\s+Operating\s+Income\b", | |
| r"\bT-?12\b|\bTrailing\s*12\b|\bTTM\b|\bYTD\b", | |
| r"\bPro\s*Forma\b|\bUnderwritten\b|\bStabilized\b", | |
| r"\bEBITDA\b", | |
| r"\bEGI\b|\bEffective\s+Gross\s+Income\b", | |
| ], | |
| }, | |
| "Occupancy_Rents": { | |
| "weight": 2.5, | |
| "terms": [ | |
| "occupancy", "physical occupancy", "economic occupancy", | |
| "vacancy", "vacancy rate", | |
| "market rent", "in-place rent", "in place rent", | |
| "effective rent", "asking rent", | |
| "rent growth", "rental rate growth", | |
| "concessions", "free rent", | |
| "loss to lease", "mark-to-market", "mark to market", | |
| "renewal rate", "retention", "turnover", | |
| "absorption", | |
| "bad debt", "credit loss", "delinquency", | |
| ], | |
| "regex": [ | |
| r"\boccupanc(?:y|ies)\b", | |
| r"\bvacanc(?:y|ies)\b", | |
| r"\bloss\s+to\s+lease\b", | |
| r"\bmark[-\s]*to[-\s]*market\b", | |
| r"\bconcession(?:s)?\b|\bfree\s+rent\b", | |
| ], | |
| }, | |
| "Leases_Tenants": { | |
| "weight": 3.0, | |
| "terms": [ | |
| "rent roll", "tenant", "tenant mix", "top tenants", | |
| "lease abstract", "lease term", "remaining term", | |
| "walt", "wale", "weighted average lease term", | |
| "commencement", "expiration", "lease expiration", | |
| "options", "renewal options", | |
| "escalations", "steps", "bumps", "rent schedule", | |
| "base rent", "minimum rent", | |
| "cam", "nnn", "triple net", "reimbursements", | |
| "expense stop", "base year", "gross-up", "gross up", | |
| "ti", "tenant improvements", | |
| "leasing commission", "lc", | |
| "security deposit", "letter of credit", "loc", | |
| "guaranty", "guarantee", | |
| "assignment", "sublease", | |
| "credit rating", "tenant financials", | |
| ], | |
| "regex": [ | |
| r"\bRent\s+Roll\b", | |
| r"\bWALT\b|\bWALE\b|\bWeighted\s+Average\s+Lease\s+Term\b", | |
| r"\bNNN\b|\bTriple\s+Net\b|\bCAM\b", | |
| r"\bTI\b|\bTenant\s+Improvements?\b", | |
| r"\bLeasing\s+Commission\b|\bLC\b", | |
| r"\bLetter\s+of\s+Credit\b|\bLOC\b", | |
| r"\bLease\s+(?:Abstract|Term|Expiration|Commencement)\b", | |
| ], | |
| }, | |
| "Expenses": { | |
| "weight": 2.3, | |
| "terms": [ | |
| "operating expenses", "opex", | |
| "property tax", "real estate taxes", "taxes", | |
| "insurance", | |
| "utilities", "water", "sewer", "electric", "gas", | |
| "repairs and maintenance", "r&m", "maintenance", | |
| "payroll", "personnel", | |
| "management fee", | |
| "contract services", | |
| "landscaping", "trash", "janitorial", | |
| "marketing", "admin", | |
| "hoa", "coa", | |
| "reserves", "replacement reserves", | |
| "recoverable", "non-recoverable", | |
| "reassessment", "tax appeal", "protest", | |
| ], | |
| "regex": [ | |
| r"\bOpEx\b|\bOperating\s+Expenses\b", | |
| r"\bReal\s+Estate\s+Taxes?\b|\bProperty\s+Taxes?\b|\bTaxes?\b", | |
| r"\bInsurance\b", | |
| r"\bUtilities?\b", | |
| r"\bManagement\s+Fee\b", | |
| r"\breassessment\b|\btax\s+appeal\b|\bprotest\b", | |
| r"\brecoverable\b|\bnon[-\s]*recoverable\b", | |
| ], | |
| }, | |
| "CapEx_ValueAdd": { | |
| "weight": 2.7, | |
| "terms": [ | |
| "capex", "capital expenditures", | |
| "renovation", "repositioning", "value-add", "value add", | |
| "deferred maintenance", | |
| "replacement reserves", | |
| "budget", "scope", "timeline", "phasing", | |
| "rent premium", "upgrade", | |
| ], | |
| "regex": [ | |
| r"\bCapEx\b|\bCapital\s+Expenditures?\b", | |
| r"\bValue[-\s]*Add\b", | |
| r"\bDeferred\s+Maintenance\b", | |
| r"\bRent\s+Premium\b", | |
| ], | |
| }, | |
| "Debt_Financing": { | |
| "weight": 2.8, | |
| "terms": [ | |
| "loan", "debt", "financing", | |
| "ltv", "ltc", "dscr", | |
| "interest rate", "coupon", "sofr", "spread", | |
| "fixed", "floating", | |
| "amortization", "interest only", "io", | |
| "maturity", "term", | |
| "prepayment", "yield maintenance", "defeasance", | |
| "covenants", | |
| "recourse", "non-recourse", "nonrecourse", | |
| "refinance", | |
| ], | |
| "regex": [ | |
| r"\bLTV\b|\bLTC\b|\bDSCR\b", | |
| r"\bSOFR\b", | |
| r"\bInterest\s+Only\b|\bIO\b", | |
| r"\bYield\s+Maintenance\b|\bDefeasance\b", | |
| r"\bNon[-\s]*Recourse\b", | |
| ], | |
| }, | |
| "Market_Demographics": { | |
| "weight": 1.8, | |
| "terms": [ | |
| "market", "submarket", "trade area", | |
| "demographics", "population", "households", | |
| "median household income", "mhi", | |
| "employment", "job growth", | |
| "major employers", | |
| "supply pipeline", "under construction", "deliveries", | |
| "comparable", "comp set", | |
| "traffic counts", | |
| ], | |
| "regex": [ | |
| r"\bDemographics\b", | |
| r"\bPopulation\b|\bHouseholds\b", | |
| r"\bMedian\s+Household\s+Income\b|\bMHI\b", | |
| r"\bUnder\s+Construction\b|\bDeliveries\b|\bPipeline\b", | |
| r"\bTraffic\s+Counts?\b", | |
| ], | |
| }, | |
| "Risk_Legal_DD": { | |
| "weight": 2.0, | |
| "terms": [ | |
| "risk factors", "assumptions", "underwriting assumptions", | |
| "forward-looking", "disclaimer", "disclosures", | |
| "environmental", "phase i", "phase ii", | |
| "zoning", "entitlements", | |
| "survey", "alta", | |
| "title", "easement", "encumbrance", | |
| "ada", "flood zone", "fema", | |
| "litigation", "property condition assessment", "pca", | |
| ], | |
| "regex": [ | |
| r"\bRisk\s+Factors\b|\bDisclosures?\b|\bDisclaimer\b", | |
| r"\bForward[-\s]*Looking\b", | |
| r"\bPhase\s*I\b|\bPhase\s*II\b|\bEnvironmental\b", | |
| r"\bZoning\b|\bEntitlements?\b", | |
| r"\bFlood\s+Zone\b|\bFEMA\b", | |
| r"\bLitigation\b", | |
| ], | |
| }, | |
| } | |
| # ---------------------------- | |
| # 2) PDF extraction | |
| # ---------------------------- | |
| class PageText: | |
| page: int | |
| text: str | |
| source: str # "text" or "ocr" | |
| text_chars: int | |
| def extract_text_layer(pdf_path: str) -> List[str]: | |
| reader = PdfReader(pdf_path) | |
| out = [] | |
| for page in reader.pages: | |
| out.append(page.extract_text() or "") | |
| return out | |
| def ocr_page_tesseract(img: Image.Image) -> str: | |
| config = "--oem 1 --psm 6" | |
| return pytesseract.image_to_string(img, lang="eng", config=config) or "" | |
| def extract_pdf_pages(pdf_path: str, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int) -> List[PageText]: | |
| text_pages = extract_text_layer(pdf_path) | |
| pages: List[PageText] = [] | |
| for i, t in enumerate(text_pages): | |
| base = (t or "").strip() | |
| base_chars = len(base) | |
| if use_ocr and base_chars < int(ocr_min_chars): | |
| imgs = convert_from_path(pdf_path, dpi=int(ocr_dpi), first_page=i + 1, last_page=i + 1) | |
| img = imgs[0] | |
| ocr_text = (ocr_page_tesseract(img) or "").strip() | |
| if len(ocr_text) > base_chars: | |
| pages.append(PageText(page=i + 1, text=ocr_text, source="ocr", text_chars=len(ocr_text))) | |
| continue | |
| pages.append(PageText(page=i + 1, text=base, source="text", text_chars=base_chars)) | |
| return pages | |
| # ---------------------------- | |
| # 3) Matching & scoring (no pandas) | |
| # ---------------------------- | |
| def normalize_text(s: str) -> str: | |
| return (s or "").lower() | |
| def compile_patterns(kw: Dict[str, Dict[str, Any]]) -> Dict[str, List[re.Pattern]]: | |
| compiled: Dict[str, List[re.Pattern]] = {} | |
| for cat, cfg in kw.items(): | |
| pats: List[re.Pattern] = [] | |
| for term in cfg.get("terms", []): | |
| term = (term or "").strip().lower() | |
| if not term: | |
| continue | |
| pat = re.escape(term).replace(r"\ ", r"\s+") | |
| pats.append(re.compile(rf"(?i)\b{pat}\b")) | |
| for rp in cfg.get("regex", []): | |
| pats.append(re.compile(rf"(?i){rp}")) | |
| compiled[cat] = pats | |
| return compiled | |
| def find_snippets(text: str, patterns: List[re.Pattern], window: int = 90, max_snippets: int = 4) -> List[str]: | |
| snips: List[str] = [] | |
| for pat in patterns: | |
| for m in pat.finditer(text): | |
| s = max(0, m.start() - window) | |
| e = min(len(text), m.end() + window) | |
| snippet = re.sub(r"\s+", " ", text[s:e].strip()) | |
| snips.append(snippet) | |
| if len(snips) >= max_snippets: | |
| return snips | |
| return snips | |
| def score_pages(pages: List[PageText], kw: Dict[str, Dict[str, Any]]): | |
| compiled = compile_patterns(kw) | |
| overall_hits = {cat: 0 for cat in kw.keys()} | |
| overall_weighted = 0.0 | |
| page_records = [] | |
| for p in pages: | |
| t = normalize_text(p.text) | |
| page_hits_total = 0 | |
| page_weighted = 0.0 | |
| cat_hits = {} | |
| for cat, cfg in kw.items(): | |
| hits = 0 | |
| for pat in compiled[cat]: | |
| hits += len(list(pat.finditer(t))) | |
| cat_hits[cat] = hits | |
| overall_hits[cat] += hits | |
| if hits: | |
| page_hits_total += hits | |
| page_weighted += hits * float(cfg["weight"]) | |
| overall_weighted += page_weighted | |
| top_cats = sorted(cat_hits.items(), key=lambda x: x[1], reverse=True) | |
| snippet_lines: List[str] = [] | |
| for cat, hits in top_cats[:3]: | |
| if hits <= 0: | |
| continue | |
| snips = find_snippets(t, compiled[cat], window=90, max_snippets=2) | |
| for s in snips: | |
| snippet_lines.append(f"[{cat}] {s}") | |
| page_records.append({ | |
| "page": p.page, | |
| "source": p.source, | |
| "text_chars": p.text_chars, | |
| "hits_total": int(page_hits_total), | |
| "score_weighted": round(page_weighted, 2), | |
| "top_snippets": "\n".join(snippet_lines[:6]), | |
| }) | |
| cat_summary = [] | |
| for cat, cfg in kw.items(): | |
| cat_summary.append({ | |
| "category": cat, | |
| "weight": cfg["weight"], | |
| "hits": int(overall_hits[cat]), | |
| "weighted_hits": round(overall_hits[cat] * float(cfg["weight"]), 2), | |
| }) | |
| page_records_sorted = sorted( | |
| page_records, | |
| key=lambda r: (r["score_weighted"], r["hits_total"]), | |
| reverse=True | |
| ) | |
| cat_summary_sorted = sorted( | |
| cat_summary, | |
| key=lambda r: (r["weighted_hits"], r["hits"]), | |
| reverse=True | |
| ) | |
| meta = { | |
| "total_pages": len(pages), | |
| "total_hits": int(sum(overall_hits.values())), | |
| "total_weighted_score": round(float(overall_weighted), 2), | |
| "sources": { | |
| "text": sum(1 for p in pages if p.source == "text"), | |
| "ocr": sum(1 for p in pages if p.source == "ocr"), | |
| } | |
| } | |
| return page_records_sorted, cat_summary_sorted, meta | |
| def write_csv(path: str, rows: List[dict], headers: List[str]): | |
| with open(path, "w", newline="", encoding="utf-8") as f: | |
| w = csv.writer(f) | |
| w.writerow(headers) | |
| for r in rows: | |
| w.writerow([r.get(h, "") for h in headers]) | |
| # ---------------------------- | |
| # 4) Gradio app | |
| # ---------------------------- | |
| def run_extract(pdf_file, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int, topk_pages: int): | |
| if pdf_file is None: | |
| return None, None, "", None, "Please upload a PDF." | |
| kw = build_keyword_dict() | |
| pages = extract_pdf_pages( | |
| pdf_path=pdf_file.name, | |
| use_ocr=use_ocr, | |
| ocr_min_chars=int(ocr_min_chars), | |
| ocr_dpi=int(ocr_dpi), | |
| ) | |
| page_ranking, cat_summary, meta = score_pages(pages, kw) | |
| topk = int(topk_pages) | |
| top_pages = page_ranking[:topk] | |
| payload = { | |
| "meta": meta, | |
| "category_summary": cat_summary, | |
| "page_ranking": page_ranking, | |
| } | |
| json_path = "underwriting_keywords_output.json" | |
| csv_pages_path = "page_ranking.csv" | |
| csv_cats_path = "category_summary.csv" | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump(payload, f, ensure_ascii=False, indent=2) | |
| write_csv( | |
| csv_pages_path, | |
| page_ranking, | |
| headers=["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"] | |
| ) | |
| write_csv( | |
| csv_cats_path, | |
| cat_summary, | |
| headers=["category", "weight", "hits", "weighted_hits"] | |
| ) | |
| summary = ( | |
| f"Total pages: {meta['total_pages']} | " | |
| f"Total hits: {meta['total_hits']} | " | |
| f"Weighted score: {meta['total_weighted_score']} | " | |
| f"Sources: {meta['sources']}" | |
| ) | |
| # Gradio Dataframe expects list-of-lists with headers | |
| cats_headers = ["category", "weight", "hits", "weighted_hits"] | |
| cats_table = [cats_headers] + [[r[h] for h in cats_headers] for r in cat_summary] | |
| pages_headers = ["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"] | |
| pages_table = [pages_headers] + [[r[h] for h in pages_headers] for r in top_pages] | |
| return cats_table, pages_table, summary, [json_path, csv_pages_path, csv_cats_path], "Done." | |
| with gr.Blocks(title="OM Underwriting Keyword Extractor") as demo: | |
| gr.Markdown( | |
| "# OM Underwriting Keyword Extractor\n" | |
| "Upload a real estate OM PDF and extract underwriting keyword signals.\n\n" | |
| "**This build uses minimal deps (no pandas/numpy/torch).** OCR fallback uses Tesseract." | |
| ) | |
| with gr.Row(): | |
| pdf = gr.File(label="Upload OM PDF", file_types=[".pdf"]) | |
| with gr.Column(): | |
| use_ocr = gr.Checkbox(value=True, label="Enable OCR fallback (recommended for OM)") | |
| ocr_min_chars = gr.Slider(0, 3000, value=350, step=50, label="OCR trigger: if text chars on page <") | |
| ocr_dpi = gr.Slider(120, 300, value=200, step=10, label="OCR render DPI") | |
| topk_pages = gr.Slider(5, 60, value=15, step=1, label="Show Top-K pages") | |
| run_btn = gr.Button("Extract Keywords") | |
| gr.Markdown("## Category Summary (sorted by weighted hits)") | |
| out_cats = gr.Dataframe(interactive=False) | |
| gr.Markdown("## Top Pages (highest underwriting signal)") | |
| out_pages = gr.Dataframe(interactive=False) | |
| out_summary = gr.Textbox(label="Run Summary", interactive=False) | |
| out_files = gr.File(label="Download Outputs (JSON + CSVs)", file_count="multiple") | |
| out_status = gr.Textbox(label="Status", interactive=False) | |
| run_btn.click( | |
| fn=run_extract, | |
| inputs=[pdf, use_ocr, ocr_min_chars, ocr_dpi, topk_pages], | |
| outputs=[out_cats, out_pages, out_summary, out_files, out_status], | |
| ) | |
| demo.launch() | |