Spaces:

SarahXia0405
/

RE_Extractor

Sleeping

File size: 17,713 Bytes

60e673f
 
 
39dfff2
 
 
 
ad53390
 
57c627b
 
 
60e673f
 
 
 
 
 
 
 
 
 
39dfff2
 
 
57c627b
39dfff2
 
 
60e673f
 
 
39dfff2
 
60e673f
39dfff2
 
 
60e673f
 
39dfff2
60e673f
 
 
 
 
 
39dfff2
 
 
57c627b
39dfff2
 
 
 
60e673f
 
 
39dfff2
 
 
 
60e673f
 
 
 
 
39dfff2
 
 
 
 
57c627b
39dfff2
 
57c627b
39dfff2
60e673f
 
 
 
 
39dfff2
 
60e673f
 
 
 
 
39dfff2
 
 
 
 
 
 
 
 
57c627b
39dfff2
 
 
 
 
60e673f
 
 
39dfff2
60e673f
 
39dfff2
 
60e673f
 
 
 
 
 
39dfff2
 
 
 
 
 
 
 
57c627b
 
39dfff2
 
 
 
60e673f
 
 
39dfff2
 
 
60e673f
 
39dfff2
60e673f
 
 
 
 
39dfff2
57c627b
39dfff2
 
 
57c627b
60e673f
 
 
39dfff2
60e673f
39dfff2
60e673f
 
 
 
 
39dfff2
 
60e673f
39dfff2
 
 
 
 
 
57c627b
60e673f
 
 
 
 
 
39dfff2
60e673f
 
 
 
 
39dfff2
 
 
57c627b
39dfff2
 
57c627b
 
60e673f
 
 
 
 
39dfff2
 
60e673f
 
 
 
 
39dfff2
57c627b
39dfff2
 
 
 
57c627b
 
60e673f
 
 
39dfff2
60e673f
 
 
39dfff2
60e673f
 
 
 
 
 
57c627b
60e673f
 
 
 
 
 
 
 
d34d9dc
57c627b
d34d9dc
57c627b
d34d9dc
57c627b
 
d34d9dc
 
ad53390
 
 
39dfff2
 
 
57c627b
39dfff2
57c627b
39dfff2
57c627b
 
 
 
 
d34d9dc
ad53390
57c627b
39dfff2
60e673f
 
57c627b
60e673f
 
 
 
 
57c627b
60e673f
 
39dfff2
 
 
 
57c627b
39dfff2
 
 
57c627b
 
39dfff2
57c627b
39dfff2
57c627b
 
39dfff2
 
60e673f
 
39dfff2
57c627b
60e673f
 
57c627b
 
 
 
 
 
 
60e673f
 
57c627b
60e673f
 
 
 
57c627b
60e673f
 
57c627b
60e673f
57c627b
60e673f
 
 
39dfff2
60e673f
57c627b
60e673f
57c627b
 
60e673f
 
 
 
57c627b
 
60e673f
 
 
39dfff2
 
57c627b
60e673f
57c627b
60e673f
 
 
57c627b
60e673f
57c627b
60e673f
 
57c627b
60e673f
57c627b
60e673f
 
39dfff2
60e673f
 
 
57c627b
 
 
 
 
 
 
 
 
 
60e673f
 
 
 
39dfff2
57c627b
 
 
 
60e673f
57c627b
60e673f
 
57c627b
 
 
 
 
 
60e673f
 
 
 
 
 
 
39dfff2
60e673f
 
 
 
 
 
39dfff2
 
60e673f
 
57c627b
 
 
 
 
 
 
 
 
 
39dfff2
60e673f
 
 
 
 
 
 
57c627b
 
 
 
 
 
 
 
 
 
 
60e673f
 
 
 
 
 
 
57c627b
 
 
 
 
 
 
 
60e673f
 
 
 
 
ad53390
57c627b
60e673f
 
 
 
 
57c627b
39dfff2
60e673f
39dfff2
60e673f

import gradio as gr
import regex as re
from dataclasses import dataclass
from typing import Dict, List, Tuple, Any

from pypdf import PdfReader
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import json
import csv
import os


# ----------------------------
# 1) Underwriting keyword dictionary
# ----------------------------
def build_keyword_dict() -> Dict[str, Dict[str, Any]]:
    return {
        "Pricing_Valuation": {
            "weight": 3.0,
            "terms": [
                "purchase price", "asking price", "offer price",
                "price per unit", "price per sf", "price per square foot",
                "cap rate", "going-in cap", "exit cap", "terminal cap",
                "valuation", "appraisal",
                "irr", "levered irr", "unlevered irr",
                "equity multiple", "cash-on-cash", "cash on cash",
                "yield on cost", "break-even occupancy", "breakeven occupancy",
            ],
            "regex": [
                r"\bcap\s*rate\b",
                r"\bgoing[-\s]*in\s+cap\b",
                r"\bexit\s+cap\b|\bterminal\s+cap\b",
                r"\bIRR\b",
                r"\bequity\s+multiple\b",
                r"\bcash[-\s]*on[-\s]*cash\b",
                r"\byield\s+on\s+cost\b",
                r"\bDSCR\b",
                r"\bLTV\b|\bLTC\b",
                r"\b\$\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:/sf|/SF|per\s*sf|per\s*SF|psf|PSF)\b",
                r"\b\$\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:/unit|per\s*unit)\b",
            ],
        },
        "NOI_CashFlow": {
            "weight": 3.0,
            "terms": [
                "noi", "net operating income",
                "t-12", "t12", "trailing 12", "ttm",
                "ytd", "annualized", "run rate", "pro forma",
                "stabilized noi", "underwritten noi",
                "cash flow", "net cash flow", "ebitda",
                "effective gross income", "egi",
                "gross potential rent", "gpr", "scheduled rent",
                "other income", "ancillary income",
            ],
            "regex": [
                r"\bNOI\b|\bNet\s+Operating\s+Income\b",
                r"\bT-?12\b|\bTrailing\s*12\b|\bTTM\b|\bYTD\b",
                r"\bPro\s*Forma\b|\bUnderwritten\b|\bStabilized\b",
                r"\bEBITDA\b",
                r"\bEGI\b|\bEffective\s+Gross\s+Income\b",
            ],
        },
        "Occupancy_Rents": {
            "weight": 2.5,
            "terms": [
                "occupancy", "physical occupancy", "economic occupancy",
                "vacancy", "vacancy rate",
                "market rent", "in-place rent", "in place rent",
                "effective rent", "asking rent",
                "rent growth", "rental rate growth",
                "concessions", "free rent",
                "loss to lease", "mark-to-market", "mark to market",
                "renewal rate", "retention", "turnover",
                "absorption",
                "bad debt", "credit loss", "delinquency",
            ],
            "regex": [
                r"\boccupanc(?:y|ies)\b",
                r"\bvacanc(?:y|ies)\b",
                r"\bloss\s+to\s+lease\b",
                r"\bmark[-\s]*to[-\s]*market\b",
                r"\bconcession(?:s)?\b|\bfree\s+rent\b",
            ],
        },
        "Leases_Tenants": {
            "weight": 3.0,
            "terms": [
                "rent roll", "tenant", "tenant mix", "top tenants",
                "lease abstract", "lease term", "remaining term",
                "walt", "wale", "weighted average lease term",
                "commencement", "expiration", "lease expiration",
                "options", "renewal options",
                "escalations", "steps", "bumps", "rent schedule",
                "base rent", "minimum rent",
                "cam", "nnn", "triple net", "reimbursements",
                "expense stop", "base year", "gross-up", "gross up",
                "ti", "tenant improvements",
                "leasing commission", "lc",
                "security deposit", "letter of credit", "loc",
                "guaranty", "guarantee",
                "assignment", "sublease",
                "credit rating", "tenant financials",
            ],
            "regex": [
                r"\bRent\s+Roll\b",
                r"\bWALT\b|\bWALE\b|\bWeighted\s+Average\s+Lease\s+Term\b",
                r"\bNNN\b|\bTriple\s+Net\b|\bCAM\b",
                r"\bTI\b|\bTenant\s+Improvements?\b",
                r"\bLeasing\s+Commission\b|\bLC\b",
                r"\bLetter\s+of\s+Credit\b|\bLOC\b",
                r"\bLease\s+(?:Abstract|Term|Expiration|Commencement)\b",
            ],
        },
        "Expenses": {
            "weight": 2.3,
            "terms": [
                "operating expenses", "opex",
                "property tax", "real estate taxes", "taxes",
                "insurance",
                "utilities", "water", "sewer", "electric", "gas",
                "repairs and maintenance", "r&m", "maintenance",
                "payroll", "personnel",
                "management fee",
                "contract services",
                "landscaping", "trash", "janitorial",
                "marketing", "admin",
                "hoa", "coa",
                "reserves", "replacement reserves",
                "recoverable", "non-recoverable",
                "reassessment", "tax appeal", "protest",
            ],
            "regex": [
                r"\bOpEx\b|\bOperating\s+Expenses\b",
                r"\bReal\s+Estate\s+Taxes?\b|\bProperty\s+Taxes?\b|\bTaxes?\b",
                r"\bInsurance\b",
                r"\bUtilities?\b",
                r"\bManagement\s+Fee\b",
                r"\breassessment\b|\btax\s+appeal\b|\bprotest\b",
                r"\brecoverable\b|\bnon[-\s]*recoverable\b",
            ],
        },
        "CapEx_ValueAdd": {
            "weight": 2.7,
            "terms": [
                "capex", "capital expenditures",
                "renovation", "repositioning", "value-add", "value add",
                "deferred maintenance",
                "replacement reserves",
                "budget", "scope", "timeline", "phasing",
                "rent premium", "upgrade",
            ],
            "regex": [
                r"\bCapEx\b|\bCapital\s+Expenditures?\b",
                r"\bValue[-\s]*Add\b",
                r"\bDeferred\s+Maintenance\b",
                r"\bRent\s+Premium\b",
            ],
        },
        "Debt_Financing": {
            "weight": 2.8,
            "terms": [
                "loan", "debt", "financing",
                "ltv", "ltc", "dscr",
                "interest rate", "coupon", "sofr", "spread",
                "fixed", "floating",
                "amortization", "interest only", "io",
                "maturity", "term",
                "prepayment", "yield maintenance", "defeasance",
                "covenants",
                "recourse", "non-recourse", "nonrecourse",
                "refinance",
            ],
            "regex": [
                r"\bLTV\b|\bLTC\b|\bDSCR\b",
                r"\bSOFR\b",
                r"\bInterest\s+Only\b|\bIO\b",
                r"\bYield\s+Maintenance\b|\bDefeasance\b",
                r"\bNon[-\s]*Recourse\b",
            ],
        },
        "Market_Demographics": {
            "weight": 1.8,
            "terms": [
                "market", "submarket", "trade area",
                "demographics", "population", "households",
                "median household income", "mhi",
                "employment", "job growth",
                "major employers",
                "supply pipeline", "under construction", "deliveries",
                "comparable", "comp set",
                "traffic counts",
            ],
            "regex": [
                r"\bDemographics\b",
                r"\bPopulation\b|\bHouseholds\b",
                r"\bMedian\s+Household\s+Income\b|\bMHI\b",
                r"\bUnder\s+Construction\b|\bDeliveries\b|\bPipeline\b",
                r"\bTraffic\s+Counts?\b",
            ],
        },
        "Risk_Legal_DD": {
            "weight": 2.0,
            "terms": [
                "risk factors", "assumptions", "underwriting assumptions",
                "forward-looking", "disclaimer", "disclosures",
                "environmental", "phase i", "phase ii",
                "zoning", "entitlements",
                "survey", "alta",
                "title", "easement", "encumbrance",
                "ada", "flood zone", "fema",
                "litigation", "property condition assessment", "pca",
            ],
            "regex": [
                r"\bRisk\s+Factors\b|\bDisclosures?\b|\bDisclaimer\b",
                r"\bForward[-\s]*Looking\b",
                r"\bPhase\s*I\b|\bPhase\s*II\b|\bEnvironmental\b",
                r"\bZoning\b|\bEntitlements?\b",
                r"\bFlood\s+Zone\b|\bFEMA\b",
                r"\bLitigation\b",
            ],
        },
    }


# ----------------------------
# 2) PDF extraction
# ----------------------------
@dataclass
class PageText:
    page: int
    text: str
    source: str  # "text" or "ocr"
    text_chars: int


def extract_text_layer(pdf_path: str) -> List[str]:
    reader = PdfReader(pdf_path)
    out = []
    for page in reader.pages:
        out.append(page.extract_text() or "")
    return out


def ocr_page_tesseract(img: Image.Image) -> str:
    config = "--oem 1 --psm 6"
    return pytesseract.image_to_string(img, lang="eng", config=config) or ""


def extract_pdf_pages(pdf_path: str, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int) -> List[PageText]:
    text_pages = extract_text_layer(pdf_path)
    pages: List[PageText] = []

    for i, t in enumerate(text_pages):
        base = (t or "").strip()
        base_chars = len(base)

        if use_ocr and base_chars < int(ocr_min_chars):
            imgs = convert_from_path(pdf_path, dpi=int(ocr_dpi), first_page=i + 1, last_page=i + 1)
            img = imgs[0]
            ocr_text = (ocr_page_tesseract(img) or "").strip()
            if len(ocr_text) > base_chars:
                pages.append(PageText(page=i + 1, text=ocr_text, source="ocr", text_chars=len(ocr_text)))
                continue

        pages.append(PageText(page=i + 1, text=base, source="text", text_chars=base_chars))

    return pages


# ----------------------------
# 3) Matching & scoring (no pandas)
# ----------------------------
def normalize_text(s: str) -> str:
    return (s or "").lower()


def compile_patterns(kw: Dict[str, Dict[str, Any]]) -> Dict[str, List[re.Pattern]]:
    compiled: Dict[str, List[re.Pattern]] = {}
    for cat, cfg in kw.items():
        pats: List[re.Pattern] = []
        for term in cfg.get("terms", []):
            term = (term or "").strip().lower()
            if not term:
                continue
            pat = re.escape(term).replace(r"\ ", r"\s+")
            pats.append(re.compile(rf"(?i)\b{pat}\b"))
        for rp in cfg.get("regex", []):
            pats.append(re.compile(rf"(?i){rp}"))
        compiled[cat] = pats
    return compiled


def find_snippets(text: str, patterns: List[re.Pattern], window: int = 90, max_snippets: int = 4) -> List[str]:
    snips: List[str] = []
    for pat in patterns:
        for m in pat.finditer(text):
            s = max(0, m.start() - window)
            e = min(len(text), m.end() + window)
            snippet = re.sub(r"\s+", " ", text[s:e].strip())
            snips.append(snippet)
            if len(snips) >= max_snippets:
                return snips
    return snips


def score_pages(pages: List[PageText], kw: Dict[str, Dict[str, Any]]):
    compiled = compile_patterns(kw)
    overall_hits = {cat: 0 for cat in kw.keys()}
    overall_weighted = 0.0

    page_records = []
    for p in pages:
        t = normalize_text(p.text)
        page_hits_total = 0
        page_weighted = 0.0
        cat_hits = {}

        for cat, cfg in kw.items():
            hits = 0
            for pat in compiled[cat]:
                hits += len(list(pat.finditer(t)))
            cat_hits[cat] = hits
            overall_hits[cat] += hits
            if hits:
                page_hits_total += hits
                page_weighted += hits * float(cfg["weight"])

        overall_weighted += page_weighted

        top_cats = sorted(cat_hits.items(), key=lambda x: x[1], reverse=True)
        snippet_lines: List[str] = []
        for cat, hits in top_cats[:3]:
            if hits <= 0:
                continue
            snips = find_snippets(t, compiled[cat], window=90, max_snippets=2)
            for s in snips:
                snippet_lines.append(f"[{cat}] {s}")

        page_records.append({
            "page": p.page,
            "source": p.source,
            "text_chars": p.text_chars,
            "hits_total": int(page_hits_total),
            "score_weighted": round(page_weighted, 2),
            "top_snippets": "\n".join(snippet_lines[:6]),
        })

    cat_summary = []
    for cat, cfg in kw.items():
        cat_summary.append({
            "category": cat,
            "weight": cfg["weight"],
            "hits": int(overall_hits[cat]),
            "weighted_hits": round(overall_hits[cat] * float(cfg["weight"]), 2),
        })

    page_records_sorted = sorted(
        page_records,
        key=lambda r: (r["score_weighted"], r["hits_total"]),
        reverse=True
    )
    cat_summary_sorted = sorted(
        cat_summary,
        key=lambda r: (r["weighted_hits"], r["hits"]),
        reverse=True
    )

    meta = {
        "total_pages": len(pages),
        "total_hits": int(sum(overall_hits.values())),
        "total_weighted_score": round(float(overall_weighted), 2),
        "sources": {
            "text": sum(1 for p in pages if p.source == "text"),
            "ocr": sum(1 for p in pages if p.source == "ocr"),
        }
    }
    return page_records_sorted, cat_summary_sorted, meta


def write_csv(path: str, rows: List[dict], headers: List[str]):
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(headers)
        for r in rows:
            w.writerow([r.get(h, "") for h in headers])


# ----------------------------
# 4) Gradio app
# ----------------------------
def run_extract(pdf_file, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int, topk_pages: int):
    if pdf_file is None:
        return None, None, "", None, "Please upload a PDF."

    kw = build_keyword_dict()

    pages = extract_pdf_pages(
        pdf_path=pdf_file.name,
        use_ocr=use_ocr,
        ocr_min_chars=int(ocr_min_chars),
        ocr_dpi=int(ocr_dpi),
    )

    page_ranking, cat_summary, meta = score_pages(pages, kw)

    topk = int(topk_pages)
    top_pages = page_ranking[:topk]

    payload = {
        "meta": meta,
        "category_summary": cat_summary,
        "page_ranking": page_ranking,
    }

    json_path = "underwriting_keywords_output.json"
    csv_pages_path = "page_ranking.csv"
    csv_cats_path = "category_summary.csv"

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

    write_csv(
        csv_pages_path,
        page_ranking,
        headers=["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"]
    )
    write_csv(
        csv_cats_path,
        cat_summary,
        headers=["category", "weight", "hits", "weighted_hits"]
    )

    summary = (
        f"Total pages: {meta['total_pages']} | "
        f"Total hits: {meta['total_hits']} | "
        f"Weighted score: {meta['total_weighted_score']} | "
        f"Sources: {meta['sources']}"
    )

    # Gradio Dataframe expects list-of-lists with headers
    cats_headers = ["category", "weight", "hits", "weighted_hits"]
    cats_table = [cats_headers] + [[r[h] for h in cats_headers] for r in cat_summary]

    pages_headers = ["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"]
    pages_table = [pages_headers] + [[r[h] for h in pages_headers] for r in top_pages]

    return cats_table, pages_table, summary, [json_path, csv_pages_path, csv_cats_path], "Done."


with gr.Blocks(title="OM Underwriting Keyword Extractor") as demo:
    gr.Markdown(
        "# OM Underwriting Keyword Extractor\n"
        "Upload a real estate OM PDF and extract underwriting keyword signals.\n\n"
        "**This build uses minimal deps (no pandas/numpy/torch).** OCR fallback uses Tesseract."
    )

    with gr.Row():
        pdf = gr.File(label="Upload OM PDF", file_types=[".pdf"])
        with gr.Column():
            use_ocr = gr.Checkbox(value=True, label="Enable OCR fallback (recommended for OM)")
            ocr_min_chars = gr.Slider(0, 3000, value=350, step=50, label="OCR trigger: if text chars on page <")
            ocr_dpi = gr.Slider(120, 300, value=200, step=10, label="OCR render DPI")
            topk_pages = gr.Slider(5, 60, value=15, step=1, label="Show Top-K pages")

    run_btn = gr.Button("Extract Keywords")

    gr.Markdown("## Category Summary (sorted by weighted hits)")
    out_cats = gr.Dataframe(interactive=False)

    gr.Markdown("## Top Pages (highest underwriting signal)")
    out_pages = gr.Dataframe(interactive=False)

    out_summary = gr.Textbox(label="Run Summary", interactive=False)
    out_files = gr.File(label="Download Outputs (JSON + CSVs)", file_count="multiple")
    out_status = gr.Textbox(label="Status", interactive=False)

    run_btn.click(
        fn=run_extract,
        inputs=[pdf, use_ocr, ocr_min_chars, ocr_dpi, topk_pages],
        outputs=[out_cats, out_pages, out_summary, out_files, out_status],
    )

demo.launch()