Spaces:

SarahXia0405
/

RE_Extractor

Sleeping

App Files Files Community

RE_Extractor / app.py

SarahXia0405

Update app.py

57c627b verified about 2 months ago

raw

history blame contribute delete

17.7 kB

	import gradio as gr
	import regex as re
	from dataclasses import dataclass
	from typing import Dict, List, Tuple, Any

	from pypdf import PdfReader
	from pdf2image import convert_from_path
	from PIL import Image
	import pytesseract
	import json
	import csv
	import os


	# ----------------------------
	# 1) Underwriting keyword dictionary
	# ----------------------------
	def build_keyword_dict() -> Dict[str, Dict[str, Any]]:
	return {
	"Pricing_Valuation": {
	"weight": 3.0,
	"terms": [
	"purchase price", "asking price", "offer price",
	"price per unit", "price per sf", "price per square foot",
	"cap rate", "going-in cap", "exit cap", "terminal cap",
	"valuation", "appraisal",
	"irr", "levered irr", "unlevered irr",
	"equity multiple", "cash-on-cash", "cash on cash",
	"yield on cost", "break-even occupancy", "breakeven occupancy",
	],
	"regex": [
	r"\bcap\s*rate\b",
	r"\bgoing[-\s]*in\s+cap\b",
	r"\bexit\s+cap\b\|\bterminal\s+cap\b",
	r"\bIRR\b",
	r"\bequity\s+multiple\b",
	r"\bcash[-\s]on[-\s]cash\b",
	r"\byield\s+on\s+cost\b",
	r"\bDSCR\b",
	r"\bLTV\b\|\bLTC\b",
	r"\b\$\s?\d{1,3}(?:,\d{3})(?:\.\d+)?\s(?:/sf\|/SF\|per\ssf\|per\sSF\|psf\|PSF)\b",
	r"\b\$\s?\d{1,3}(?:,\d{3})(?:\.\d+)?\s(?:/unit\|per\s*unit)\b",
	],
	},
	"NOI_CashFlow": {
	"weight": 3.0,
	"terms": [
	"noi", "net operating income",
	"t-12", "t12", "trailing 12", "ttm",
	"ytd", "annualized", "run rate", "pro forma",
	"stabilized noi", "underwritten noi",
	"cash flow", "net cash flow", "ebitda",
	"effective gross income", "egi",
	"gross potential rent", "gpr", "scheduled rent",
	"other income", "ancillary income",
	],
	"regex": [
	r"\bNOI\b\|\bNet\s+Operating\s+Income\b",
	r"\bT-?12\b\|\bTrailing\s*12\b\|\bTTM\b\|\bYTD\b",
	r"\bPro\s*Forma\b\|\bUnderwritten\b\|\bStabilized\b",
	r"\bEBITDA\b",
	r"\bEGI\b\|\bEffective\s+Gross\s+Income\b",
	],
	},
	"Occupancy_Rents": {
	"weight": 2.5,
	"terms": [
	"occupancy", "physical occupancy", "economic occupancy",
	"vacancy", "vacancy rate",
	"market rent", "in-place rent", "in place rent",
	"effective rent", "asking rent",
	"rent growth", "rental rate growth",
	"concessions", "free rent",
	"loss to lease", "mark-to-market", "mark to market",
	"renewal rate", "retention", "turnover",
	"absorption",
	"bad debt", "credit loss", "delinquency",
	],
	"regex": [
	r"\boccupanc(?:y\|ies)\b",
	r"\bvacanc(?:y\|ies)\b",
	r"\bloss\s+to\s+lease\b",
	r"\bmark[-\s]to[-\s]market\b",
	r"\bconcession(?:s)?\b\|\bfree\s+rent\b",
	],
	},
	"Leases_Tenants": {
	"weight": 3.0,
	"terms": [
	"rent roll", "tenant", "tenant mix", "top tenants",
	"lease abstract", "lease term", "remaining term",
	"walt", "wale", "weighted average lease term",
	"commencement", "expiration", "lease expiration",
	"options", "renewal options",
	"escalations", "steps", "bumps", "rent schedule",
	"base rent", "minimum rent",
	"cam", "nnn", "triple net", "reimbursements",
	"expense stop", "base year", "gross-up", "gross up",
	"ti", "tenant improvements",
	"leasing commission", "lc",
	"security deposit", "letter of credit", "loc",
	"guaranty", "guarantee",
	"assignment", "sublease",
	"credit rating", "tenant financials",
	],
	"regex": [
	r"\bRent\s+Roll\b",
	r"\bWALT\b\|\bWALE\b\|\bWeighted\s+Average\s+Lease\s+Term\b",
	r"\bNNN\b\|\bTriple\s+Net\b\|\bCAM\b",
	r"\bTI\b\|\bTenant\s+Improvements?\b",
	r"\bLeasing\s+Commission\b\|\bLC\b",
	r"\bLetter\s+of\s+Credit\b\|\bLOC\b",
	r"\bLease\s+(?:Abstract\|Term\|Expiration\|Commencement)\b",
	],
	},
	"Expenses": {
	"weight": 2.3,
	"terms": [
	"operating expenses", "opex",
	"property tax", "real estate taxes", "taxes",
	"insurance",
	"utilities", "water", "sewer", "electric", "gas",
	"repairs and maintenance", "r&m", "maintenance",
	"payroll", "personnel",
	"management fee",
	"contract services",
	"landscaping", "trash", "janitorial",
	"marketing", "admin",
	"hoa", "coa",
	"reserves", "replacement reserves",
	"recoverable", "non-recoverable",
	"reassessment", "tax appeal", "protest",
	],
	"regex": [
	r"\bOpEx\b\|\bOperating\s+Expenses\b",
	r"\bReal\s+Estate\s+Taxes?\b\|\bProperty\s+Taxes?\b\|\bTaxes?\b",
	r"\bInsurance\b",
	r"\bUtilities?\b",
	r"\bManagement\s+Fee\b",
	r"\breassessment\b\|\btax\s+appeal\b\|\bprotest\b",
	r"\brecoverable\b\|\bnon[-\s]*recoverable\b",
	],
	},
	"CapEx_ValueAdd": {
	"weight": 2.7,
	"terms": [
	"capex", "capital expenditures",
	"renovation", "repositioning", "value-add", "value add",
	"deferred maintenance",
	"replacement reserves",
	"budget", "scope", "timeline", "phasing",
	"rent premium", "upgrade",
	],
	"regex": [
	r"\bCapEx\b\|\bCapital\s+Expenditures?\b",
	r"\bValue[-\s]*Add\b",
	r"\bDeferred\s+Maintenance\b",
	r"\bRent\s+Premium\b",
	],
	},
	"Debt_Financing": {
	"weight": 2.8,
	"terms": [
	"loan", "debt", "financing",
	"ltv", "ltc", "dscr",
	"interest rate", "coupon", "sofr", "spread",
	"fixed", "floating",
	"amortization", "interest only", "io",
	"maturity", "term",
	"prepayment", "yield maintenance", "defeasance",
	"covenants",
	"recourse", "non-recourse", "nonrecourse",
	"refinance",
	],
	"regex": [
	r"\bLTV\b\|\bLTC\b\|\bDSCR\b",
	r"\bSOFR\b",
	r"\bInterest\s+Only\b\|\bIO\b",
	r"\bYield\s+Maintenance\b\|\bDefeasance\b",
	r"\bNon[-\s]*Recourse\b",
	],
	},
	"Market_Demographics": {
	"weight": 1.8,
	"terms": [
	"market", "submarket", "trade area",
	"demographics", "population", "households",
	"median household income", "mhi",
	"employment", "job growth",
	"major employers",
	"supply pipeline", "under construction", "deliveries",
	"comparable", "comp set",
	"traffic counts",
	],
	"regex": [
	r"\bDemographics\b",
	r"\bPopulation\b\|\bHouseholds\b",
	r"\bMedian\s+Household\s+Income\b\|\bMHI\b",
	r"\bUnder\s+Construction\b\|\bDeliveries\b\|\bPipeline\b",
	r"\bTraffic\s+Counts?\b",
	],
	},
	"Risk_Legal_DD": {
	"weight": 2.0,
	"terms": [
	"risk factors", "assumptions", "underwriting assumptions",
	"forward-looking", "disclaimer", "disclosures",
	"environmental", "phase i", "phase ii",
	"zoning", "entitlements",
	"survey", "alta",
	"title", "easement", "encumbrance",
	"ada", "flood zone", "fema",
	"litigation", "property condition assessment", "pca",
	],
	"regex": [
	r"\bRisk\s+Factors\b\|\bDisclosures?\b\|\bDisclaimer\b",
	r"\bForward[-\s]*Looking\b",
	r"\bPhase\sI\b\|\bPhase\sII\b\|\bEnvironmental\b",
	r"\bZoning\b\|\bEntitlements?\b",
	r"\bFlood\s+Zone\b\|\bFEMA\b",
	r"\bLitigation\b",
	],
	},
	}


	# ----------------------------
	# 2) PDF extraction
	# ----------------------------
	@dataclass
	class PageText:
	page: int
	text: str
	source: str # "text" or "ocr"
	text_chars: int


	def extract_text_layer(pdf_path: str) -> List[str]:
	reader = PdfReader(pdf_path)
	out = []
	for page in reader.pages:
	out.append(page.extract_text() or "")
	return out


	def ocr_page_tesseract(img: Image.Image) -> str:
	config = "--oem 1 --psm 6"
	return pytesseract.image_to_string(img, lang="eng", config=config) or ""


	def extract_pdf_pages(pdf_path: str, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int) -> List[PageText]:
	text_pages = extract_text_layer(pdf_path)
	pages: List[PageText] = []

	for i, t in enumerate(text_pages):
	base = (t or "").strip()
	base_chars = len(base)

	if use_ocr and base_chars < int(ocr_min_chars):
	imgs = convert_from_path(pdf_path, dpi=int(ocr_dpi), first_page=i + 1, last_page=i + 1)
	img = imgs[0]
	ocr_text = (ocr_page_tesseract(img) or "").strip()
	if len(ocr_text) > base_chars:
	pages.append(PageText(page=i + 1, text=ocr_text, source="ocr", text_chars=len(ocr_text)))
	continue

	pages.append(PageText(page=i + 1, text=base, source="text", text_chars=base_chars))

	return pages


	# ----------------------------
	# 3) Matching & scoring (no pandas)
	# ----------------------------
	def normalize_text(s: str) -> str:
	return (s or "").lower()


	def compile_patterns(kw: Dict[str, Dict[str, Any]]) -> Dict[str, List[re.Pattern]]:
	compiled: Dict[str, List[re.Pattern]] = {}
	for cat, cfg in kw.items():
	pats: List[re.Pattern] = []
	for term in cfg.get("terms", []):
	term = (term or "").strip().lower()
	if not term:
	continue
	pat = re.escape(term).replace(r"\ ", r"\s+")
	pats.append(re.compile(rf"(?i)\b{pat}\b"))
	for rp in cfg.get("regex", []):
	pats.append(re.compile(rf"(?i){rp}"))
	compiled[cat] = pats
	return compiled


	def find_snippets(text: str, patterns: List[re.Pattern], window: int = 90, max_snippets: int = 4) -> List[str]:
	snips: List[str] = []
	for pat in patterns:
	for m in pat.finditer(text):
	s = max(0, m.start() - window)
	e = min(len(text), m.end() + window)
	snippet = re.sub(r"\s+", " ", text[s:e].strip())
	snips.append(snippet)
	if len(snips) >= max_snippets:
	return snips
	return snips


	def score_pages(pages: List[PageText], kw: Dict[str, Dict[str, Any]]):
	compiled = compile_patterns(kw)
	overall_hits = {cat: 0 for cat in kw.keys()}
	overall_weighted = 0.0

	page_records = []
	for p in pages:
	t = normalize_text(p.text)
	page_hits_total = 0
	page_weighted = 0.0
	cat_hits = {}

	for cat, cfg in kw.items():
	hits = 0
	for pat in compiled[cat]:
	hits += len(list(pat.finditer(t)))
	cat_hits[cat] = hits
	overall_hits[cat] += hits
	if hits:
	page_hits_total += hits
	page_weighted += hits * float(cfg["weight"])

	overall_weighted += page_weighted

	top_cats = sorted(cat_hits.items(), key=lambda x: x[1], reverse=True)
	snippet_lines: List[str] = []
	for cat, hits in top_cats[:3]:
	if hits <= 0:
	continue
	snips = find_snippets(t, compiled[cat], window=90, max_snippets=2)
	for s in snips:
	snippet_lines.append(f"[{cat}] {s}")

	page_records.append({
	"page": p.page,
	"source": p.source,
	"text_chars": p.text_chars,
	"hits_total": int(page_hits_total),
	"score_weighted": round(page_weighted, 2),
	"top_snippets": "\n".join(snippet_lines[:6]),
	})

	cat_summary = []
	for cat, cfg in kw.items():
	cat_summary.append({
	"category": cat,
	"weight": cfg["weight"],
	"hits": int(overall_hits[cat]),
	"weighted_hits": round(overall_hits[cat] * float(cfg["weight"]), 2),
	})

	page_records_sorted = sorted(
	page_records,
	key=lambda r: (r["score_weighted"], r["hits_total"]),
	reverse=True
	)
	cat_summary_sorted = sorted(
	cat_summary,
	key=lambda r: (r["weighted_hits"], r["hits"]),
	reverse=True
	)

	meta = {
	"total_pages": len(pages),
	"total_hits": int(sum(overall_hits.values())),
	"total_weighted_score": round(float(overall_weighted), 2),
	"sources": {
	"text": sum(1 for p in pages if p.source == "text"),
	"ocr": sum(1 for p in pages if p.source == "ocr"),
	}
	}
	return page_records_sorted, cat_summary_sorted, meta


	def write_csv(path: str, rows: List[dict], headers: List[str]):
	with open(path, "w", newline="", encoding="utf-8") as f:
	w = csv.writer(f)
	w.writerow(headers)
	for r in rows:
	w.writerow([r.get(h, "") for h in headers])


	# ----------------------------
	# 4) Gradio app
	# ----------------------------
	def run_extract(pdf_file, use_ocr: bool, ocr_min_chars: int, ocr_dpi: int, topk_pages: int):
	if pdf_file is None:
	return None, None, "", None, "Please upload a PDF."

	kw = build_keyword_dict()

	pages = extract_pdf_pages(
	pdf_path=pdf_file.name,
	use_ocr=use_ocr,
	ocr_min_chars=int(ocr_min_chars),
	ocr_dpi=int(ocr_dpi),
	)

	page_ranking, cat_summary, meta = score_pages(pages, kw)

	topk = int(topk_pages)
	top_pages = page_ranking[:topk]

	payload = {
	"meta": meta,
	"category_summary": cat_summary,
	"page_ranking": page_ranking,
	}

	json_path = "underwriting_keywords_output.json"
	csv_pages_path = "page_ranking.csv"
	csv_cats_path = "category_summary.csv"

	with open(json_path, "w", encoding="utf-8") as f:
	json.dump(payload, f, ensure_ascii=False, indent=2)

	write_csv(
	csv_pages_path,
	page_ranking,
	headers=["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"]
	)
	write_csv(
	csv_cats_path,
	cat_summary,
	headers=["category", "weight", "hits", "weighted_hits"]
	)

	summary = (
	f"Total pages: {meta['total_pages']} \| "
	f"Total hits: {meta['total_hits']} \| "
	f"Weighted score: {meta['total_weighted_score']} \| "
	f"Sources: {meta['sources']}"
	)

	# Gradio Dataframe expects list-of-lists with headers
	cats_headers = ["category", "weight", "hits", "weighted_hits"]
	cats_table = [cats_headers] + [[r[h] for h in cats_headers] for r in cat_summary]

	pages_headers = ["page", "source", "text_chars", "hits_total", "score_weighted", "top_snippets"]
	pages_table = [pages_headers] + [[r[h] for h in pages_headers] for r in top_pages]

	return cats_table, pages_table, summary, [json_path, csv_pages_path, csv_cats_path], "Done."


	with gr.Blocks(title="OM Underwriting Keyword Extractor") as demo:
	gr.Markdown(
	"# OM Underwriting Keyword Extractor\n"
	"Upload a real estate OM PDF and extract underwriting keyword signals.\n\n"
	"This build uses minimal deps (no pandas/numpy/torch). OCR fallback uses Tesseract."
	)

	with gr.Row():
	pdf = gr.File(label="Upload OM PDF", file_types=[".pdf"])
	with gr.Column():
	use_ocr = gr.Checkbox(value=True, label="Enable OCR fallback (recommended for OM)")
	ocr_min_chars = gr.Slider(0, 3000, value=350, step=50, label="OCR trigger: if text chars on page <")
	ocr_dpi = gr.Slider(120, 300, value=200, step=10, label="OCR render DPI")
	topk_pages = gr.Slider(5, 60, value=15, step=1, label="Show Top-K pages")

	run_btn = gr.Button("Extract Keywords")

	gr.Markdown("## Category Summary (sorted by weighted hits)")
	out_cats = gr.Dataframe(interactive=False)

	gr.Markdown("## Top Pages (highest underwriting signal)")
	out_pages = gr.Dataframe(interactive=False)

	out_summary = gr.Textbox(label="Run Summary", interactive=False)
	out_files = gr.File(label="Download Outputs (JSON + CSVs)", file_count="multiple")
	out_status = gr.Textbox(label="Status", interactive=False)

	run_btn.click(
	fn=run_extract,
	inputs=[pdf, use_ocr, ocr_min_chars, ocr_dpi, topk_pages],
	outputs=[out_cats, out_pages, out_summary, out_files, out_status],
	)

	demo.launch()