mortgage_OCR / extractor.py
mlbench123's picture
Update extractor.py
8cf23c5 verified
import re
# -----------------------------
# INTEREST RATE EXTRACTION
# -----------------------------
def extract_interest_rate(text):
"""
Extract interest rate using semantic priority.
The document can contain multiple rates.
We select the most authoritative one.
"""
priority_patterns = [
# 1. Comparison rate (most stable & always present)
r"砖讬注讜专\s+讛专讬讘讬转\s+诇爪专讻讬\s+讛砖讜讜讗讛\s*[:\-]?\s*(\d+\.\d+)\s*%",
# 2. Forecast total interest rate
r"讛专讬讘讬转\s+讛讻讜诇诇转\s+讛讞讝讜讬讛\s*[:\-]?\s*(\d+\.\d+)\s*%",
# 3. Adjusted interest rate
r"砖讬注讜专\s+讛专讬讘讬转\s+讛诪转讜讗诪转\s*[:\-]?\s*(\d+\.\d+)\s*%",
# 4. Base interest rate
r"砖讬注讜专\s+讛专讬讘讬转\s*[:\-]?\s*(\d+\.\d+)\s*%"
]
for pattern in priority_patterns:
match = re.search(pattern, text)
if match:
try:
return float(match.group(1))
except ValueError:
continue
# Special case: Bank of Israel 0% loans
if "诪转讜讜讛 讘谞拽 讬砖专讗诇" in text or "专讬讘讬转 0" in text:
return 0.0
return None
# -----------------------------
# LOAN AMOUNT EXTRACTION
# -----------------------------
def extract_loan_amount(text):
"""
Extract loan amount ONLY from execution amount.
Never guess from balances, totals, or monthly values.
"""
priority_patterns = [
# Canonical execution amount
r"住讻讜诐\s+讞诇拽\s+讝讛\s+讘注转\s+讛讘讬爪讜注\s*[:\-]?\s*([\d,]+(?:\.\d{2})?)",
# Variant formatting sometimes seen
r"住讻讜诐\s+讛讛诇讜讜讗讛\s+讘注转\s+讛讘讬爪讜注\s*[:\-]?\s*([\d,]+(?:\.\d{2})?)"
]
for pattern in priority_patterns:
match = re.search(pattern, text)
if match:
value = match.group(1).replace(",", "")
try:
return float(value)
except ValueError:
continue
return None