import re # ----------------------------- # INTEREST RATE EXTRACTION # ----------------------------- def extract_interest_rate(text): """ Extract interest rate using semantic priority. The document can contain multiple rates. We select the most authoritative one. """ priority_patterns = [ # 1. Comparison rate (most stable & always present) r"שיעור\s+הריבית\s+לצרכי\s+השוואה\s*[:\-]?\s*(\d+\.\d+)\s*%", # 2. Forecast total interest rate r"הריבית\s+הכוללת\s+החזויה\s*[:\-]?\s*(\d+\.\d+)\s*%", # 3. Adjusted interest rate r"שיעור\s+הריבית\s+המתואמת\s*[:\-]?\s*(\d+\.\d+)\s*%", # 4. Base interest rate r"שיעור\s+הריבית\s*[:\-]?\s*(\d+\.\d+)\s*%" ] for pattern in priority_patterns: match = re.search(pattern, text) if match: try: return float(match.group(1)) except ValueError: continue # Special case: Bank of Israel 0% loans if "מתווה בנק ישראל" in text or "ריבית 0" in text: return 0.0 return None # ----------------------------- # LOAN AMOUNT EXTRACTION # ----------------------------- def extract_loan_amount(text): """ Extract loan amount ONLY from execution amount. Never guess from balances, totals, or monthly values. """ priority_patterns = [ # Canonical execution amount r"סכום\s+חלק\s+זה\s+בעת\s+הביצוע\s*[:\-]?\s*([\d,]+(?:\.\d{2})?)", # Variant formatting sometimes seen r"סכום\s+ההלוואה\s+בעת\s+הביצוע\s*[:\-]?\s*([\d,]+(?:\.\d{2})?)" ] for pattern in priority_patterns: match = re.search(pattern, text) if match: value = match.group(1).replace(",", "") try: return float(value) except ValueError: continue return None