File size: 2,010 Bytes
4385db3
 
 
8cf23c5
 
 
 
4385db3
 
8cf23c5
 
 
4385db3
 
8cf23c5
 
 
 
 
 
 
 
4385db3
8cf23c5
 
4385db3
 
 
8cf23c5
4385db3
 
8cf23c5
 
 
 
 
 
 
 
4385db3
 
 
 
8cf23c5
 
 
 
4385db3
 
8cf23c5
 
4385db3
 
8cf23c5
 
 
4385db3
8cf23c5
 
 
4385db3
8cf23c5
 
 
 
 
 
 
 
4385db3
8cf23c5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re


# -----------------------------
# INTEREST RATE EXTRACTION
# -----------------------------

def extract_interest_rate(text):
    """
    Extract interest rate using semantic priority.
    The document can contain multiple rates.
    We select the most authoritative one.
    """

    priority_patterns = [
        # 1. Comparison rate (most stable & always present)
        r"砖讬注讜专\s+讛专讬讘讬转\s+诇爪专讻讬\s+讛砖讜讜讗讛\s*[:\-]?\s*(\d+\.\d+)\s*%",

        # 2. Forecast total interest rate
        r"讛专讬讘讬转\s+讛讻讜诇诇转\s+讛讞讝讜讬讛\s*[:\-]?\s*(\d+\.\d+)\s*%",

        # 3. Adjusted interest rate
        r"砖讬注讜专\s+讛专讬讘讬转\s+讛诪转讜讗诪转\s*[:\-]?\s*(\d+\.\d+)\s*%",

        # 4. Base interest rate
        r"砖讬注讜专\s+讛专讬讘讬转\s*[:\-]?\s*(\d+\.\d+)\s*%"
    ]

    for pattern in priority_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                return float(match.group(1))
            except ValueError:
                continue

    # Special case: Bank of Israel 0% loans
    if "诪转讜讜讛 讘谞拽 讬砖专讗诇" in text or "专讬讘讬转 0" in text:
        return 0.0

    return None


# -----------------------------
# LOAN AMOUNT EXTRACTION
# -----------------------------

def extract_loan_amount(text):
    """
    Extract loan amount ONLY from execution amount.
    Never guess from balances, totals, or monthly values.
    """

    priority_patterns = [
        # Canonical execution amount
        r"住讻讜诐\s+讞诇拽\s+讝讛\s+讘注转\s+讛讘讬爪讜注\s*[:\-]?\s*([\d,]+(?:\.\d{2})?)",

        # Variant formatting sometimes seen
        r"住讻讜诐\s+讛讛诇讜讜讗讛\s+讘注转\s+讛讘讬爪讜注\s*[:\-]?\s*([\d,]+(?:\.\d{2})?)"
    ]

    for pattern in priority_patterns:
        match = re.search(pattern, text)
        if match:
            value = match.group(1).replace(",", "")
            try:
                return float(value)
            except ValueError:
                continue

    return None