File size: 12,155 Bytes
9d21edd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""

Strict Domain Analyzer for Legal Documents.

Implements specific checks for:

- Entity Roles (Vendor vs Vendee)

- Domain Categories (Financial, Possession, Ownership, etc.)

- Timeline Logic (Agreement vs Registration)

- Numeric Consistency within context

"""

import re

# =========================
# 1. STRICT CLASSIFICATION
# =========================

def is_legal_boilerplate(text):
    """Detects standard legal headers, footers, and witness blocks."""
    t = text.lower()
    patterns = [
        "in witness whereof", "signed and delivered", "witnesses:", 
        "schedule", "jurisdiction", "arbitration", "notice",
        "all that piece and parcel", "north by", "south by"
    ]
    # If it's very short (< 5 words) and contains a keyword
    words = t.split()
    if len(words) < 5 and any(p in t for p in patterns):
        return True
    
    # If it's just a signature block
    if "signed by" in t or "witness" in t:
        return True
        
    return False

def get_clause_domain(text):
    """

    Classify clause into strict legal domains.

    Returns: 'FINANCIAL', 'POSSESSION', 'OWNERSHIP', 'ENCUMBRANCE', 'ADMINISTRATIVE', 'RECITAL', 'DEFINITION', 'OPERATIVE' or 'GENERAL'

    """
    t = text.lower()
    
    # 1. RECITAL (Background)
    if t.startswith("whereas") or "and whereas" in t:
        return "RECITAL"
        
    # 2. DEFINITION
    if "shall mean" in t or "expression vendor" in t or "expression vendee" in t:
        return "DEFINITION"

    # 3. FINANCIAL (Money, Consideration)
    if any(w in t for w in ["rs.", "rupees", "paid", "consideration", "sum of", "amount", "price", "cheque", "bank"]):
        return "FINANCIAL"

    # 4. POSSESSION (Handover, Vacant)
    if any(w in t for w in ["possession", "handed over", "delivered", "vacant"]):
        return "POSSESSION"

    # 5. OWNERSHIP / TITLE
    if any(w in t for w in ["owner", "title", "interest", "rights", "absolute", "fee simple"]):
        return "OWNERSHIP"
        
    # 6. ENCUMBRANCE (Loans, Mortgages)
    if any(w in t for w in ["encumbrance", "mortgage", "loan", "charge", "lien", "litigation"]):
        return "ENCUMBRANCE"
        
    # 7. ADMINISTRATIVE (Boilerplate)
    if any(w in t for w in ["witness", "signed", "schedule", "jurisdiction", "arbitration", "notice"]):
        return "ADMINISTRATIVE"

    # 8. OPERATIVE (Action)
    if t.startswith("that") or "hereby" in t or "now this deed" in t:
        return "OPERATIVE"

    return "GENERAL"

def get_entities(text):
    """

    Strictly detect if clause belongs to a specific entity.

    """
    t = text.lower()
    entities = set()
    if "vendor" in t: entities.add("Vendor")
    if "vendee" in t: entities.add("Vendee")
    return entities

# =========================
# 2. EXTRACTION HELPERS
# =========================

def extract_numbers(text):
    """Extract numeric values for comparison."""
    # Matches Rs. 100, 1,00,000, 500 sq ft (just the numbers)
    return [int(n.replace(",", "")) for n in re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)]

def has_negation(text):
    neg_words = ["not", "never", "no", "cannot", "must not", "shall not"]
    return any(w in text.lower() for w in neg_words)

def has_exception_language(text):
    """Detects legal exception/qualification identifiers."""
    qualifiers = [
        "subject to", "notwithstanding", "except as provided", 
        "unless otherwise", "provided however", "without prejudice"
    ]
    return any(q in text.lower() for q in qualifiers)

def is_definition(text):
    """Strictly checks if a clause is a definition."""
    t = text.lower()
    if "shall mean" in t or "means" in t or "defined as" in t:
        return True
    return False

def is_party_intro(text):
    """Detects if a clause is just listing a party description."""
    t = text.lower()
    
    # Strong Indicators: Address patterns, Relations, IDs
    # Regex for "Door No", "D.No", "residing at"
    address_pattern = r"(door\s*no|d\.no|residing\s*at|post\s*,\s*village)"
    
    # Regex for relations: "son of", "wife of", "daughter of", "w/o", "s/o", or just "son", "wife" in context
    relation_pattern = r"\b(son|wife|daughter|husband|father|mother|s/o|w/o|d/o)\b"
    
    # Regex for IDs: "aadhaar", "pan no", "id card"
    id_pattern = r"(aadhaar|pan\s*no|id\s*card|mobile\s*no)"
    
    # Check for presence of these patterns
    has_address = re.search(address_pattern, t)
    has_relation = re.search(relation_pattern, t)
    has_id = re.search(id_pattern, t)
    
    # If it has at least 2 strong components (e.g. Relation + ID, or Address + Relation), it's a bio
    score = 0
    if has_address: score += 1
    if has_relation: score += 1
    if has_id: score += 1
    
    return score >= 2

# =========================
# 3. CORE LOGIC GATES
# =========================

def analyze_pair(text1, text2, similarity, threshold=0.75):
    """

    Strict Analyzer returning (Label, Score, Reason).

    Args:

        threshold: Minimum similarity score to consider as CANDIDATE (default 0.75)

    """
    # Force Reload Trigger
    
    # --- GATE 0: BOILERPLATE CHECK ---
    if is_legal_boilerplate(text1) or is_legal_boilerplate(text2):
        return None, 0.0, "Boilerplate (Skipped)"

    # --- GATE 1: DOMAIN MISMATCH ---
    d1 = get_clause_domain(text1)
    d2 = get_clause_domain(text2)
    
    # If domains are totally different, SKIP.
    # Exception: OPERATIVE and GENERAL might overlap, but strictly FINANCIAL vs POSSESSION should skip.
    if d1 != "GENERAL" and d2 != "GENERAL" and d1 != d2:
        # RELAXATION: Only bypass if similarity is VERY high (suggesting misclassification).
        # Otherwise, DO NOT compare apples (Financial) to oranges (Possession), 
        # even in Deep Search mode.
        if similarity < 0.85:
            return None, 0.0, "Domain Mismatch"

    # --- HARDENED CHECK: GENERAL vs SPECIFIC ---
    # Common source of noise: "Any other details" matching "The price is Rs 100"
    # Block GENERAL vs Specific unless similarity is high
    if (d1 == "GENERAL" and d2 != "GENERAL") or (d2 == "GENERAL" and d1 != "GENERAL"):
        if similarity < 0.80:
             return None, 0.0, "General vs Specific Domain (Skipped)"

    # --- SPECIFIC FILTER: MONEY vs TIMELINE ---
    # Prevents "Price is X" vs "Payment due on Date Y" (confusing numbers/dates)
    # Check if one clause is purely FINANCIAL and other is purely TIMELINE/DATE based
    is_financial = d1 == "FINANCIAL" or d2 == "FINANCIAL"
    has_date = re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text1) or \
               re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text2)
    
    if is_financial and has_date:
        # If one talks about Price/Amount and other has a Date, 
        # unless they are explicitly about "Payment Schedule", they are likely different.
        if "schedule" not in text1.lower() and "schedule" not in text2.lower():
             if similarity < 0.85:
                 return None, 0.0, "Financial vs Timeline Mismatch"

    # --- SPECIFIC FILTER: ELIGIBILITY vs ASSISTANCE ---
    # Prevents "Eligibility criteria" vs "Assistance details" (Common in schemes)
    # Check for keywords like "eligible", "qualify" vs "grant", "support", "help"
    t1_lower, t2_lower = text1.lower(), text2.lower()
    is_eligibility = any(w in t1_lower for w in ["eligible", "qualify", "criteria", "requirement"]) or \
                     any(w in t2_lower for w in ["eligible", "qualify", "criteria", "requirement"])
    is_assistance = any(w in t1_lower for w in ["provide", "grant", "subsidy", "support", "assistance"]) or \
                    any(w in t2_lower for w in ["provide", "grant", "subsidy", "support", "assistance"])

    if is_eligibility and is_assistance:
         # Unless precise overlap, these are distinct sections
         if similarity < 0.85:
              return None, 0.0, "Eligibility vs Assistance Mismatch"

    # --- GATE 1.5: PARTY DESCRIPTION CHECK ---
    # If both clauses are just descriptions of people (addresses, relations), skip.
    if is_party_intro(text1) and is_party_intro(text2):
        return None, 0.0, "Party Description (Skipped)"

    # --- GATE 2: ENTITY MISMATCH ---
    e1 = get_entities(text1)
    e2 = get_entities(text2)
    # If one is Vendor ONLY and other is Vendee ONLY -> SKIP
    if e1 and e2 and e1 != e2 and not (e1 & e2):
        # RELAXATION: Only bypass if similarity is VERY high.
        if similarity < 0.85:
            return None, 0.0, "Entity Role Mismatch"
    
    # --- GATE 2.5: DEFINITION GUARD ---
    # Don't compare definitions with operative clauses generally
    if is_definition(text1) or is_definition(text2):
        # Only compare if both are definitions (conflicting definitions)
        if not (is_definition(text1) and is_definition(text2)):
             return None, 0.0, "Definition vs Operative"

    # --- GATE 3: POSSESSION TIMELINE ---
    # "Possession at agreement" vs "Possession at registration" is NOT a contradiction.
    if d1 == "POSSESSION" and d2 == "POSSESSION":
        keywords_a = ["agreement", "earnest"]
        keywords_b = ["registration", "sale deed", "final"]
        
        has_a = any(k in text1.lower() for k in keywords_a)
        has_b = any(k in text2.lower() for k in keywords_b)
        
        # If one talks about start and other about end, it's a sequence.
        if (has_a and any(k in text2.lower() for k in keywords_b)) or \
           (has_b and any(k in text1.lower() for k in keywords_a)):
             return None, 0.0, "Possession Timeline Sequence"

    # --- GATE 4: NUMERIC REASONING ---
    # Only compare numbers if context allows
    nums1 = extract_numbers(text1)
    nums2 = extract_numbers(text2)
    
    if nums1 and nums2 and nums1 != nums2:
        # MAGNITUDE CHECK: If numbers differ by > 100x, likely different units (e.g. Price vs Area)
        # e.g. 5,50,000 vs 1.25 -> Ratio is huge.
        max1, max2 = max(nums1), max(nums2)
        if max1 > 0 and max2 > 0:
            ratio = max1 / max2 if max1 > max2 else max2 / max1
            if ratio > 100:
                 return None, 0.0, "Numeric Magnitude Mismatch (Likely Unit Diff)"

        # Check if they are in the same domain (likely valid comparison)
        if d1 == d2 and d1 != "GENERAL":
             return "NUMERIC_INCONSISTENCY", 0.9, f"Mismatch in {d1} values"
        
        # If General, be careful. 
        # But if similarity is VERY high, it might be a contradiction.
        if similarity > 0.9:
             return "NUMERIC_INCONSISTENCY", 0.85, "Numeric Mismatch in similar context"

    # --- GATE 4.5: EXCEPTION/HIERARCHY CHECK ---
    # If high similarity but one has exception language
    # We use a slightly lower threshold for exception detection to be safe
    exception_threshold = max(0.65, threshold - 0.05)
    if similarity > exception_threshold: 
        has_ex1 = has_exception_language(text1)
        has_ex2 = has_exception_language(text2)
        
        if (has_ex1 and not has_ex2) or (has_ex2 and not has_ex1):
            return "QUALIFICATION", similarity, "Legal Exception/Qualification detected (Not a Conflict)"

    # --- GATE 5: LOGICAL NEGATION ---
    if (has_negation(text1) and not has_negation(text2)) or \
       (has_negation(text2) and not has_negation(text1)):
        # Only flag if high similarity implies they are talking about the same thing
        # Negation check requires fairly high confidence they are related
        if similarity > 0.85:
            return "LEGAL_CONFLICT", 0.8, "Logical Negation detected"

    # --- FINAL GATE: CANDIDATE FOR NLI ---
    # If we are here, we passed the blocks. 
    # If similarity is high, let NLI decide.
    if similarity > threshold: 
        return "CANDIDATE", similarity, "High Similarity - Pending NLI"
        
    return None, 0.0, "Low Similarity"