Spaces:
Sleeping
Sleeping
File size: 12,155 Bytes
9d21edd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | """
Strict Domain Analyzer for Legal Documents.
Implements specific checks for:
- Entity Roles (Vendor vs Vendee)
- Domain Categories (Financial, Possession, Ownership, etc.)
- Timeline Logic (Agreement vs Registration)
- Numeric Consistency within context
"""
import re
# =========================
# 1. STRICT CLASSIFICATION
# =========================
def is_legal_boilerplate(text):
"""Detects standard legal headers, footers, and witness blocks."""
t = text.lower()
patterns = [
"in witness whereof", "signed and delivered", "witnesses:",
"schedule", "jurisdiction", "arbitration", "notice",
"all that piece and parcel", "north by", "south by"
]
# If it's very short (< 5 words) and contains a keyword
words = t.split()
if len(words) < 5 and any(p in t for p in patterns):
return True
# If it's just a signature block
if "signed by" in t or "witness" in t:
return True
return False
def get_clause_domain(text):
"""
Classify clause into strict legal domains.
Returns: 'FINANCIAL', 'POSSESSION', 'OWNERSHIP', 'ENCUMBRANCE', 'ADMINISTRATIVE', 'RECITAL', 'DEFINITION', 'OPERATIVE' or 'GENERAL'
"""
t = text.lower()
# 1. RECITAL (Background)
if t.startswith("whereas") or "and whereas" in t:
return "RECITAL"
# 2. DEFINITION
if "shall mean" in t or "expression vendor" in t or "expression vendee" in t:
return "DEFINITION"
# 3. FINANCIAL (Money, Consideration)
if any(w in t for w in ["rs.", "rupees", "paid", "consideration", "sum of", "amount", "price", "cheque", "bank"]):
return "FINANCIAL"
# 4. POSSESSION (Handover, Vacant)
if any(w in t for w in ["possession", "handed over", "delivered", "vacant"]):
return "POSSESSION"
# 5. OWNERSHIP / TITLE
if any(w in t for w in ["owner", "title", "interest", "rights", "absolute", "fee simple"]):
return "OWNERSHIP"
# 6. ENCUMBRANCE (Loans, Mortgages)
if any(w in t for w in ["encumbrance", "mortgage", "loan", "charge", "lien", "litigation"]):
return "ENCUMBRANCE"
# 7. ADMINISTRATIVE (Boilerplate)
if any(w in t for w in ["witness", "signed", "schedule", "jurisdiction", "arbitration", "notice"]):
return "ADMINISTRATIVE"
# 8. OPERATIVE (Action)
if t.startswith("that") or "hereby" in t or "now this deed" in t:
return "OPERATIVE"
return "GENERAL"
def get_entities(text):
"""
Strictly detect if clause belongs to a specific entity.
"""
t = text.lower()
entities = set()
if "vendor" in t: entities.add("Vendor")
if "vendee" in t: entities.add("Vendee")
return entities
# =========================
# 2. EXTRACTION HELPERS
# =========================
def extract_numbers(text):
"""Extract numeric values for comparison."""
# Matches Rs. 100, 1,00,000, 500 sq ft (just the numbers)
return [int(n.replace(",", "")) for n in re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)]
def has_negation(text):
neg_words = ["not", "never", "no", "cannot", "must not", "shall not"]
return any(w in text.lower() for w in neg_words)
def has_exception_language(text):
"""Detects legal exception/qualification identifiers."""
qualifiers = [
"subject to", "notwithstanding", "except as provided",
"unless otherwise", "provided however", "without prejudice"
]
return any(q in text.lower() for q in qualifiers)
def is_definition(text):
"""Strictly checks if a clause is a definition."""
t = text.lower()
if "shall mean" in t or "means" in t or "defined as" in t:
return True
return False
def is_party_intro(text):
"""Detects if a clause is just listing a party description."""
t = text.lower()
# Strong Indicators: Address patterns, Relations, IDs
# Regex for "Door No", "D.No", "residing at"
address_pattern = r"(door\s*no|d\.no|residing\s*at|post\s*,\s*village)"
# Regex for relations: "son of", "wife of", "daughter of", "w/o", "s/o", or just "son", "wife" in context
relation_pattern = r"\b(son|wife|daughter|husband|father|mother|s/o|w/o|d/o)\b"
# Regex for IDs: "aadhaar", "pan no", "id card"
id_pattern = r"(aadhaar|pan\s*no|id\s*card|mobile\s*no)"
# Check for presence of these patterns
has_address = re.search(address_pattern, t)
has_relation = re.search(relation_pattern, t)
has_id = re.search(id_pattern, t)
# If it has at least 2 strong components (e.g. Relation + ID, or Address + Relation), it's a bio
score = 0
if has_address: score += 1
if has_relation: score += 1
if has_id: score += 1
return score >= 2
# =========================
# 3. CORE LOGIC GATES
# =========================
def analyze_pair(text1, text2, similarity, threshold=0.75):
"""
Strict Analyzer returning (Label, Score, Reason).
Args:
threshold: Minimum similarity score to consider as CANDIDATE (default 0.75)
"""
# Force Reload Trigger
# --- GATE 0: BOILERPLATE CHECK ---
if is_legal_boilerplate(text1) or is_legal_boilerplate(text2):
return None, 0.0, "Boilerplate (Skipped)"
# --- GATE 1: DOMAIN MISMATCH ---
d1 = get_clause_domain(text1)
d2 = get_clause_domain(text2)
# If domains are totally different, SKIP.
# Exception: OPERATIVE and GENERAL might overlap, but strictly FINANCIAL vs POSSESSION should skip.
if d1 != "GENERAL" and d2 != "GENERAL" and d1 != d2:
# RELAXATION: Only bypass if similarity is VERY high (suggesting misclassification).
# Otherwise, DO NOT compare apples (Financial) to oranges (Possession),
# even in Deep Search mode.
if similarity < 0.85:
return None, 0.0, "Domain Mismatch"
# --- HARDENED CHECK: GENERAL vs SPECIFIC ---
# Common source of noise: "Any other details" matching "The price is Rs 100"
# Block GENERAL vs Specific unless similarity is high
if (d1 == "GENERAL" and d2 != "GENERAL") or (d2 == "GENERAL" and d1 != "GENERAL"):
if similarity < 0.80:
return None, 0.0, "General vs Specific Domain (Skipped)"
# --- SPECIFIC FILTER: MONEY vs TIMELINE ---
# Prevents "Price is X" vs "Payment due on Date Y" (confusing numbers/dates)
# Check if one clause is purely FINANCIAL and other is purely TIMELINE/DATE based
is_financial = d1 == "FINANCIAL" or d2 == "FINANCIAL"
has_date = re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text1) or \
re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text2)
if is_financial and has_date:
# If one talks about Price/Amount and other has a Date,
# unless they are explicitly about "Payment Schedule", they are likely different.
if "schedule" not in text1.lower() and "schedule" not in text2.lower():
if similarity < 0.85:
return None, 0.0, "Financial vs Timeline Mismatch"
# --- SPECIFIC FILTER: ELIGIBILITY vs ASSISTANCE ---
# Prevents "Eligibility criteria" vs "Assistance details" (Common in schemes)
# Check for keywords like "eligible", "qualify" vs "grant", "support", "help"
t1_lower, t2_lower = text1.lower(), text2.lower()
is_eligibility = any(w in t1_lower for w in ["eligible", "qualify", "criteria", "requirement"]) or \
any(w in t2_lower for w in ["eligible", "qualify", "criteria", "requirement"])
is_assistance = any(w in t1_lower for w in ["provide", "grant", "subsidy", "support", "assistance"]) or \
any(w in t2_lower for w in ["provide", "grant", "subsidy", "support", "assistance"])
if is_eligibility and is_assistance:
# Unless precise overlap, these are distinct sections
if similarity < 0.85:
return None, 0.0, "Eligibility vs Assistance Mismatch"
# --- GATE 1.5: PARTY DESCRIPTION CHECK ---
# If both clauses are just descriptions of people (addresses, relations), skip.
if is_party_intro(text1) and is_party_intro(text2):
return None, 0.0, "Party Description (Skipped)"
# --- GATE 2: ENTITY MISMATCH ---
e1 = get_entities(text1)
e2 = get_entities(text2)
# If one is Vendor ONLY and other is Vendee ONLY -> SKIP
if e1 and e2 and e1 != e2 and not (e1 & e2):
# RELAXATION: Only bypass if similarity is VERY high.
if similarity < 0.85:
return None, 0.0, "Entity Role Mismatch"
# --- GATE 2.5: DEFINITION GUARD ---
# Don't compare definitions with operative clauses generally
if is_definition(text1) or is_definition(text2):
# Only compare if both are definitions (conflicting definitions)
if not (is_definition(text1) and is_definition(text2)):
return None, 0.0, "Definition vs Operative"
# --- GATE 3: POSSESSION TIMELINE ---
# "Possession at agreement" vs "Possession at registration" is NOT a contradiction.
if d1 == "POSSESSION" and d2 == "POSSESSION":
keywords_a = ["agreement", "earnest"]
keywords_b = ["registration", "sale deed", "final"]
has_a = any(k in text1.lower() for k in keywords_a)
has_b = any(k in text2.lower() for k in keywords_b)
# If one talks about start and other about end, it's a sequence.
if (has_a and any(k in text2.lower() for k in keywords_b)) or \
(has_b and any(k in text1.lower() for k in keywords_a)):
return None, 0.0, "Possession Timeline Sequence"
# --- GATE 4: NUMERIC REASONING ---
# Only compare numbers if context allows
nums1 = extract_numbers(text1)
nums2 = extract_numbers(text2)
if nums1 and nums2 and nums1 != nums2:
# MAGNITUDE CHECK: If numbers differ by > 100x, likely different units (e.g. Price vs Area)
# e.g. 5,50,000 vs 1.25 -> Ratio is huge.
max1, max2 = max(nums1), max(nums2)
if max1 > 0 and max2 > 0:
ratio = max1 / max2 if max1 > max2 else max2 / max1
if ratio > 100:
return None, 0.0, "Numeric Magnitude Mismatch (Likely Unit Diff)"
# Check if they are in the same domain (likely valid comparison)
if d1 == d2 and d1 != "GENERAL":
return "NUMERIC_INCONSISTENCY", 0.9, f"Mismatch in {d1} values"
# If General, be careful.
# But if similarity is VERY high, it might be a contradiction.
if similarity > 0.9:
return "NUMERIC_INCONSISTENCY", 0.85, "Numeric Mismatch in similar context"
# --- GATE 4.5: EXCEPTION/HIERARCHY CHECK ---
# If high similarity but one has exception language
# We use a slightly lower threshold for exception detection to be safe
exception_threshold = max(0.65, threshold - 0.05)
if similarity > exception_threshold:
has_ex1 = has_exception_language(text1)
has_ex2 = has_exception_language(text2)
if (has_ex1 and not has_ex2) or (has_ex2 and not has_ex1):
return "QUALIFICATION", similarity, "Legal Exception/Qualification detected (Not a Conflict)"
# --- GATE 5: LOGICAL NEGATION ---
if (has_negation(text1) and not has_negation(text2)) or \
(has_negation(text2) and not has_negation(text1)):
# Only flag if high similarity implies they are talking about the same thing
# Negation check requires fairly high confidence they are related
if similarity > 0.85:
return "LEGAL_CONFLICT", 0.8, "Logical Negation detected"
# --- FINAL GATE: CANDIDATE FOR NLI ---
# If we are here, we passed the blocks.
# If similarity is high, let NLI decide.
if similarity > threshold:
return "CANDIDATE", similarity, "High Similarity - Pending NLI"
return None, 0.0, "Low Similarity"
|