Spaces:

JAYASREESS
/

final_year

Sleeping

File size: 12,155 Bytes

9d21edd

"""

Strict Domain Analyzer for Legal Documents.

Implements specific checks for:

- Entity Roles (Vendor vs Vendee)

- Domain Categories (Financial, Possession, Ownership, etc.)

- Timeline Logic (Agreement vs Registration)

- Numeric Consistency within context

"""

import re

# =========================
# 1. STRICT CLASSIFICATION
# =========================

def is_legal_boilerplate(text):
    """Detects standard legal headers, footers, and witness blocks."""
    t = text.lower()
    patterns = [
        "in witness whereof", "signed and delivered", "witnesses:", 
        "schedule", "jurisdiction", "arbitration", "notice",
        "all that piece and parcel", "north by", "south by"
    ]
    # If it's very short (< 5 words) and contains a keyword
    words = t.split()
    if len(words) < 5 and any(p in t for p in patterns):
        return True
    
    # If it's just a signature block
    if "signed by" in t or "witness" in t:
        return True
        
    return False

def get_clause_domain(text):
    """

    Classify clause into strict legal domains.

    Returns: 'FINANCIAL', 'POSSESSION', 'OWNERSHIP', 'ENCUMBRANCE', 'ADMINISTRATIVE', 'RECITAL', 'DEFINITION', 'OPERATIVE' or 'GENERAL'

    """
    t = text.lower()
    
    # 1. RECITAL (Background)
    if t.startswith("whereas") or "and whereas" in t:
        return "RECITAL"
        
    # 2. DEFINITION
    if "shall mean" in t or "expression vendor" in t or "expression vendee" in t:
        return "DEFINITION"

    # 3. FINANCIAL (Money, Consideration)
    if any(w in t for w in ["rs.", "rupees", "paid", "consideration", "sum of", "amount", "price", "cheque", "bank"]):
        return "FINANCIAL"

    # 4. POSSESSION (Handover, Vacant)
    if any(w in t for w in ["possession", "handed over", "delivered", "vacant"]):
        return "POSSESSION"

    # 5. OWNERSHIP / TITLE
    if any(w in t for w in ["owner", "title", "interest", "rights", "absolute", "fee simple"]):
        return "OWNERSHIP"
        
    # 6. ENCUMBRANCE (Loans, Mortgages)
    if any(w in t for w in ["encumbrance", "mortgage", "loan", "charge", "lien", "litigation"]):
        return "ENCUMBRANCE"
        
    # 7. ADMINISTRATIVE (Boilerplate)
    if any(w in t for w in ["witness", "signed", "schedule", "jurisdiction", "arbitration", "notice"]):
        return "ADMINISTRATIVE"

    # 8. OPERATIVE (Action)
    if t.startswith("that") or "hereby" in t or "now this deed" in t:
        return "OPERATIVE"

    return "GENERAL"

def get_entities(text):
    """

    Strictly detect if clause belongs to a specific entity.

    """
    t = text.lower()
    entities = set()
    if "vendor" in t: entities.add("Vendor")
    if "vendee" in t: entities.add("Vendee")
    return entities

# =========================
# 2. EXTRACTION HELPERS
# =========================

def extract_numbers(text):
    """Extract numeric values for comparison."""
    # Matches Rs. 100, 1,00,000, 500 sq ft (just the numbers)
    return [int(n.replace(",", "")) for n in re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)]

def has_negation(text):
    neg_words = ["not", "never", "no", "cannot", "must not", "shall not"]
    return any(w in text.lower() for w in neg_words)

def has_exception_language(text):
    """Detects legal exception/qualification identifiers."""
    qualifiers = [
        "subject to", "notwithstanding", "except as provided", 
        "unless otherwise", "provided however", "without prejudice"
    ]
    return any(q in text.lower() for q in qualifiers)

def is_definition(text):
    """Strictly checks if a clause is a definition."""
    t = text.lower()
    if "shall mean" in t or "means" in t or "defined as" in t:
        return True
    return False

def is_party_intro(text):
    """Detects if a clause is just listing a party description."""
    t = text.lower()
    
    # Strong Indicators: Address patterns, Relations, IDs
    # Regex for "Door No", "D.No", "residing at"
    address_pattern = r"(door\s*no|d\.no|residing\s*at|post\s*,\s*village)"
    
    # Regex for relations: "son of", "wife of", "daughter of", "w/o", "s/o", or just "son", "wife" in context
    relation_pattern = r"\b(son|wife|daughter|husband|father|mother|s/o|w/o|d/o)\b"
    
    # Regex for IDs: "aadhaar", "pan no", "id card"
    id_pattern = r"(aadhaar|pan\s*no|id\s*card|mobile\s*no)"
    
    # Check for presence of these patterns
    has_address = re.search(address_pattern, t)
    has_relation = re.search(relation_pattern, t)
    has_id = re.search(id_pattern, t)
    
    # If it has at least 2 strong components (e.g. Relation + ID, or Address + Relation), it's a bio
    score = 0
    if has_address: score += 1
    if has_relation: score += 1
    if has_id: score += 1
    
    return score >= 2

# =========================
# 3. CORE LOGIC GATES
# =========================

def analyze_pair(text1, text2, similarity, threshold=0.75):
    """

    Strict Analyzer returning (Label, Score, Reason).

    Args:

        threshold: Minimum similarity score to consider as CANDIDATE (default 0.75)

    """
    # Force Reload Trigger
    
    # --- GATE 0: BOILERPLATE CHECK ---
    if is_legal_boilerplate(text1) or is_legal_boilerplate(text2):
        return None, 0.0, "Boilerplate (Skipped)"

    # --- GATE 1: DOMAIN MISMATCH ---
    d1 = get_clause_domain(text1)
    d2 = get_clause_domain(text2)
    
    # If domains are totally different, SKIP.
    # Exception: OPERATIVE and GENERAL might overlap, but strictly FINANCIAL vs POSSESSION should skip.
    if d1 != "GENERAL" and d2 != "GENERAL" and d1 != d2:
        # RELAXATION: Only bypass if similarity is VERY high (suggesting misclassification).
        # Otherwise, DO NOT compare apples (Financial) to oranges (Possession), 
        # even in Deep Search mode.
        if similarity < 0.85:
            return None, 0.0, "Domain Mismatch"

    # --- HARDENED CHECK: GENERAL vs SPECIFIC ---
    # Common source of noise: "Any other details" matching "The price is Rs 100"
    # Block GENERAL vs Specific unless similarity is high
    if (d1 == "GENERAL" and d2 != "GENERAL") or (d2 == "GENERAL" and d1 != "GENERAL"):
        if similarity < 0.80:
             return None, 0.0, "General vs Specific Domain (Skipped)"

    # --- SPECIFIC FILTER: MONEY vs TIMELINE ---
    # Prevents "Price is X" vs "Payment due on Date Y" (confusing numbers/dates)
    # Check if one clause is purely FINANCIAL and other is purely TIMELINE/DATE based
    is_financial = d1 == "FINANCIAL" or d2 == "FINANCIAL"
    has_date = re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text1) or \
               re.search(r"\d{1,2}[./-]\d{1,2}[./-]\d{2,4}", text2)
    
    if is_financial and has_date:
        # If one talks about Price/Amount and other has a Date, 
        # unless they are explicitly about "Payment Schedule", they are likely different.
        if "schedule" not in text1.lower() and "schedule" not in text2.lower():
             if similarity < 0.85:
                 return None, 0.0, "Financial vs Timeline Mismatch"

    # --- SPECIFIC FILTER: ELIGIBILITY vs ASSISTANCE ---
    # Prevents "Eligibility criteria" vs "Assistance details" (Common in schemes)
    # Check for keywords like "eligible", "qualify" vs "grant", "support", "help"
    t1_lower, t2_lower = text1.lower(), text2.lower()
    is_eligibility = any(w in t1_lower for w in ["eligible", "qualify", "criteria", "requirement"]) or \
                     any(w in t2_lower for w in ["eligible", "qualify", "criteria", "requirement"])
    is_assistance = any(w in t1_lower for w in ["provide", "grant", "subsidy", "support", "assistance"]) or \
                    any(w in t2_lower for w in ["provide", "grant", "subsidy", "support", "assistance"])

    if is_eligibility and is_assistance:
         # Unless precise overlap, these are distinct sections
         if similarity < 0.85:
              return None, 0.0, "Eligibility vs Assistance Mismatch"

    # --- GATE 1.5: PARTY DESCRIPTION CHECK ---
    # If both clauses are just descriptions of people (addresses, relations), skip.
    if is_party_intro(text1) and is_party_intro(text2):
        return None, 0.0, "Party Description (Skipped)"

    # --- GATE 2: ENTITY MISMATCH ---
    e1 = get_entities(text1)
    e2 = get_entities(text2)
    # If one is Vendor ONLY and other is Vendee ONLY -> SKIP
    if e1 and e2 and e1 != e2 and not (e1 & e2):
        # RELAXATION: Only bypass if similarity is VERY high.
        if similarity < 0.85:
            return None, 0.0, "Entity Role Mismatch"
    
    # --- GATE 2.5: DEFINITION GUARD ---
    # Don't compare definitions with operative clauses generally
    if is_definition(text1) or is_definition(text2):
        # Only compare if both are definitions (conflicting definitions)
        if not (is_definition(text1) and is_definition(text2)):
             return None, 0.0, "Definition vs Operative"

    # --- GATE 3: POSSESSION TIMELINE ---
    # "Possession at agreement" vs "Possession at registration" is NOT a contradiction.
    if d1 == "POSSESSION" and d2 == "POSSESSION":
        keywords_a = ["agreement", "earnest"]
        keywords_b = ["registration", "sale deed", "final"]
        
        has_a = any(k in text1.lower() for k in keywords_a)
        has_b = any(k in text2.lower() for k in keywords_b)
        
        # If one talks about start and other about end, it's a sequence.
        if (has_a and any(k in text2.lower() for k in keywords_b)) or \
           (has_b and any(k in text1.lower() for k in keywords_a)):
             return None, 0.0, "Possession Timeline Sequence"

    # --- GATE 4: NUMERIC REASONING ---
    # Only compare numbers if context allows
    nums1 = extract_numbers(text1)
    nums2 = extract_numbers(text2)
    
    if nums1 and nums2 and nums1 != nums2:
        # MAGNITUDE CHECK: If numbers differ by > 100x, likely different units (e.g. Price vs Area)
        # e.g. 5,50,000 vs 1.25 -> Ratio is huge.
        max1, max2 = max(nums1), max(nums2)
        if max1 > 0 and max2 > 0:
            ratio = max1 / max2 if max1 > max2 else max2 / max1
            if ratio > 100:
                 return None, 0.0, "Numeric Magnitude Mismatch (Likely Unit Diff)"

        # Check if they are in the same domain (likely valid comparison)
        if d1 == d2 and d1 != "GENERAL":
             return "NUMERIC_INCONSISTENCY", 0.9, f"Mismatch in {d1} values"
        
        # If General, be careful. 
        # But if similarity is VERY high, it might be a contradiction.
        if similarity > 0.9:
             return "NUMERIC_INCONSISTENCY", 0.85, "Numeric Mismatch in similar context"

    # --- GATE 4.5: EXCEPTION/HIERARCHY CHECK ---
    # If high similarity but one has exception language
    # We use a slightly lower threshold for exception detection to be safe
    exception_threshold = max(0.65, threshold - 0.05)
    if similarity > exception_threshold: 
        has_ex1 = has_exception_language(text1)
        has_ex2 = has_exception_language(text2)
        
        if (has_ex1 and not has_ex2) or (has_ex2 and not has_ex1):
            return "QUALIFICATION", similarity, "Legal Exception/Qualification detected (Not a Conflict)"

    # --- GATE 5: LOGICAL NEGATION ---
    if (has_negation(text1) and not has_negation(text2)) or \
       (has_negation(text2) and not has_negation(text1)):
        # Only flag if high similarity implies they are talking about the same thing
        # Negation check requires fairly high confidence they are related
        if similarity > 0.85:
            return "LEGAL_CONFLICT", 0.8, "Logical Negation detected"

    # --- FINAL GATE: CANDIDATE FOR NLI ---
    # If we are here, we passed the blocks. 
    # If similarity is high, let NLI decide.
    if similarity > threshold: 
        return "CANDIDATE", similarity, "High Similarity - Pending NLI"
        
    return None, 0.0, "Low Similarity"