Major_Project / inference.py
riyasuryawanshi746's picture
Fixed PDF preprocessing and clause segmentation
6893de4 verified
# inference.py
# v5.4 β€” Confidence calibration fixed.
# Root cause of "Neural=0.993, Confidence=LOW 21%" bug:
# The agreement factor penalized high-neural / zero-symbolic as "disagreement",
# but this is a legitimate state (neural model is certain; no rules triggered).
# Fix: agreement is now only computed between the two scores when BOTH are
# non-trivial (> 0.05). When symbolic is near zero, we treat the neural score
# alone as the evidence and give a neutral agreement factor of 0.70 rather
# than nearly 0.
from __future__ import annotations
IP_CLAUSE_TYPES = {
"IP Ownership Assignment", "Joint IP Ownership",
"Irrevocable Or Perpetual License",
"Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
}
# Rule -> feature dependencies (unchanged)
RULE_FEATURE_DEPS = {
"ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
"ICA_002": ["unilateral_termination", "notice_period_defined"],
"ICA_003": ["non_compete_years"],
"ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
"ICA_005": ["is_wagering_clause"],
"ICA_006": ["restrains_legal_proceedings"],
"ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
"ICA_008": ["has_auto_renewal", "has_opt_out_window"],
"ICA_009": ["has_arbitration", "arbitration_distant_venue"],
"ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
"ICA_011": ["unilateral_price_change"],
"DPDPA_001": ["processes_personal_data", "has_data_retention_clause"],
"DPDPA_002": ["assigns_all_ip", "includes_pre_existing_ip"],
"DPDPA_003": ["processes_sensitive_data", "has_consent_clause"],
"DPDPA_004": ["processes_personal_data", "has_breach_notification"],
"ITA_001": ["handles_digital_data", "has_security_clause"],
"CPA_001": ["is_consumer_contract", "has_one_sided_clause"],
}
# ── Risk-level thresholds (single source of truth) ──────────────────────────
RISK_LOW_MAX = 0.50 # < 0.50 β†’ Low
RISK_MEDIUM_MAX = 0.80 # 0.50–0.80 β†’ Medium; > 0.80 β†’ High
# Threshold below which a score is considered "near zero" for agreement logic
_TRIVIAL_SCORE = 0.05
def level_from_score(score: float) -> tuple[str, str]:
"""Return (level_label, emoji) for a fused score under the v5.4 thresholds."""
if score < RISK_LOW_MAX:
return "Low", "🟒"
if score <= RISK_MEDIUM_MAX:
return "Medium", "🟑"
return "High", "πŸ”΄"
def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
"""Evaluate symbolic rules. Score is clamped to [0, 1]."""
triggered, total = [], 0.0
for rule in symbolic_rules:
try:
if rule["condition"](features):
triggered.append(rule)
total += rule["penalty"]
except Exception:
pass
return {
"symbolic_score": round(min(total, 1.0), 3),
"triggered_rules": triggered,
}
def _neuro_symbolic_fusion(
neural: float,
symbolic: float,
is_ip_clause: bool = False,
) -> dict:
"""
Weighted fusion β€” neural-dominant by design.
No artificial floor: a weak symbolic trigger no longer inflates risk.
"""
if is_ip_clause and symbolic > 0:
w_n, w_s = 0.60, 0.40
else:
w_n, w_s = 0.75, 0.25
raw = w_n * neural + w_s * symbolic
score = round(min(max(raw, 0.0), 1.0), 3)
level, emoji = level_from_score(score)
formula = (
f"({w_n:.2f} Γ— {neural:.3f}) + ({w_s:.2f} Γ— {symbolic:.3f}) "
f"= {round(raw, 3)}"
)
return {
"score": score,
"level": level,
"emoji": emoji,
"breakdown": {
"neural_score": round(neural, 3),
"symbolic_score": round(symbolic, 3),
"weights": {"neural": w_n, "symbolic": w_s},
"raw_fused": round(raw, 3),
"floor_applied": False,
"final": score,
"formula": formula,
},
}
def _compute_confidence(
neural: float,
symbolic: float,
fused: float,
num_triggered: int,
neural_loaded: bool = True,
) -> dict:
"""
Three-factor confidence calibrated for v5.4.
boundary_dist – distance from the nearest risk-level boundary (0.50, 0.80).
A score far from any boundary is a clear-cut decision.
agreement – alignment between neural and symbolic signals.
FIX v5.4: when symbolic is near-zero (no rules triggered),
we do NOT treat this as "disagreement". High neural + no
symbolic rules is a perfectly consistent, informative state.
Agreement is only penalised when BOTH scores are non-trivial
and they point in opposite directions.
rule_strength – more triggered rules β‡’ stronger deterministic evidence.
"""
# Factor 1: distance from nearest risk boundary
boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
dist_factor = min(boundary_dist / 0.20, 1.0)
# Factor 2: agreement (FIXED)
if not neural_loaded:
# No neural signal at all β€” moderate confidence
agree_factor = 0.50
elif symbolic <= _TRIVIAL_SCORE:
# Symbolic is near-zero: no rules fired. Neural is the only signal.
# This is NOT disagreement β€” treat as a confident neural-only verdict.
# Scale agreement by how decisive the neural score is:
# neural close to 0 or 1 β†’ high confidence (0.80)
# neural near 0.50 (borderline) β†’ lower confidence (0.50)
neural_decisiveness = abs(neural - 0.50) / 0.50 # 0 at boundary, 1 at extremes
agree_factor = 0.50 + 0.30 * neural_decisiveness # range [0.50, 0.80]
elif neural <= _TRIVIAL_SCORE:
# Neural is near-zero: symbolic rules fired but model disagrees.
# Genuine disagreement β†’ low agreement factor.
agree_factor = 0.30
else:
# Both signals are non-trivial: measure actual divergence.
agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
# Factor 3: rule strength
if num_triggered == 0: rule_factor = 0.40
elif num_triggered == 1: rule_factor = 0.70
else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)
score = 0.40 * dist_factor + 0.35 * agree_factor + 0.25 * rule_factor
score = round(score, 3)
if score >= 0.65: level = "High"
elif score >= 0.40: level = "Medium"
else: level = "Low"
return {
"level": level,
"score": score,
"factors": {
"boundary_dist": round(boundary_dist, 3),
"agreement": round(agree_factor, 3),
"rule_strength": round(rule_factor, 3),
},
}