Spaces:

riyasuryawanshi746
/

Major_Project

Sleeping

App Files Files Community

Major_Project / inference.py

riyasuryawanshi746

Fixed PDF preprocessing and clause segmentation

6893de4 verified 8 days ago

raw

history blame contribute delete

7.08 kB

	# inference.py
	# v5.4 — Confidence calibration fixed.
	# Root cause of "Neural=0.993, Confidence=LOW 21%" bug:
	# The agreement factor penalized high-neural / zero-symbolic as "disagreement",
	# but this is a legitimate state (neural model is certain; no rules triggered).
	# Fix: agreement is now only computed between the two scores when BOTH are
	# non-trivial (> 0.05). When symbolic is near zero, we treat the neural score
	# alone as the evidence and give a neutral agreement factor of 0.70 rather
	# than nearly 0.

	from __future__ import annotations

	IP_CLAUSE_TYPES = {
	"IP Ownership Assignment", "Joint IP Ownership",
	"Irrevocable Or Perpetual License",
	"Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
	}

	# Rule -> feature dependencies (unchanged)
	RULE_FEATURE_DEPS = {
	"ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
	"ICA_002": ["unilateral_termination", "notice_period_defined"],
	"ICA_003": ["non_compete_years"],
	"ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
	"ICA_005": ["is_wagering_clause"],
	"ICA_006": ["restrains_legal_proceedings"],
	"ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
	"ICA_008": ["has_auto_renewal", "has_opt_out_window"],
	"ICA_009": ["has_arbitration", "arbitration_distant_venue"],
	"ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
	"ICA_011": ["unilateral_price_change"],
	"DPDPA_001": ["processes_personal_data", "has_data_retention_clause"],
	"DPDPA_002": ["assigns_all_ip", "includes_pre_existing_ip"],
	"DPDPA_003": ["processes_sensitive_data", "has_consent_clause"],
	"DPDPA_004": ["processes_personal_data", "has_breach_notification"],
	"ITA_001": ["handles_digital_data", "has_security_clause"],
	"CPA_001": ["is_consumer_contract", "has_one_sided_clause"],
	}

	# ── Risk-level thresholds (single source of truth) ──────────────────────────
	RISK_LOW_MAX = 0.50 # < 0.50 → Low
	RISK_MEDIUM_MAX = 0.80 # 0.50–0.80 → Medium; > 0.80 → High

	# Threshold below which a score is considered "near zero" for agreement logic
	_TRIVIAL_SCORE = 0.05


	def level_from_score(score: float) -> tuple[str, str]:
	"""Return (level_label, emoji) for a fused score under the v5.4 thresholds."""
	if score < RISK_LOW_MAX:
	return "Low", "🟢"
	if score <= RISK_MEDIUM_MAX:
	return "Medium", "🟡"
	return "High", "🔴"


	def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
	"""Evaluate symbolic rules. Score is clamped to [0, 1]."""
	triggered, total = [], 0.0
	for rule in symbolic_rules:
	try:
	if rule["condition"](features):
	triggered.append(rule)
	total += rule["penalty"]
	except Exception:
	pass
	return {
	"symbolic_score": round(min(total, 1.0), 3),
	"triggered_rules": triggered,
	}


	def _neuro_symbolic_fusion(
	neural: float,
	symbolic: float,
	is_ip_clause: bool = False,
	) -> dict:
	"""
	Weighted fusion — neural-dominant by design.
	No artificial floor: a weak symbolic trigger no longer inflates risk.
	"""
	if is_ip_clause and symbolic > 0:
	w_n, w_s = 0.60, 0.40
	else:
	w_n, w_s = 0.75, 0.25

	raw = w_n * neural + w_s * symbolic
	score = round(min(max(raw, 0.0), 1.0), 3)

	level, emoji = level_from_score(score)

	formula = (
	f"({w_n:.2f} × {neural:.3f}) + ({w_s:.2f} × {symbolic:.3f}) "
	f"= {round(raw, 3)}"
	)

	return {
	"score": score,
	"level": level,
	"emoji": emoji,
	"breakdown": {
	"neural_score": round(neural, 3),
	"symbolic_score": round(symbolic, 3),
	"weights": {"neural": w_n, "symbolic": w_s},
	"raw_fused": round(raw, 3),
	"floor_applied": False,
	"final": score,
	"formula": formula,
	},
	}


	def _compute_confidence(
	neural: float,
	symbolic: float,
	fused: float,
	num_triggered: int,
	neural_loaded: bool = True,
	) -> dict:
	"""
	Three-factor confidence calibrated for v5.4.

	boundary_dist – distance from the nearest risk-level boundary (0.50, 0.80).
	A score far from any boundary is a clear-cut decision.
	agreement – alignment between neural and symbolic signals.
	FIX v5.4: when symbolic is near-zero (no rules triggered),
	we do NOT treat this as "disagreement". High neural + no
	symbolic rules is a perfectly consistent, informative state.
	Agreement is only penalised when BOTH scores are non-trivial
	and they point in opposite directions.
	rule_strength – more triggered rules ⇒ stronger deterministic evidence.
	"""
	# Factor 1: distance from nearest risk boundary
	boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
	dist_factor = min(boundary_dist / 0.20, 1.0)

	# Factor 2: agreement (FIXED)
	if not neural_loaded:
	# No neural signal at all — moderate confidence
	agree_factor = 0.50
	elif symbolic <= _TRIVIAL_SCORE:
	# Symbolic is near-zero: no rules fired. Neural is the only signal.
	# This is NOT disagreement — treat as a confident neural-only verdict.
	# Scale agreement by how decisive the neural score is:
	# neural close to 0 or 1 → high confidence (0.80)
	# neural near 0.50 (borderline) → lower confidence (0.50)
	neural_decisiveness = abs(neural - 0.50) / 0.50 # 0 at boundary, 1 at extremes
	agree_factor = 0.50 + 0.30 * neural_decisiveness # range [0.50, 0.80]
	elif neural <= _TRIVIAL_SCORE:
	# Neural is near-zero: symbolic rules fired but model disagrees.
	# Genuine disagreement → low agreement factor.
	agree_factor = 0.30
	else:
	# Both signals are non-trivial: measure actual divergence.
	agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)

	# Factor 3: rule strength
	if num_triggered == 0: rule_factor = 0.40
	elif num_triggered == 1: rule_factor = 0.70
	else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)

	score = 0.40 * dist_factor + 0.35 * agree_factor + 0.25 * rule_factor
	score = round(score, 3)

	if score >= 0.65: level = "High"
	elif score >= 0.40: level = "Medium"
	else: level = "Low"

	return {
	"level": level,
	"score": score,
	"factors": {
	"boundary_dist": round(boundary_dist, 3),
	"agreement": round(agree_factor, 3),
	"rule_strength": round(rule_factor, 3),
	},
	}