Spaces:

Satyam810
/

lexai

Running

App Files Files Community

lexai / src /query_validator.py

Satyam810

fix: add pocso, ndps, act, case to validator legal signals

8d8908a 8 days ago

raw

history blame contribute delete

5.49 kB

	"""
	Input validation guard — runs before any ML code.

	Returns (True, "") if valid.
	Returns (False, error_msg) if invalid.

	Catches:
	1. Empty / whitespace only
	2. Too long (> QUERY_MAX_CHARS) — checked BEFORE word count
	3. Too short (< QUERY_MIN_CHARS)
	4. Too few words (< QUERY_MIN_WORDS)
	5. Non-Latin / Indic script
	6. No legal signal words

	FIXES from v3.2.1 audit:
	- Too-long check now runs BEFORE short check (was shadowed)
	- Indic script detection uses ord() ranges instead of regex
	(regex pattern had encoding issues on some systems)
	- Legal signal check uses whole-word matching (word boundary)
	- Single-word error message now includes "too brief"
	"""

	import re
	from config import QUERY_MIN_CHARS, QUERY_MAX_CHARS, QUERY_MIN_WORDS

	# Legal signal words — at least one must appear as a standalone word
	LEGAL_SIGNALS = [
	"ipc", "section", "accused", "court", "bail", "murder", "rape",
	"fraud", "appeal", "conviction", "acquittal", "sentence", "judge",
	"petitioner", "respondent", "plaintiff", "defendant", "fir", "charge",
	"arrest", "custody", "evidence", "witness", "verdict", "judgment",
	"crpc", "article", "writ", "habeas", "injunction", "decree",
	"theft", "robbery", "assault", "cheating", "dacoity", "offence",
	"offense", "criminal", "civil", "sessions", "magistrate", "high court",
	"supreme court", "tribunal", "acquit", "convict", "imprison",
	"sentenced", "charged", "alleged", "pocso", "ndps", "act", "case"
	]


	def _has_indic_script(text: str) -> bool:
	"""
	Detect Indic script characters using Unicode code point ranges.
	Uses ord() checks — avoids regex encoding issues on all platforms.

	Ranges covered:
	0x0900–0x097F Devanagari (Hindi, Marathi, Sanskrit)
	0x0980–0x09FF Bengali
	0x0A00–0x0A7F Gurmukhi (Punjabi)
	0x0A80–0x0AFF Gujarati
	0x0B00–0x0B7F Odia
	0x0B80–0x0BFF Tamil
	0x0C00–0x0C7F Telugu
	0x0C80–0x0CFF Kannada
	0x0D00–0x0D7F Malayalam
	"""
	indic_count = 0
	for ch in text:
	cp = ord(ch)
	if (0x0900 <= cp <= 0x097F or # Devanagari
	0x0980 <= cp <= 0x09FF or # Bengali
	0x0A00 <= cp <= 0x0A7F or # Gurmukhi
	0x0A80 <= cp <= 0x0AFF or # Gujarati
	0x0B00 <= cp <= 0x0B7F or # Odia
	0x0B80 <= cp <= 0x0BFF or # Tamil
	0x0C00 <= cp <= 0x0C7F or # Telugu
	0x0C80 <= cp <= 0x0CFF or # Kannada
	0x0D00 <= cp <= 0x0D7F): # Malayalam
	indic_count += 1
	return indic_count > len(text) * 0.25


	def _has_legal_signal(text_lower: str) -> bool:
	"""
	Check for at least one legal signal word.
	Uses word-boundary matching to avoid false positives from
	substrings (e.g. "like" inside "Unlike", "in" inside "injunction").
	"""
	for signal in LEGAL_SIGNALS:
	# Use \b word boundary for single-word signals
	# Use plain 'in' check for multi-word signals like "high court"
	if " " in signal:
	if signal in text_lower:
	return True
	else:
	if re.search(r'\b' + re.escape(signal) + r'\b', text_lower):
	return True
	return False


	def validate_query(text: str) -> tuple:
	"""
	Validate query before sending to NLP/ML pipeline.

	Returns:
	(True, "") — valid query
	(False, human-readable error msg) — invalid query
	"""
	# 1. Empty
	if not text or not text.strip():
	return False, (
	"Please describe your case. The search field is empty."
	)

	text = text.strip()

	# 2. Too long — check BEFORE word count to catch "word " * 1000
	if len(text) > QUERY_MAX_CHARS:
	return False, (
	f"Query too long ({len(text):,} characters, limit {QUERY_MAX_CHARS:,}). "
	f"Summarize the key charges, facts, and evidence in a few sentences. "
	f"For a full judgment text, use the PDF upload feature."
	)

	# 3. Too short (character count)
	if len(text) < QUERY_MIN_CHARS:
	return False, (
	f"Query too short ({len(text)} characters, minimum {QUERY_MIN_CHARS}). "
	f"Example: 'Accused charged under IPC Section 302 for murder "
	f"with eyewitness and forensic evidence.'"
	)

	# 4. Too few words
	word_count = len(text.split())
	if word_count < QUERY_MIN_WORDS:
	return False, (
	f"Query too brief ({word_count} word{'s' if word_count != 1 else ''}). "
	f"Please describe the charges, facts, and evidence in at least "
	f"{QUERY_MIN_WORDS} words."
	)

	# 5. Non-Latin / Indic script
	if _has_indic_script(text):
	return False, (
	"Query appears to be in a non-English script. "
	"LexAI's embedding model (LegalBERT) was trained on English legal text. "
	"Please enter your query in English for accurate results."
	)

	# 6. No legal signal
	text_lower = text.lower()
	if not _has_legal_signal(text_lower):
	return False, (
	"Query doesn't appear to describe a legal case. "
	"Please include legal context such as charges (IPC section), "
	"case type (murder, bail, fraud), court, or parties. "
	"Example: 'Accused charged under IPC 420 for cheating. Victim filed FIR.'"
	)

	return True, ""