""" Input validation guard — runs before any ML code. Returns (True, "") if valid. Returns (False, error_msg) if invalid. Catches: 1. Empty / whitespace only 2. Too long (> QUERY_MAX_CHARS) — checked BEFORE word count 3. Too short (< QUERY_MIN_CHARS) 4. Too few words (< QUERY_MIN_WORDS) 5. Non-Latin / Indic script 6. No legal signal words FIXES from v3.2.1 audit: - Too-long check now runs BEFORE short check (was shadowed) - Indic script detection uses ord() ranges instead of regex (regex pattern had encoding issues on some systems) - Legal signal check uses whole-word matching (word boundary) - Single-word error message now includes "too brief" """ import re from config import QUERY_MIN_CHARS, QUERY_MAX_CHARS, QUERY_MIN_WORDS # Legal signal words — at least one must appear as a standalone word LEGAL_SIGNALS = [ "ipc", "section", "accused", "court", "bail", "murder", "rape", "fraud", "appeal", "conviction", "acquittal", "sentence", "judge", "petitioner", "respondent", "plaintiff", "defendant", "fir", "charge", "arrest", "custody", "evidence", "witness", "verdict", "judgment", "crpc", "article", "writ", "habeas", "injunction", "decree", "theft", "robbery", "assault", "cheating", "dacoity", "offence", "offense", "criminal", "civil", "sessions", "magistrate", "high court", "supreme court", "tribunal", "acquit", "convict", "imprison", "sentenced", "charged", "alleged", "pocso", "ndps", "act", "case" ] def _has_indic_script(text: str) -> bool: """ Detect Indic script characters using Unicode code point ranges. Uses ord() checks — avoids regex encoding issues on all platforms. Ranges covered: 0x0900–0x097F Devanagari (Hindi, Marathi, Sanskrit) 0x0980–0x09FF Bengali 0x0A00–0x0A7F Gurmukhi (Punjabi) 0x0A80–0x0AFF Gujarati 0x0B00–0x0B7F Odia 0x0B80–0x0BFF Tamil 0x0C00–0x0C7F Telugu 0x0C80–0x0CFF Kannada 0x0D00–0x0D7F Malayalam """ indic_count = 0 for ch in text: cp = ord(ch) if (0x0900 <= cp <= 0x097F or # Devanagari 0x0980 <= cp <= 0x09FF or # Bengali 0x0A00 <= cp <= 0x0A7F or # Gurmukhi 0x0A80 <= cp <= 0x0AFF or # Gujarati 0x0B00 <= cp <= 0x0B7F or # Odia 0x0B80 <= cp <= 0x0BFF or # Tamil 0x0C00 <= cp <= 0x0C7F or # Telugu 0x0C80 <= cp <= 0x0CFF or # Kannada 0x0D00 <= cp <= 0x0D7F): # Malayalam indic_count += 1 return indic_count > len(text) * 0.25 def _has_legal_signal(text_lower: str) -> bool: """ Check for at least one legal signal word. Uses word-boundary matching to avoid false positives from substrings (e.g. "like" inside "Unlike", "in" inside "injunction"). """ for signal in LEGAL_SIGNALS: # Use \b word boundary for single-word signals # Use plain 'in' check for multi-word signals like "high court" if " " in signal: if signal in text_lower: return True else: if re.search(r'\b' + re.escape(signal) + r'\b', text_lower): return True return False def validate_query(text: str) -> tuple: """ Validate query before sending to NLP/ML pipeline. Returns: (True, "") — valid query (False, human-readable error msg) — invalid query """ # 1. Empty if not text or not text.strip(): return False, ( "Please describe your case. The search field is empty." ) text = text.strip() # 2. Too long — check BEFORE word count to catch "word " * 1000 if len(text) > QUERY_MAX_CHARS: return False, ( f"Query too long ({len(text):,} characters, limit {QUERY_MAX_CHARS:,}). " f"Summarize the key charges, facts, and evidence in a few sentences. " f"For a full judgment text, use the PDF upload feature." ) # 3. Too short (character count) if len(text) < QUERY_MIN_CHARS: return False, ( f"Query too short ({len(text)} characters, minimum {QUERY_MIN_CHARS}). " f"Example: 'Accused charged under IPC Section 302 for murder " f"with eyewitness and forensic evidence.'" ) # 4. Too few words word_count = len(text.split()) if word_count < QUERY_MIN_WORDS: return False, ( f"Query too brief ({word_count} word{'s' if word_count != 1 else ''}). " f"Please describe the charges, facts, and evidence in at least " f"{QUERY_MIN_WORDS} words." ) # 5. Non-Latin / Indic script if _has_indic_script(text): return False, ( "Query appears to be in a non-English script. " "LexAI's embedding model (LegalBERT) was trained on English legal text. " "Please enter your query in English for accurate results." ) # 6. No legal signal text_lower = text.lower() if not _has_legal_signal(text_lower): return False, ( "Query doesn't appear to describe a legal case. " "Please include legal context such as charges (IPC section), " "case type (murder, bail, fraud), court, or parties. " "Example: 'Accused charged under IPC 420 for cheating. Victim filed FIR.'" ) return True, ""