| """ |
| Input validation guard — runs before any ML code. |
| |
| Returns (True, "") if valid. |
| Returns (False, error_msg) if invalid. |
| |
| Catches: |
| 1. Empty / whitespace only |
| 2. Too long (> QUERY_MAX_CHARS) — checked BEFORE word count |
| 3. Too short (< QUERY_MIN_CHARS) |
| 4. Too few words (< QUERY_MIN_WORDS) |
| 5. Non-Latin / Indic script |
| 6. No legal signal words |
| |
| FIXES from v3.2.1 audit: |
| - Too-long check now runs BEFORE short check (was shadowed) |
| - Indic script detection uses ord() ranges instead of regex |
| (regex pattern had encoding issues on some systems) |
| - Legal signal check uses whole-word matching (word boundary) |
| - Single-word error message now includes "too brief" |
| """ |
|
|
| import re |
| from config import QUERY_MIN_CHARS, QUERY_MAX_CHARS, QUERY_MIN_WORDS |
|
|
| |
| LEGAL_SIGNALS = [ |
| "ipc", "section", "accused", "court", "bail", "murder", "rape", |
| "fraud", "appeal", "conviction", "acquittal", "sentence", "judge", |
| "petitioner", "respondent", "plaintiff", "defendant", "fir", "charge", |
| "arrest", "custody", "evidence", "witness", "verdict", "judgment", |
| "crpc", "article", "writ", "habeas", "injunction", "decree", |
| "theft", "robbery", "assault", "cheating", "dacoity", "offence", |
| "offense", "criminal", "civil", "sessions", "magistrate", "high court", |
| "supreme court", "tribunal", "acquit", "convict", "imprison", |
| "sentenced", "charged", "alleged", "pocso", "ndps", "act", "case" |
| ] |
|
|
|
|
| def _has_indic_script(text: str) -> bool: |
| """ |
| Detect Indic script characters using Unicode code point ranges. |
| Uses ord() checks — avoids regex encoding issues on all platforms. |
| |
| Ranges covered: |
| 0x0900–0x097F Devanagari (Hindi, Marathi, Sanskrit) |
| 0x0980–0x09FF Bengali |
| 0x0A00–0x0A7F Gurmukhi (Punjabi) |
| 0x0A80–0x0AFF Gujarati |
| 0x0B00–0x0B7F Odia |
| 0x0B80–0x0BFF Tamil |
| 0x0C00–0x0C7F Telugu |
| 0x0C80–0x0CFF Kannada |
| 0x0D00–0x0D7F Malayalam |
| """ |
| indic_count = 0 |
| for ch in text: |
| cp = ord(ch) |
| if (0x0900 <= cp <= 0x097F or |
| 0x0980 <= cp <= 0x09FF or |
| 0x0A00 <= cp <= 0x0A7F or |
| 0x0A80 <= cp <= 0x0AFF or |
| 0x0B00 <= cp <= 0x0B7F or |
| 0x0B80 <= cp <= 0x0BFF or |
| 0x0C00 <= cp <= 0x0C7F or |
| 0x0C80 <= cp <= 0x0CFF or |
| 0x0D00 <= cp <= 0x0D7F): |
| indic_count += 1 |
| return indic_count > len(text) * 0.25 |
|
|
|
|
| def _has_legal_signal(text_lower: str) -> bool: |
| """ |
| Check for at least one legal signal word. |
| Uses word-boundary matching to avoid false positives from |
| substrings (e.g. "like" inside "Unlike", "in" inside "injunction"). |
| """ |
| for signal in LEGAL_SIGNALS: |
| |
| |
| if " " in signal: |
| if signal in text_lower: |
| return True |
| else: |
| if re.search(r'\b' + re.escape(signal) + r'\b', text_lower): |
| return True |
| return False |
|
|
|
|
| def validate_query(text: str) -> tuple: |
| """ |
| Validate query before sending to NLP/ML pipeline. |
| |
| Returns: |
| (True, "") — valid query |
| (False, human-readable error msg) — invalid query |
| """ |
| |
| if not text or not text.strip(): |
| return False, ( |
| "Please describe your case. The search field is empty." |
| ) |
|
|
| text = text.strip() |
|
|
| |
| if len(text) > QUERY_MAX_CHARS: |
| return False, ( |
| f"Query too long ({len(text):,} characters, limit {QUERY_MAX_CHARS:,}). " |
| f"Summarize the key charges, facts, and evidence in a few sentences. " |
| f"For a full judgment text, use the PDF upload feature." |
| ) |
|
|
| |
| if len(text) < QUERY_MIN_CHARS: |
| return False, ( |
| f"Query too short ({len(text)} characters, minimum {QUERY_MIN_CHARS}). " |
| f"Example: 'Accused charged under IPC Section 302 for murder " |
| f"with eyewitness and forensic evidence.'" |
| ) |
|
|
| |
| word_count = len(text.split()) |
| if word_count < QUERY_MIN_WORDS: |
| return False, ( |
| f"Query too brief ({word_count} word{'s' if word_count != 1 else ''}). " |
| f"Please describe the charges, facts, and evidence in at least " |
| f"{QUERY_MIN_WORDS} words." |
| ) |
|
|
| |
| if _has_indic_script(text): |
| return False, ( |
| "Query appears to be in a non-English script. " |
| "LexAI's embedding model (LegalBERT) was trained on English legal text. " |
| "Please enter your query in English for accurate results." |
| ) |
|
|
| |
| text_lower = text.lower() |
| if not _has_legal_signal(text_lower): |
| return False, ( |
| "Query doesn't appear to describe a legal case. " |
| "Please include legal context such as charges (IPC section), " |
| "case type (murder, bail, fraud), court, or parties. " |
| "Example: 'Accused charged under IPC 420 for cheating. Victim filed FIR.'" |
| ) |
|
|
| return True, "" |
|
|