File size: 5,485 Bytes
b2fe8d1 8d8908a b2fe8d1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """
Input validation guard — runs before any ML code.
Returns (True, "") if valid.
Returns (False, error_msg) if invalid.
Catches:
1. Empty / whitespace only
2. Too long (> QUERY_MAX_CHARS) — checked BEFORE word count
3. Too short (< QUERY_MIN_CHARS)
4. Too few words (< QUERY_MIN_WORDS)
5. Non-Latin / Indic script
6. No legal signal words
FIXES from v3.2.1 audit:
- Too-long check now runs BEFORE short check (was shadowed)
- Indic script detection uses ord() ranges instead of regex
(regex pattern had encoding issues on some systems)
- Legal signal check uses whole-word matching (word boundary)
- Single-word error message now includes "too brief"
"""
import re
from config import QUERY_MIN_CHARS, QUERY_MAX_CHARS, QUERY_MIN_WORDS
# Legal signal words — at least one must appear as a standalone word
LEGAL_SIGNALS = [
"ipc", "section", "accused", "court", "bail", "murder", "rape",
"fraud", "appeal", "conviction", "acquittal", "sentence", "judge",
"petitioner", "respondent", "plaintiff", "defendant", "fir", "charge",
"arrest", "custody", "evidence", "witness", "verdict", "judgment",
"crpc", "article", "writ", "habeas", "injunction", "decree",
"theft", "robbery", "assault", "cheating", "dacoity", "offence",
"offense", "criminal", "civil", "sessions", "magistrate", "high court",
"supreme court", "tribunal", "acquit", "convict", "imprison",
"sentenced", "charged", "alleged", "pocso", "ndps", "act", "case"
]
def _has_indic_script(text: str) -> bool:
"""
Detect Indic script characters using Unicode code point ranges.
Uses ord() checks — avoids regex encoding issues on all platforms.
Ranges covered:
0x0900–0x097F Devanagari (Hindi, Marathi, Sanskrit)
0x0980–0x09FF Bengali
0x0A00–0x0A7F Gurmukhi (Punjabi)
0x0A80–0x0AFF Gujarati
0x0B00–0x0B7F Odia
0x0B80–0x0BFF Tamil
0x0C00–0x0C7F Telugu
0x0C80–0x0CFF Kannada
0x0D00–0x0D7F Malayalam
"""
indic_count = 0
for ch in text:
cp = ord(ch)
if (0x0900 <= cp <= 0x097F or # Devanagari
0x0980 <= cp <= 0x09FF or # Bengali
0x0A00 <= cp <= 0x0A7F or # Gurmukhi
0x0A80 <= cp <= 0x0AFF or # Gujarati
0x0B00 <= cp <= 0x0B7F or # Odia
0x0B80 <= cp <= 0x0BFF or # Tamil
0x0C00 <= cp <= 0x0C7F or # Telugu
0x0C80 <= cp <= 0x0CFF or # Kannada
0x0D00 <= cp <= 0x0D7F): # Malayalam
indic_count += 1
return indic_count > len(text) * 0.25
def _has_legal_signal(text_lower: str) -> bool:
"""
Check for at least one legal signal word.
Uses word-boundary matching to avoid false positives from
substrings (e.g. "like" inside "Unlike", "in" inside "injunction").
"""
for signal in LEGAL_SIGNALS:
# Use \b word boundary for single-word signals
# Use plain 'in' check for multi-word signals like "high court"
if " " in signal:
if signal in text_lower:
return True
else:
if re.search(r'\b' + re.escape(signal) + r'\b', text_lower):
return True
return False
def validate_query(text: str) -> tuple:
"""
Validate query before sending to NLP/ML pipeline.
Returns:
(True, "") — valid query
(False, human-readable error msg) — invalid query
"""
# 1. Empty
if not text or not text.strip():
return False, (
"Please describe your case. The search field is empty."
)
text = text.strip()
# 2. Too long — check BEFORE word count to catch "word " * 1000
if len(text) > QUERY_MAX_CHARS:
return False, (
f"Query too long ({len(text):,} characters, limit {QUERY_MAX_CHARS:,}). "
f"Summarize the key charges, facts, and evidence in a few sentences. "
f"For a full judgment text, use the PDF upload feature."
)
# 3. Too short (character count)
if len(text) < QUERY_MIN_CHARS:
return False, (
f"Query too short ({len(text)} characters, minimum {QUERY_MIN_CHARS}). "
f"Example: 'Accused charged under IPC Section 302 for murder "
f"with eyewitness and forensic evidence.'"
)
# 4. Too few words
word_count = len(text.split())
if word_count < QUERY_MIN_WORDS:
return False, (
f"Query too brief ({word_count} word{'s' if word_count != 1 else ''}). "
f"Please describe the charges, facts, and evidence in at least "
f"{QUERY_MIN_WORDS} words."
)
# 5. Non-Latin / Indic script
if _has_indic_script(text):
return False, (
"Query appears to be in a non-English script. "
"LexAI's embedding model (LegalBERT) was trained on English legal text. "
"Please enter your query in English for accurate results."
)
# 6. No legal signal
text_lower = text.lower()
if not _has_legal_signal(text_lower):
return False, (
"Query doesn't appear to describe a legal case. "
"Please include legal context such as charges (IPC section), "
"case type (murder, bail, fraud), court, or parties. "
"Example: 'Accused charged under IPC 420 for cheating. Victim filed FIR.'"
)
return True, ""
|