File size: 5,485 Bytes
b2fe8d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d8908a
b2fe8d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Input validation guard — runs before any ML code.

Returns (True, "")            if valid.
Returns (False, error_msg)    if invalid.

Catches:
  1. Empty / whitespace only
  2. Too long  (> QUERY_MAX_CHARS)     — checked BEFORE word count
  3. Too short (< QUERY_MIN_CHARS)
  4. Too few words (< QUERY_MIN_WORDS)
  5. Non-Latin / Indic script
  6. No legal signal words

FIXES from v3.2.1 audit:
  - Too-long check now runs BEFORE short check (was shadowed)
  - Indic script detection uses ord() ranges instead of regex
    (regex pattern had encoding issues on some systems)
  - Legal signal check uses whole-word matching (word boundary)
  - Single-word error message now includes "too brief"
"""

import re
from config import QUERY_MIN_CHARS, QUERY_MAX_CHARS, QUERY_MIN_WORDS

# Legal signal words — at least one must appear as a standalone word
LEGAL_SIGNALS = [
    "ipc", "section", "accused", "court", "bail", "murder", "rape",
    "fraud", "appeal", "conviction", "acquittal", "sentence", "judge",
    "petitioner", "respondent", "plaintiff", "defendant", "fir", "charge",
    "arrest", "custody", "evidence", "witness", "verdict", "judgment",
    "crpc", "article", "writ", "habeas", "injunction", "decree",
    "theft", "robbery", "assault", "cheating", "dacoity", "offence",
    "offense", "criminal", "civil", "sessions", "magistrate", "high court",
    "supreme court", "tribunal", "acquit", "convict", "imprison",
    "sentenced", "charged", "alleged", "pocso", "ndps", "act", "case"
]


def _has_indic_script(text: str) -> bool:
    """
    Detect Indic script characters using Unicode code point ranges.
    Uses ord() checks — avoids regex encoding issues on all platforms.

    Ranges covered:
      0x0900–0x097F  Devanagari  (Hindi, Marathi, Sanskrit)
      0x0980–0x09FF  Bengali
      0x0A00–0x0A7F  Gurmukhi   (Punjabi)
      0x0A80–0x0AFF  Gujarati
      0x0B00–0x0B7F  Odia
      0x0B80–0x0BFF  Tamil
      0x0C00–0x0C7F  Telugu
      0x0C80–0x0CFF  Kannada
      0x0D00–0x0D7F  Malayalam
    """
    indic_count = 0
    for ch in text:
        cp = ord(ch)
        if (0x0900 <= cp <= 0x097F or   # Devanagari
            0x0980 <= cp <= 0x09FF or   # Bengali
            0x0A00 <= cp <= 0x0A7F or   # Gurmukhi
            0x0A80 <= cp <= 0x0AFF or   # Gujarati
            0x0B00 <= cp <= 0x0B7F or   # Odia
            0x0B80 <= cp <= 0x0BFF or   # Tamil
            0x0C00 <= cp <= 0x0C7F or   # Telugu
            0x0C80 <= cp <= 0x0CFF or   # Kannada
            0x0D00 <= cp <= 0x0D7F):    # Malayalam
            indic_count += 1
    return indic_count > len(text) * 0.25


def _has_legal_signal(text_lower: str) -> bool:
    """
    Check for at least one legal signal word.
    Uses word-boundary matching to avoid false positives from
    substrings (e.g. "like" inside "Unlike", "in" inside "injunction").
    """
    for signal in LEGAL_SIGNALS:
        # Use \b word boundary for single-word signals
        # Use plain 'in' check for multi-word signals like "high court"
        if " " in signal:
            if signal in text_lower:
                return True
        else:
            if re.search(r'\b' + re.escape(signal) + r'\b', text_lower):
                return True
    return False


def validate_query(text: str) -> tuple:
    """
    Validate query before sending to NLP/ML pipeline.

    Returns:
        (True, "")                         — valid query
        (False, human-readable error msg)  — invalid query
    """
    # 1. Empty
    if not text or not text.strip():
        return False, (
            "Please describe your case. The search field is empty."
        )

    text = text.strip()

    # 2. Too long — check BEFORE word count to catch "word " * 1000
    if len(text) > QUERY_MAX_CHARS:
        return False, (
            f"Query too long ({len(text):,} characters, limit {QUERY_MAX_CHARS:,}). "
            f"Summarize the key charges, facts, and evidence in a few sentences. "
            f"For a full judgment text, use the PDF upload feature."
        )

    # 3. Too short (character count)
    if len(text) < QUERY_MIN_CHARS:
        return False, (
            f"Query too short ({len(text)} characters, minimum {QUERY_MIN_CHARS}). "
            f"Example: 'Accused charged under IPC Section 302 for murder "
            f"with eyewitness and forensic evidence.'"
        )

    # 4. Too few words
    word_count = len(text.split())
    if word_count < QUERY_MIN_WORDS:
        return False, (
            f"Query too brief ({word_count} word{'s' if word_count != 1 else ''}). "
            f"Please describe the charges, facts, and evidence in at least "
            f"{QUERY_MIN_WORDS} words."
        )

    # 5. Non-Latin / Indic script
    if _has_indic_script(text):
        return False, (
            "Query appears to be in a non-English script. "
            "LexAI's embedding model (LegalBERT) was trained on English legal text. "
            "Please enter your query in English for accurate results."
        )

    # 6. No legal signal
    text_lower = text.lower()
    if not _has_legal_signal(text_lower):
        return False, (
            "Query doesn't appear to describe a legal case. "
            "Please include legal context such as charges (IPC section), "
            "case type (murder, bail, fraud), court, or parties. "
            "Example: 'Accused charged under IPC 420 for cheating. Victim filed FIR.'"
        )

    return True, ""