File size: 5,208 Bytes
ef31980
868e976
ef31980
 
f930d0c
 
 
ef31980
 
f930d0c
 
 
 
 
 
 
 
 
 
 
 
ef31980
 
 
f930d0c
 
 
 
 
 
ef31980
 
 
 
 
 
 
f930d0c
 
ef31980
f930d0c
 
ef31980
 
 
 
 
 
 
 
 
 
 
 
f930d0c
0767317
ef31980
 
 
 
0767317
ef31980
 
 
 
 
f930d0c
 
 
 
 
 
ef31980
f930d0c
ef31980
f930d0c
 
45d3479
0767317
45d3479
f930d0c
 
 
 
 
 
 
 
ef31980
f930d0c
 
 
 
 
 
 
ef31980
 
 
f930d0c
 
 
 
ef31980
f930d0c
ef31980
f930d0c
 
ef31980
 
 
f930d0c
ef31980
0767317
ef31980
 
 
f930d0c
ef31980
f930d0c
 
 
 
 
 
 
ef31980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f930d0c
ef31980
f930d0c
ef31980
f930d0c
 
 
 
 
 
ef31980
f930d0c
 
 
ef31980
f930d0c
ef31980
f930d0c
ef31980
f930d0c
 
 
ef31980
f930d0c
 
 
ef31980
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# ====================================================
# web_search.py
# ====================================================

"""
Minimal Tavily wrapper: run a web search and return a list of UDISE codes.

Enhancement:
- Optional village support for more precise queries
"""

import os
import re
from typing import List, Optional

# optional Tavily SDK
try:
    from tavily import TavilyClient
except Exception:
    TavilyClient = None

# ----------------------------------------------------
# State → UDISE prefix mapping
# ----------------------------------------------------
STATE_TO_UDISE_CODE = {
    "Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
    "Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
    "Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
    "Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
    "Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
    "Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22",
    "Madhya Pradesh": "23", "Gujarat": "24", "Daman & Diu": "25",
    "Dadra & Nagar Haveli": "26", "Maharashtra": "27",
    "Andhra Pradesh": "28", "Karnataka": "29", "Goa": "30",
    "Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33",
    "Puducherry": "34", "Andaman & Nicobar Islands": "35",
    "Telangana": "36", "Ladakh": "37",
}

# strict 11-digit UDISE match
_UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")


# ----------------------------------------------------
# Query builder (Village-aware)
# ----------------------------------------------------
def _build_query(
    school_name: Optional[str],
    state_name: Optional[str],
    district: Optional[str],
    village: Optional[str] = None,
) -> str:
    parts = ["UDISE code of"]

    if school_name:
        parts.append(f"School {school_name.strip()}")

    if village:
        parts.append(f"in village {village.strip()}")

    if district:
        parts.append(f"district {district.strip()}")

    if state_name:
        parts.append(f"state {state_name.strip()}")

    return " ".join(parts).strip()


def _call_tavily(api_key: Optional[str], query: str):
    key = api_key or os.getenv("TAVILY_API_KEY")
    if not key:
        return {"ok": False, "error": "No Tavily API key provided."}
    if TavilyClient is None:
        return {"ok": False, "error": "tavily package not installed."}
    try:
        client = TavilyClient(key)
        print(query)
        resp = client.search(query=query, country="india")
        print(resp)
        return {"ok": True, "data": resp}
    except Exception as e:
        return {"ok": False, "error": str(e)}


def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
    if not state_name:
        return None

    cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
    for k in STATE_TO_UDISE_CODE:
        if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
            return k
    return None


# ----------------------------------------------------
# Public API
# ----------------------------------------------------
def tavily_search_codes(
    school_name: Optional[str],
    state_name: Optional[str] = None,
    district: Optional[str] = None,
    village: Optional[str] = None,
    api_key: Optional[str] = None,
    enforce_state_prefix: bool = True,
) -> List[str]:
    """
    Perform a Tavily search and return a list of unique UDISE codes.

    Village is used only to improve search precision.
    """

    if not school_name:
        return []

    query = _build_query(school_name, state_name, district, village)
    call = _call_tavily(api_key, query)

    if not call.get("ok"):
        return []

    raw = call.get("data") or {}
    snippets = []

    if isinstance(raw, dict):
        candidates = (
            raw.get("results")
            or raw.get("data", {}).get("results")
            or raw.get("items")
            or []
        )
        for item in candidates:
            if isinstance(item, dict):
                snippets.append(
                    " ".join(
                        [
                            str(item.get("title", "")),
                            str(item.get("content", "")),
                            str(item.get("text", "")),
                            str(item.get("url", "")),
                        ]
                    )
                )
            else:
                snippets.append(str(item))
    elif isinstance(raw, list):
        snippets = [str(x) for x in raw]
    else:
        snippets = [str(raw)]

    allowed_prefix = None
    state_key = _normalize_state_key(state_name)
    if enforce_state_prefix and state_key:
        allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)

    found, seen = [], set()
    for text in snippets:
        for m in _UDISE_RE.finditer(text):
            code = m.group(1)
            if code in seen:
                continue

            prefix = code[:2]
            if prefix not in STATE_TO_UDISE_CODE.values():
                continue
            if allowed_prefix and prefix != allowed_prefix:
                continue

            seen.add(code)
            found.append(code)

    return found