Spaces:
Running
Running
File size: 16,595 Bytes
b0e15c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | """Scheme Code Resolver
======================
Resolves missing AMFI scheme codes by fuzzy-matching the fund name from the
CSV against mfapi.in's /mf/search endpoint.
This runs as a PRE-TRIAGE step so that the NAV engine can fire for funds whose
scheme code was absent from the CSV.
"""
from __future__ import annotations
import difflib
import re
import time
import requests
MFAPI_SEARCH = "https://api.mfapi.in/mf/search"
MATCH_CUTOFF = 0.52 # minimum SequenceMatcher ratio to accept
SLEEP_BETWEEN = 0.25 # seconds between API calls (polite rate limit)
# Manual overrides for schemes that mfapi's search endpoint does not
# currently return, but whose AMFI codes are known and stable. Keys are
# normalized fund names (see _normalize).
SCHEME_OVERRIDES: dict[str, str] = {
# ββ Pre-verified from AMFI NAV master (portal.amfiindia.com) ββββββββββββββ
# These funds have empty scheme codes in source CSV and cannot be reliably
# resolved via mfapi fuzzy search. Codes are Regular Plan - Growth only.
# Existing override
"kotak tax saver scheme growth": "109234",
# ββ Debt: Banking and PSU βββββββββββββββββββββββββββββββββββββββββββββββββ
"hdfc banking and psu debt fund growth option": "128628",
"icici prudential banking and psu debt fund growth": "112342",
"kotak banking and psu debt growth": "123690",
"invesco india banking and psu fund growth option": "118232",
"sundaram banking psu fund formerly known as sundaram banking and psu debt fund regular plan growth": "100784",
"hsbc banking and psu debt fund regular growth": "151104",
"iti banking psu debt fund regular plan growth option": "148535",
# ββ Debt: Liquid ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"dsp liquidity fund regular plan growth": "119120",
"invesco india liquid fund growth": "104488",
"invesco india liquid fund regular growth": "118769",
"union liquid fund growth option": "115398",
"parag parikh liquid fund regular plan growth": "149038",
"motilal oswal liquid fund regular growth": "147622",
"iti liquid fund regular plan growth option": "147153",
"quantum liquid fund regular plan growth option": "103504",
"lic mf liquid fund regular plan growth": "120716",
"icici prudential liquid fund growth": "120593",
"aditya birla sun life liquid fund retail growth": "100042",
"aditya birla sun life liquid fund growth": "100047",
"edelweiss liquid fund regular plan growth option": "140182",
"edelweiss liquid fund retail plan growth option": "119114",
"axis liquid fund retail plan growth option": "112090",
"sbi liquid fund regular plan growth": "119822",
"nippon india liquid fund retail option growth plan": "100837",
# ββ Debt: Overnight βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"uti overnight fund regular plan growth option": "100814",
"canara robeco overnight fund regular plan growth option": "147534",
"dsp overnight fund regular plan growth": "146061",
"franklin india overnight fund growth": "146210",
"bandhan overnight fund regular plan growth": "146187",
"iti overnight fund regular plan growth option": "148529",
"union overnight fund regular plan growth option": "146997",
"icici prudential overnight fund growth": "145811",
"edelweiss overnight fund regular plan growth": "147569",
"lic mf overnight fund regular plan growth": "146065",
"hdfc overnight fund growth option": "145822",
# ββ Debt: Ultra Short Duration ββββββββββββββββββββββββββββββββββββββββββββ
"icici prudential ultra short term fund growth": "120505",
"invesco india ultra short duration fund growth": "117825",
"uti ultra short duration fund regular plan growth option": "102532",
"aditya birla sun life savings fund growth regular plan": "119293",
"aditya birla sun life savings fund retail growth": "119293",
"hdfc ultra short term fund growth option": "145539",
"aditya birla sun life savings fund discipline advantage plan": "112016",
"pgim india ultra short duration fund growth": "100474",
"iti ultra short duration fund regular plan growth option": "148533",
"motilal oswal ultra short term fund mofustf regular plan growth": "124233",
"tata ultra short term fund regular plan growth": "146070",
"kotak savings fund growth": "119270",
"lic mf ultra short duration fund regular plan growth": "147770",
"canara robeco ultra short term fund regular plan growth option": "119671",
"sundaram ultra short duration fund formerly known as principal ultra short term fund growth option": "120826",
"bank of india ultra short duration fund regular plan growth": "109269",
# ββ Debt: Short Duration ββββββββββββββββββββββββββββββββββββββββββββββββββ
"hdfc short term debt fund growth option": "119247",
"icici prudential short term fund growth option": "101758",
"sbi short horizon debt fund short term fund retail growth": "106227",
"sbi short term debt fund regular plan growth": "119831",
"kotak bond short term plan growth": "101373",
"dsp short term fund regular plan growth": "119598",
"lic mf short duration fund regular plan growth": "145952",
"mirae asset short duration fund regular plan growth": "148416",
"invesco india short duration fund growth": "105185",
"canara robeco short duration fund regular plan growth option": "119675",
"groww short duration fund formerly known as indiabulls short term fund regular plan growth option": "123708",
"tata short term bond fund regular plan growth option": "119802",
# ββ Debt: Medium Duration βββββββββββββββββββββββββββββββββββββββββββββββββ
"aditya birla sun life medium term plan growth regular plan": "111803",
"axis strategic bond fund regular plan growth option": "116894",
"icici prudential medium term bond fund growth": "120841",
"hdfc medium term debt fund growth option": "119238",
"kotak medium term fund growth": "119281",
"dsp bond fund growth": "100078",
"sundaram medium duration fund formerly known as sundaram medium term bond fund regular plan growth": "100603",
# ββ ETFs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"hdfc nifty100 low volatility 30 etf growth option": "145748",
"hdfc nifty200 momentum 30 etf growth option": "146058",
"hdfc nifty it etf growth option": "120493",
"hdfc nifty private bank etf growth option": "145696",
# ββ Index Funds βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"dsp nifty next 50 index fund regular plan growth": "143669",
"uti nifty next 50 index fund regular plan growth option": "120713",
"motilal oswal nifty smallcap 250 index regular plan": "147960",
"icici prudential nifty pharma index fund growth": "143874",
"dsp nifty 50 index fund regular plan growth": "143537",
"motilal oswal nifty midcap 150 index fund regular plan": "147068",
"sbi nifty index fund regular plan growth": "135818",
"motilal oswal nifty bank index regular plan": "145552",
}
def _normalize(name: str) -> str:
"""Convert hyphenated CSV name to a clean lowercase string."""
return re.sub(r"[-_]+", " ", name).strip().lower()
def _search_query(name: str) -> str:
"""Take first 6 tokens for a focused search query."""
return " ".join(_normalize(name).split()[:6])
def _search_mfapi(query: str) -> list[dict]:
try:
resp = requests.get(MFAPI_SEARCH, params={"q": query}, timeout=15)
resp.raise_for_status()
return resp.json()
except Exception as exc:
print(f" [resolver] search error for '{query}': {exc}")
return []
def _best_match(candidates: list[dict], target_name: str) -> dict | None:
if not candidates:
return None
target = _normalize(target_name)
best_score = 0.0
best_item = None
for item in candidates:
candidate = _normalize(item.get("schemeName", ""))
score = difflib.SequenceMatcher(None, target, candidate).ratio()
if score > best_score:
best_score = score
best_item = item
if best_score >= MATCH_CUTOFF:
return best_item
return None
def _is_valid_scheme_code(code: str) -> bool:
"""AMFI scheme codes are purely numeric (e.g. 120586). Platform codes like GROWWEH are invalid."""
return bool(code and code.isdigit())
def resolve_scheme_code_for_fund_name(
fund_name: str,
) -> tuple[str | None, str | None]:
"""
Resolve a scheme code for one fund name.
Resolution order:
1. Exact normalized-name override from SCHEME_OVERRIDES
2. mfapi search + fuzzy best-match
"""
norm = _normalize(fund_name)
override_code = SCHEME_OVERRIDES.get(norm)
if override_code:
return override_code, "override"
query = _search_query(fund_name)
candidates = _search_mfapi(query)
match = _best_match(candidates, fund_name)
if match:
return str(match["schemeCode"]), match.get("schemeName", "")
return None, None
def resolve_missing_scheme_codes(
rows: list[dict[str, str]],
*,
verbose: bool = True,
) -> tuple[list[dict[str, str]], dict[str, str]]:
"""
Resolve blank scheme codes and also correct any exact-name rows whose
current numeric code disagrees with SCHEME_OVERRIDES.
Blank/invalid codes are resolved via SCHEME_OVERRIDES (O(1) dict lookup)
first, then mfapi search in parallel.
Complexity: O(N) time, O(N) space where N = funds with missing codes.
Network I/O parallelised with ThreadPoolExecutor(20) β pure I/O bound.
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
resolved: dict[str, str] = {}
corrected_existing = 0
# ββ Collect rows that need resolution βββββββββββββββββββββββββββββββββββββ
target_rows: list[dict[str, str]] = []
for row in rows:
fund_name = (row.get("Fund") or "").strip()
if not fund_name or fund_name.count("-") < 2 or ":" in fund_name:
continue
norm = _normalize(fund_name)
raw_code = (row.get("Scheme Code") or "").strip()
override_code = SCHEME_OVERRIDES.get(norm)
# Future-proofing: if we know the canonical code for this exact fund name,
# correct it even when the CSV already contains a numeric but stale code.
if override_code and raw_code != override_code:
row["Scheme Code"] = override_code
resolved[fund_name] = override_code
corrected_existing += 1
continue
if _is_valid_scheme_code(raw_code):
continue
if raw_code and not _is_valid_scheme_code(raw_code):
row["Scheme Code"] = "" # clear invalid platform codes e.g. GROWWEH
target_rows.append(row)
total_missing = len(target_rows)
if total_missing == 0:
if verbose:
if corrected_existing:
print(f"[resolver] Corrected {corrected_existing} existing scheme codes via override table.")
else:
print("[resolver] No missing scheme codes found.")
return rows, resolved
if verbose:
print(f"[resolver] Resolving {total_missing} missing scheme codes (parallel)β¦")
# ββ Phase A: Override table β O(1) per fund, no network βββββββββββββββββββ
mfapi_needed: list[dict[str, str]] = []
override_count = 0
for row in target_rows:
fund_name = (row.get("Fund") or "").strip()
norm = _normalize(fund_name)
code = SCHEME_OVERRIDES.get(norm)
if code:
row["Scheme Code"] = code
resolved[fund_name] = code
override_count += 1
else:
mfapi_needed.append(row)
if verbose and override_count:
print(f" [resolver] {override_count} resolved via override table (instant)")
if verbose and corrected_existing:
print(f" [resolver] {corrected_existing} existing codes corrected via override table")
# ββ Phase B: mfapi search β parallel ThreadPoolExecutor βββββββββββββββββββ
if not mfapi_needed:
if verbose:
print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved.")
return rows, resolved
lock = __import__("threading").Lock()
completed = [0]
def _resolve_one(row: dict[str, str]) -> tuple[str, str | None, str | None]:
"""Returns (fund_name, scheme_code_or_None, matched_name_or_None)."""
fund_name = (row.get("Fund") or "").strip()
query = _search_query(fund_name)
candidates = _search_mfapi(query)
match = _best_match(candidates, fund_name)
if match:
return fund_name, str(match["schemeCode"]), match.get("schemeName", "")
return fund_name, None, None
# 20 workers: mfapi is pure REST, stateless, handles concurrency fine
with ThreadPoolExecutor(max_workers=20) as executor:
future_to_row = {executor.submit(_resolve_one, row): row for row in mfapi_needed}
for future in as_completed(future_to_row):
row = future_to_row[future]
fund_name = (row.get("Fund") or "").strip()
try:
_, code, matched_name = future.result()
except Exception:
code = matched_name = None
with lock:
completed[0] += 1
n = completed[0]
total_mfapi = len(mfapi_needed)
if code:
row["Scheme Code"] = code
resolved[fund_name] = code
if verbose:
print(f" [{n}/{total_mfapi}] OK {fund_name[:55]}")
print(f" -> [{code}] {(matched_name or '')[:55]}")
else:
if verbose:
print(f" [{n}/{total_mfapi}] NO {fund_name[:55]} -- no match")
if verbose:
print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved "
f"({override_count} overrides + {len(resolved)-override_count-corrected_existing} mfapi"
f"{f', {corrected_existing} corrected existing codes' if corrected_existing else ''}).")
return rows, resolved |