MF / src /scheme_resolver.py
Parthiban97's picture
Upload 15 files
b0e15c1 verified
"""Scheme Code Resolver
======================
Resolves missing AMFI scheme codes by fuzzy-matching the fund name from the
CSV against mfapi.in's /mf/search endpoint.
This runs as a PRE-TRIAGE step so that the NAV engine can fire for funds whose
scheme code was absent from the CSV.
"""
from __future__ import annotations
import difflib
import re
import time
import requests
MFAPI_SEARCH = "https://api.mfapi.in/mf/search"
MATCH_CUTOFF = 0.52 # minimum SequenceMatcher ratio to accept
SLEEP_BETWEEN = 0.25 # seconds between API calls (polite rate limit)
# Manual overrides for schemes that mfapi's search endpoint does not
# currently return, but whose AMFI codes are known and stable. Keys are
# normalized fund names (see _normalize).
SCHEME_OVERRIDES: dict[str, str] = {
# ── Pre-verified from AMFI NAV master (portal.amfiindia.com) ──────────────
# These funds have empty scheme codes in source CSV and cannot be reliably
# resolved via mfapi fuzzy search. Codes are Regular Plan - Growth only.
# Existing override
"kotak tax saver scheme growth": "109234",
# ── Debt: Banking and PSU ─────────────────────────────────────────────────
"hdfc banking and psu debt fund growth option": "128628",
"icici prudential banking and psu debt fund growth": "112342",
"kotak banking and psu debt growth": "123690",
"invesco india banking and psu fund growth option": "118232",
"sundaram banking psu fund formerly known as sundaram banking and psu debt fund regular plan growth": "100784",
"hsbc banking and psu debt fund regular growth": "151104",
"iti banking psu debt fund regular plan growth option": "148535",
# ── Debt: Liquid ──────────────────────────────────────────────────────────
"dsp liquidity fund regular plan growth": "119120",
"invesco india liquid fund growth": "104488",
"invesco india liquid fund regular growth": "118769",
"union liquid fund growth option": "115398",
"parag parikh liquid fund regular plan growth": "149038",
"motilal oswal liquid fund regular growth": "147622",
"iti liquid fund regular plan growth option": "147153",
"quantum liquid fund regular plan growth option": "103504",
"lic mf liquid fund regular plan growth": "120716",
"icici prudential liquid fund growth": "120593",
"aditya birla sun life liquid fund retail growth": "100042",
"aditya birla sun life liquid fund growth": "100047",
"edelweiss liquid fund regular plan growth option": "140182",
"edelweiss liquid fund retail plan growth option": "119114",
"axis liquid fund retail plan growth option": "112090",
"sbi liquid fund regular plan growth": "119822",
"nippon india liquid fund retail option growth plan": "100837",
# ── Debt: Overnight ───────────────────────────────────────────────────────
"uti overnight fund regular plan growth option": "100814",
"canara robeco overnight fund regular plan growth option": "147534",
"dsp overnight fund regular plan growth": "146061",
"franklin india overnight fund growth": "146210",
"bandhan overnight fund regular plan growth": "146187",
"iti overnight fund regular plan growth option": "148529",
"union overnight fund regular plan growth option": "146997",
"icici prudential overnight fund growth": "145811",
"edelweiss overnight fund regular plan growth": "147569",
"lic mf overnight fund regular plan growth": "146065",
"hdfc overnight fund growth option": "145822",
# ── Debt: Ultra Short Duration ────────────────────────────────────────────
"icici prudential ultra short term fund growth": "120505",
"invesco india ultra short duration fund growth": "117825",
"uti ultra short duration fund regular plan growth option": "102532",
"aditya birla sun life savings fund growth regular plan": "119293",
"aditya birla sun life savings fund retail growth": "119293",
"hdfc ultra short term fund growth option": "145539",
"aditya birla sun life savings fund discipline advantage plan": "112016",
"pgim india ultra short duration fund growth": "100474",
"iti ultra short duration fund regular plan growth option": "148533",
"motilal oswal ultra short term fund mofustf regular plan growth": "124233",
"tata ultra short term fund regular plan growth": "146070",
"kotak savings fund growth": "119270",
"lic mf ultra short duration fund regular plan growth": "147770",
"canara robeco ultra short term fund regular plan growth option": "119671",
"sundaram ultra short duration fund formerly known as principal ultra short term fund growth option": "120826",
"bank of india ultra short duration fund regular plan growth": "109269",
# ── Debt: Short Duration ──────────────────────────────────────────────────
"hdfc short term debt fund growth option": "119247",
"icici prudential short term fund growth option": "101758",
"sbi short horizon debt fund short term fund retail growth": "106227",
"sbi short term debt fund regular plan growth": "119831",
"kotak bond short term plan growth": "101373",
"dsp short term fund regular plan growth": "119598",
"lic mf short duration fund regular plan growth": "145952",
"mirae asset short duration fund regular plan growth": "148416",
"invesco india short duration fund growth": "105185",
"canara robeco short duration fund regular plan growth option": "119675",
"groww short duration fund formerly known as indiabulls short term fund regular plan growth option": "123708",
"tata short term bond fund regular plan growth option": "119802",
# ── Debt: Medium Duration ─────────────────────────────────────────────────
"aditya birla sun life medium term plan growth regular plan": "111803",
"axis strategic bond fund regular plan growth option": "116894",
"icici prudential medium term bond fund growth": "120841",
"hdfc medium term debt fund growth option": "119238",
"kotak medium term fund growth": "119281",
"dsp bond fund growth": "100078",
"sundaram medium duration fund formerly known as sundaram medium term bond fund regular plan growth": "100603",
# ── ETFs ──────────────────────────────────────────────────────────────────
"hdfc nifty100 low volatility 30 etf growth option": "145748",
"hdfc nifty200 momentum 30 etf growth option": "146058",
"hdfc nifty it etf growth option": "120493",
"hdfc nifty private bank etf growth option": "145696",
# ── Index Funds ───────────────────────────────────────────────────────────
"dsp nifty next 50 index fund regular plan growth": "143669",
"uti nifty next 50 index fund regular plan growth option": "120713",
"motilal oswal nifty smallcap 250 index regular plan": "147960",
"icici prudential nifty pharma index fund growth": "143874",
"dsp nifty 50 index fund regular plan growth": "143537",
"motilal oswal nifty midcap 150 index fund regular plan": "147068",
"sbi nifty index fund regular plan growth": "135818",
"motilal oswal nifty bank index regular plan": "145552",
}
def _normalize(name: str) -> str:
"""Convert hyphenated CSV name to a clean lowercase string."""
return re.sub(r"[-_]+", " ", name).strip().lower()
def _search_query(name: str) -> str:
"""Take first 6 tokens for a focused search query."""
return " ".join(_normalize(name).split()[:6])
def _search_mfapi(query: str) -> list[dict]:
try:
resp = requests.get(MFAPI_SEARCH, params={"q": query}, timeout=15)
resp.raise_for_status()
return resp.json()
except Exception as exc:
print(f" [resolver] search error for '{query}': {exc}")
return []
def _best_match(candidates: list[dict], target_name: str) -> dict | None:
if not candidates:
return None
target = _normalize(target_name)
best_score = 0.0
best_item = None
for item in candidates:
candidate = _normalize(item.get("schemeName", ""))
score = difflib.SequenceMatcher(None, target, candidate).ratio()
if score > best_score:
best_score = score
best_item = item
if best_score >= MATCH_CUTOFF:
return best_item
return None
def _is_valid_scheme_code(code: str) -> bool:
"""AMFI scheme codes are purely numeric (e.g. 120586). Platform codes like GROWWEH are invalid."""
return bool(code and code.isdigit())
def resolve_scheme_code_for_fund_name(
fund_name: str,
) -> tuple[str | None, str | None]:
"""
Resolve a scheme code for one fund name.
Resolution order:
1. Exact normalized-name override from SCHEME_OVERRIDES
2. mfapi search + fuzzy best-match
"""
norm = _normalize(fund_name)
override_code = SCHEME_OVERRIDES.get(norm)
if override_code:
return override_code, "override"
query = _search_query(fund_name)
candidates = _search_mfapi(query)
match = _best_match(candidates, fund_name)
if match:
return str(match["schemeCode"]), match.get("schemeName", "")
return None, None
def resolve_missing_scheme_codes(
rows: list[dict[str, str]],
*,
verbose: bool = True,
) -> tuple[list[dict[str, str]], dict[str, str]]:
"""
Resolve blank scheme codes and also correct any exact-name rows whose
current numeric code disagrees with SCHEME_OVERRIDES.
Blank/invalid codes are resolved via SCHEME_OVERRIDES (O(1) dict lookup)
first, then mfapi search in parallel.
Complexity: O(N) time, O(N) space where N = funds with missing codes.
Network I/O parallelised with ThreadPoolExecutor(20) β€” pure I/O bound.
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
resolved: dict[str, str] = {}
corrected_existing = 0
# ── Collect rows that need resolution ─────────────────────────────────────
target_rows: list[dict[str, str]] = []
for row in rows:
fund_name = (row.get("Fund") or "").strip()
if not fund_name or fund_name.count("-") < 2 or ":" in fund_name:
continue
norm = _normalize(fund_name)
raw_code = (row.get("Scheme Code") or "").strip()
override_code = SCHEME_OVERRIDES.get(norm)
# Future-proofing: if we know the canonical code for this exact fund name,
# correct it even when the CSV already contains a numeric but stale code.
if override_code and raw_code != override_code:
row["Scheme Code"] = override_code
resolved[fund_name] = override_code
corrected_existing += 1
continue
if _is_valid_scheme_code(raw_code):
continue
if raw_code and not _is_valid_scheme_code(raw_code):
row["Scheme Code"] = "" # clear invalid platform codes e.g. GROWWEH
target_rows.append(row)
total_missing = len(target_rows)
if total_missing == 0:
if verbose:
if corrected_existing:
print(f"[resolver] Corrected {corrected_existing} existing scheme codes via override table.")
else:
print("[resolver] No missing scheme codes found.")
return rows, resolved
if verbose:
print(f"[resolver] Resolving {total_missing} missing scheme codes (parallel)…")
# ── Phase A: Override table β€” O(1) per fund, no network ───────────────────
mfapi_needed: list[dict[str, str]] = []
override_count = 0
for row in target_rows:
fund_name = (row.get("Fund") or "").strip()
norm = _normalize(fund_name)
code = SCHEME_OVERRIDES.get(norm)
if code:
row["Scheme Code"] = code
resolved[fund_name] = code
override_count += 1
else:
mfapi_needed.append(row)
if verbose and override_count:
print(f" [resolver] {override_count} resolved via override table (instant)")
if verbose and corrected_existing:
print(f" [resolver] {corrected_existing} existing codes corrected via override table")
# ── Phase B: mfapi search β€” parallel ThreadPoolExecutor ───────────────────
if not mfapi_needed:
if verbose:
print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved.")
return rows, resolved
lock = __import__("threading").Lock()
completed = [0]
def _resolve_one(row: dict[str, str]) -> tuple[str, str | None, str | None]:
"""Returns (fund_name, scheme_code_or_None, matched_name_or_None)."""
fund_name = (row.get("Fund") or "").strip()
query = _search_query(fund_name)
candidates = _search_mfapi(query)
match = _best_match(candidates, fund_name)
if match:
return fund_name, str(match["schemeCode"]), match.get("schemeName", "")
return fund_name, None, None
# 20 workers: mfapi is pure REST, stateless, handles concurrency fine
with ThreadPoolExecutor(max_workers=20) as executor:
future_to_row = {executor.submit(_resolve_one, row): row for row in mfapi_needed}
for future in as_completed(future_to_row):
row = future_to_row[future]
fund_name = (row.get("Fund") or "").strip()
try:
_, code, matched_name = future.result()
except Exception:
code = matched_name = None
with lock:
completed[0] += 1
n = completed[0]
total_mfapi = len(mfapi_needed)
if code:
row["Scheme Code"] = code
resolved[fund_name] = code
if verbose:
print(f" [{n}/{total_mfapi}] OK {fund_name[:55]}")
print(f" -> [{code}] {(matched_name or '')[:55]}")
else:
if verbose:
print(f" [{n}/{total_mfapi}] NO {fund_name[:55]} -- no match")
if verbose:
print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved "
f"({override_count} overrides + {len(resolved)-override_count-corrected_existing} mfapi"
f"{f', {corrected_existing} corrected existing codes' if corrected_existing else ''}).")
return rows, resolved