Spaces:

Parthiban97
/

MF

Running

App Files Files Community

MF / src /weightage.py

Parthiban97

Upload 15 files

b0e15c1 verified 7 days ago

raw

history blame contribute delete

17 kB

	"""
	Weightage scoring algorithm for mutual fund schemes.

	Scoring method: Sum of column weights where cell qualifies for Light Green (Top/Bottom 10)
	AND is NOT overridden by Light Red fill (threshold violations).

	Weight Distribution (Advisor-revised, March 2026):
	1. Sortino Ratio: 1.300 (Top 10, higher is better)
	2. Sharpe Ratio: 1.200 (Top 10, higher is better)
	3. Information Ratio: 1.000 (Top 10, higher is better, Light Red if < 0)
	4. Alpha: 1.000 (Top 10, higher is better, Light Red if < 1)
	5. Maximum Drawdown: 1.350 (Top 10, closest to 0 is better)
	6. Down Market Capture: 1.000 (Bottom 10, lower is better)
	7. Standard Deviation: 1.000 (Bottom 10, lower is better)
	8. 10 Years CAGR: 0.750 (Top 10, higher is better, Light Red if < Category Avg)
	9. 5 Years CAGR: 0.600 (Top 10, higher is better, Light Red if < Category Avg)
	10. 3 Years CAGR: 0.400 (Top 10, higher is better, Light Red if < Category Avg)
	11. P/E Ratio: 0.150 (Bottom 10, lower is better)
	12. TER: 0.150 (Bottom 10, lower is better)
	13. Turnover (%): 0.100 (Bottom 10, lower is better)

	Total: 10.000
	"""

	import math
	from typing import List, Optional, Dict
	from src.models import Fund


	# ─── Weight map (Advisor-revised March 2026) ─────────────────────────────────
	WEIGHTS: Dict[str, float] = {
	"sortino": 1.30,
	"sharpe": 1.20,
	"info_ratio": 1.00,
	"alpha": 1.00,
	"max_drawdown": 1.35,
	"down_capture": 1.00,
	"std_dev": 1.00,
	"cagr_10y": 0.75,
	"cagr_5y": 0.60,
	"cagr_3y": 0.40,
	"pe_ratio": 0.15,
	"ter": 0.15,
	"turnover": 0.10,
	}

	# Sanity-check: total should equal 10.000
	_TOTAL = round(sum(WEIGHTS.values()), 3)
	assert _TOTAL == 10.000, f"WEIGHTS do not sum to 10.000 — got {_TOTAL}"

	# Metrics where higher is better → Top 10
	TOP_10_METRICS = [
	"sharpe", "sortino", "alpha",
	"info_ratio", "max_drawdown",
	"cagr_3y", "cagr_5y", "cagr_10y",
	]

	# Metrics where lower is better → Bottom 10
	BOTTOM_10_METRICS = [
	"ter", "turnover", "std_dev",
	"down_capture", "pe_ratio",
	]

	# Dual-condition metrics: qualifies for green AND may trigger light-red override
	DUAL_CONDITION_RULES: Dict[str, tuple] = {
	"alpha": ("below_value", 1), # Light Red if alpha < 1%
	"info_ratio": ("below_value", 0), # Light Red if IR < 0
	"cagr_3y": ("below_category_avg", None), # Light Red if < category avg
	"cagr_5y": ("below_category_avg", None),
	"cagr_10y": ("below_category_avg", None),
	}


	# ─── Value helpers ────────────────────────────────────────────────────────────

	def _is_valid(v) -> bool:
	"""True if v is a real, non-zero, non-NaN number."""
	if v is None:
	return False
	if isinstance(v, float) and (v != v): # NaN check
	return False
	# 0.0 is treated as missing/not-applicable for risk metrics
	if v == 0:
	return False
	return True


	def _is_valid_drawdown(v) -> bool:
	"""
	For Maximum Drawdown specifically: 0.0 is a genuine data-quality gap
	(overnight/liquid funds sometimes publish 0 when the real figure was never
	fetched). Treat 0 as invalid so that only funds with a real (negative)
	drawdown value compete in the ranking.
	"""
	if v is None:
	return False
	if isinstance(v, float) and v != v: # NaN
	return False
	if v == 0:
	return False # ← exact zero excluded; see drawdown_zero_fix() below
	return True


	# ─── Ranking helpers ──────────────────────────────────────────────────────────

	def _top_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
	"""
	Return True if fund is in the top-N (highest values) for metric.

	Special case:
	- For Information Ratio we allow a value of exactly 0.0 to participate
	in ranking (Excel treats 0 as a valid value; only < 0 is "red").
	"""
	fund_val = getattr(fund, metric, None)

	def _valid_for_rank(v):
	if metric == "info_ratio":
	# Treat 0 as a real value; only None/NaN are invalid here.
	if v is None:
	return False
	if isinstance(v, float) and (v != v):
	return False
	return True
	return _is_valid(v)

	if not _valid_for_rank(fund_val):
	return False

	valid = [getattr(f, metric, None) for f in peers
	if _valid_for_rank(getattr(f, metric, None))]
	if len(valid) < 2:
	return False

	# Match Excel's TOP 10 conditional formatting:
	# "Top N items", with N capped at the number of valid funds.
	effective_n = min(n, len(valid))
	valid.sort(reverse=True)
	return fund_val >= valid[effective_n - 1]


	def _top_n_drawdown(fund: Fund, peers: List[Fund], n: int = 10) -> bool:
	"""
	Special top-N for Maximum Drawdown.

	"Closest to 0" = highest value among negatives.
	-5% is better than -20%, so we still sort descending.
	Only non-zero, non-None values participate (see _is_valid_drawdown).
	Uses strict-N (no 50% fallback) so a single liquid fund with a real
	drawdown doesn't accidentally qualify just because of category size.
	"""
	fund_val = getattr(fund, "max_drawdown", None)
	if not _is_valid_drawdown(fund_val):
	return False

	valid = [getattr(f, "max_drawdown", None) for f in peers
	if _is_valid_drawdown(getattr(f, "max_drawdown", None))]
	if not valid:
	return False

	effective_n = min(n, len(valid))
	valid.sort(reverse=True) # -5 > -20 → -5 is rank-1
	return fund_val >= valid[effective_n - 1]


	def _bottom_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
	"""Return True if fund is in the bottom-N (lowest values) for metric."""
	fund_val = getattr(fund, metric, None)
	if not _is_valid(fund_val):
	return False

	valid = [getattr(f, metric, None) for f in peers
	if _is_valid(getattr(f, metric, None))]
	if len(valid) < 2:
	return False

	# Match Excel's BOTTOM 10 conditional formatting:
	# "Bottom N items", with N capped at the number of valid funds.
	effective_n = min(n, len(valid))
	valid.sort()
	return fund_val <= valid[effective_n - 1]


	def _category_avg(peers: List[Fund], metric: str) -> Optional[float]:
	"""Arithmetic mean of valid metric values across peers."""
	vals = [getattr(f, metric, None) for f in peers
	if _is_valid(getattr(f, metric, None))]
	return sum(vals) / len(vals) if vals else None


	def _light_red(fund: Fund, metric: str, cat_avg: Optional[float]) -> bool:
	"""Return True if the metric triggers a Light Red override for this fund."""
	if metric not in DUAL_CONDITION_RULES:
	return False
	rule_type, threshold = DUAL_CONDITION_RULES[metric]
	val = getattr(fund, metric, None)
	if not _is_valid(val):
	return False
	if rule_type == "below_value":
	return val < threshold
	if rule_type == "below_category_avg":
	return (cat_avg is not None) and (val < cat_avg)
	return False


	# ─── Drawdown zero-cell fix ───────────────────────────────────────────────────

	def drawdown_zero_fix(
	funds: List[Fund],
	*,
	verbose: bool = True,
	) -> int:
	"""
	Detect funds whose max_drawdown is exactly 0 (data-quality gap) and
	recompute it from live NAV history via the NAV engine.

	Strategy
	--------
	1. Collect every fund where max_drawdown == 0 AND the fund has a
	scheme_code (stored in fund.name as a fallback lookup key via CSV).
	In practice the scheme_code lives in the CSV row; the data_engine
	should pass it through. We look for it on fund.fill_status
	(which sometimes carries audit tags) or via a side-channel dict
	passed in by the caller. Most robustly, callers should set
	fund.fill_status = "DRAWDOWN_ZERO" before calling this function,
	OR we scan all funds whose max_drawdown is 0.

	2. For each such fund, call compute_nav_metrics_for_scheme() requesting
	only ["Maximum Drawdown"].

	3. If a real negative value comes back, write it to fund.max_drawdown.

	Returns the count of cells successfully fixed.

	NOTE: This function requires network access (mfapi.in + yfinance).
	It is intentionally separated from compute_scores() so callers
	can opt in only when enrichment is desired.
	"""
	# Import here to avoid circular dependency at module level
	try:
	from src.nav_metrics_engine import NavEngineCache, compute_nav_metrics_for_scheme
	except ImportError:
	if verbose:
	print("[drawdown_fix] nav_metrics_engine not available — skipping.")
	return 0

	# Build a name → scheme_code map from fund.fill_status field
	# (data_engine stores scheme codes in fill_status for audit; adjust if needed)
	# Fallback: use the fund name itself as a best-effort search key.

	DEBT_PREFIXES = ("debt", "liquid", "overnight", "money market", "gilt",
	"fixed maturity", "interval", "fmp")

	from datetime import datetime as _dt
	_now = _dt.now()

	def _fund_age_years(f) -> float \| None:
	ld = getattr(f, "_launch_date", None)
	if not isinstance(ld, _dt):
	return None
	return (_now - ld).days / 365.25

	# Import the set of funds already attempted by csv_enrichment NAV phase
	try:
	from src.csv_enrichment import _NAV_ATTEMPTED_FUNDS as _nav_attempted
	except Exception:
	_nav_attempted = set()

	zero_funds = [
	f for f in funds
	if (
	# Only target funds where drawdown is truly missing (0 or None)
	(f.max_drawdown == 0 or f.max_drawdown is None)
	# AND only equity/hybrid — debt funds have tiny/no drawdown, skip them
	and not any(f.category.lower().startswith(pfx) for pfx in DEBT_PREFIXES)
	# AND fund must be ≥3 years old — younger funds can't have 3Y NAV history
	and (_fund_age_years(f) is None or _fund_age_years(f) >= 3.0)
	# AND skip funds already attempted by csv_enrichment NAV phase —
	# if enrichment couldn't fill MDD, a second pass won't either
	and f.name not in _nav_attempted
	)
	]

	if not zero_funds:
	if verbose:
	print("[drawdown_fix] No zero/missing drawdown cells found.")
	return 0

	if verbose:
	print(f"[drawdown_fix] Attempting to fix {len(zero_funds)} drawdown cells …")

	from concurrent.futures import ThreadPoolExecutor, as_completed as _as_completed
	import threading as _threading

	# Bulk-preload cache before parallel workers start (2 SQL queries instead of N)
	try:
	from src.nav_metrics_engine import _bulk_preload_cache, resolve_benchmark_ticker
	_scheme_codes = [getattr(f, "_scheme_code", None) or "" for f in zero_funds]
	_bench_tickers = [resolve_benchmark_ticker(getattr(f, "benchmark", "") or "") for f in zero_funds]
	_bulk_preload_cache(_scheme_codes, _bench_tickers)
	except Exception:
	pass # graceful degradation — workers will fall back to per-query

	cache = NavEngineCache()
	fixed = 0
	_lock = _threading.Lock()

	with_code = [
	(f, getattr(f, "_scheme_code", None) or "", getattr(f, "benchmark", "") or "")
	for f in zero_funds
	if (getattr(f, "_scheme_code", None) or "").strip()
	]
	no_code = [f for f in zero_funds if not (getattr(f, "_scheme_code", None) or "").strip()]

	if verbose:
	for f in no_code:
	print(f" SKIP {f.name[:55]} — no scheme code available")

	def _fix_one(args):
	fund, scheme_code, benchmark = args
	metrics, skip = compute_nav_metrics_for_scheme(
	scheme_code=scheme_code,
	benchmark_type=benchmark,
	needed_metrics=["Maximum Drawdown"],
	cache=cache,
	)
	mdd = metrics.get("Maximum Drawdown")
	reason = skip.get("Maximum Drawdown", "unknown")
	return fund, mdd, reason

	with ThreadPoolExecutor(max_workers=12) as executor:
	futures = {executor.submit(_fix_one, item): item for item in with_code}
	for fut in _as_completed(futures):
	try:
	fund, mdd, reason = fut.result()
	except Exception as e:
	continue
	if mdd is not None and mdd != 0:
	with _lock:
	fund.max_drawdown = mdd
	fixed += 1
	if verbose:
	print(f" FIXED {fund.name[:55]} → MDD = {mdd:.3f}%")
	else:
	if verbose:
	print(f" MISS {fund.name[:55]} — {reason}")

	if verbose:
	print(f"[drawdown_fix] Done. Fixed {fixed}/{len(zero_funds)} cells.")

	return fixed


	# ─── Main scoring engine ──────────────────────────────────────────────────────

	def compute_scores(funds: List[Fund]) -> List[Fund]:
	"""
	Score and rank all funds within their categories.

	Algorithm
	---------
	For every metric that carries a weight:
	1. Check if the fund is in Top-N or Bottom-N (as appropriate) within
	its category peer group → "Light Green"
	2. If Light Green AND a dual-condition rule fires → "Light Red"
	override: weight contribution = 0
	3. Otherwise if Light Green and NOT Light Red → add weight

	fund.score is capped at 10.0 (model scale).

	Also sets:
	fund.rank_in_category – 1 = best within category
	fund.is_top_quartile – True for top ⌈N/4⌉ funds

	Returns the same list (mutated in-place) for convenience.
	"""
	# Group by category
	categories: Dict[str, List[Fund]] = {}
	for fund in funds:
	categories.setdefault(fund.category, []).append(fund)

	for cat_name, cat_funds in categories.items():

	# Pre-compute category averages for CAGR dual-condition rules
	cat_averages = {
	metric: _category_avg(cat_funds, metric)
	for metric in ("cagr_3y", "cagr_5y", "cagr_10y")
	}

	for fund in cat_funds:
	score = 0.0

	for metric, weight in WEIGHTS.items():
	is_green = False

	# ── Green check ──────────────────────────────────────────
	if metric == "max_drawdown":
	is_green = _top_n_drawdown(fund, cat_funds)
	elif metric in TOP_10_METRICS:
	is_green = _top_n(fund, cat_funds, metric)
	elif metric in BOTTOM_10_METRICS:
	is_green = _bottom_n(fund, cat_funds, metric)

	# ── Light Red override ───────────────────────────────────
	if is_green and metric in DUAL_CONDITION_RULES:
	cat_avg = cat_averages.get(metric)
	if _light_red(fund, metric, cat_avg):
	is_green = False # zeroed by override

	if is_green:
	score += weight

	fund.score = round(min(score, 10.0), 3)

	# ── Rank within category ─────────────────────────────────────────
	sorted_funds = sorted(
	cat_funds,
	key=lambda f: (-(f.score or 0), (f.name or "").lower(), getattr(f, "order", 0)),
	)
	top_quartile_cutoff = max(1, math.ceil(len(sorted_funds) / 4))

	for rank, fund in enumerate(sorted_funds, start=1):
	fund.rank_in_category = rank
	fund.is_top_quartile = (rank <= top_quartile_cutoff)

	return funds