MF / src /weightage.py
Parthiban97's picture
Upload 15 files
b0e15c1 verified
"""
Weightage scoring algorithm for mutual fund schemes.
Scoring method: Sum of column weights where cell qualifies for Light Green (Top/Bottom 10)
AND is NOT overridden by Light Red fill (threshold violations).
Weight Distribution (Advisor-revised, March 2026):
1. Sortino Ratio: 1.300 (Top 10, higher is better)
2. Sharpe Ratio: 1.200 (Top 10, higher is better)
3. Information Ratio: 1.000 (Top 10, higher is better, Light Red if < 0)
4. Alpha: 1.000 (Top 10, higher is better, Light Red if < 1)
5. Maximum Drawdown: 1.350 (Top 10, closest to 0 is better)
6. Down Market Capture: 1.000 (Bottom 10, lower is better)
7. Standard Deviation: 1.000 (Bottom 10, lower is better)
8. 10 Years CAGR: 0.750 (Top 10, higher is better, Light Red if < Category Avg)
9. 5 Years CAGR: 0.600 (Top 10, higher is better, Light Red if < Category Avg)
10. 3 Years CAGR: 0.400 (Top 10, higher is better, Light Red if < Category Avg)
11. P/E Ratio: 0.150 (Bottom 10, lower is better)
12. TER: 0.150 (Bottom 10, lower is better)
13. Turnover (%): 0.100 (Bottom 10, lower is better)
Total: 10.000
"""
import math
from typing import List, Optional, Dict
from src.models import Fund
# ─── Weight map (Advisor-revised March 2026) ─────────────────────────────────
WEIGHTS: Dict[str, float] = {
"sortino": 1.30,
"sharpe": 1.20,
"info_ratio": 1.00,
"alpha": 1.00,
"max_drawdown": 1.35,
"down_capture": 1.00,
"std_dev": 1.00,
"cagr_10y": 0.75,
"cagr_5y": 0.60,
"cagr_3y": 0.40,
"pe_ratio": 0.15,
"ter": 0.15,
"turnover": 0.10,
}
# Sanity-check: total should equal 10.000
_TOTAL = round(sum(WEIGHTS.values()), 3)
assert _TOTAL == 10.000, f"WEIGHTS do not sum to 10.000 β€” got {_TOTAL}"
# Metrics where higher is better β†’ Top 10
TOP_10_METRICS = [
"sharpe", "sortino", "alpha",
"info_ratio", "max_drawdown",
"cagr_3y", "cagr_5y", "cagr_10y",
]
# Metrics where lower is better β†’ Bottom 10
BOTTOM_10_METRICS = [
"ter", "turnover", "std_dev",
"down_capture", "pe_ratio",
]
# Dual-condition metrics: qualifies for green AND may trigger light-red override
DUAL_CONDITION_RULES: Dict[str, tuple] = {
"alpha": ("below_value", 1), # Light Red if alpha < 1%
"info_ratio": ("below_value", 0), # Light Red if IR < 0
"cagr_3y": ("below_category_avg", None), # Light Red if < category avg
"cagr_5y": ("below_category_avg", None),
"cagr_10y": ("below_category_avg", None),
}
# ─── Value helpers ────────────────────────────────────────────────────────────
def _is_valid(v) -> bool:
"""True if v is a real, non-zero, non-NaN number."""
if v is None:
return False
if isinstance(v, float) and (v != v): # NaN check
return False
# 0.0 is treated as missing/not-applicable for risk metrics
if v == 0:
return False
return True
def _is_valid_drawdown(v) -> bool:
"""
For Maximum Drawdown specifically: 0.0 is a genuine data-quality gap
(overnight/liquid funds sometimes publish 0 when the real figure was never
fetched). Treat 0 as invalid so that only funds with a real (negative)
drawdown value compete in the ranking.
"""
if v is None:
return False
if isinstance(v, float) and v != v: # NaN
return False
if v == 0:
return False # ← exact zero excluded; see drawdown_zero_fix() below
return True
# ─── Ranking helpers ──────────────────────────────────────────────────────────
def _top_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
"""
Return True if fund is in the top-N (highest values) for metric.
Special case:
- For Information Ratio we allow a value of exactly 0.0 to participate
in ranking (Excel treats 0 as a valid value; only < 0 is "red").
"""
fund_val = getattr(fund, metric, None)
def _valid_for_rank(v):
if metric == "info_ratio":
# Treat 0 as a real value; only None/NaN are invalid here.
if v is None:
return False
if isinstance(v, float) and (v != v):
return False
return True
return _is_valid(v)
if not _valid_for_rank(fund_val):
return False
valid = [getattr(f, metric, None) for f in peers
if _valid_for_rank(getattr(f, metric, None))]
if len(valid) < 2:
return False
# Match Excel's TOP 10 conditional formatting:
# "Top N items", with N capped at the number of valid funds.
effective_n = min(n, len(valid))
valid.sort(reverse=True)
return fund_val >= valid[effective_n - 1]
def _top_n_drawdown(fund: Fund, peers: List[Fund], n: int = 10) -> bool:
"""
Special top-N for Maximum Drawdown.
"Closest to 0" = highest value among negatives.
-5% is better than -20%, so we still sort descending.
Only non-zero, non-None values participate (see _is_valid_drawdown).
Uses strict-N (no 50% fallback) so a single liquid fund with a real
drawdown doesn't accidentally qualify just because of category size.
"""
fund_val = getattr(fund, "max_drawdown", None)
if not _is_valid_drawdown(fund_val):
return False
valid = [getattr(f, "max_drawdown", None) for f in peers
if _is_valid_drawdown(getattr(f, "max_drawdown", None))]
if not valid:
return False
effective_n = min(n, len(valid))
valid.sort(reverse=True) # -5 > -20 β†’ -5 is rank-1
return fund_val >= valid[effective_n - 1]
def _bottom_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
"""Return True if fund is in the bottom-N (lowest values) for metric."""
fund_val = getattr(fund, metric, None)
if not _is_valid(fund_val):
return False
valid = [getattr(f, metric, None) for f in peers
if _is_valid(getattr(f, metric, None))]
if len(valid) < 2:
return False
# Match Excel's BOTTOM 10 conditional formatting:
# "Bottom N items", with N capped at the number of valid funds.
effective_n = min(n, len(valid))
valid.sort()
return fund_val <= valid[effective_n - 1]
def _category_avg(peers: List[Fund], metric: str) -> Optional[float]:
"""Arithmetic mean of valid metric values across peers."""
vals = [getattr(f, metric, None) for f in peers
if _is_valid(getattr(f, metric, None))]
return sum(vals) / len(vals) if vals else None
def _light_red(fund: Fund, metric: str, cat_avg: Optional[float]) -> bool:
"""Return True if the metric triggers a Light Red override for this fund."""
if metric not in DUAL_CONDITION_RULES:
return False
rule_type, threshold = DUAL_CONDITION_RULES[metric]
val = getattr(fund, metric, None)
if not _is_valid(val):
return False
if rule_type == "below_value":
return val < threshold
if rule_type == "below_category_avg":
return (cat_avg is not None) and (val < cat_avg)
return False
# ─── Drawdown zero-cell fix ───────────────────────────────────────────────────
def drawdown_zero_fix(
funds: List[Fund],
*,
verbose: bool = True,
) -> int:
"""
Detect funds whose max_drawdown is exactly 0 (data-quality gap) and
recompute it from live NAV history via the NAV engine.
Strategy
--------
1. Collect every fund where max_drawdown == 0 AND the fund has a
scheme_code (stored in fund.name as a fallback lookup key via CSV).
In practice the scheme_code lives in the CSV row; the data_engine
should pass it through. We look for it on fund.fill_status
(which sometimes carries audit tags) or via a side-channel dict
passed in by the caller. Most robustly, callers should set
fund.fill_status = "DRAWDOWN_ZERO" before calling this function,
OR we scan all funds whose max_drawdown is 0.
2. For each such fund, call compute_nav_metrics_for_scheme() requesting
only ["Maximum Drawdown"].
3. If a real negative value comes back, write it to fund.max_drawdown.
Returns the count of cells successfully fixed.
NOTE: This function requires network access (mfapi.in + yfinance).
It is intentionally separated from compute_scores() so callers
can opt in only when enrichment is desired.
"""
# Import here to avoid circular dependency at module level
try:
from src.nav_metrics_engine import NavEngineCache, compute_nav_metrics_for_scheme
except ImportError:
if verbose:
print("[drawdown_fix] nav_metrics_engine not available β€” skipping.")
return 0
# Build a name β†’ scheme_code map from fund.fill_status field
# (data_engine stores scheme codes in fill_status for audit; adjust if needed)
# Fallback: use the fund name itself as a best-effort search key.
DEBT_PREFIXES = ("debt", "liquid", "overnight", "money market", "gilt",
"fixed maturity", "interval", "fmp")
from datetime import datetime as _dt
_now = _dt.now()
def _fund_age_years(f) -> float | None:
ld = getattr(f, "_launch_date", None)
if not isinstance(ld, _dt):
return None
return (_now - ld).days / 365.25
# Import the set of funds already attempted by csv_enrichment NAV phase
try:
from src.csv_enrichment import _NAV_ATTEMPTED_FUNDS as _nav_attempted
except Exception:
_nav_attempted = set()
zero_funds = [
f for f in funds
if (
# Only target funds where drawdown is truly missing (0 or None)
(f.max_drawdown == 0 or f.max_drawdown is None)
# AND only equity/hybrid β€” debt funds have tiny/no drawdown, skip them
and not any(f.category.lower().startswith(pfx) for pfx in DEBT_PREFIXES)
# AND fund must be β‰₯3 years old β€” younger funds can't have 3Y NAV history
and (_fund_age_years(f) is None or _fund_age_years(f) >= 3.0)
# AND skip funds already attempted by csv_enrichment NAV phase β€”
# if enrichment couldn't fill MDD, a second pass won't either
and f.name not in _nav_attempted
)
]
if not zero_funds:
if verbose:
print("[drawdown_fix] No zero/missing drawdown cells found.")
return 0
if verbose:
print(f"[drawdown_fix] Attempting to fix {len(zero_funds)} drawdown cells …")
from concurrent.futures import ThreadPoolExecutor, as_completed as _as_completed
import threading as _threading
# Bulk-preload cache before parallel workers start (2 SQL queries instead of N)
try:
from src.nav_metrics_engine import _bulk_preload_cache, resolve_benchmark_ticker
_scheme_codes = [getattr(f, "_scheme_code", None) or "" for f in zero_funds]
_bench_tickers = [resolve_benchmark_ticker(getattr(f, "benchmark", "") or "") for f in zero_funds]
_bulk_preload_cache(_scheme_codes, _bench_tickers)
except Exception:
pass # graceful degradation β€” workers will fall back to per-query
cache = NavEngineCache()
fixed = 0
_lock = _threading.Lock()
with_code = [
(f, getattr(f, "_scheme_code", None) or "", getattr(f, "benchmark", "") or "")
for f in zero_funds
if (getattr(f, "_scheme_code", None) or "").strip()
]
no_code = [f for f in zero_funds if not (getattr(f, "_scheme_code", None) or "").strip()]
if verbose:
for f in no_code:
print(f" SKIP {f.name[:55]} β€” no scheme code available")
def _fix_one(args):
fund, scheme_code, benchmark = args
metrics, skip = compute_nav_metrics_for_scheme(
scheme_code=scheme_code,
benchmark_type=benchmark,
needed_metrics=["Maximum Drawdown"],
cache=cache,
)
mdd = metrics.get("Maximum Drawdown")
reason = skip.get("Maximum Drawdown", "unknown")
return fund, mdd, reason
with ThreadPoolExecutor(max_workers=12) as executor:
futures = {executor.submit(_fix_one, item): item for item in with_code}
for fut in _as_completed(futures):
try:
fund, mdd, reason = fut.result()
except Exception as e:
continue
if mdd is not None and mdd != 0:
with _lock:
fund.max_drawdown = mdd
fixed += 1
if verbose:
print(f" FIXED {fund.name[:55]} β†’ MDD = {mdd:.3f}%")
else:
if verbose:
print(f" MISS {fund.name[:55]} β€” {reason}")
if verbose:
print(f"[drawdown_fix] Done. Fixed {fixed}/{len(zero_funds)} cells.")
return fixed
# ─── Main scoring engine ──────────────────────────────────────────────────────
def compute_scores(funds: List[Fund]) -> List[Fund]:
"""
Score and rank all funds within their categories.
Algorithm
---------
For every metric that carries a weight:
1. Check if the fund is in Top-N or Bottom-N (as appropriate) within
its category peer group β†’ "Light Green"
2. If Light Green AND a dual-condition rule fires β†’ "Light Red"
override: weight contribution = 0
3. Otherwise if Light Green and NOT Light Red β†’ add weight
fund.score is capped at 10.0 (model scale).
Also sets:
fund.rank_in_category – 1 = best within category
fund.is_top_quartile – True for top ⌈N/4βŒ‰ funds
Returns the same list (mutated in-place) for convenience.
"""
# Group by category
categories: Dict[str, List[Fund]] = {}
for fund in funds:
categories.setdefault(fund.category, []).append(fund)
for cat_name, cat_funds in categories.items():
# Pre-compute category averages for CAGR dual-condition rules
cat_averages = {
metric: _category_avg(cat_funds, metric)
for metric in ("cagr_3y", "cagr_5y", "cagr_10y")
}
for fund in cat_funds:
score = 0.0
for metric, weight in WEIGHTS.items():
is_green = False
# ── Green check ──────────────────────────────────────────
if metric == "max_drawdown":
is_green = _top_n_drawdown(fund, cat_funds)
elif metric in TOP_10_METRICS:
is_green = _top_n(fund, cat_funds, metric)
elif metric in BOTTOM_10_METRICS:
is_green = _bottom_n(fund, cat_funds, metric)
# ── Light Red override ───────────────────────────────────
if is_green and metric in DUAL_CONDITION_RULES:
cat_avg = cat_averages.get(metric)
if _light_red(fund, metric, cat_avg):
is_green = False # zeroed by override
if is_green:
score += weight
fund.score = round(min(score, 10.0), 3)
# ── Rank within category ─────────────────────────────────────────
sorted_funds = sorted(
cat_funds,
key=lambda f: (-(f.score or 0), (f.name or "").lower(), getattr(f, "order", 0)),
)
top_quartile_cutoff = max(1, math.ceil(len(sorted_funds) / 4))
for rank, fund in enumerate(sorted_funds, start=1):
fund.rank_in_category = rank
fund.is_top_quartile = (rank <= top_quartile_cutoff)
return funds