MF / src /reference_data.py
Parthiban97's picture
Upload 15 files
b0e15c1 verified
"""
Reference data extractor from Processed_data.xlsx
This module extracts BM Index, Category Average, and fund weightage data that the advisor
has manually filled in Processed_data.xlsx, so we can use it when processing
raw CSV files that have blank BM/Category rows.
"""
import openpyxl
from typing import Dict, Any, Optional, Tuple
from pathlib import Path
def extract_reference_data(processed_xlsx_path: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]:
"""
Extract BM Index, Category Average, and fund weightage data from Processed_data.xlsx.
Returns:
(bm_data, cat_avg_data, fund_weightages) where:
- bm_data: dict mapping category name to CAGR values for BM Index
- cat_avg_data: dict mapping category name to CAGR/values for Category Average
- fund_weightages: dict mapping fund name to manually adjusted weightage value
"""
xlsx_path = Path(processed_xlsx_path)
if not xlsx_path.exists():
print(f"Warning: Reference file not found: {processed_xlsx_path}")
return {}, {}, {}
wb = openpyxl.load_workbook(str(xlsx_path))
ws = wb.active
bm_data = {}
cat_avg_data = {}
fund_weightages = {}
current_category = None
# Find the Weightage column index by scanning the header row
weightage_col_idx = None
for col_idx in range(1, ws.max_column + 1):
header_val = ws.cell(1, col_idx).value
if header_val and 'Weightage' in str(header_val):
weightage_col_idx = col_idx
break
for i in range(1, ws.max_row + 1):
cell_val = ws.cell(i, 1).value
# Check if it's a category header
if cell_val and ':' in str(cell_val) and any(x in str(cell_val) for x in ['Equity', 'Debt', 'Hybrid', 'Solution', 'Other']):
current_category = cell_val
# Check if it's BM Index row
elif cell_val == 'BM Index' and current_category:
bm_1y = ws.cell(i, 6).value
bm_3y = ws.cell(i, 7).value
bm_5y = ws.cell(i, 8).value
bm_10y = ws.cell(i, 9).value
# Only store if at least one value is present
if any([bm_1y, bm_3y, bm_5y, bm_10y]):
bm_data[current_category] = {
'cagr_1y': bm_1y,
'cagr_3y': bm_3y,
'cagr_5y': bm_5y,
'cagr_10y': bm_10y
}
# Check if it's Category Average row
elif cell_val == 'Category Average' and current_category:
cat_1y = ws.cell(i, 6).value
cat_3y = ws.cell(i, 7).value
cat_5y = ws.cell(i, 8).value
cat_10y = ws.cell(i, 9).value
pe = ws.cell(i, 12).value
pb = ws.cell(i, 13).value
# Only store if at least one CAGR value is present
if any([cat_1y, cat_3y, cat_5y, cat_10y]):
cat_avg_data[current_category] = {
'cagr_1y': cat_1y,
'cagr_3y': cat_3y,
'cagr_5y': cat_5y,
'cagr_10y': cat_10y,
'pe_ratio': pe,
'pb_ratio': pb
}
# Check if it's a fund row (not category header, BM Index, or Category Average)
elif cell_val and cell_val not in ['BM Index', 'Category Average', 'Fund'] and current_category:
# Extract fund name
fund_name = str(cell_val).strip()
# Extract weightage if we found the Weightage column
if weightage_col_idx:
weightage_val = ws.cell(i, weightage_col_idx).value
if weightage_val is not None:
try:
# Convert to int if possible, otherwise round float to nearest int
if isinstance(weightage_val, float):
fund_weightages[fund_name] = int(round(weightage_val))
else:
fund_weightages[fund_name] = int(weightage_val)
except (ValueError, TypeError):
# If conversion fails, skip this fund
pass
wb.close()
print(f"Loaded reference data: {len(bm_data)} categories with BM Index, {len(cat_avg_data)} with Category Average, {len(fund_weightages)} fund weightages")
return bm_data, cat_avg_data, fund_weightages
def get_fund_weightage_from_reference(fund_name: str, fund_weightages: Dict[str, int]) -> Optional[int]:
"""
Get the manually adjusted weightage for a fund from reference data.
Args:
fund_name: Name of the fund
fund_weightages: Dictionary of fund name to weightage from Processed_data.xlsx
Returns:
Weightage value if found, None otherwise
"""
# Try exact match first
if fund_name in fund_weightages:
return fund_weightages[fund_name]
# Try partial match (in case of slight name differences)
for ref_fund_name, weightage in fund_weightages.items():
if fund_name.lower() in ref_fund_name.lower() or ref_fund_name.lower() in fund_name.lower():
return weightage
return None
# Default reference file path
DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"
# Default reference file path
DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"