""" Reference data extractor from Processed_data.xlsx This module extracts BM Index, Category Average, and fund weightage data that the advisor has manually filled in Processed_data.xlsx, so we can use it when processing raw CSV files that have blank BM/Category rows. """ import openpyxl from typing import Dict, Any, Optional, Tuple from pathlib import Path def extract_reference_data(processed_xlsx_path: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]: """ Extract BM Index, Category Average, and fund weightage data from Processed_data.xlsx. Returns: (bm_data, cat_avg_data, fund_weightages) where: - bm_data: dict mapping category name to CAGR values for BM Index - cat_avg_data: dict mapping category name to CAGR/values for Category Average - fund_weightages: dict mapping fund name to manually adjusted weightage value """ xlsx_path = Path(processed_xlsx_path) if not xlsx_path.exists(): print(f"Warning: Reference file not found: {processed_xlsx_path}") return {}, {}, {} wb = openpyxl.load_workbook(str(xlsx_path)) ws = wb.active bm_data = {} cat_avg_data = {} fund_weightages = {} current_category = None # Find the Weightage column index by scanning the header row weightage_col_idx = None for col_idx in range(1, ws.max_column + 1): header_val = ws.cell(1, col_idx).value if header_val and 'Weightage' in str(header_val): weightage_col_idx = col_idx break for i in range(1, ws.max_row + 1): cell_val = ws.cell(i, 1).value # Check if it's a category header if cell_val and ':' in str(cell_val) and any(x in str(cell_val) for x in ['Equity', 'Debt', 'Hybrid', 'Solution', 'Other']): current_category = cell_val # Check if it's BM Index row elif cell_val == 'BM Index' and current_category: bm_1y = ws.cell(i, 6).value bm_3y = ws.cell(i, 7).value bm_5y = ws.cell(i, 8).value bm_10y = ws.cell(i, 9).value # Only store if at least one value is present if any([bm_1y, bm_3y, bm_5y, bm_10y]): bm_data[current_category] = { 'cagr_1y': bm_1y, 'cagr_3y': bm_3y, 'cagr_5y': bm_5y, 'cagr_10y': bm_10y } # Check if it's Category Average row elif cell_val == 'Category Average' and current_category: cat_1y = ws.cell(i, 6).value cat_3y = ws.cell(i, 7).value cat_5y = ws.cell(i, 8).value cat_10y = ws.cell(i, 9).value pe = ws.cell(i, 12).value pb = ws.cell(i, 13).value # Only store if at least one CAGR value is present if any([cat_1y, cat_3y, cat_5y, cat_10y]): cat_avg_data[current_category] = { 'cagr_1y': cat_1y, 'cagr_3y': cat_3y, 'cagr_5y': cat_5y, 'cagr_10y': cat_10y, 'pe_ratio': pe, 'pb_ratio': pb } # Check if it's a fund row (not category header, BM Index, or Category Average) elif cell_val and cell_val not in ['BM Index', 'Category Average', 'Fund'] and current_category: # Extract fund name fund_name = str(cell_val).strip() # Extract weightage if we found the Weightage column if weightage_col_idx: weightage_val = ws.cell(i, weightage_col_idx).value if weightage_val is not None: try: # Convert to int if possible, otherwise round float to nearest int if isinstance(weightage_val, float): fund_weightages[fund_name] = int(round(weightage_val)) else: fund_weightages[fund_name] = int(weightage_val) except (ValueError, TypeError): # If conversion fails, skip this fund pass wb.close() print(f"Loaded reference data: {len(bm_data)} categories with BM Index, {len(cat_avg_data)} with Category Average, {len(fund_weightages)} fund weightages") return bm_data, cat_avg_data, fund_weightages def get_fund_weightage_from_reference(fund_name: str, fund_weightages: Dict[str, int]) -> Optional[int]: """ Get the manually adjusted weightage for a fund from reference data. Args: fund_name: Name of the fund fund_weightages: Dictionary of fund name to weightage from Processed_data.xlsx Returns: Weightage value if found, None otherwise """ # Try exact match first if fund_name in fund_weightages: return fund_weightages[fund_name] # Try partial match (in case of slight name differences) for ref_fund_name, weightage in fund_weightages.items(): if fund_name.lower() in ref_fund_name.lower() or ref_fund_name.lower() in fund_name.lower(): return weightage return None # Default reference file path DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx" # Default reference file path DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"