Spaces:
Running
Running
| """ | |
| Reference data extractor from Processed_data.xlsx | |
| This module extracts BM Index, Category Average, and fund weightage data that the advisor | |
| has manually filled in Processed_data.xlsx, so we can use it when processing | |
| raw CSV files that have blank BM/Category rows. | |
| """ | |
| import openpyxl | |
| from typing import Dict, Any, Optional, Tuple | |
| from pathlib import Path | |
| def extract_reference_data(processed_xlsx_path: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]: | |
| """ | |
| Extract BM Index, Category Average, and fund weightage data from Processed_data.xlsx. | |
| Returns: | |
| (bm_data, cat_avg_data, fund_weightages) where: | |
| - bm_data: dict mapping category name to CAGR values for BM Index | |
| - cat_avg_data: dict mapping category name to CAGR/values for Category Average | |
| - fund_weightages: dict mapping fund name to manually adjusted weightage value | |
| """ | |
| xlsx_path = Path(processed_xlsx_path) | |
| if not xlsx_path.exists(): | |
| print(f"Warning: Reference file not found: {processed_xlsx_path}") | |
| return {}, {}, {} | |
| wb = openpyxl.load_workbook(str(xlsx_path)) | |
| ws = wb.active | |
| bm_data = {} | |
| cat_avg_data = {} | |
| fund_weightages = {} | |
| current_category = None | |
| # Find the Weightage column index by scanning the header row | |
| weightage_col_idx = None | |
| for col_idx in range(1, ws.max_column + 1): | |
| header_val = ws.cell(1, col_idx).value | |
| if header_val and 'Weightage' in str(header_val): | |
| weightage_col_idx = col_idx | |
| break | |
| for i in range(1, ws.max_row + 1): | |
| cell_val = ws.cell(i, 1).value | |
| # Check if it's a category header | |
| if cell_val and ':' in str(cell_val) and any(x in str(cell_val) for x in ['Equity', 'Debt', 'Hybrid', 'Solution', 'Other']): | |
| current_category = cell_val | |
| # Check if it's BM Index row | |
| elif cell_val == 'BM Index' and current_category: | |
| bm_1y = ws.cell(i, 6).value | |
| bm_3y = ws.cell(i, 7).value | |
| bm_5y = ws.cell(i, 8).value | |
| bm_10y = ws.cell(i, 9).value | |
| # Only store if at least one value is present | |
| if any([bm_1y, bm_3y, bm_5y, bm_10y]): | |
| bm_data[current_category] = { | |
| 'cagr_1y': bm_1y, | |
| 'cagr_3y': bm_3y, | |
| 'cagr_5y': bm_5y, | |
| 'cagr_10y': bm_10y | |
| } | |
| # Check if it's Category Average row | |
| elif cell_val == 'Category Average' and current_category: | |
| cat_1y = ws.cell(i, 6).value | |
| cat_3y = ws.cell(i, 7).value | |
| cat_5y = ws.cell(i, 8).value | |
| cat_10y = ws.cell(i, 9).value | |
| pe = ws.cell(i, 12).value | |
| pb = ws.cell(i, 13).value | |
| # Only store if at least one CAGR value is present | |
| if any([cat_1y, cat_3y, cat_5y, cat_10y]): | |
| cat_avg_data[current_category] = { | |
| 'cagr_1y': cat_1y, | |
| 'cagr_3y': cat_3y, | |
| 'cagr_5y': cat_5y, | |
| 'cagr_10y': cat_10y, | |
| 'pe_ratio': pe, | |
| 'pb_ratio': pb | |
| } | |
| # Check if it's a fund row (not category header, BM Index, or Category Average) | |
| elif cell_val and cell_val not in ['BM Index', 'Category Average', 'Fund'] and current_category: | |
| # Extract fund name | |
| fund_name = str(cell_val).strip() | |
| # Extract weightage if we found the Weightage column | |
| if weightage_col_idx: | |
| weightage_val = ws.cell(i, weightage_col_idx).value | |
| if weightage_val is not None: | |
| try: | |
| # Convert to int if possible, otherwise round float to nearest int | |
| if isinstance(weightage_val, float): | |
| fund_weightages[fund_name] = int(round(weightage_val)) | |
| else: | |
| fund_weightages[fund_name] = int(weightage_val) | |
| except (ValueError, TypeError): | |
| # If conversion fails, skip this fund | |
| pass | |
| wb.close() | |
| print(f"Loaded reference data: {len(bm_data)} categories with BM Index, {len(cat_avg_data)} with Category Average, {len(fund_weightages)} fund weightages") | |
| return bm_data, cat_avg_data, fund_weightages | |
| def get_fund_weightage_from_reference(fund_name: str, fund_weightages: Dict[str, int]) -> Optional[int]: | |
| """ | |
| Get the manually adjusted weightage for a fund from reference data. | |
| Args: | |
| fund_name: Name of the fund | |
| fund_weightages: Dictionary of fund name to weightage from Processed_data.xlsx | |
| Returns: | |
| Weightage value if found, None otherwise | |
| """ | |
| # Try exact match first | |
| if fund_name in fund_weightages: | |
| return fund_weightages[fund_name] | |
| # Try partial match (in case of slight name differences) | |
| for ref_fund_name, weightage in fund_weightages.items(): | |
| if fund_name.lower() in ref_fund_name.lower() or ref_fund_name.lower() in fund_name.lower(): | |
| return weightage | |
| return None | |
| # Default reference file path | |
| DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx" | |
| # Default reference file path | |
| DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx" | |