Spaces:

Parthiban97
/

MF

Running

File size: 5,509 Bytes

b0e15c1

"""

Reference data extractor from Processed_data.xlsx



This module extracts BM Index, Category Average, and fund weightage data that the advisor

has manually filled in Processed_data.xlsx, so we can use it when processing

raw CSV files that have blank BM/Category rows.

"""

import openpyxl
from typing import Dict, Any, Optional, Tuple
from pathlib import Path


def extract_reference_data(processed_xlsx_path: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]:
    """

    Extract BM Index, Category Average, and fund weightage data from Processed_data.xlsx.



    Returns:

        (bm_data, cat_avg_data, fund_weightages) where:

        - bm_data: dict mapping category name to CAGR values for BM Index

        - cat_avg_data: dict mapping category name to CAGR/values for Category Average

        - fund_weightages: dict mapping fund name to manually adjusted weightage value

    """
    xlsx_path = Path(processed_xlsx_path)
    if not xlsx_path.exists():
        print(f"Warning: Reference file not found: {processed_xlsx_path}")
        return {}, {}, {}

    wb = openpyxl.load_workbook(str(xlsx_path))
    ws = wb.active

    bm_data = {}
    cat_avg_data = {}
    fund_weightages = {}
    current_category = None

    # Find the Weightage column index by scanning the header row
    weightage_col_idx = None
    for col_idx in range(1, ws.max_column + 1):
        header_val = ws.cell(1, col_idx).value
        if header_val and 'Weightage' in str(header_val):
            weightage_col_idx = col_idx
            break

    for i in range(1, ws.max_row + 1):
        cell_val = ws.cell(i, 1).value

        # Check if it's a category header
        if cell_val and ':' in str(cell_val) and any(x in str(cell_val) for x in ['Equity', 'Debt', 'Hybrid', 'Solution', 'Other']):
            current_category = cell_val

        # Check if it's BM Index row
        elif cell_val == 'BM Index' and current_category:
            bm_1y = ws.cell(i, 6).value
            bm_3y = ws.cell(i, 7).value
            bm_5y = ws.cell(i, 8).value
            bm_10y = ws.cell(i, 9).value

            # Only store if at least one value is present
            if any([bm_1y, bm_3y, bm_5y, bm_10y]):
                bm_data[current_category] = {
                    'cagr_1y': bm_1y,
                    'cagr_3y': bm_3y,
                    'cagr_5y': bm_5y,
                    'cagr_10y': bm_10y
                }

        # Check if it's Category Average row
        elif cell_val == 'Category Average' and current_category:
            cat_1y = ws.cell(i, 6).value
            cat_3y = ws.cell(i, 7).value
            cat_5y = ws.cell(i, 8).value
            cat_10y = ws.cell(i, 9).value
            pe = ws.cell(i, 12).value
            pb = ws.cell(i, 13).value

            # Only store if at least one CAGR value is present
            if any([cat_1y, cat_3y, cat_5y, cat_10y]):
                cat_avg_data[current_category] = {
                    'cagr_1y': cat_1y,
                    'cagr_3y': cat_3y,
                    'cagr_5y': cat_5y,
                    'cagr_10y': cat_10y,
                    'pe_ratio': pe,
                    'pb_ratio': pb
                }

        # Check if it's a fund row (not category header, BM Index, or Category Average)
        elif cell_val and cell_val not in ['BM Index', 'Category Average', 'Fund'] and current_category:
            # Extract fund name
            fund_name = str(cell_val).strip()

            # Extract weightage if we found the Weightage column
            if weightage_col_idx:
                weightage_val = ws.cell(i, weightage_col_idx).value
                if weightage_val is not None:
                    try:
                        # Convert to int if possible, otherwise round float to nearest int
                        if isinstance(weightage_val, float):
                            fund_weightages[fund_name] = int(round(weightage_val))
                        else:
                            fund_weightages[fund_name] = int(weightage_val)
                    except (ValueError, TypeError):
                        # If conversion fails, skip this fund
                        pass

    wb.close()

    print(f"Loaded reference data: {len(bm_data)} categories with BM Index, {len(cat_avg_data)} with Category Average, {len(fund_weightages)} fund weightages")

    return bm_data, cat_avg_data, fund_weightages


def get_fund_weightage_from_reference(fund_name: str, fund_weightages: Dict[str, int]) -> Optional[int]:
    """

    Get the manually adjusted weightage for a fund from reference data.



    Args:

        fund_name: Name of the fund

        fund_weightages: Dictionary of fund name to weightage from Processed_data.xlsx



    Returns:

        Weightage value if found, None otherwise

    """
    # Try exact match first
    if fund_name in fund_weightages:
        return fund_weightages[fund_name]

    # Try partial match (in case of slight name differences)
    for ref_fund_name, weightage in fund_weightages.items():
        if fund_name.lower() in ref_fund_name.lower() or ref_fund_name.lower() in fund_name.lower():
            return weightage

    return None


# Default reference file path
DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"


# Default reference file path
DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"