File size: 5,509 Bytes
b0e15c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""

Reference data extractor from Processed_data.xlsx



This module extracts BM Index, Category Average, and fund weightage data that the advisor

has manually filled in Processed_data.xlsx, so we can use it when processing

raw CSV files that have blank BM/Category rows.

"""

import openpyxl
from typing import Dict, Any, Optional, Tuple
from pathlib import Path


def extract_reference_data(processed_xlsx_path: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]:
    """

    Extract BM Index, Category Average, and fund weightage data from Processed_data.xlsx.



    Returns:

        (bm_data, cat_avg_data, fund_weightages) where:

        - bm_data: dict mapping category name to CAGR values for BM Index

        - cat_avg_data: dict mapping category name to CAGR/values for Category Average

        - fund_weightages: dict mapping fund name to manually adjusted weightage value

    """
    xlsx_path = Path(processed_xlsx_path)
    if not xlsx_path.exists():
        print(f"Warning: Reference file not found: {processed_xlsx_path}")
        return {}, {}, {}

    wb = openpyxl.load_workbook(str(xlsx_path))
    ws = wb.active

    bm_data = {}
    cat_avg_data = {}
    fund_weightages = {}
    current_category = None

    # Find the Weightage column index by scanning the header row
    weightage_col_idx = None
    for col_idx in range(1, ws.max_column + 1):
        header_val = ws.cell(1, col_idx).value
        if header_val and 'Weightage' in str(header_val):
            weightage_col_idx = col_idx
            break

    for i in range(1, ws.max_row + 1):
        cell_val = ws.cell(i, 1).value

        # Check if it's a category header
        if cell_val and ':' in str(cell_val) and any(x in str(cell_val) for x in ['Equity', 'Debt', 'Hybrid', 'Solution', 'Other']):
            current_category = cell_val

        # Check if it's BM Index row
        elif cell_val == 'BM Index' and current_category:
            bm_1y = ws.cell(i, 6).value
            bm_3y = ws.cell(i, 7).value
            bm_5y = ws.cell(i, 8).value
            bm_10y = ws.cell(i, 9).value

            # Only store if at least one value is present
            if any([bm_1y, bm_3y, bm_5y, bm_10y]):
                bm_data[current_category] = {
                    'cagr_1y': bm_1y,
                    'cagr_3y': bm_3y,
                    'cagr_5y': bm_5y,
                    'cagr_10y': bm_10y
                }

        # Check if it's Category Average row
        elif cell_val == 'Category Average' and current_category:
            cat_1y = ws.cell(i, 6).value
            cat_3y = ws.cell(i, 7).value
            cat_5y = ws.cell(i, 8).value
            cat_10y = ws.cell(i, 9).value
            pe = ws.cell(i, 12).value
            pb = ws.cell(i, 13).value

            # Only store if at least one CAGR value is present
            if any([cat_1y, cat_3y, cat_5y, cat_10y]):
                cat_avg_data[current_category] = {
                    'cagr_1y': cat_1y,
                    'cagr_3y': cat_3y,
                    'cagr_5y': cat_5y,
                    'cagr_10y': cat_10y,
                    'pe_ratio': pe,
                    'pb_ratio': pb
                }

        # Check if it's a fund row (not category header, BM Index, or Category Average)
        elif cell_val and cell_val not in ['BM Index', 'Category Average', 'Fund'] and current_category:
            # Extract fund name
            fund_name = str(cell_val).strip()

            # Extract weightage if we found the Weightage column
            if weightage_col_idx:
                weightage_val = ws.cell(i, weightage_col_idx).value
                if weightage_val is not None:
                    try:
                        # Convert to int if possible, otherwise round float to nearest int
                        if isinstance(weightage_val, float):
                            fund_weightages[fund_name] = int(round(weightage_val))
                        else:
                            fund_weightages[fund_name] = int(weightage_val)
                    except (ValueError, TypeError):
                        # If conversion fails, skip this fund
                        pass

    wb.close()

    print(f"Loaded reference data: {len(bm_data)} categories with BM Index, {len(cat_avg_data)} with Category Average, {len(fund_weightages)} fund weightages")

    return bm_data, cat_avg_data, fund_weightages


def get_fund_weightage_from_reference(fund_name: str, fund_weightages: Dict[str, int]) -> Optional[int]:
    """

    Get the manually adjusted weightage for a fund from reference data.



    Args:

        fund_name: Name of the fund

        fund_weightages: Dictionary of fund name to weightage from Processed_data.xlsx



    Returns:

        Weightage value if found, None otherwise

    """
    # Try exact match first
    if fund_name in fund_weightages:
        return fund_weightages[fund_name]

    # Try partial match (in case of slight name differences)
    for ref_fund_name, weightage in fund_weightages.items():
        if fund_name.lower() in ref_fund_name.lower() or ref_fund_name.lower() in fund_name.lower():
            return weightage

    return None


# Default reference file path
DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"


# Default reference file path
DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"