File size: 5,202 Bytes
e9084d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import configparser
import os
import ast
import pandas as pd
import logging

logger = logging.getLogger("config")

# Initialize config
logger = logging.getLogger("config")

config = configparser.ConfigParser()

project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Try backend/config first, then config/ as fallback
conf_file_path = os.path.join(project_root, "backend", "config", "common.properties")
if not os.path.exists(conf_file_path):
    conf_file_path = os.path.join(project_root, "config", "common.properties")

if not os.path.exists(conf_file_path):
    raise FileNotFoundError(f"common.properties not found. Searched in backend/config/ and config/ under {project_root}")

config.read(conf_file_path)

logger.info(f"Config loaded from: {conf_file_path}")
#temporary read 
# print("CONFIG FILES LOADED:", files)
# print("SECTIONS FOUND:", config.sections())

# Export any config constants if needed
APARTMENT_IDENTIFIER = config.get("IDENTIFIERS", "APARTMENT_IDENTIFIER")
FLAT_NUMBER_IDENTIFIER = config.get("IDENTIFIERS", "FLAT_NUMBER_IDENTIFIER")
HOUSE_NUMBER_IDENTIFIER = config.get("IDENTIFIERS", "HOUSE_NUMBER_IDENTIFIER")
STREET_KEYWORD = config.get("IDENTIFIERS", "STREET_KEYWORD")
FLOOR_NO_KEYWORD=config.get("IDENTIFIERS","FLOOR_KEYWORD")
SURNAME_IDENTIFIER = ast.literal_eval(config.get("IDENTIFIERS","INDIAN_SURNAMES"))

STATE_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "STATE_MAPPING"))
CITY_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "CITY_MAPPING"))
ADDRESS_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "ADDRESS_MAPPING"))

MODEL_WEIGHTS = ast.literal_eval(config.get("MATCHING_LOGIC", "MODEL_WEIGHTS"))
MATCHING_RULES = ast.literal_eval(config.get("MATCHING_LOGIC", "MATCHING_RULES"))

# Name-specific weights (embedding 0.7 + fuzz 0.2 + phonetic 0.1)
try:
    NAME_MODEL_WEIGHTS = ast.literal_eval(config.get("NAME_MATCHING", "NAME_MODEL_WEIGHTS"))
    NAME_MATCH_ADJUSTMENTS = ast.literal_eval(config.get("NAME_MATCHING", "NAME_MATCH_ADJUSTMENTS"))
except Exception:
    NAME_MODEL_WEIGHTS = MODEL_WEIGHTS
    NAME_MATCH_ADJUSTMENTS = {"surname_penalty": -30, "initial_boost": 30, "subset_boost": 40}

# Address-specific weights (embedding + fuzz, no phonetic)
try:
    ADDRESS_MODEL_WEIGHTS = ast.literal_eval(config.get("ADDRESS_MATCHING", "ADDRESS_MODEL_WEIGHTS"))
    ADDRESS_MATCH_ADJUSTMENTS = ast.literal_eval(config.get("ADDRESS_MATCHING", "ADDRESS_MATCH_ADJUSTMENTS"))
except Exception:
    ADDRESS_MODEL_WEIGHTS = MODEL_WEIGHTS
    ADDRESS_MATCH_ADJUSTMENTS = {"house_match_boost": 30, "house_mismatch_penalty": 70}

try:
    MODEL_1_NAME = config.get("EMBEDDING_MODELS", "MODEL_1_NAME").strip()
    MODEL_2_NAME = config.get("EMBEDDING_MODELS", "MODEL_2_NAME").strip()
except Exception:
    MODEL_1_NAME = "sentence-transformers/all-mpnet-base-v2"
    MODEL_2_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# =========================================================
# CSV DATA LOADING (replacing MySQL)
# =========================================================

def load_csv_file(csv_path: str, file_name: str) -> pd.DataFrame:
    """Load CSV file as DataFrame with error handling."""
    try:
        # Convert to absolute path relative to project root
        if not os.path.isabs(csv_path):
            csv_path = os.path.join(project_root, csv_path)
        
        # Normalize path and resolve any ../ or ./ references
        csv_path = os.path.abspath(csv_path)
        
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            logger.info(f"Loaded {file_name}: {len(df)} rows from {csv_path}")
            return df
        else:
            logger.warning(f"CSV file not found: {csv_path}")
            return pd.DataFrame()
    except Exception as e:
        logger.error(f"Failed to load {file_name}: {e}")
        return pd.DataFrame()

# Load CSV reference tables
try:
    name_variation_df = load_csv_file(config.get("csv", "name_variation_standard"), "name_variation_standard")
    hno_variation_df = load_csv_file(config.get("csv", "hno_variation_standard"), "hno_variation_standard")
    city_prev_pres_df = load_csv_file(config.get("csv", "city_prev_pres"), "city_prev_pres")
    state_name_standard_df = load_csv_file(config.get("csv", "state_name_standard"), "state_name_standard")
    sur_comm_names_df = load_csv_file(config.get("csv", "sur_comm_names"), "sur_comm_names")
    pin_city_state_df = load_csv_file(config.get("csv", "pin_city_state"), "pin_city_state")
    
    logger.info("All CSV files loaded successfully")
except Exception as e:
    logger.warning(f"Some CSV files may not have loaded: {e}")
    name_variation_df = pd.DataFrame()
    hno_variation_df = pd.DataFrame()
    city_prev_pres_df = pd.DataFrame()
    state_name_standard_df = pd.DataFrame()
    sur_comm_names_df = pd.DataFrame()
    pin_city_state_df = pd.DataFrame()

# Legacy string exports for backward compatibility
pin_city_state = "pin_city_state"
sur_comm_names = "sur_comm_names"
city_prev_pres = "city_prev_pres"
state_name_standard = "state_name_standard"
hno_variation_standard = "hno_variation_standard"
name_variation_standard = "name_variation_standard"