import pandas as pd import re from synthesis_qa_backend import ResearchSynthesizer from config import API_KEY, INDEX_PATH, METADATA_PATH, SPECIFIC_COUNTRIES class DataHandler: def __init__(self): self.synthesizer = None self.docs_df = pd.DataFrame() self.countries_list = [] self.sectors_list = [] self.load_data() def load_data(self): """Initialize the research system and load data""" try: self.synthesizer = ResearchSynthesizer(INDEX_PATH, METADATA_PATH, API_KEY) metadata_df = pd.read_csv(METADATA_PATH) self.docs_df = metadata_df.drop_duplicates(subset=['record_id']) print(f"✅ Loaded {len(self.docs_df)} unique documents") # Get unique values for dropdowns self.countries_list, self.sectors_list = self._get_unique_values() except Exception as e: print(f"❌ Error loading system: {e}") self.synthesizer = None self.docs_df = pd.DataFrame() def _get_unique_values(self): """Get unique values for dropdowns""" if self.docs_df.empty: return [], [] countries_list = [] sectors_list = [] if 'study_countries' in self.docs_df.columns: for countries_str in self.docs_df['study_countries'].dropna(): if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']: continue countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')] filtered = [c for c in countries if c in SPECIFIC_COUNTRIES and len(c) > 1] countries_list.extend(filtered) countries_list = sorted(list(set(countries_list))) if 'world_bank_sector' in self.docs_df.columns: sectors_list = sorted(self.docs_df['world_bank_sector'].dropna().unique().tolist()) return countries_list, sectors_list def get_data(self): """Return all data objects""" return { 'synthesizer': self.synthesizer, 'docs_df': self.docs_df, 'countries_list': self.countries_list, 'sectors_list': self.sectors_list }