import pandas as pd import ast import logging from typing import List, Dict from app.config import settings from app.models import Product logger = logging.getLogger(__name__) class DataLoader: def __init__(self): self.df = None self.products = [] def load_data(self) -> pd.DataFrame: """Load dataset from CSV""" try: self.df = pd.read_csv(settings.DATA_PATH) logger.info(f"Loaded {len(self.df)} products from dataset") self._preprocess_data() return self.df except Exception as e: logger.error(f"Error loading data: {str(e)}") raise def _preprocess_data(self): """Preprocess the data""" # Parse categories from string to list if 'categories' in self.df.columns: self.df['categories'] = self.df['categories'].apply(self._parse_list) # Parse images from string to list if 'images' in self.df.columns: self.df['images'] = self.df['images'].apply(self._parse_list) # Fill NaN values self.df = self.df.fillna("") # Convert to Product models self.products = [self._row_to_product(row) for _, row in self.df.iterrows()] logger.info(f"Preprocessed {len(self.products)} products") def _parse_list(self, value): """Parse string representation of list""" if pd.isna(value) or value == "": return [] try: if isinstance(value, str): # Try to parse as Python literal return ast.literal_eval(value) return value if isinstance(value, list) else [] except: # If parsing fails, split by comma return [item.strip() for item in str(value).split(',')] def _row_to_product(self, row) -> Product: """Convert DataFrame row to Product model""" return Product( uniq_id=str(row.get('uniq_id', '')), title=str(row.get('title', '')), brand=str(row.get('brand', '')) if row.get('brand') else None, description=str(row.get('description', '')) if row.get('description') else None, price=str(row.get('price', '')) if row.get('price') else None, categories=row.get('categories', []) if isinstance(row.get('categories'), list) else [], images=row.get('images', []) if isinstance(row.get('images'), list) else [], manufacturer=str(row.get('manufacturer', '')) if row.get('manufacturer') else None, package_dimensions=str(row.get('package_dimensions', '')) if row.get('package_dimensions') else None, country_of_origin=str(row.get('country_of_origin', '')) if row.get('country_of_origin') else None, material=str(row.get('material', '')) if row.get('material') else None, color=str(row.get('color', '')) if row.get('color') else None ) def get_products(self) -> List[Product]: """Get all products as Product models""" if not self.products: self.load_data() return self.products def get_dataframe(self) -> pd.DataFrame: """Get products as DataFrame""" if self.df is None: self.load_data() return self.df def get_product_by_id(self, uniq_id: str) -> Product: """Get a specific product by ID""" for product in self.get_products(): if product.uniq_id == uniq_id: return product return None # Global instance data_loader = DataLoader()