File size: 3,623 Bytes
df7b648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import ast
import logging
from typing import List, Dict
from app.config import settings
from app.models import Product

logger = logging.getLogger(__name__)

class DataLoader:
    def __init__(self):
        self.df = None
        self.products = []
    
    def load_data(self) -> pd.DataFrame:
        """Load dataset from CSV"""
        try:
            self.df = pd.read_csv(settings.DATA_PATH)
            logger.info(f"Loaded {len(self.df)} products from dataset")
            self._preprocess_data()
            return self.df
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise
    
    def _preprocess_data(self):
        """Preprocess the data"""
        # Parse categories from string to list
        if 'categories' in self.df.columns:
            self.df['categories'] = self.df['categories'].apply(self._parse_list)
        
        # Parse images from string to list
        if 'images' in self.df.columns:
            self.df['images'] = self.df['images'].apply(self._parse_list)
        
        # Fill NaN values
        self.df = self.df.fillna("")
        
        # Convert to Product models
        self.products = [self._row_to_product(row) for _, row in self.df.iterrows()]
        logger.info(f"Preprocessed {len(self.products)} products")
    
    def _parse_list(self, value):
        """Parse string representation of list"""
        if pd.isna(value) or value == "":
            return []
        try:
            if isinstance(value, str):
                # Try to parse as Python literal
                return ast.literal_eval(value)
            return value if isinstance(value, list) else []
        except:
            # If parsing fails, split by comma
            return [item.strip() for item in str(value).split(',')]
    
    def _row_to_product(self, row) -> Product:
        """Convert DataFrame row to Product model"""
        return Product(
            uniq_id=str(row.get('uniq_id', '')),
            title=str(row.get('title', '')),
            brand=str(row.get('brand', '')) if row.get('brand') else None,
            description=str(row.get('description', '')) if row.get('description') else None,
            price=str(row.get('price', '')) if row.get('price') else None,
            categories=row.get('categories', []) if isinstance(row.get('categories'), list) else [],
            images=row.get('images', []) if isinstance(row.get('images'), list) else [],
            manufacturer=str(row.get('manufacturer', '')) if row.get('manufacturer') else None,
            package_dimensions=str(row.get('package_dimensions', '')) if row.get('package_dimensions') else None,
            country_of_origin=str(row.get('country_of_origin', '')) if row.get('country_of_origin') else None,
            material=str(row.get('material', '')) if row.get('material') else None,
            color=str(row.get('color', '')) if row.get('color') else None
        )
    
    def get_products(self) -> List[Product]:
        """Get all products as Product models"""
        if not self.products:
            self.load_data()
        return self.products
    
    def get_dataframe(self) -> pd.DataFrame:
        """Get products as DataFrame"""
        if self.df is None:
            self.load_data()
        return self.df
    
    def get_product_by_id(self, uniq_id: str) -> Product:
        """Get a specific product by ID"""
        for product in self.get_products():
            if product.uniq_id == uniq_id:
                return product
        return None

# Global instance
data_loader = DataLoader()