File size: 6,564 Bytes
6aa09c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Any, Optional
from utils.logger import setup_logger
import io

logger = setup_logger(__name__)

class DataProcessor:
    """Core data processing class for handling file operations and data analysis"""
    
    def __init__(self):
        self.supported_formats = ['csv', 'xlsx', 'xls']
        
    def read_file(self, file_content: bytes, filename: str) -> pd.DataFrame:
        """
        Read uploaded file and return pandas DataFrame
        
        Args:
            file_content: File content as bytes
            filename: Name of the uploaded file
            
        Returns:
            pandas DataFrame
            
        Raises:
            ValueError: If file format is not supported
            Exception: If file cannot be read
        """
        try:
            file_extension = filename.lower().split('.')[-1]
            
            if file_extension not in self.supported_formats:
                raise ValueError(f"Unsupported file format: {file_extension}")
            
            if file_extension == 'csv':
                # Try different encodings for CSV files
                for encoding in ['utf-8', 'latin-1', 'cp1252']:
                    try:
                        df = pd.read_csv(io.BytesIO(file_content), encoding=encoding)
                        logger.info(f"Successfully read CSV file with {encoding} encoding")
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    raise ValueError("Could not decode CSV file with any supported encoding")
                    
            elif file_extension in ['xlsx', 'xls']:
                df = pd.read_excel(io.BytesIO(file_content))
                logger.info(f"Successfully read Excel file: {filename}")
            
            logger.info(f"Loaded dataset with shape: {df.shape}")
            return df
            
        except Exception as e:
            logger.error(f"Error reading file {filename}: {str(e)}")
            raise
    
    def analyze_data_quality(self, df: pd.DataFrame) -> Dict[str, Any]:
        """
        Analyze data quality and return comprehensive statistics
        
        Args:
            df: pandas DataFrame to analyze
            
        Returns:
            Dictionary containing data quality metrics
        """
        analysis = {
            'shape': df.shape,
            'columns': list(df.columns),
            'dtypes': df.dtypes.to_dict(),
            'missing_values': {},
            'duplicates': df.duplicated().sum(),
            'column_stats': {}
        }
        
        # Analyze each column
        for col in df.columns:
            col_analysis = {
                'dtype': str(df[col].dtype),
                'missing_count': df[col].isnull().sum(),
                'missing_percentage': (df[col].isnull().sum() / len(df)) * 100,
                'unique_values': df[col].nunique(),
                'sample_values': df[col].dropna().head(5).tolist()
            }
            
            # Add specific stats based on data type
            if df[col].dtype in ['int64', 'float64']:
                col_analysis.update({
                    'mean': df[col].mean(),
                    'median': df[col].median(),
                    'std': df[col].std(),
                    'min': df[col].min(),
                    'max': df[col].max(),
                    'outliers': self._detect_outliers(df[col])
                })
            elif df[col].dtype == 'object':
                col_analysis.update({
                    'most_common': df[col].value_counts().head(3).to_dict(),
                    'avg_length': df[col].astype(str).str.len().mean()
                })
            
            analysis['column_stats'][col] = col_analysis
            analysis['missing_values'][col] = col_analysis['missing_count']
        
        logger.info(f"Data quality analysis completed for {df.shape[0]} rows, {df.shape[1]} columns")
        return analysis
    
    def _detect_outliers(self, series: pd.Series) -> int:
        """
        Detect outliers using IQR method
        
        Args:
            series: pandas Series with numeric data
            
        Returns:
            Number of outliers detected
        """
        if series.dtype not in ['int64', 'float64']:
            return 0
            
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = ((series < lower_bound) | (series > upper_bound)).sum()
        return outliers
    
    def detect_data_types(self, df: pd.DataFrame) -> Dict[str, str]:
        """
        Detect and suggest optimal data types for each column
        
        Args:
            df: pandas DataFrame
            
        Returns:
            Dictionary mapping column names to suggested data types
        """
        suggestions = {}
        
        for col in df.columns:
            current_type = str(df[col].dtype)
            
            # Skip if already optimal
            if current_type in ['int64', 'float64', 'bool', 'datetime64[ns]']:
                suggestions[col] = current_type
                continue
            
            # Try to infer better type
            non_null_series = df[col].dropna()
            
            if len(non_null_series) == 0:
                suggestions[col] = 'object'
                continue
            
            # Check for numeric
            try:
                pd.to_numeric(non_null_series)
                if non_null_series.astype(str).str.contains(r'\.').any():
                    suggestions[col] = 'float64'
                else:
                    suggestions[col] = 'int64'
                continue
            except (ValueError, TypeError):
                pass
            
            # Check for datetime
            try:
                pd.to_datetime(non_null_series)
                suggestions[col] = 'datetime64[ns]'
                continue
            except (ValueError, TypeError):
                pass
            
            # Check for boolean
            unique_vals = set(non_null_series.astype(str).str.lower())
            if unique_vals.issubset({'true', 'false', '1', '0', 'yes', 'no'}):
                suggestions[col] = 'bool'
                continue
            
            suggestions[col] = 'object'
        
        return suggestions