SamadhiDBS commited on
Commit
a897569
ยท
verified ยท
1 Parent(s): 7a809d2

Update app/data_processor.py

Browse files
Files changed (1) hide show
  1. app/data_processor.py +127 -131
app/data_processor.py CHANGED
@@ -1,132 +1,128 @@
1
- ## data ingestion & preprocessing & schema detection
2
-
3
- import pandas as pd
4
- import numpy as np
5
- from pathlib import Path
6
- import json
7
-
8
- class DataProcessor:
9
- def __init__(self):
10
- self.df = None
11
- self.schema = {}
12
-
13
- def load_data(self, file_path):
14
- ##______________load csv or json file________________________
15
- file_ext = Path(file_path).suffix.lower()
16
-
17
- if file_ext == '.csv':
18
- self.df = pd.read_csv(file_path)
19
- elif file_ext == '.json':
20
- self.df = pd.read_json(file_path)
21
- else:
22
- raise ValueError("Unsupported file type. Use CSV or JSON file")
23
-
24
- return self.df
25
-
26
- def load_from_upload(self, uploaded_file):
27
- ###__________load from stramlit upload_____________
28
-
29
- if uploaded_file.name.endswith('.csv'):
30
- self.df = pd.read_csv(uploaded_file)
31
- elif uploaded_file.name.endswith('.json'):
32
- self.df = pd.read_json(uploaded_file)
33
- else:
34
- raise ValueError("Unsupported file type")
35
-
36
- return self.df
37
-
38
- def preprocess(self):
39
- """
40
- Step 2: Clean the data - Enhanced version
41
- """
42
- print("๐Ÿ”„ Preprocessing data...")
43
-
44
- # FIRST: Replace '?' and other placeholders with NaN
45
- placeholder_values = ['?', 'None', 'null', 'NULL', 'NaN', 'nan', '', ' ', 'Unknown', 'unknown']
46
- self.df = self.df.replace(placeholder_values, pd.NA)
47
-
48
- # Remove duplicate rows
49
- initial_rows = len(self.df)
50
- self.df = self.df.drop_duplicates()
51
- print(f" Removed {initial_rows - len(self.df)} duplicates")
52
-
53
- # Handle missing values
54
- missing_before = self.df.isnull().sum().sum()
55
-
56
- # For numeric columns: fill with median
57
- numeric_cols = self.df.select_dtypes(include=[np.number]).columns
58
- for col in numeric_cols:
59
- self.df[col] = self.df[col].fillna(self.df[col].median())
60
-
61
- # For categorical columns: fill with mode or 'Unknown'
62
- categorical_cols = self.df.select_dtypes(include=['object']).columns
63
- for col in categorical_cols:
64
- if not self.df[col].isnull().all():
65
- mode_val = self.df[col].mode()
66
- if len(mode_val) > 0:
67
- self.df[col] = self.df[col].fillna(mode_val[0])
68
- else:
69
- self.df[col] = self.df[col].fillna("Unknown")
70
-
71
- missing_after = self.df.isnull().sum().sum()
72
- print(f" Filled {missing_before - missing_after} missing values")
73
-
74
- # Convert data types intelligently
75
- self._convert_types()
76
-
77
- return self.df
78
-
79
- def _convert_types(self):
80
- ##________auto-convert data typpes_______
81
-
82
- # try to convert object columns to datetime
83
- for col in self.df.columns:
84
- if self.df[col].dtype == 'object':
85
- try:
86
- self.df[col] = pd.to_datetime(self.df[col])
87
- print(f" Converted {col} to datetime")
88
- except:
89
- pass
90
-
91
- def detect_schema(self):
92
- """
93
- Step 3: Detect schema - identify column types
94
- """
95
- self.schema = {
96
- 'numeric': [],
97
- 'categorical': [],
98
- 'datetime': [],
99
- 'text': []
100
- }
101
-
102
- for col in self.df.columns:
103
- if pd.api.types.is_datetime64_any_dtype(self.df[col]):
104
- self.schema['datetime'].append(col)
105
- elif pd.api.types.is_numeric_dtype(self.df[col]):
106
- self.schema['numeric'].append(col)
107
- elif pd.api.types.is_object_dtype(self.df[col]):
108
- # Check if it's categorical (few unique values)
109
- unique_ratio = self.df[col].nunique() / len(self.df)
110
- # Lower threshold to catch more categories (0.05 = 5%)
111
- if unique_ratio < 0.5: # Changed from 0.05 to 0.5 to catch product, category, region
112
- self.schema['categorical'].append(col)
113
- else:
114
- self.schema['text'].append(col)
115
-
116
- print("\n๐Ÿ“Š Schema Detected:")
117
- print(f" Numeric columns: {self.schema['numeric']}")
118
- print(f" Categorical columns: {self.schema['categorical']}")
119
- print(f" Date columns: {self.schema['datetime']}")
120
-
121
- return self.schema
122
-
123
- def get_summary(self):
124
- ##__________get basic data summary_________
125
-
126
- return{
127
- 'rows': len(self.df),
128
- 'columns': len(self.df.columns),
129
- 'column_names': list(self.df.columns),
130
- 'missing_values': self.df.isnull().sum().to_dict(),
131
- 'memory_usage': self.df.memory_usage(deep=True).sum() / 1024**2 # MB
132
  }
 
1
+ ## data ingestion & preprocessing & schema detection
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from pathlib import Path
6
+ import json
7
+
8
+ class DataProcessor:
9
+ def __init__(self):
10
+ self.df = None
11
+ self.schema = {}
12
+
13
+ def load_data(self, file_path):
14
+ ##______________load csv or json file________________________
15
+ file_ext = Path(file_path).suffix.lower()
16
+
17
+ if file_ext == '.csv':
18
+ self.df = pd.read_csv(file_path)
19
+ elif file_ext == '.json':
20
+ self.df = pd.read_json(file_path)
21
+ else:
22
+ raise ValueError("Unsupported file type. Use CSV or JSON file")
23
+
24
+ return self.df
25
+
26
+ def load_from_upload(self, uploaded_file):
27
+ ###__________load from stramlit upload_____________
28
+
29
+ if uploaded_file.name.endswith('.csv'):
30
+ self.df = pd.read_csv(uploaded_file)
31
+ elif uploaded_file.name.endswith('.json'):
32
+ self.df = pd.read_json(uploaded_file)
33
+ else:
34
+ raise ValueError("Unsupported file type")
35
+
36
+ return self.df
37
+
38
+ def preprocess(self):
39
+ """Step 2: Clean the data - Enhanced version"""
40
+ print("๐Ÿ”„ Preprocessing data...")
41
+
42
+ #FIRST: Replace '?' and other placeholders with NaN
43
+ placeholder_values = ['?', 'None', 'null', 'NULL', 'NaN', 'nan', '', ' ', 'Unknown', 'unknown']
44
+ self.df = self.df.replace(placeholder_values, pd.NA)
45
+
46
+ # Remove duplicate rows
47
+ initial_rows = len(self.df)
48
+ self.df = self.df.drop_duplicates()
49
+ print(f" Removed {initial_rows - len(self.df)} duplicates")
50
+
51
+ #Handle missing values
52
+ missing_before = self.df.isnull().sum().sum()
53
+
54
+ #For numeric columns: fill with median
55
+ numeric_cols = self.df.select_dtypes(include=[np.number]).columns
56
+ for col in numeric_cols:
57
+ self.df[col] = self.df[col].fillna(self.df[col].median())
58
+
59
+ #For categorical columns: fill with mode or 'Unknown'
60
+ categorical_cols = self.df.select_dtypes(include=['object']).columns
61
+ for col in categorical_cols:
62
+ if not self.df[col].isnull().all():
63
+ mode_val = self.df[col].mode()
64
+ if len(mode_val) > 0:
65
+ self.df[col] = self.df[col].fillna(mode_val[0])
66
+ else:
67
+ self.df[col] = self.df[col].fillna("Unknown")
68
+
69
+ missing_after = self.df.isnull().sum().sum()
70
+ print(f" Filled {missing_before - missing_after} missing values")
71
+
72
+ #Convert data types intelligently
73
+ self._convert_types()
74
+
75
+ return self.df
76
+
77
+ def _convert_types(self):
78
+ ##________auto-convert data typpes_______
79
+
80
+ #try to convert object columns to datetime
81
+ for col in self.df.columns:
82
+ if self.df[col].dtype == 'object':
83
+ try:
84
+ self.df[col] = pd.to_datetime(self.df[col])
85
+ print(f" Converted {col} to datetime")
86
+ except:
87
+ pass
88
+
89
+ def detect_schema(self):
90
+ """Step 3: Detect schema - identify column types"""
91
+ self.schema = {
92
+ 'numeric': [],
93
+ 'categorical': [],
94
+ 'datetime': [],
95
+ 'text': []
96
+ }
97
+
98
+ for col in self.df.columns:
99
+ if pd.api.types.is_datetime64_any_dtype(self.df[col]):
100
+ self.schema['datetime'].append(col)
101
+ elif pd.api.types.is_numeric_dtype(self.df[col]):
102
+ self.schema['numeric'].append(col)
103
+ elif pd.api.types.is_object_dtype(self.df[col]):
104
+ # Check if it's categorical (few unique values)
105
+ unique_ratio = self.df[col].nunique() / len(self.df)
106
+ # Lower threshold to catch more categories (0.05 = 5%)
107
+ if unique_ratio < 0.5: # Changed from 0.05 to 0.5 to catch product, category, region
108
+ self.schema['categorical'].append(col)
109
+ else:
110
+ self.schema['text'].append(col)
111
+
112
+ print("\n๐Ÿ“Š Schema Detected:")
113
+ print(f" Numeric columns: {self.schema['numeric']}")
114
+ print(f" Categorical columns: {self.schema['categorical']}")
115
+ print(f" Date columns: {self.schema['datetime']}")
116
+
117
+ return self.schema
118
+
119
+ def get_summary(self):
120
+ ##__________get basic data summary_________
121
+
122
+ return{
123
+ 'rows': len(self.df),
124
+ 'columns': len(self.df.columns),
125
+ 'column_names': list(self.df.columns),
126
+ 'missing_values': self.df.isnull().sum().to_dict(),
127
+ 'memory_usage': self.df.memory_usage(deep=True).sum() / 1024**2 # MB
 
 
 
 
128
  }