dineth554 commited on
Commit
53290d9
·
verified ·
1 Parent(s): 5244794

Upload inference.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. inference.py +242 -0
inference.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fraud Detection Inference Script
3
+ Load the trained model from Safetensors format and make predictions on sample data.
4
+ """
5
+ import os
6
+ import sys
7
+ import pandas as pd
8
+ import numpy as np
9
+ from safetensors.numpy import load_file
10
+
11
+ # Paths
12
+ SAFETENSORS_PATH = '/app/credit_card_fraud_1403/model/fraud_detector.safetensors'
13
+ DATA_PATH = '/app/credit_card_fraud_1403/data/creditcard.csv'
14
+
15
+ class SafetensorsRFClassifier:
16
+ """
17
+ Random Forest classifier that loads from Safetensors format.
18
+ Implements prediction logic compatible with sklearn's RandomForestClassifier.
19
+ """
20
+
21
+ def __init__(self, tensors):
22
+ self.n_estimators = int(tensors['metadata/n_estimators'][0])
23
+ self.n_features = int(tensors['metadata/n_features'][0])
24
+ self.n_classes = int(tensors['metadata/n_classes'][0])
25
+ self.classes_ = tensors['metadata/classes']
26
+ self.trees = []
27
+
28
+ # Load each tree
29
+ for i in range(self.n_estimators):
30
+ prefix = f'tree_{i:03d}'
31
+ tree = {
32
+ 'node_count': int(tensors[f'{prefix}/node_count'][0]),
33
+ 'children_left': tensors[f'{prefix}/children_left'],
34
+ 'children_right': tensors[f'{prefix}/children_right'],
35
+ 'feature': tensors[f'{prefix}/feature'],
36
+ 'threshold': tensors[f'{prefix}/threshold'],
37
+ 'value': tensors[f'{prefix}/value'],
38
+ 'value_shape': tensors[f'{prefix}/value_shape'],
39
+ 'impurity': tensors[f'{prefix}/impurity'],
40
+ 'n_node_samples': tensors[f'{prefix}/n_node_samples'],
41
+ }
42
+ self.trees.append(tree)
43
+
44
+ def _predict_tree(self, tree, X):
45
+ """Make predictions for a single tree."""
46
+ n_samples = X.shape[0]
47
+ predictions = np.zeros(n_samples, dtype=np.int32)
48
+
49
+ for i in range(n_samples):
50
+ node = 0
51
+ while tree['children_left'][node] != tree['children_right'][node]: # Not a leaf
52
+ if X[i, tree['feature'][node]] <= tree['threshold'][node]:
53
+ node = tree['children_left'][node]
54
+ else:
55
+ node = tree['children_right'][node]
56
+
57
+ # Get class with highest count at leaf
58
+ value_shape = tree['value_shape']
59
+ value = tree['value'].reshape(value_shape)
60
+ predictions[i] = np.argmax(value[node, 0])
61
+
62
+ return predictions
63
+
64
+ def _predict_proba_tree(self, tree, X):
65
+ """Make probability predictions for a single tree."""
66
+ n_samples = X.shape[0]
67
+ probas = np.zeros((n_samples, self.n_classes), dtype=np.float32)
68
+
69
+ for i in range(n_samples):
70
+ node = 0
71
+ while tree['children_left'][node] != tree['children_right'][node]:
72
+ if X[i, tree['feature'][node]] <= tree['threshold'][node]:
73
+ node = tree['children_left'][node]
74
+ else:
75
+ node = tree['children_right'][node]
76
+
77
+ # Get class probabilities at leaf
78
+ value_shape = tree['value_shape']
79
+ value = tree['value'].reshape(value_shape)
80
+ class_counts = value[node, 0]
81
+ total = class_counts.sum()
82
+ if total > 0:
83
+ probas[i] = class_counts / total
84
+ else:
85
+ probas[i] = [0.5, 0.5] # Default if no samples
86
+
87
+ return probas
88
+
89
+ def predict(self, X):
90
+ """Predict class labels for samples in X."""
91
+ X = np.asarray(X, dtype=np.float32)
92
+
93
+ # Aggregate predictions from all trees (majority voting)
94
+ votes = np.zeros((X.shape[0], self.n_estimators), dtype=np.int32)
95
+ for i, tree in enumerate(self.trees):
96
+ votes[:, i] = self._predict_tree(tree, X)
97
+
98
+ # Majority vote
99
+ predictions = np.array([np.bincount(votes[j], minlength=self.n_classes).argmax()
100
+ for j in range(X.shape[0])])
101
+ return predictions
102
+
103
+ def predict_proba(self, X):
104
+ """Predict class probabilities for samples in X."""
105
+ X = np.asarray(X, dtype=np.float32)
106
+
107
+ # Average probabilities from all trees
108
+ probas = np.zeros((X.shape[0], self.n_classes), dtype=np.float32)
109
+ for tree in self.trees:
110
+ probas += self._predict_proba_tree(tree, X)
111
+
112
+ probas /= self.n_estimators
113
+ return probas
114
+
115
+
116
+ class SafetensorsScaler:
117
+ """RobustScaler that loads from Safetensors format."""
118
+
119
+ def __init__(self, tensors):
120
+ self.center_ = tensors['scaler/center']
121
+ self.scale_ = tensors['scaler/scale']
122
+ self.features_ = tensors['scaler/features']
123
+
124
+ def transform(self, X):
125
+ """Transform data using stored center and scale."""
126
+ X = np.asarray(X, dtype=np.float32)
127
+ X_scaled = X.copy()
128
+
129
+ for i, feature_idx in enumerate(self.features_):
130
+ if len(self.center_) > 0:
131
+ X_scaled[:, i] = (X[:, i] - self.center_[i]) / self.scale_[i]
132
+ else:
133
+ X_scaled[:, i] = X[:, i] / self.scale_[i]
134
+
135
+ return X_scaled
136
+
137
+
138
+ def load_artifacts_safetensors():
139
+ """Load the trained model and scaler from Safetensors format."""
140
+ print("Loading model artifacts from Safetensors...")
141
+
142
+ # Load safetensors file
143
+ tensors = load_file(SAFETENSORS_PATH)
144
+ print(f"✓ Loaded {len(tensors)} tensors from {SAFETENSORS_PATH}")
145
+
146
+ # Create model and scaler from tensors
147
+ model = SafetensorsRFClassifier(tensors)
148
+ scaler = SafetensorsScaler(tensors)
149
+
150
+ print(f"✓ Model initialized with {model.n_estimators} estimators")
151
+ print(f"✓ Scaler initialized")
152
+
153
+ return model, scaler
154
+
155
+
156
+ def load_sample_data(n_samples=5):
157
+ """Load sample data from the test set using random sampling."""
158
+ print(f"\nLoading {n_samples} random sample transactions...")
159
+ df = pd.read_csv(DATA_PATH)
160
+
161
+ # Use random sampling for more robust verification
162
+ np.random.seed(42) # For reproducibility
163
+
164
+ # Get indices for fraud and legitimate samples
165
+ fraud_indices = df[df['Class'] == 1].index.tolist()
166
+ legit_indices = df[df['Class'] == 0].index.tolist()
167
+
168
+ # Randomly sample from each class
169
+ n_fraud = min(n_samples // 2 + 1, len(fraud_indices))
170
+ n_legit = n_samples - n_fraud
171
+
172
+ sampled_fraud = np.random.choice(fraud_indices, n_fraud, replace=False)
173
+ sampled_legit = np.random.choice(legit_indices, n_legit, replace=False)
174
+
175
+ sample_indices = np.concatenate([sampled_fraud, sampled_legit])
176
+ np.random.shuffle(sample_indices)
177
+
178
+ samples = df.loc[sample_indices]
179
+
180
+ X_samples = samples.drop(['Class'], axis=1)
181
+ y_true = samples['Class'].values
182
+
183
+ return X_samples, y_true
184
+
185
+
186
+ def predict(model, scaler, X_samples):
187
+ """Make predictions on sample data."""
188
+ # Scale Time and Amount features
189
+ X_processed = X_samples.copy().values
190
+
191
+ # Apply scaling to Time (column 0) and Amount (column 29)
192
+ features_to_scale = [0, 29] # Time and Amount indices
193
+ for i, feature_idx in enumerate(features_to_scale):
194
+ if len(scaler.center_) > 0:
195
+ X_processed[:, feature_idx] = (X_processed[:, feature_idx] - scaler.center_[i]) / scaler.scale_[i]
196
+ else:
197
+ X_processed[:, feature_idx] = X_processed[:, feature_idx] / scaler.scale_[i]
198
+
199
+ # Make predictions
200
+ predictions = model.predict(X_processed)
201
+ probabilities = model.predict_proba(X_processed)[:, 1]
202
+
203
+ return predictions, probabilities
204
+
205
+
206
+ def main():
207
+ """Main inference function."""
208
+ print("="*60)
209
+ print("FRAUD DETECTION INFERENCE (SAFETENSORS)")
210
+ print("="*60)
211
+
212
+ # Load artifacts
213
+ model, scaler = load_artifacts_safetensors()
214
+
215
+ # Load sample data
216
+ X_samples, y_true = load_sample_data(n_samples=5)
217
+
218
+ # Make predictions
219
+ predictions, probabilities = predict(model, scaler, X_samples)
220
+
221
+ # Display results
222
+ print("\n" + "="*60)
223
+ print("PREDICTION RESULTS")
224
+ print("="*60)
225
+ print(f"{'Sample':<8} {'True':<8} {'Predicted':<10} {'Prob':<10} {'Result'}")
226
+ print("-"*60)
227
+
228
+ for i in range(len(predictions)):
229
+ true_label = "FRAUD" if y_true[i] == 1 else "LEGIT"
230
+ pred_label = "FRAUD" if predictions[i] == 1 else "LEGIT"
231
+ match = "✓ CORRECT" if predictions[i] == y_true[i] else "✗ WRONG"
232
+
233
+ print(f"{i+1:<8} {true_label:<8} {pred_label:<10} {probabilities[i]:.4f} {match}")
234
+
235
+ print("="*60)
236
+ print("\nInference completed successfully!")
237
+
238
+ return predictions, probabilities
239
+
240
+
241
+ if __name__ == '__main__':
242
+ main()