curfox_model_trainer

Sleeping

App Files Files Community

Arafath10 commited on Mar 5, 2024

Commit

1fbf289

verified ·

1 Parent(s): 7d96763

Update main.py

Browse files

Files changed (1) hide show

main.py +105 -76

main.py CHANGED Viewed

@@ -35,90 +35,117 @@ app.add_middleware(
 from joblib import dump
 def train_the_model(data):
-    data = data
-    # Select columns
-    selected_columns = ['customer_name', 'customer_address', 'customer_phone',
-                        'customer_email', 'cod', 'weight',
-                        'origin_city.name', 'destination_city.name', 'status.name']
-    # Handling missing values
-    data_filled = data[selected_columns].fillna('Missing')
-    # Encoding categorical variables
-    encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
-    for col, encoder in encoders.items():
-        data_filled[col] = encoder.fit_transform(data_filled[col])
-    # Splitting the dataset
-    X = data_filled.drop('status.name', axis=1)
-    y = data_filled['status.name']
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    # Setup the hyperparameter grid to search
-    param_grid = {
-        'max_depth': [3, 4, 5],
-        'learning_rate': [0.01, 0.1, 0.4],
-        'n_estimators': [100, 200, 300],
-        'subsample': [0.8, 0.9, 1],
-        'colsample_bytree': [0.3, 0.7]
-    }
-    # Initialize the classifier
-    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
-    # Setup GridSearchCV
-    grid_search = GridSearchCV(xgb, param_grid, cv=10, n_jobs=-1, scoring='accuracy')
-    # Fit the grid search to the data
-    grid_search.fit(X_train, y_train)
-    # Get the best parameters
-    best_params = grid_search.best_params_
-    print("Best parameters:", best_params)
-    # Train the model with best parameters
-    best_xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
-    best_xgb.fit(X_train, y_train)
-    # Predict on the test set
-    y_pred = best_xgb.predict(X_test)
-    y_pred_proba = best_xgb.predict_proba(X_test)
-    # Evaluate the model
-    accuracy = accuracy_score(y_test, y_pred)
-    classification_rep = classification_report(y_test, y_pred)
-    # Print the results
-    print("Accuracy:", accuracy)
-    print("Classification Report:\n", classification_report(y_test, y_pred))
-    # Save the model
-    model_filename = 'xgb_model.joblib'
-    dump(best_xgb, model_filename)
-    # Save the encoders
-    encoders_filename = 'encoders.joblib'
-    dump(encoders, encoders_filename)
-    print(f"Model saved as {model_filename}")
-    print(f"Encoders saved as {encoders_filename}")
-@app.get("/trigger_the_data_fecher_every_30min")
-async def your_continuous_function(page: int):
     print("data fetcher running.....")
     # Initialize an empty DataFrame to store the combined data
     combined_df = pd.DataFrame()
     # Update the payload for each page
-    url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate=500&page="+str(page)
     payload = {}
     headers = {
                     'Accept': 'application/json',
-                    'X-Tenant': 'royalexpress'
                   }
     response = requests.request("GET", url, headers=headers, data=payload)
@@ -127,8 +154,10 @@ async def your_continuous_function(page: int):
     json_response = response.json()
     # Extracting 'data' for conversion
     data = json_response['data']
     df = pd.json_normalize(data)
     # Concatenate the current page's DataFrame with the combined DataFrame
     combined_df = pd.concat([combined_df, df], ignore_index=True)
@@ -139,8 +168,8 @@ async def your_continuous_function(page: int):
     train_the_model(data)
-    return "model trained with new page : "+str(page)+" data"
 @app.get("/test_api")
 async def test_api():
-    return "kpi_result"

 from joblib import dump
 def train_the_model(data):
+    try:
+        new_data = data
+        encoders = load('encoders.joblib')
+        xgb_model = load('xgb_model.joblib')
+        selected_columns = ['customer_name', 'customer_address', 'customer_phone',
+                            'customer_email', 'cod', 'weight', 'origin_city.name',
+                            'destination_city.name', 'status.name']
+        new_data_filled = new_data[selected_columns].fillna('Missing')
+        for col, encoder in encoders.items():
+            if col in new_data_filled.columns:
+                unseen_categories = set(new_data_filled[col]) - set(encoder.classes_)
+                if unseen_categories:
+                    for category in unseen_categories:
+                        encoder.classes_ = np.append(encoder.classes_, category)
+                    new_data_filled[col] = encoder.transform(new_data_filled[col])
+                else:
+                    new_data_filled[col] = encoder.transform(new_data_filled[col])
+        X_new = new_data_filled.drop('status.name', axis=1)
+        y_new = new_data_filled['status.name']
+        xgb_model.fit(X_new, y_new)
+        dump(xgb_model, 'xgb_model.joblib')
+        print("Model updated with new data.")
+        updated_model_accuracy = evaluate_model(xgb_model, X_test, y_test)
+        print("Updated model accuracy:", updated_model_accuracy)
+    except:
+        data = data
+        # Select columns
+        selected_columns = ['customer_name', 'customer_address', 'customer_phone',
+                            'customer_email', 'cod', 'weight',
+                            'origin_city.name', 'destination_city.name', 'status.name']
+        # Handling missing values
+        data_filled = data[selected_columns].fillna('Missing')
+        # Encoding categorical variables
+        encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
+        for col, encoder in encoders.items():
+            data_filled[col] = encoder.fit_transform(data_filled[col])
+        # Splitting the dataset
+        X = data_filled.drop('status.name', axis=1)
+        y = data_filled['status.name']
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        # Setup the hyperparameter grid to search
+        param_grid = {
+            'max_depth': [3, 4, 5],
+            'learning_rate': [0.01, 0.1, 0.4],
+            'n_estimators': [100, 200, 300],
+            'subsample': [0.8, 0.9, 1],
+            'colsample_bytree': [0.3, 0.7]
+        }
+        # Initialize the classifier
+        xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
+        # Setup GridSearchCV
+        grid_search = GridSearchCV(xgb, param_grid, cv=10, n_jobs=-1, scoring='accuracy')
+        # Fit the grid search to the data
+        grid_search.fit(X_train, y_train)
+        # Get the best parameters
+        best_params = grid_search.best_params_
+        print("Best parameters:", best_params)
+        # Train the model with best parameters
+        best_xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
+        best_xgb.fit(X_train, y_train)
+        # Predict on the test set
+        y_pred = best_xgb.predict(X_test)
+        y_pred_proba = best_xgb.predict_proba(X_test)
+        # Evaluate the model
+        accuracy = accuracy_score(y_test, y_pred)
+        classification_rep = classification_report(y_test, y_pred)
+        # Print the results
+        print("Accuracy:", accuracy)
+        print("Classification Report:\n", classification_report(y_test, y_pred))
+        # Save the model
+        model_filename = 'xgb_model.joblib'
+        dump(best_xgb, model_filename)
+        # Save the encoders
+        encoders_filename = 'encoders.joblib'
+        dump(encoders, encoders_filename)
+        print(f"Model saved as {model_filename}")
+        print(f"Encoders saved as {encoders_filename}")
+        print("new base model trained")
+@app.get("/trigger_the_data_fecher")
+async def your_continuous_function(page: int,paginate: int,Tenant: str):
     print("data fetcher running.....")
     # Initialize an empty DataFrame to store the combined data
     combined_df = pd.DataFrame()
     # Update the payload for each page
+    url = "https://dev3.api.curfox.parallaxtec.com/api/ml/order-list?sort=id&paginate="+str(paginate)+"&page="+str(page)
     payload = {}
     headers = {
                     'Accept': 'application/json',
+                    'X-Tenant': Tenant #'royalexpress'
                   }
     response = requests.request("GET", url, headers=headers, data=payload)
     json_response = response.json()
     # Extracting 'data' for conversion
     data = json_response['data']
+    data_count = len(data)
     df = pd.json_normalize(data)
     # Concatenate the current page's DataFrame with the combined DataFrame
     combined_df = pd.concat([combined_df, df], ignore_index=True)
     train_the_model(data)
+    return "model trained with page number: "+str(page)+" data count :"+str(data_count)
 @app.get("/test_api")
 async def test_api():
+    return "api_working"