Spaces:

hhhar
/

ChurnPredUpdated

Sleeping

App Files Files Community

hhhar commited on Oct 5, 2024

Commit

2ea9add

verified ·

1 Parent(s): 43959e6

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -59

app.py CHANGED Viewed

@@ -3,62 +3,46 @@ import joblib
 import numpy as np
 import os
 import pandas as pd
-import io
 import openpyxl
-import re
 # Load the preprocessor
 preprocessor_path = 'modelExports/preprocessor.pkl'
 preprocessor = joblib.load(preprocessor_path)
-def find_header_row(df, required_columns, max_rows_to_check=10):
-    for i in range(min(max_rows_to_check, len(df))):
-        row_values = [str(val).strip() for val in df.iloc[i].values]
-        if all(col in row_values for col in required_columns):
-            return i  # Header row found at row i
-    return -1  # Header row not found
 def process_uploaded_file(uploaded_file, required_columns):
     try:
         file_extension = uploaded_file.name.split('.')[-1].lower()
         if file_extension == 'csv':
-            # Read the first few rows to check for headers
-            df = pd.read_csv(uploaded_file, nrows=10, header=None)
         elif file_extension in ['xlsx', 'xls']:
-            # Read the first few rows of the Excel file
-            df = pd.read_excel(uploaded_file, nrows=10, header=None, engine='openpyxl')
         else:
             st.error("Unsupported file format. Please upload a CSV or Excel file.")
             return None
-        header_row = find_header_row(df, required_columns)
-        if header_row == -1:
-            st.error(f"Required columns not found in the first {len(df)} rows.")
-            st.write("Expected columns:", required_columns)
-            st.write("Found data rows:", df.head().values.tolist())
-            return None
-        # Re-read the file with the correct header row
-        uploaded_file.seek(0)  # Reset file pointer
-        if file_extension == 'csv':
-            df = pd.read_csv(uploaded_file, header=header_row)
-        else:
-            df = pd.read_excel(uploaded_file, header=header_row, engine='openpyxl')
-        st.write("Column names:", df.columns.tolist())
-        # Return the DataFrame without modifying column names
         return df
     except Exception as e:
         st.error(f"Error reading the file: {e}")
         return None
 def predict_with_model(model, data, includes_preprocessor):
     if includes_preprocessor:
         return model.predict(data)
@@ -145,29 +129,29 @@ if interface == "Single Prediction":
     input_data = {}
     # Categorical inputs
-    input_data['CRM_PID_Value_Segment'] = st.selectbox(
-        'CRM_PID_Value_Segment', crm_pid_value_segment_options)
-    input_data['EffectiveSegment'] = st.selectbox(
-        'EffectiveSegment', effective_segment_options)
-    input_data['KA_name'] = st.selectbox('KA_name', ka_name_options)
     # Numerical inputs
-    input_data['Billing_ZIP'] = st.number_input(
-        'Billing_ZIP', min_value=0, format="%d")
-    input_data['Active_subscribers'] = st.number_input(
-        'Active_subscribers', min_value=0, format="%d")
-    input_data['Not_Active_subscribers'] = st.number_input(
-        'Not_Active_subscribers', min_value=0, format="%d")
-    input_data['Suspended_subscribers'] = st.number_input(
-        'Suspended_subscribers', min_value=0, format="%d")
-    input_data['Total_SUBs'] = st.number_input(
-        'Total_SUBs', min_value=0, format="%d")
-    input_data['AvgMobileRevenue'] = st.number_input(
-        'AvgMobileRevenue', min_value=0.0, format="%.2f")
-    input_data['AvgFIXRevenue'] = st.number_input(
-        'AvgFIXRevenue', min_value=0.0, format="%.2f")
-    input_data['TotalRevenue'] = st.number_input(
-        'TotalRevenue', min_value=0.0, format="%.2f")
     input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f")
     # Predict churn
@@ -175,6 +159,9 @@ if interface == "Single Prediction":
         # Convert input data to DataFrame
         input_df = pd.DataFrame([input_data])
         # Preprocess the data only if needed
         input_data_transformed = preprocessor.transform(input_df)
@@ -261,16 +248,15 @@ elif interface == "Batch Prediction":
         if df is None:
             st.stop()
-        # Standardize column names to match model expectations
-        df.columns = df.columns.str.strip().str.upper().str.replace(' ', '_')
-        st.write("Standardized DataFrame columns:", df.columns.tolist())
-        # Check for missing columns
-        missing_columns = [col for col in required_columns if col not in df.columns]
-        if missing_columns:
-            st.error(f"The following required columns are missing: {missing_columns}")
-            st.stop()
         # Fill missing values if any
         df.fillna({

 import numpy as np
 import os
 import pandas as pd
 import openpyxl
 # Load the preprocessor
 preprocessor_path = 'modelExports/preprocessor.pkl'
 preprocessor = joblib.load(preprocessor_path)
 def process_uploaded_file(uploaded_file, required_columns):
     try:
         file_extension = uploaded_file.name.split('.')[-1].lower()
         if file_extension == 'csv':
+            df = pd.read_csv(uploaded_file)
         elif file_extension in ['xlsx', 'xls']:
+            df = pd.read_excel(uploaded_file, engine='openpyxl')
         else:
             st.error("Unsupported file format. Please upload a CSV or Excel file.")
             return None
+        # Standardize column names to uppercase and strip spaces
+        df.columns = df.columns.str.upper().str.strip()
+        st.write("DataFrame columns:", df.columns.tolist())
+        # Standardize required columns to uppercase and strip spaces
+        required_columns_upper = [col.upper().strip() for col in required_columns]
+        # Check if all required columns are present
+        missing_columns = [col for col in required_columns_upper if col not in df.columns]
+        if missing_columns:
+            st.error(f"The following required columns are missing: {missing_columns}")
+            return None
+        st.write(f"Uploaded data has {df.shape[0]} rows and {df.shape[1]} columns.")
         return df
     except Exception as e:
         st.error(f"Error reading the file: {e}")
         return None
 def predict_with_model(model, data, includes_preprocessor):
     if includes_preprocessor:
         return model.predict(data)
     input_data = {}
     # Categorical inputs
+    input_data['CRM_PID_VALUE_SEGMENT'] = st.selectbox(
+        'CRM_PID_VALUE_SEGMENT', crm_pid_value_segment_options)
+    input_data['EFFECTIVESEGMENT'] = st.selectbox(
+        'EFFECTIVESEGMENT', effective_segment_options)
+    input_data['KA_NAME'] = st.selectbox('KA_NAME', ka_name_options)
     # Numerical inputs
+    input_data['BILLING_ZIP'] = st.number_input(
+        'BILLING_ZIP', min_value=0, format="%d")
+    input_data['ACTIVE_SUBSCRIBERS'] = st.number_input(
+        'ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
+    input_data['NOT_ACTIVE_SUBSCRIBERS'] = st.number_input(
+        'NOT_ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
+    input_data['SUSPENDED_SUBSCRIBERS'] = st.number_input(
+        'SUSPENDED_SUBSCRIBERS', min_value=0, format="%d")
+    input_data['TOTAL_SUBS'] = st.number_input(
+        'TOTAL_SUBS', min_value=0, format="%d")
+    input_data['AVGMOBILEREVENUE'] = st.number_input(
+        'AVGMOBILEREVENUE', min_value=0.0, format="%.2f")
+    input_data['AVGFIXREVENUE'] = st.number_input(
+        'AVGFIXREVENUE', min_value=0.0, format="%.2f")
+    input_data['TOTALREVENUE'] = st.number_input(
+        'TOTALREVENUE', min_value=0.0, format="%.2f")
     input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f")
     # Predict churn
         # Convert input data to DataFrame
         input_df = pd.DataFrame([input_data])
+        # Standardize column names to uppercase
+        input_df.columns = input_df.columns.str.upper().str.strip()
         # Preprocess the data only if needed
         input_data_transformed = preprocessor.transform(input_df)
         if df is None:
             st.stop()
+        # Convert numerical columns to numeric data types
+        numerical_columns = [
+            'BILLING_ZIP', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS',
+            'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE',
+            'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
+        ]
+        for col in numerical_columns:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
         # Fill missing values if any
         df.fillna({