Spaces:

Amitpwa
/

mlOps

Sleeping

App Files Files Community

Amitpwa commited on Jul 15, 2024

Commit

dfa4582

verified ·

1 Parent(s): 0294945

uploaded all files

Browse files

added requirements file

Files changed (3) hide show

app.py +92 -0
plaintext +8 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import streamlit as st
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import GridSearchCV
+# import joblib
+# Load dataset
+df = pd.read_csv('ds_salaries.csv')
+# Load the original dataset to get unique values for dropdowns
+df_original = pd.read_csv('ds_salaries.csv')
+# Load the best model
+# best_model = joblib.load('best_model.pkl')
+numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
+categorical_cols = df.select_dtypes(include=['object']).columns
+df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
+df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))
+# Drop the salary_currency column as it's not needed for prediction
+df = df.drop(columns=['salary_currency'])
+# Encode categorical variables
+categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
+df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
+# Define features and target variable
+X = df.drop(['salary', 'salary_in_usd'], axis=1)
+y = df['salary_in_usd']
+# Split the data
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Model training and experiment tracking with MLflow
+models = {
+    'Linear Regression': LinearRegression(),
+    'Decision Tree': DecisionTreeRegressor(),
+    'Random Forest': RandomForestRegressor(),
+    'Gradient Boosting': GradientBoostingRegressor()
+}
+param_grid = {
+    'n_estimators': [100, 200, 300],
+    'max_depth': [None, 10, 20, 30]
+}
+grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
+grid_search.fit(X_train, y_train)
+# Streamlit app
+st.title('Data Science Salary Predictor')
+# Input features
+experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
+employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
+job_title = st.selectbox('Job Title', df_original['job_title'].unique())
+employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
+remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
+company_location = st.selectbox('Company Location', df_original['company_location'].unique())
+company_size = st.selectbox('Company Size', df_original['company_size'].unique())
+# Predict salary
+input_data = pd.DataFrame({
+    'work_year': [2023],
+    'experience_level': [experience_level],
+    'employment_type': [employment_type],
+    'job_title': [job_title],
+    'employee_residence': [employee_residence],
+    'remote_ratio': [remote_ratio],
+    'company_location': [company_location],
+    'company_size': [company_size]
+})
+# Encode categorical variables
+categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
+input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)
+# Align input data with training data columns
+input_data = input_data.reindex(columns=X_train.columns, fill_value=0)
+# joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
+# Predict the salary
+salary_prediction = grid_search.best_estimator_.predict(input_data)[0]
+st.write(f'Predicted Salary: ${salary_prediction:.2f}')

plaintext ADDED Viewed

	@@ -0,0 +1,8 @@

+app.py
+requirements.txt
+joblib
+pandas
+seaborn
+scikit-learn
+matplotlib
+mlflow

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+joblib
+pandas
+seaborn
+scikit-learn
+matplotlib
+mlflow