from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


%cd /content/drive/MyDrive/extraLearn

/content/drive/MyDrive/extraLearn


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('ExtraaLearn.csv')
df.head()


# Normalize ordinal labels
if 'profile_completed' in df.columns:
    df['profile_completed'] = df['profile_completed'].replace({
        'Low': 'Low (0-50%)',
        'Medium': 'Medium (50-75%)',
        'High': 'High (75-100%)'
    })

# Display summary
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     4612 non-null   object 
 1   age                    4612 non-null   int64  
 2   current_occupation     4612 non-null   object 
 3   first_interaction      4612 non-null   object 
 4   profile_completed      4612 non-null   object 
 5   website_visits         4612 non-null   int64  
 6   time_spent_on_website  4612 non-null   int64  
 7   page_views_per_visit   4612 non-null   float64
 8   last_activity          4612 non-null   object 
 9   print_media_type1      4612 non-null   object 
 10  print_media_type2      4612 non-null   object 
 11  digital_media          4612 non-null   object 
 12  educational_channels   4612 non-null   object 
 13  referral               4612 non-null   object 
 14  status                 4612 non-null   int64  
dtypes: float64(1), int64(4), object(10)
memory usage: 540.6+ KB


# Distribution of target
sns.countplot(data=df, x='status')
plt.title('Target Distribution')
plt.show()

# Numeric distributions
num_cols = ['age','website_visits','time_spent_on_website','page_views_per_visit']
df[num_cols].hist(figsize=(10,6))
plt.show()

# Conversion rate by categorical vars
cat_cols = ['current_occupation','first_interaction','last_activity','profile_completed']
for col in cat_cols:
    rate = df.groupby(col)['status'].mean().sort_values(ascending=False)
    rate.plot(kind='bar', figsize=(6,3), title=f'Conversion Rate by {col}')
    plt.show()


NUM_COLS = ['age','website_visits','time_spent_on_website','page_views_per_visit']
BIN_COLS = ['print_media_type1','print_media_type2','digital_media','educational_channels','referral']
CAT_COLS = ['current_occupation','first_interaction','last_activity']
ORD_COLS = ['profile_completed']
TARGET = 'status'

ordinal_map = [['Low (0-50%)','Medium (50-75%)','High (75-100%)']]

num_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

bin_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('enc', OrdinalEncoder(categories=[['No','Yes']]*len(BIN_COLS)))
])

cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

ord_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(categories=ordinal_map))
])

preprocess = ColumnTransformer([
    ('num', num_pipe, NUM_COLS),
    ('bin', bin_pipe, BIN_COLS),
    ('cat', cat_pipe, CAT_COLS),
    ('ord', ord_pipe, ORD_COLS)
])


from sklearn.model_selection import train_test_split

X = df[NUM_COLS + BIN_COLS + CAT_COLS + ORD_COLS]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

rf = Pipeline([
    ('pre', preprocess),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])

gb = Pipeline([
    ('pre', preprocess),
    ('clf', GradientBoostingClassifier(random_state=42))
])

models = {'RandomForest': rf, 'GradientBoosting': gb}
cv_scores = {}

for name, model in models.items():
    scores = []
    for tr, va in cv.split(X_train, y_train):
        model.fit(X_train.iloc[tr], y_train.iloc[tr])
        proba = model.predict_proba(X_train.iloc[va])[:,1]
        scores.append(roc_auc_score(y_train.iloc[va], proba))
    cv_scores[name] = np.mean(scores)

cv_scores

{'RandomForest': np.float64(0.9144631480533882),
 'GradientBoosting': np.float64(0.9228610159849652)}


best_model_name = max(cv_scores, key=cv_scores.get)
best_model_name

'GradientBoosting'


if best_model_name == 'RandomForest':
    base = rf
    params = {
        'clf__n_estimators': [150,200,300],
        'clf__max_depth': [None,6,10],
        'clf__min_samples_split': [2,5],
        'clf__min_samples_leaf': [1,2]
    }
else:
    base = gb
    params = {
        'clf__n_estimators': [100,150,200],
        'clf__learning_rate': [0.03,0.05,0.1],
        'clf__max_depth': [2,3],
        'clf__subsample': [0.8,1.0]
    }

rs = RandomizedSearchCV(base, params, n_iter=10, scoring='roc_auc', cv=cv, n_jobs=-1, random_state=42)
rs.fit(X_train, y_train)
rs.best_params_

{'clf__subsample': 1.0,
 'clf__n_estimators': 150,
 'clf__max_depth': 3,
 'clf__learning_rate': 0.05}


best_model = rs.best_estimator_
proba = best_model.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)

eval_metrics = {
    'roc_auc': roc_auc_score(y_test, proba),
    'pr_auc': average_precision_score(y_test, proba),
    'f1': f1_score(y_test, pred),
    'recall': recall_score(y_test, pred),
    'precision': precision_score(y_test, pred),
    'confusion_matrix': confusion_matrix(y_test, pred)
}
eval_metrics

{'roc_auc': np.float64(0.9278862307640614),
 'pr_auc': np.float64(0.8383560457393716),
 'f1': 0.7680890538033395,
 'recall': 0.75,
 'precision': 0.7870722433460076,
 'confusion_matrix': array([[591,  56],
        [ 69, 207]])}


import joblib
joblib.dump(best_model, 'best_model.joblib')

['best_model.joblib']

	ID	age	current_occupation	first_interaction	profile_completed	website_visits	time_spent_on_website	page_views_per_visit	last_activity	print_media_type1	print_media_type2	digital_media	educational_channels	referral	status
count	4612	4612.000000	4612	4612	4612	4612.000000	4612.000000	4612.000000	4612	4612	4612	4612	4612	4612	4612.000000
unique	4612	NaN	3	2	3	NaN	NaN	NaN	3	2	2	2	2	2	NaN
top	EXT4612	NaN	Professional	Website	High (75-100%)	NaN	NaN	NaN	Email Activity	No	No	No	No	No	NaN
freq	1	NaN	2616	2542	2264	NaN	NaN	NaN	2278	4115	4379	4085	3907	4519	NaN
mean	NaN	46.201214	NaN	NaN	NaN	3.566782	724.011275	3.026126	NaN	NaN	NaN	NaN	NaN	NaN	0.298569
std	NaN	13.161454	NaN	NaN	NaN	2.829134	743.828683	1.968125	NaN	NaN	NaN	NaN	NaN	NaN	0.457680
min	NaN	18.000000	NaN	NaN	NaN	0.000000	0.000000	0.000000	NaN	NaN	NaN	NaN	NaN	NaN	0.000000
25%	NaN	36.000000	NaN	NaN	NaN	2.000000	148.750000	2.077750	NaN	NaN	NaN	NaN	NaN	NaN	0.000000
50%	NaN	51.000000	NaN	NaN	NaN	3.000000	376.000000	2.792000	NaN	NaN	NaN	NaN	NaN	NaN	0.000000
75%	NaN	57.000000	NaN	NaN	NaN	5.000000	1336.750000	3.756250	NaN	NaN	NaN	NaN	NaN	NaN	1.000000
max	NaN	63.000000	NaN	NaN	NaN	30.000000	2537.000000	18.434000	NaN	NaN	NaN	NaN	NaN	NaN	1.000000

Tier	Probability Range	Action
Hot Leads	p ≥ 0.80	Immediate sales call / WhatsApp follow-up
Warm Leads	0.50 ≤ p < 0.80	Email + webinar invites + remarketing
Cold Leads	p < 0.50	Long-term nurture campaigns

ExtraaLearn Lead Conversion Prediction — Full End‑to‑End Notebook¶

Load Dataset¶

Basic Data Cleaning¶

Exploratory Data Analysis (EDA)¶

Preprocessing Pipeline¶

Train-Test Split¶

Baseline Model Training¶

Hyperparameter Tuning¶

Final Model Evaluation¶

Serialize Final Model¶

🔍 Actionable Insights & Business Recommendations¶

📌 I. Key Data‑Driven Insights¶

1. Profile Completion Is the Strongest Conversion Signal¶

2. Website-Based First Interaction Converts Far Better Than Mobile App¶

3. Last Interaction Type Strongly Predicts Lead Warmth¶

4. Engagement Depth Influences Conversion¶

5. Occupation Impacts Conversion Probability¶

📌 II. Model‑Based Insights¶

📌 III. Actionable Business Recommendations¶

1. Optimize the Profile Completion Journey¶

2. Focus Marketing Spend on Website Traffic¶

3. Prioritize Email & Web-Active Leads in Sales Routing¶

4. Use Behavioral Signals for Personalized Outreach¶

5. Create Occupation-Based Lead Personas¶

6. Deploy Probability-Based Lead Tiering¶

7. Retrain & Monitor Model Performance Regularly¶

📌 IV. Strategic Takeaways¶

🔗 Hugging Face Docker Space URL for Backend Flask API's¶

🔗 Hugging Face Docker Space URL for Streamlit ExtraLearn Model for frontend space¶

	ID	age	current_occupation	first_interaction	profile_completed	website_visits	time_spent_on_website	page_views_per_visit	last_activity	print_media_type1	print_media_type2	digital_media	educational_channels	referral	status
0	EXT001	57	Unemployed	Website	High	7	1639	1.861	Website Activity	Yes	No	Yes	No	No	1
1	EXT002	56	Professional	Mobile App	Medium	2	83	0.320	Website Activity	No	No	No	Yes	No	0
2	EXT003	52	Professional	Website	Medium	3	330	0.074	Website Activity	No	No	Yes	No	No	0
3	EXT004	53	Unemployed	Website	High	4	464	2.057	Website Activity	No	No	No	No	No	1
4	EXT005	23	Student	Website	High	4	600	16.914	Email Activity	No	No	No	No	No	0