Spaces:
Sleeping
Sleeping
File size: 12,307 Bytes
687eeb7 dc9f3db 687eeb7 dc9f3db 687eeb7 dc9f3db 687eeb7 dc9f3db 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 dc9f3db 687eeb7 dc9f3db 687eeb7 45f02e0 687eeb7 dc9f3db 687eeb7 45f02e0 687eeb7 45f02e0 687eeb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import logging
import gradio as gr
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Initial hardcoded sample data
data = pd.DataFrame({
'User ID': [1, 2, 3, 4, 5],
'Session Duration': [300, 450, 200, 600, 350],
'Pages Visited': [5, 8, 3, 12, 7],
'Ads Clicked': [2, 1, 0, 3, 2],
'User Interests': ['technology', 'sports', 'technology', 'arts', 'sports'],
'Engagement Score': [0.8, 0.5, 0.3, 0.9, 0.7],
'Device Type': ['mobile', 'desktop', 'mobile', 'tablet', 'desktop'],
'Time of Day': ['morning', 'afternoon', 'evening', 'morning', 'afternoon'],
'Time Spent per Page': [30, 25, 45, 20, 50],
'Click Through Rate': [0.1, 0.2, 0.05, 0.3, 0.15],
'Conversion Rate': [0.05, 0.1, 0, 0.2, 0.1],
'Frequency of Visits': [10, 20, 5, 15, 10],
'Bounce Rate': [0.2, 0.1, 0.5, 0.05, 0.3]
})
logging.info("Sample data prepared.")
# Define expected columns including 'User ID'
expected_columns = {
'User ID': int,
'Session Duration': int,
'Pages Visited': int,
'Ads Clicked': int,
'User Interests': str,
'Engagement Score': float,
'Device Type': str,
'Time of Day': str,
'Time Spent per Page': int,
'Click Through Rate': float,
'Conversion Rate': float,
'Frequency of Visits': int,
'Bounce Rate': float
}
def validate_data(user_data):
if not all(col in user_data.columns for col in expected_columns):
logging.error("Missing columns in the uploaded data.")
return False, "Missing columns in the uploaded data."
for col, dtype in expected_columns.items():
# Check if the expected type is string and the actual type is object
if dtype == str and user_data[col].dtype == object:
continue
if user_data[col].dtype != np.dtype(dtype):
logging.error(f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}.")
return False, f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}."
logging.info("Data is valid.")
return True, "Data is valid."
def load_user_data(file):
try:
user_data = pd.read_csv(file)
is_valid, message = validate_data(user_data)
if not is_valid:
return message
global data
data = user_data
# Retrain the pipeline with new data
pipeline.fit(data)
return "Data uploaded, validated, and model retrained successfully. You can now make predictions by selecting the 'Cluster Prediction' tab above"
except Exception as e:
return str(e)
# Updated preprocessing
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score',
'Time Spent per Page', 'Click Through Rate', 'Conversion Rate',
'Frequency of Visits', 'Bounce Rate']),
('cat', OneHotEncoder(handle_unknown='ignore'), ['User Interests', 'Device Type', 'Time of Day'])
])
logging.info("Preprocessor setup complete.")
# Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
logging.info("KMeans clustering configured.")
# Define the pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('cluster', kmeans)
])
logging.info("Pipeline created.")
# Fit the pipeline to the data
pipeline.fit(data)
def generate_insights(cluster_characteristics):
# Example insights based on hypothetical thresholds
insights = []
if cluster_characteristics['Engagement Score'] > 0.7 and cluster_characteristics['Conversion Rate'] < 0.1:
insights.append("High engagement but low conversion: Consider optimizing the checkout process or providing targeted offers.")
if cluster_characteristics['Click Through Rate'] > 0.2:
insights.append("High click-through rate: Users are interacting well with ads. Increase ad relevance to boost conversions.")
if cluster_characteristics['Bounce Rate'] > 0.3:
insights.append("High bounce rate: Review landing page design and content relevance to improve user retention.")
if cluster_characteristics['Frequency of Visits'] > 15:
insights.append("Frequent visits: Users are returning often, consider loyalty programs or personalized content to maintain engagement.")
if cluster_characteristics['Time Spent per Page'] < 20:
insights.append("Low time spent per page: Content may not be engaging or relevant enough. Consider content optimization.")
if cluster_characteristics['Conversion Rate'] > 0.15:
insights.append("High conversion rate: Effective ad targeting. Explore scaling up ad spend on similar user segments.")
return " ".join(insights)
def predict_cluster(session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type, time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate):
logging.info("Starting cluster prediction.")
input_df = pd.DataFrame({
'Session Duration': [session_duration],
'Pages Visited': [pages_visited],
'Ads Clicked': [ads_clicked],
'Engagement Score': [engagement_score],
'User Interests': [user_interests],
'Device Type': [device_type],
'Time of Day': [time_of_day],
'Time Spent per Page': [time_spent_per_page],
'Click Through Rate': [click_through_rate],
'Conversion Rate': [conversion_rate],
'Frequency of Visits': [frequency_of_visits],
'Bounce Rate': [bounce_rate]
})
logging.debug(f"Input DataFrame: {input_df}")
cluster = pipeline.predict(input_df)[0]
logging.info(f"Predicted cluster: {cluster}")
centroids = pipeline.named_steps['cluster'].cluster_centers_
cluster_characteristics = centroids[cluster]
# Decode features for insights
num_features = ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score', 'Time Spent per Page', 'Click Through Rate', 'Conversion Rate', 'Frequency of Visits', 'Bounce Rate']
scaled_features = cluster_characteristics[:9]
original_num_values = pipeline.named_steps['preprocessor'].named_transformers_['num'].inverse_transform([scaled_features])[0]
cat_features = ['User Interests', 'Device Type', 'Time of Day']
encoded_features = cluster_characteristics[9:]
original_cat_values = pipeline.named_steps['preprocessor'].named_transformers_['cat'].inverse_transform([encoded_features])[0]
# Combine numerical and categorical features into a dictionary
cluster_characteristics = dict(zip(num_features, original_num_values))
cluster_characteristics.update(dict(zip(cat_features, original_cat_values)))
# Generate actionable insights
insights = generate_insights(cluster_characteristics)
logging.info("Cluster prediction completed.")
return f"Predicted Cluster: {cluster}\nCharacteristics: {cluster_characteristics}\nActionable Insights: {insights}"
def ad_performance_analytics():
logging.info("Calculating ad performance analytics.")
avg_ctr = data['Click Through Rate'].mean()
avg_conversion_rate = data['Conversion Rate'].mean()
avg_bounce_rate = data['Bounce Rate'].mean()
logging.debug(f"Average CTR: {avg_ctr}, Average Conversion Rate: {avg_conversion_rate}, Average Bounce Rate: {avg_bounce_rate}")
# Prepare the analytics report
report = f"Average Click Through Rate: {avg_ctr:.2%}\n"
report += f"Average Conversion Rate: {avg_conversion_rate:.2%}\n"
report += f"Average Bounce Rate: {avg_bounce_rate:.2%}"
logging.info("Ad performance analytics calculation completed.")
return report
with gr.Blocks() as demo:
with gr.Tab("Upload Data"):
gr.Markdown("""
**Upload your data file in CSV format. Ensure it contains the following columns with appropriate data types:**
- User ID (int)
- Session Duration (int)
- Pages Visited (int)
- Ads Clicked (int)
- User Interests (str)
- Engagement Score (float)
- Device Type (str)
- Time of Day (str)
- Time Spent per Page (int)
- Click Through Rate (float)
- Conversion Rate (float)
- Frequency of Visits (int)
- Bounce Rate (float)
**Note:** You can upload your own data for analysis, or continue using the existing sample data for predictions by selecting the **'Cluster Prediction'** tab above.
""")
file_input = gr.File(label="Upload your CSV data file")
upload_message = gr.Textbox()
file_input.change(load_user_data, inputs=file_input, outputs=upload_message)
with gr.Tab("Cluster Prediction"):
with gr.Row():
gr.Markdown("**This form allows you to input user session data to predict which cluster the user belongs to and provides actionable insights based on their behavior.**")
session_duration = gr.Number(label="Session Duration", value=300) # Set initial value
pages_visited = gr.Number(label="Pages Visited", value=5) # Set initial value
ads_clicked = gr.Number(label="Ads Clicked", value=2) # Set initial value
engagement_score = gr.Slider(0, 1, label="Engagement Score", value=0.5) # Set initial value
user_interests = gr.Dropdown(['technology', 'sports', 'arts'], label="User Interests", value='technology') # Set initial value
device_type = gr.Radio(['mobile', 'desktop', 'tablet'], label="Device Type", value='mobile') # Set initial value
time_of_day = gr.Radio(['morning', 'afternoon', 'evening'], label="Time of Day", value='morning') # Set initial value
time_spent_per_page = gr.Number(label="Time Spent per Page", value=30) # Set initial value
click_through_rate = gr.Slider(0, 1, step=0.01, label="Click Through Rate", value=0.1) # Set initial value
conversion_rate = gr.Slider(0, 1, step=0.01, label="Conversion Rate", value=0.05) # Set initial value
frequency_of_visits = gr.Number(label="Frequency of Visits", value=10) # Set initial value
bounce_rate = gr.Slider(0, 1, step=0.01, label="Bounce Rate", value=0.2) # Set initial value
predict_button = gr.Button("Predict")
output_textbox = gr.Textbox(label="Prediction Output", lines=4)
predict_button.click(
predict_cluster,
inputs=[
session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type,
time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate
],
outputs=output_textbox
)
logging.info("Gradio predict button configured.")
with gr.Tab("Ad Performance Analytics"):
gr.Markdown("""
**This form provides a summary of key performance metrics for ads.**
- **Average Click-Through Rate (CTR):** Measures the percentage of ad views that result in clicks. Higher values indicate more effective ad engagement.
- **Average Conversion Rate:** Indicates the percentage of clicks that convert into actions, such as purchases or sign-ups. This metric helps assess the effectiveness of ad targeting and the overall conversion potential.
- **Average Bounce Rate:** Reflects the percentage of single-page visits. Lower bounce rates suggest that the landing pages are relevant to the visitors' interests.
Understanding these metrics can help optimize ad strategies and improve overall campaign performance.
""")
analytics_button = gr.Button("Analyze Ad Performance")
analytics_output = gr.Textbox(label="Analytics Output", lines=3)
analytics_button.click(
ad_performance_analytics,
outputs=analytics_output
)
logging.info("Gradio analytics button configured.")
demo.launch()
logging.info("Gradio interface launched.")
|