skyvera's picture
Upload 3 files
dc9f3db verified
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import logging
import gradio as gr
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Initial hardcoded sample data
data = pd.DataFrame({
'User ID': [1, 2, 3, 4, 5],
'Session Duration': [300, 450, 200, 600, 350],
'Pages Visited': [5, 8, 3, 12, 7],
'Ads Clicked': [2, 1, 0, 3, 2],
'User Interests': ['technology', 'sports', 'technology', 'arts', 'sports'],
'Engagement Score': [0.8, 0.5, 0.3, 0.9, 0.7],
'Device Type': ['mobile', 'desktop', 'mobile', 'tablet', 'desktop'],
'Time of Day': ['morning', 'afternoon', 'evening', 'morning', 'afternoon'],
'Time Spent per Page': [30, 25, 45, 20, 50],
'Click Through Rate': [0.1, 0.2, 0.05, 0.3, 0.15],
'Conversion Rate': [0.05, 0.1, 0, 0.2, 0.1],
'Frequency of Visits': [10, 20, 5, 15, 10],
'Bounce Rate': [0.2, 0.1, 0.5, 0.05, 0.3]
})
logging.info("Sample data prepared.")
# Define expected columns including 'User ID'
expected_columns = {
'User ID': int,
'Session Duration': int,
'Pages Visited': int,
'Ads Clicked': int,
'User Interests': str,
'Engagement Score': float,
'Device Type': str,
'Time of Day': str,
'Time Spent per Page': int,
'Click Through Rate': float,
'Conversion Rate': float,
'Frequency of Visits': int,
'Bounce Rate': float
}
def validate_data(user_data):
if not all(col in user_data.columns for col in expected_columns):
logging.error("Missing columns in the uploaded data.")
return False, "Missing columns in the uploaded data."
for col, dtype in expected_columns.items():
# Check if the expected type is string and the actual type is object
if dtype == str and user_data[col].dtype == object:
continue
if user_data[col].dtype != np.dtype(dtype):
logging.error(f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}.")
return False, f"Incorrect data type for column {col}. Expected {dtype}, got {user_data[col].dtype}."
logging.info("Data is valid.")
return True, "Data is valid."
def load_user_data(file):
try:
user_data = pd.read_csv(file)
is_valid, message = validate_data(user_data)
if not is_valid:
return message
global data
data = user_data
# Retrain the pipeline with new data
pipeline.fit(data)
return "Data uploaded, validated, and model retrained successfully. You can now make predictions by selecting the 'Cluster Prediction' tab above"
except Exception as e:
return str(e)
# Updated preprocessing
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score',
'Time Spent per Page', 'Click Through Rate', 'Conversion Rate',
'Frequency of Visits', 'Bounce Rate']),
('cat', OneHotEncoder(handle_unknown='ignore'), ['User Interests', 'Device Type', 'Time of Day'])
])
logging.info("Preprocessor setup complete.")
# Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
logging.info("KMeans clustering configured.")
# Define the pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('cluster', kmeans)
])
logging.info("Pipeline created.")
# Fit the pipeline to the data
pipeline.fit(data)
def generate_insights(cluster_characteristics):
# Example insights based on hypothetical thresholds
insights = []
if cluster_characteristics['Engagement Score'] > 0.7 and cluster_characteristics['Conversion Rate'] < 0.1:
insights.append("High engagement but low conversion: Consider optimizing the checkout process or providing targeted offers.")
if cluster_characteristics['Click Through Rate'] > 0.2:
insights.append("High click-through rate: Users are interacting well with ads. Increase ad relevance to boost conversions.")
if cluster_characteristics['Bounce Rate'] > 0.3:
insights.append("High bounce rate: Review landing page design and content relevance to improve user retention.")
if cluster_characteristics['Frequency of Visits'] > 15:
insights.append("Frequent visits: Users are returning often, consider loyalty programs or personalized content to maintain engagement.")
if cluster_characteristics['Time Spent per Page'] < 20:
insights.append("Low time spent per page: Content may not be engaging or relevant enough. Consider content optimization.")
if cluster_characteristics['Conversion Rate'] > 0.15:
insights.append("High conversion rate: Effective ad targeting. Explore scaling up ad spend on similar user segments.")
return " ".join(insights)
def predict_cluster(session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type, time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate):
logging.info("Starting cluster prediction.")
input_df = pd.DataFrame({
'Session Duration': [session_duration],
'Pages Visited': [pages_visited],
'Ads Clicked': [ads_clicked],
'Engagement Score': [engagement_score],
'User Interests': [user_interests],
'Device Type': [device_type],
'Time of Day': [time_of_day],
'Time Spent per Page': [time_spent_per_page],
'Click Through Rate': [click_through_rate],
'Conversion Rate': [conversion_rate],
'Frequency of Visits': [frequency_of_visits],
'Bounce Rate': [bounce_rate]
})
logging.debug(f"Input DataFrame: {input_df}")
cluster = pipeline.predict(input_df)[0]
logging.info(f"Predicted cluster: {cluster}")
centroids = pipeline.named_steps['cluster'].cluster_centers_
cluster_characteristics = centroids[cluster]
# Decode features for insights
num_features = ['Session Duration', 'Pages Visited', 'Ads Clicked', 'Engagement Score', 'Time Spent per Page', 'Click Through Rate', 'Conversion Rate', 'Frequency of Visits', 'Bounce Rate']
scaled_features = cluster_characteristics[:9]
original_num_values = pipeline.named_steps['preprocessor'].named_transformers_['num'].inverse_transform([scaled_features])[0]
cat_features = ['User Interests', 'Device Type', 'Time of Day']
encoded_features = cluster_characteristics[9:]
original_cat_values = pipeline.named_steps['preprocessor'].named_transformers_['cat'].inverse_transform([encoded_features])[0]
# Combine numerical and categorical features into a dictionary
cluster_characteristics = dict(zip(num_features, original_num_values))
cluster_characteristics.update(dict(zip(cat_features, original_cat_values)))
# Generate actionable insights
insights = generate_insights(cluster_characteristics)
logging.info("Cluster prediction completed.")
return f"Predicted Cluster: {cluster}\nCharacteristics: {cluster_characteristics}\nActionable Insights: {insights}"
def ad_performance_analytics():
logging.info("Calculating ad performance analytics.")
avg_ctr = data['Click Through Rate'].mean()
avg_conversion_rate = data['Conversion Rate'].mean()
avg_bounce_rate = data['Bounce Rate'].mean()
logging.debug(f"Average CTR: {avg_ctr}, Average Conversion Rate: {avg_conversion_rate}, Average Bounce Rate: {avg_bounce_rate}")
# Prepare the analytics report
report = f"Average Click Through Rate: {avg_ctr:.2%}\n"
report += f"Average Conversion Rate: {avg_conversion_rate:.2%}\n"
report += f"Average Bounce Rate: {avg_bounce_rate:.2%}"
logging.info("Ad performance analytics calculation completed.")
return report
with gr.Blocks() as demo:
with gr.Tab("Upload Data"):
gr.Markdown("""
**Upload your data file in CSV format. Ensure it contains the following columns with appropriate data types:**
- User ID (int)
- Session Duration (int)
- Pages Visited (int)
- Ads Clicked (int)
- User Interests (str)
- Engagement Score (float)
- Device Type (str)
- Time of Day (str)
- Time Spent per Page (int)
- Click Through Rate (float)
- Conversion Rate (float)
- Frequency of Visits (int)
- Bounce Rate (float)
**Note:** You can upload your own data for analysis, or continue using the existing sample data for predictions by selecting the **'Cluster Prediction'** tab above.
""")
file_input = gr.File(label="Upload your CSV data file")
upload_message = gr.Textbox()
file_input.change(load_user_data, inputs=file_input, outputs=upload_message)
with gr.Tab("Cluster Prediction"):
with gr.Row():
gr.Markdown("**This form allows you to input user session data to predict which cluster the user belongs to and provides actionable insights based on their behavior.**")
session_duration = gr.Number(label="Session Duration", value=300) # Set initial value
pages_visited = gr.Number(label="Pages Visited", value=5) # Set initial value
ads_clicked = gr.Number(label="Ads Clicked", value=2) # Set initial value
engagement_score = gr.Slider(0, 1, label="Engagement Score", value=0.5) # Set initial value
user_interests = gr.Dropdown(['technology', 'sports', 'arts'], label="User Interests", value='technology') # Set initial value
device_type = gr.Radio(['mobile', 'desktop', 'tablet'], label="Device Type", value='mobile') # Set initial value
time_of_day = gr.Radio(['morning', 'afternoon', 'evening'], label="Time of Day", value='morning') # Set initial value
time_spent_per_page = gr.Number(label="Time Spent per Page", value=30) # Set initial value
click_through_rate = gr.Slider(0, 1, step=0.01, label="Click Through Rate", value=0.1) # Set initial value
conversion_rate = gr.Slider(0, 1, step=0.01, label="Conversion Rate", value=0.05) # Set initial value
frequency_of_visits = gr.Number(label="Frequency of Visits", value=10) # Set initial value
bounce_rate = gr.Slider(0, 1, step=0.01, label="Bounce Rate", value=0.2) # Set initial value
predict_button = gr.Button("Predict")
output_textbox = gr.Textbox(label="Prediction Output", lines=4)
predict_button.click(
predict_cluster,
inputs=[
session_duration, pages_visited, ads_clicked, engagement_score, user_interests, device_type,
time_of_day, time_spent_per_page, click_through_rate, conversion_rate, frequency_of_visits, bounce_rate
],
outputs=output_textbox
)
logging.info("Gradio predict button configured.")
with gr.Tab("Ad Performance Analytics"):
gr.Markdown("""
**This form provides a summary of key performance metrics for ads.**
- **Average Click-Through Rate (CTR):** Measures the percentage of ad views that result in clicks. Higher values indicate more effective ad engagement.
- **Average Conversion Rate:** Indicates the percentage of clicks that convert into actions, such as purchases or sign-ups. This metric helps assess the effectiveness of ad targeting and the overall conversion potential.
- **Average Bounce Rate:** Reflects the percentage of single-page visits. Lower bounce rates suggest that the landing pages are relevant to the visitors' interests.
Understanding these metrics can help optimize ad strategies and improve overall campaign performance.
""")
analytics_button = gr.Button("Analyze Ad Performance")
analytics_output = gr.Textbox(label="Analytics Output", lines=3)
analytics_button.click(
ad_performance_analytics,
outputs=analytics_output
)
logging.info("Gradio analytics button configured.")
demo.launch()
logging.info("Gradio interface launched.")