Spaces:

ghostai1
/

internalRAGCX

Sleeping

App Files Files Community

ghostai1 commited on May 1, 2025

Commit

6ff6074

verified ·

1 Parent(s): 50a6c2d

Create app.py

Browse files

Files changed (1) hide show

app.py +264 -0

app.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import io
+import os
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+from reportlab.lib.utils import ImageReader
+# Load the call center logs CSV (assumed to be uploaded to the Space)
+CSV_FILE_PATH = "call_center_logs.csv"
+# Data cleanup function
+def clean_data(df):
+    original_count = len(df)
+    cleanup_details = {
+        'original': original_count,
+        'nulls_removed': 0,
+        'duplicates_removed': 0,
+        'short_removed': 0,
+        'malformed_removed': 0,
+        'invalid_timestamps': 0
+    }
+    # Remove nulls in critical columns
+    critical_columns = ['query', 'resolution', 'duration_minutes', 'satisfaction_score']
+    null_rows = df[critical_columns].isna().any(axis=1)
+    cleanup_details['nulls_removed'] = null_rows.sum()
+    df = df[~null_rows]
+    # Remove duplicates based on call_id
+    duplicate_rows = df['call_id'].duplicated()
+    cleanup_details['duplicates_removed'] = duplicate_rows.sum()
+    df = df[~duplicate_rows]
+    # Remove short queries
+    short_rows = (df['query'].str.len() < 5) | (df['resolution'].str.len() < 5)
+    cleanup_details['short_removed'] = short_rows.sum()
+    df = df[~short_rows]
+    # Remove malformed queries
+    malformed_rows = df['query'].str.contains(r'[!?]{2,}|\b(Invalid|N/A)\b', regex=True, case=False, na=False)
+    cleanup_details['malformed_removed'] = malformed_rows.sum()
+    df = df[~malformed_rows]
+    # Validate and clean timestamps
+    invalid_timestamps = pd.to_datetime(df['timestamp'], errors='coerce').isna()
+    cleanup_details['invalid_timestamps'] = invalid_timestamps.sum()
+    df = df[~invalid_timestamps]
+    # Standardize language (fill missing with 'en')
+    df['language'] = df['language'].fillna('en')
+    # Convert duration and satisfaction score to numeric
+    df['duration_minutes'] = pd.to_numeric(df['duration_minutes'], errors='coerce')
+    df['satisfaction_score'] = pd.to_numeric(df['satisfaction_score'], errors='coerce')
+    cleaned_count = len(df)
+    cleanup_details['cleaned'] = cleaned_count
+    cleanup_details['removed'] = original_count - cleaned_count
+    # Save cleaned CSV for SageMaker/Azure AI
+    cleaned_path = 'cleaned_call_center_logs.csv'
+    df.to_csv(cleaned_path, index=False)
+    return df, cleanup_details, cleaned_path
+# Statistical plotting function
+def plot_statistics(df):
+    # Plot 1: Distribution of Call Durations
+    plt.figure(figsize=(10, 6))
+    sns.histplot(df['duration_minutes'], bins=20, kde=True, color='skyblue')
+    plt.title('Distribution of Call Durations')
+    plt.xlabel('Duration (minutes)')
+    plt.ylabel('Frequency')
+    plt.savefig('duration_distribution.png')
+    plt.close()
+    # Plot 2: Satisfaction Scores by Agent
+    plt.figure(figsize=(10, 6))
+    sns.boxplot(x='agent_id', y='satisfaction_score', data=df, color='lightblue')
+    plt.title('Satisfaction Scores by Agent')
+    plt.xlabel('Agent ID')
+    plt.ylabel('Satisfaction Score')
+    plt.savefig('satisfaction_by_agent.png')
+    plt.close()
+    # Plot 3: Query Frequency by Language
+    plt.figure(figsize=(10, 6))
+    sns.countplot(x='language', data=df, color='skyblue')
+    plt.title('Query Frequency by Language')
+    plt.xlabel('Language')
+    plt.ylabel('Number of Queries')
+    plt.savefig('query_by_language.png')
+    plt.close()
+    return ['duration_distribution.png', 'satisfaction_by_agent.png', 'query_by_language.png']
+# Generate PDF report
+def generate_pdf_report(cleanup_details, plot_paths):
+    pdf_path = 'data_analysis_report.pdf'
+    c = canvas.Canvas(pdf_path, pagesize=letter)
+    width, height = letter
+    # Title
+    c.setFont("Helvetica-Bold", 16)
+    c.drawString(50, height - 50, "Call Center Data Analysis Report")
+    # Cleanup Stats
+    c.setFont("Helvetica", 12)
+    y_position = height - 80
+    c.drawString(50, y_position, "Data Cleanup Statistics:")
+    y_position -= 20
+    for key, value in cleanup_details.items():
+        c.drawString(70, y_position, f"{key.replace('_', ' ').title()}: {value}")
+        y_position -= 15
+    # Add Plots
+    y_position -= 30
+    for plot_path in plot_paths:
+        if os.path.exists(plot_path):
+            img = ImageReader(plot_path)
+            img_width, img_height = img.getSize()
+            aspect = img_height / float(img_width)
+            plot_width = 500
+            plot_height = plot_width * aspect
+            if y_position - plot_height < 50:
+                c.showPage()
+                y_position = height - 50
+            c.drawImage(img, 50, y_position - plot_height, width=plot_width, height=plot_height)
+            y_position -= plot_height + 20
+    c.save()
+    return pdf_path
+# Main analysis function
+def analyze_data():
+    try:
+        # Load the CSV
+        df = pd.read_csv(CSV_FILE_PATH)
+        # Clean the data
+        cleaned_df, cleanup_details, cleaned_path = clean_data(df)
+        # Generate statistical plots
+        plot_paths = plot_statistics(cleaned_df)
+        # Generate PDF report
+        pdf_path = generate_pdf_report(cleanup_details, plot_paths)
+        # Prepare cleanup stats for display
+        cleanup_stats = "\n".join([f"{key.replace('_', ' ').title()}: {value}" for key, value in cleanup_details.items()])
+        return (
+            cleaned_df.head(50).to_html(),  # Display first 50 rows as a table
+            cleanup_stats,
+            plot_paths[0],  # Duration distribution
+            plot_paths[1],  # Satisfaction by agent
+            plot_paths[2],  # Query by language
+            gr.File(value=cleaned_path, label="Download Cleaned CSV"),
+            gr.File(value=pdf_path, label="Download PDF Report")
+        )
+    except Exception as e:
+        return f"Error: {str(e)}", "", None, None, None, None, None
+# Gradio interface
+custom_css = """
+body {
+    background: linear-gradient(135deg, #1a1a1a 0%, #2a2a2a 100%);
+    color: #e0e0e0;
+    font-family: 'Arial', sans-serif;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    min-height: 100vh;
+    margin: 0;
+}
+.gr-box {
+    background: #3a3a3a;
+    border: 1px solid #4a4a4a;
+    border-radius: 8px;
+    padding: 20px;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3);
+}
+.gr-button {
+    background: #1e90ff;
+    color: white;
+    border-radius: 5px;
+    padding: 12px 20px;
+    margin: 8px 0;
+    width: 100%;
+    text-align: center;
+    transition: background 0.3s ease;
+    font-size: 16px;
+}
+.gr-button:hover {
+    background: #1c86ee;
+    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
+}
+.gr-textbox {
+    background: #2f2f2f;
+    color: #e0e0e0;
+    border: 1px solid #4a4a4a;
+    border-radius: 5px;
+    margin-bottom: 15px;
+    font-size: 16px;
+    padding: 15px;
+    min-height: 120px;
+    width: 100%;
+}
+.gr-image {
+    width: 100%;
+    height: auto;
+    max-height: 400px;
+}
+#app-container {
+    max-width: 900px;
+    width: 100%;
+    padding: 20px;
+    background: #252525;
+    border-radius: 12px;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);
+}
+.text-center {
+    text-align: center;
+    margin-bottom: 20px;
+}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    with gr.Column(elem_id="app-container"):
+        gr.Markdown("# Call Center Data Analysis", elem_classes="text-center")
+        gr.Markdown("Analyze call center logs, view statistics, and export cleaned data for SageMaker/Azure AI.", elem_classes="text-center")
+        # Button to trigger analysis
+        analyze_button = gr.Button("Analyze Data")
+        # Outputs
+        raw_data_output = gr.HTML(label="Raw Data (First 50 Rows)")
+        cleanup_stats_output = gr.Textbox(label="Data Cleanup Statistics")
+        duration_plot_output = gr.Image(label="Distribution of Call Durations")
+        satisfaction_plot_output = gr.Image(label="Satisfaction Scores by Agent")
+        language_plot_output = gr.Image(label="Query Frequency by Language")
+        csv_download = gr.File(label="Download Cleaned CSV")
+        pdf_download = gr.File(label="Download PDF Report")
+        # Connect the button to the analysis function
+        analyze_button.click(
+            fn=analyze_data,
+            inputs=None,
+            outputs=[
+                raw_data_output,
+                cleanup_stats_output,
+                duration_plot_output,
+                satisfaction_plot_output,
+                language_plot_output,
+                csv_download,
+                pdf_download
+            ]
+        )
+demo.launch()