Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import io | |
| import os | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.utils import ImageReader | |
| # Load the call center logs CSV (assumed to be uploaded to the Space) | |
| CSV_FILE_PATH = "call_center_logs.csv" | |
| # Data cleanup function | |
| def clean_data(df): | |
| original_count = len(df) | |
| cleanup_details = { | |
| 'original': original_count, | |
| 'nulls_removed': 0, | |
| 'duplicates_removed': 0, | |
| 'short_removed': 0, | |
| 'malformed_removed': 0, | |
| 'invalid_timestamps': 0 | |
| } | |
| # Remove nulls in critical columns | |
| critical_columns = ['query', 'resolution', 'duration_minutes', 'satisfaction_score'] | |
| null_rows = df[critical_columns].isna().any(axis=1) | |
| cleanup_details['nulls_removed'] = null_rows.sum() | |
| df = df[~null_rows] | |
| # Remove duplicates based on call_id | |
| duplicate_rows = df['call_id'].duplicated() | |
| cleanup_details['duplicates_removed'] = duplicate_rows.sum() | |
| df = df[~duplicate_rows] | |
| # Remove short queries | |
| short_rows = (df['query'].str.len() < 5) | (df['resolution'].str.len() < 5) | |
| cleanup_details['short_removed'] = short_rows.sum() | |
| df = df[~short_rows] | |
| # Remove malformed queries | |
| malformed_rows = df['query'].str.contains(r'[!?]{2,}|\b(Invalid|N/A)\b', regex=True, case=False, na=False) | |
| cleanup_details['malformed_removed'] = malformed_rows.sum() | |
| df = df[~malformed_rows] | |
| # Validate and clean timestamps | |
| invalid_timestamps = pd.to_datetime(df['timestamp'], errors='coerce').isna() | |
| cleanup_details['invalid_timestamps'] = invalid_timestamps.sum() | |
| df = df[~invalid_timestamps] | |
| # Standardize language (fill missing with 'en') | |
| df['language'] = df['language'].fillna('en') | |
| # Convert duration and satisfaction score to numeric | |
| df['duration_minutes'] = pd.to_numeric(df['duration_minutes'], errors='coerce') | |
| df['satisfaction_score'] = pd.to_numeric(df['satisfaction_score'], errors='coerce') | |
| cleaned_count = len(df) | |
| cleanup_details['cleaned'] = cleaned_count | |
| cleanup_details['removed'] = original_count - cleaned_count | |
| # Save cleaned CSV for SageMaker/Azure AI | |
| cleaned_path = 'cleaned_call_center_logs.csv' | |
| df.to_csv(cleaned_path, index=False) | |
| return df, cleanup_details, cleaned_path | |
| # Statistical plotting function | |
| def plot_statistics(df): | |
| # Plot 1: Distribution of Call Durations | |
| plt.figure(figsize=(10, 6)) | |
| sns.histplot(df['duration_minutes'], bins=20, kde=True, color='skyblue') | |
| plt.title('Distribution of Call Durations') | |
| plt.xlabel('Duration (minutes)') | |
| plt.ylabel('Frequency') | |
| plt.savefig('duration_distribution.png') | |
| plt.close() | |
| # Plot 2: Satisfaction Scores by Agent | |
| plt.figure(figsize=(10, 6)) | |
| sns.boxplot(x='agent_id', y='satisfaction_score', data=df, color='lightblue') | |
| plt.title('Satisfaction Scores by Agent') | |
| plt.xlabel('Agent ID') | |
| plt.ylabel('Satisfaction Score') | |
| plt.savefig('satisfaction_by_agent.png') | |
| plt.close() | |
| # Plot 3: Query Frequency by Language | |
| plt.figure(figsize=(10, 6)) | |
| sns.countplot(x='language', data=df, color='skyblue') | |
| plt.title('Query Frequency by Language') | |
| plt.xlabel('Language') | |
| plt.ylabel('Number of Queries') | |
| plt.savefig('query_by_language.png') | |
| plt.close() | |
| return ['duration_distribution.png', 'satisfaction_by_agent.png', 'query_by_language.png'] | |
| # Generate PDF report | |
| def generate_pdf_report(cleanup_details, plot_paths): | |
| pdf_path = 'data_analysis_report.pdf' | |
| c = canvas.Canvas(pdf_path, pagesize=letter) | |
| width, height = letter | |
| # Title | |
| c.setFont("Helvetica-Bold", 16) | |
| c.drawString(50, height - 50, "Call Center Data Analysis Report") | |
| # Cleanup Stats | |
| c.setFont("Helvetica", 12) | |
| y_position = height - 80 | |
| c.drawString(50, y_position, "Data Cleanup Statistics:") | |
| y_position -= 20 | |
| for key, value in cleanup_details.items(): | |
| c.drawString(70, y_position, f"{key.replace('_', ' ').title()}: {value}") | |
| y_position -= 15 | |
| # Add Plots | |
| y_position -= 30 | |
| for plot_path in plot_paths: | |
| if os.path.exists(plot_path): | |
| img = ImageReader(plot_path) | |
| img_width, img_height = img.getSize() | |
| aspect = img_height / float(img_width) | |
| plot_width = 500 | |
| plot_height = plot_width * aspect | |
| if y_position - plot_height < 50: | |
| c.showPage() | |
| y_position = height - 50 | |
| c.drawImage(img, 50, y_position - plot_height, width=plot_width, height=plot_height) | |
| y_position -= plot_height + 20 | |
| c.save() | |
| return pdf_path | |
| # Main analysis function | |
| def analyze_data(): | |
| try: | |
| # Load the CSV | |
| df = pd.read_csv(CSV_FILE_PATH) | |
| # Clean the data | |
| cleaned_df, cleanup_details, cleaned_path = clean_data(df) | |
| # Generate statistical plots | |
| plot_paths = plot_statistics(cleaned_df) | |
| # Generate PDF report | |
| pdf_path = generate_pdf_report(cleanup_details, plot_paths) | |
| # Prepare cleanup stats for display | |
| cleanup_stats = "\n".join([f"{key.replace('_', ' ').title()}: {value}" for key, value in cleanup_details.items()]) | |
| return ( | |
| cleaned_df.head(50).to_html(), # Display first 50 rows as a table | |
| cleanup_stats, | |
| plot_paths[0], # Duration distribution | |
| plot_paths[1], # Satisfaction by agent | |
| plot_paths[2], # Query by language | |
| gr.File(value=cleaned_path, label="Download Cleaned CSV"), | |
| gr.File(value=pdf_path, label="Download PDF Report") | |
| ) | |
| except Exception as e: | |
| return f"Error: {str(e)}", "", None, None, None, None, None | |
| # Gradio interface | |
| custom_css = """ | |
| body { | |
| background: linear-gradient(135deg, #1a1a1a 0%, #2a2a2a 100%); | |
| color: #e0e0e0; | |
| font-family: 'Arial', sans-serif; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| min-height: 100vh; | |
| margin: 0; | |
| } | |
| .gr-box { | |
| background: #3a3a3a; | |
| border: 1px solid #4a4a4a; | |
| border-radius: 8px; | |
| padding: 20px; | |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3); | |
| } | |
| .gr-button { | |
| background: #1e90ff; | |
| color: white; | |
| border-radius: 5px; | |
| padding: 12px 20px; | |
| margin: 8px 0; | |
| width: 100%; | |
| text-align: center; | |
| transition: background 0.3s ease; | |
| font-size: 16px; | |
| } | |
| .gr-button:hover { | |
| background: #1c86ee; | |
| box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2); | |
| } | |
| .gr-textbox { | |
| background: #2f2f2f; | |
| color: #e0e0e0; | |
| border: 1px solid #4a4a4a; | |
| border-radius: 5px; | |
| margin-bottom: 15px; | |
| font-size: 16px; | |
| padding: 15px; | |
| min-height: 120px; | |
| width: 100%; | |
| } | |
| .gr-image { | |
| width: 100%; | |
| height: auto; | |
| max-height: 400px; | |
| } | |
| #app-container { | |
| max-width: 900px; | |
| width: 100%; | |
| padding: 20px; | |
| background: #252525; | |
| border-radius: 12px; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5); | |
| } | |
| .text-center { | |
| text-align: center; | |
| margin-bottom: 20px; | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css) as demo: | |
| with gr.Column(elem_id="app-container"): | |
| gr.Markdown("# Call Center Data Analysis", elem_classes="text-center") | |
| gr.Markdown("Analyze call center logs, view statistics, and export cleaned data for SageMaker/Azure AI.", elem_classes="text-center") | |
| # Button to trigger analysis | |
| analyze_button = gr.Button("Analyze Data") | |
| # Outputs | |
| raw_data_output = gr.HTML(label="Raw Data (First 50 Rows)") | |
| cleanup_stats_output = gr.Textbox(label="Data Cleanup Statistics") | |
| duration_plot_output = gr.Image(label="Distribution of Call Durations") | |
| satisfaction_plot_output = gr.Image(label="Satisfaction Scores by Agent") | |
| language_plot_output = gr.Image(label="Query Frequency by Language") | |
| csv_download = gr.File(label="Download Cleaned CSV") | |
| pdf_download = gr.File(label="Download PDF Report") | |
| # Connect the button to the analysis function | |
| analyze_button.click( | |
| fn=analyze_data, | |
| inputs=None, | |
| outputs=[ | |
| raw_data_output, | |
| cleanup_stats_output, | |
| duration_plot_output, | |
| satisfaction_plot_output, | |
| language_plot_output, | |
| csv_download, | |
| pdf_download | |
| ] | |
| ) | |
| demo.launch() |