Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Jul 28, 2024

Commit

60c5bda

verified ·

1 Parent(s): 0575dff

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -50

app.py CHANGED Viewed

@@ -38,6 +38,7 @@ categories_keywords = {
     "Miscellaneous": []
 }
 def categorize_question(question):
     words = question.split()
@@ -68,6 +69,7 @@ def categorize_question(question):
     return "Miscellaneous"
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
@@ -119,6 +121,9 @@ def preprocess_data(df):
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
     return df
 def cluster_data(df, num_clusters):
@@ -152,58 +157,89 @@ def generate_wordcloud(df):
     plt.figure(figsize=(15, 7))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
-    plt.show()
-def generate_barchart(df):
-    category_counts = df['Category'].value_counts().reset_index()
-    category_counts.columns = ['Category', 'Count']
-    fig = px.bar(category_counts, x='Category', y='Count', title='Number of Queries per Category', color='Count', color_continuous_scale='Viridis')
-    fig.show()
-def process_and_analyze(file, num_clusters):
-    df = pd.read_csv(file)
-    df = preprocess_data(df)
-    df, kmeans = cluster_data(df, num_clusters)
-    df['Category'] = df['texts'].apply(categorize_question)
-    df = df.sort_values(by=['Category', 'Cluster'])
-    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
-        temp_filename = tmp_file.name
-        df.to_csv(temp_filename, index=False)
-    generate_wordcloud(df)
-    generate_barchart(df)
-    return temp_filename
-def save_file(file):
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
-        temp_filename = tmp_file.name
-        with open(temp_filename, 'wb') as f:
-            f.write(file.read())
-    return temp_filename
-def process_and_return(file, num_clusters):
-    temp_filename = save_file(file)
-    output_filename = process_and_analyze(temp_filename, num_clusters)
-    with open(output_filename, 'rb') as f:
-        csv_bytes = BytesIO(f.read())
-    return csv_bytes
-iface = gr.Interface(
-    fn=process_and_return,
     inputs=[
-        gr.inputs.File(label="Upload CSV File"),
-        gr.inputs.Slider(2, 10, step=1, default=3, label="Number of Clusters")
     ],
-    outputs=gr.outputs.File(label="Processed CSV File"),
-    title="Query Categorization and Clustering",
-    description="Upload a CSV file containing the queries. This tool will categorize and cluster the queries, then return a processed CSV file."
 )
-iface.launch()

     "Miscellaneous": []
 }
 def categorize_question(question):
     words = question.split()
     return "Miscellaneous"
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
+    # Categorize the texts
+    df['Category'] = df['texts'].apply(categorize_question)
     return df
 def cluster_data(df, num_clusters):
     plt.figure(figsize=(15, 7))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
+    buf = BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    img = Image.open(buf)
+    return img
+def generate_bar_chart(df, num_clusters_to_display):
+    # Exclude common words from the top words
+    common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
+    top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
+    df_top_categories = df[df['Category'].isin(top_categories)]
+    category_top_words = df_top_categories.groupby('Category', observed=False)['texts'].apply(lambda x: ' '.join(x)).reset_index()
+    category_top_words['top_word'] = category_top_words['texts'].apply(lambda x: ' '.join([word for word in pd.Series(x.split()).value_counts().index if word not in common_words][:3]))
+    category_sizes = df_top_categories['Category'].value_counts().reset_index()
+    category_sizes.columns = ['Category', 'Count']
+    category_sizes = category_sizes.merge(category_top_words[['Category', 'top_word']], on='Category')
+    fig = px.bar(category_sizes, x='Category', y='Count', text='top_word', title='Category Frequency with Top Words')
+    fig.update_traces(textposition='outside')
+    fig.update_layout(xaxis_title='Category', yaxis_title='Frequency', showlegend=False)
+    buf = BytesIO()
+    fig.write_image(buf, format='png')
+    buf.seek(0)
+    img = Image.open(buf)
+    return img
+def main(file, num_clusters_to_display):
+    try:
+        df = pd.read_csv(file)
+        # Filter by 'Fallback Message shown'
+        df = df[df['Answer'] == 'Fallback Message shown']
+        df = preprocess_data(df)
+        df = df[df['Category'] != 'Miscellaneous']
+        # Get category sizes and sort by size in ascending order
+        category_sizes = df['Category'].value_counts().reset_index()
+        category_sizes.columns = ['Category', 'Count']
+        sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
+        sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
+        # Get the largest x categories as specified by num_clusters_to_display
+        largest_categories = sorted_categories[:num_clusters_to_display]
+        smallest_categories = sorted_categories_sm[:num_clusters_to_display]
+        # Filter the dataframe to include only the largest categories
+        filtered_df = df[df['Category'].isin(largest_categories)]
+        filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
+        # Sort the dataframe by Category
+        filtered_df = filtered_df.sort_values(by='Category')
+        filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
+        wordcloud_img = generate_wordcloud(filtered_cloud_df)
+        bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
+            filtered_df.to_csv(tmpfile.name, index=False)
+            csv_file_path = tmpfile.name
+        return csv_file_path, wordcloud_img, bar_chart_img
+    except Exception as e:
+        print(f"Error: {e}")
+        return str(e), None, None
+interface = gr.Interface(
+    fn=main,
     inputs=[
+        gr.File(label="Upload CSV File (.csv)"),
+        gr.Slider(label="Number of Categories to Display", minimum=1, maximum=15, step=1, value=5)
     ],
+    outputs=[
+        gr.File(label="Categorized Data CSV"),
+        gr.Image(label="Word Cloud"),
+        gr.Image(label="Bar Chart")
+    ],
+    title="Unanswered User Queries Categorization",
 )
+interface.launch(share=True)