Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Jul 28, 2024

Commit

0575dff

verified ·

1 Parent(s): 1762079

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -85

app.py CHANGED Viewed

@@ -38,7 +38,6 @@ categories_keywords = {
     "Miscellaneous": []
 }
 def categorize_question(question):
     words = question.split()
@@ -69,7 +68,6 @@ def categorize_question(question):
     return "Miscellaneous"
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
@@ -121,9 +119,6 @@ def preprocess_data(df):
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
-    # Categorize the texts
-    df['Category'] = df['texts'].apply(categorize_question)
     return df
 def cluster_data(df, num_clusters):
@@ -157,88 +152,58 @@ def generate_wordcloud(df):
     plt.figure(figsize=(15, 7))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
-    buf = BytesIO()
-    plt.savefig(buf, format='png')
-    buf.seek(0)
-    img = Image.open(buf)
-    return img
-def generate_bar_chart(df, num_clusters_to_display):
-    # Exclude common words from the top words
-    common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
-    top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
-    df_top_categories = df[df['Category'].isin(top_categories)]
-    category_top_words = df_top_categories.groupby('Category', observed=False)['texts'].apply(lambda x: ' '.join(x)).reset_index()
-    category_top_words['top_word'] = category_top_words['texts'].apply(lambda x: ' '.join([word for word in pd.Series(x.split()).value_counts().index if word not in common_words][:3]))
-    category_sizes = df_top_categories['Category'].value_counts().reset_index()
-    category_sizes.columns = ['Category', 'Count']
-    category_sizes = category_sizes.merge(category_top_words[['Category', 'top_word']], on='Category')
-    fig = px.bar(category_sizes, x='Category', y='Count', text='top_word', title='Category Frequency with Top Words')
-    fig.update_traces(textposition='outside')
-    fig.update_layout(xaxis_title='Category', yaxis_title='Frequency', showlegend=False)
-    buf = BytesIO()
-    fig.write_image(buf, format='png')
-    buf.seek(0)
-    img = Image.open(buf)
-    return img
-def main(file, num_clusters_to_display):
-    try:
-        df = pd.read_csv(file)
-        # Filter by 'Fallback Message shown'
-        df = df[df['Answer'] == 'Fallback Message shown']
-        df = preprocess_data(df)
-        df = df[df['Category'] != 'Miscellaneous']
-        # Get category sizes and sort by size in ascending order
-        category_sizes = df['Category'].value_counts().reset_index()
-        category_sizes.columns = ['Category', 'Count']
-        sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
-        sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
-        # Get the largest x categories as specified by num_clusters_to_display
-        largest_categories = sorted_categories[:num_clusters_to_display]
-        smallest_categories = sorted_categories_sm[:num_clusters_to_display]
-        # Filter the dataframe to include only the largest categories
-        filtered_df = df[df['Category'].isin(largest_categories)]
-        filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
-        # Sort the dataframe by Category
-        filtered_df = filtered_df.sort_values(by='Category')
-        filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
-        wordcloud_img = generate_wordcloud(filtered_cloud_df)
-        bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
-            filtered_df.to_csv(tmpfile.name, index=False)
-            csv_file_path = tmpfile.name
-        return csv_file_path, wordcloud_img, bar_chart_img
-    except Exception as e:
-        print(f"Error: {e}")
-        return str(e), None, None
-interface = gr.Interface(
-    fn=main,
     inputs=[
-        gr.File(label="Upload CSV File (.csv)"),
-        gr.Slider(label="Number of Categories to Display", minimum=1, maximum=15, step=1, value=5)
-    ],
-    outputs=[
-        gr.File(label="Categorized Data CSV"),
-        gr.Image(label="Word Cloud"),
-        gr.Image(label="Bar Chart")
     ],
-    title="Unanswered User Queries Categorization",
 )
-interface.launch(share=True)

     "Miscellaneous": []
 }
 def categorize_question(question):
     words = question.split()
     return "Miscellaneous"
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
     return df
 def cluster_data(df, num_clusters):
     plt.figure(figsize=(15, 7))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
+    plt.show()
+def generate_barchart(df):
+    category_counts = df['Category'].value_counts().reset_index()
+    category_counts.columns = ['Category', 'Count']
+    fig = px.bar(category_counts, x='Category', y='Count', title='Number of Queries per Category', color='Count', color_continuous_scale='Viridis')
+    fig.show()
+def process_and_analyze(file, num_clusters):
+    df = pd.read_csv(file)
+    df = preprocess_data(df)
+    df, kmeans = cluster_data(df, num_clusters)
+    df['Category'] = df['texts'].apply(categorize_question)
+    df = df.sort_values(by=['Category', 'Cluster'])
+    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+        temp_filename = tmp_file.name
+        df.to_csv(temp_filename, index=False)
+    generate_wordcloud(df)
+    generate_barchart(df)
+    return temp_filename
+def save_file(file):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
+        temp_filename = tmp_file.name
+        with open(temp_filename, 'wb') as f:
+            f.write(file.read())
+    return temp_filename
+def process_and_return(file, num_clusters):
+    temp_filename = save_file(file)
+    output_filename = process_and_analyze(temp_filename, num_clusters)
+    with open(output_filename, 'rb') as f:
+        csv_bytes = BytesIO(f.read())
+    return csv_bytes
+iface = gr.Interface(
+    fn=process_and_return,
     inputs=[
+        gr.inputs.File(label="Upload CSV File"),
+        gr.inputs.Slider(2, 10, step=1, default=3, label="Number of Clusters")
     ],
+    outputs=gr.outputs.File(label="Processed CSV File"),
+    title="Query Categorization and Clustering",
+    description="Upload a CSV file containing the queries. This tool will categorize and cluster the queries, then return a processed CSV file."
 )
+iface.launch()