Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Jul 10, 2024

Commit

29692d0

verified ·

1 Parent(s): c6ae83d

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -25

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ import re
 from io import BytesIO
 def preprocess_data(df):
     # Renaming the 'Queries' column to 'texts'
     df.rename(columns={'Queries': 'texts'}, inplace=True)
@@ -21,7 +20,6 @@ def preprocess_data(df):
     # Remove URL from text
     df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
     # Remove emojis from text
     def remove_emoji(string):
         emoji_pattern = re.compile("["
@@ -98,7 +96,6 @@ def preprocess_data(df):
     for phrase in remove_phrases:
         df['texts'] = df['texts'].str.replace(phrase, '')
     # Drop rows containing any general words from response and its variations
     general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
                           "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
@@ -128,12 +125,9 @@ def preprocess_data(df):
     df['texts'] = df['texts'].apply(lambda x: x.strip())  # Remove leading and trailing whitespaces
     df = df[df['texts'] != '']
     return df
 def cluster_data(df, num_clusters=5):
     # Vectorize the text data
     vectorizer = TfidfVectorizer(stop_words='english')
     X = vectorizer.fit_transform(df['texts'])
@@ -149,10 +143,8 @@ def cluster_data(df, num_clusters=5):
     df['PCA1'] = principal_components[:, 0]
     df['PCA2'] = principal_components[:, 1]
     return df
 def visualize_clusters(df):
     plt.figure(figsize=(10, 6))
     scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
@@ -162,31 +154,25 @@ def visualize_clusters(df):
     plt.ylabel('PCA Component 2')
     plt.show()
 def main(file, num_clusters):
     try:
         df = pd.read_excel(file)
         df = preprocess_data(df)
         df = cluster_data(df, num_clusters)
         visualize_clusters(df)
-        output = BytesIO()
-        df.to_csv(output, index=False)
-        output.seek(0)
-        return output
     except Exception as e:
-        return str(e)
-interface = gr.Interface(
     fn=main,
-    inputs=[
-        gr.File(label="Upload Excel File (.xlsx)"),
-        gr.Number(value=5, label="Number of Clusters")
-    ],
-    outputs=gr.File(label="Download Clustered Data as CSV"),
-    title="Unanswered User Queries Clustering",
-    description="Upload an Excel file (.xlsx)"
 )
-interface.launch()

 from io import BytesIO
 def preprocess_data(df):
     # Renaming the 'Queries' column to 'texts'
     df.rename(columns={'Queries': 'texts'}, inplace=True)
     # Remove URL from text
     df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
     # Remove emojis from text
     def remove_emoji(string):
         emoji_pattern = re.compile("["
     for phrase in remove_phrases:
         df['texts'] = df['texts'].str.replace(phrase, '')
     # Drop rows containing any general words from response and its variations
     general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
                           "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
     df['texts'] = df['texts'].apply(lambda x: x.strip())  # Remove leading and trailing whitespaces
     df = df[df['texts'] != '']
     return df
 def cluster_data(df, num_clusters=5):
     # Vectorize the text data
     vectorizer = TfidfVectorizer(stop_words='english')
     X = vectorizer.fit_transform(df['texts'])
     df['PCA1'] = principal_components[:, 0]
     df['PCA2'] = principal_components[:, 1]
     return df
 def visualize_clusters(df):
     plt.figure(figsize=(10, 6))
     scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
     plt.ylabel('PCA Component 2')
     plt.show()
 def main(file, num_clusters):
     try:
         df = pd.read_excel(file)
         df = preprocess_data(df)
         df = cluster_data(df, num_clusters)
         visualize_clusters(df)
+        csv_file = BytesIO()
+        df.to_csv(csv_file, index=False)
+        csv_file.seek(0)
+        return csv_file
     except Exception as e:
+        import traceback
+        return traceback.format_exc()
+iface = gr.Interface(
     fn=main,
+    inputs=[gr.inputs.File(label="Upload an Excel File (.xlsx)"), gr.inputs.Number(label="Number of Clusters")],
+    outputs=gr.outputs.File(label="Download Clustered Data as CSV")
 )
+if __name__ == "__main__":
+    iface.launch()