Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Jul 10, 2024

Commit

6847e76

verified ·

1 Parent(s): ab5743a

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -33

app.py CHANGED Viewed

@@ -6,21 +6,14 @@ import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
 import re
 from io import BytesIO
 def preprocess_data(df):
-    # Renaming the 'Queries' column to 'texts'
     df.rename(columns={'Queries': 'texts'}, inplace=True)
-    # Convert the 'texts' column to string
     df['texts'] = df['texts'].astype(str)
-    # Lowercase the 'texts' column
     df['texts'] = df['texts'].str.lower()
-    # Remove URL from text
     df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
-    # Remove emojis from text
     def remove_emoji(string):
         emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"
@@ -34,7 +27,6 @@ def preprocess_data(df):
     df['texts'] = df['texts'].apply(remove_emoji)
-    # Define synonyms
     custom_synonyms = {
         'application': ['form'],
         'apply': ['fill', 'applied'],
@@ -48,20 +40,17 @@ def preprocess_data(df):
         'interview': ["pi"]
     }
-    # Replace synonyms in the 'texts' column
     for original_word, synonym_list in custom_synonyms.items():
         for synonym in synonym_list:
-            pattern = r"\b" + synonym + r"\b(?!\s*\()"  # match whole word and exclude words in parentheses
             df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
-            pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"  # match whole word followed by optional whitespace and "you"
             df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
-    # Define list of spam words or phrases
     spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
                  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
                  "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
-    # Remove any row that contains a spam phrase
     rows_to_remove = set()
     for spam_phrase in spam_list:
         pattern = r"\b" + re.escape(spam_phrase) + r"\b"
@@ -70,25 +59,21 @@ def preprocess_data(df):
     df = df.drop(rows_to_remove)
-    # Drop rows containing any greetings and its variations
     greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
     for greet_var in greet_variations:
         pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
         df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
-    # Drop rows containing any okay response and its variations
     okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
     for okay_var in okay_variations:
         pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
         df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
-    # Drop rows containing any yes response and its variations
     yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
     for yes_var in yes_variations:
         pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
         df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
-    # Remove specific phrases from the "texts" column
     remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
                       "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
                       "actually no","next steps","i'm a student alumni","i have questions"]
@@ -96,7 +81,6 @@ def preprocess_data(df):
     for phrase in remove_phrases:
         df['texts'] = df['texts'].str.replace(phrase, '')
-    # Drop rows containing any general words from response and its variations
     general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
                           "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
                           "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
@@ -109,35 +93,28 @@ def preprocess_data(df):
         return re.sub(r'[^\w\s]', '', text)
     df['texts'] = df['texts'].apply(remove_punctuations)
-    # Remove specific phrases from the "texts" column
     remove_morephrases = ["short reads 38 min","bite size  2 min","videos 28 min","long reads  8 min"]
     for phrase in remove_morephrases:
         df['texts'] = df['texts'].str.replace(phrase, '')
-    # Remove rows with phone numbers in the 'texts' column
     df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
-    # Remove any leading or trailing whitespaces
     df['texts'] = df['texts'].str.strip()
-    # Remove blank rows
-    df['texts'] = df['texts'].apply(lambda x: x.strip())  # Remove leading and trailing whitespaces
     df = df[df['texts'] != '']
     return df
 def cluster_data(df, num_clusters=5):
-    # Vectorize the text data
     vectorizer = TfidfVectorizer(stop_words='english')
     X = vectorizer.fit_transform(df['texts'])
-    # Perform K-Means clustering
     kmeans = KMeans(n_clusters=num_clusters, random_state=0)
     kmeans.fit(X)
     df['Cluster'] = kmeans.labels_
-    # Perform PCA to reduce dimensions for visualization
     pca = PCA(n_components=2)
     principal_components = pca.fit_transform(X.toarray())
     df['PCA1'] = principal_components[:, 0]
@@ -161,12 +138,9 @@ def main(file, num_clusters):
         df = cluster_data(df, num_clusters)
         visualize_clusters(df)
-        # Save the DataFrame to a CSV file
-        output = BytesIO()
-        df.to_csv(output, index=False)
-        output.seek(0)
-        return output
     except Exception as e:
         return str(e)

 from sklearn.decomposition import PCA
 import re
 from io import BytesIO
+import tempfile
 def preprocess_data(df):
     df.rename(columns={'Queries': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str)
     df['texts'] = df['texts'].str.lower()
     df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
     def remove_emoji(string):
         emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"
     df['texts'] = df['texts'].apply(remove_emoji)
     custom_synonyms = {
         'application': ['form'],
         'apply': ['fill', 'applied'],
         'interview': ["pi"]
     }
     for original_word, synonym_list in custom_synonyms.items():
         for synonym in synonym_list:
+            pattern = r"\b" + synonym + r"\b(?!\s*\()"
             df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
+            pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
             df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
     spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
                  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
                  "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
     rows_to_remove = set()
     for spam_phrase in spam_list:
         pattern = r"\b" + re.escape(spam_phrase) + r"\b"
     df = df.drop(rows_to_remove)
     greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
     for greet_var in greet_variations:
         pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
         df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
     okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
     for okay_var in okay_variations:
         pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
         df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
     yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
     for yes_var in yes_variations:
         pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
         df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
     remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
                       "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
                       "actually no","next steps","i'm a student alumni","i have questions"]
     for phrase in remove_phrases:
         df['texts'] = df['texts'].str.replace(phrase, '')
     general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
                           "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
                           "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
         return re.sub(r'[^\w\s]', '', text)
     df['texts'] = df['texts'].apply(remove_punctuations)
     remove_morephrases = ["short reads 38 min","bite size  2 min","videos 28 min","long reads  8 min"]
     for phrase in remove_morephrases:
         df['texts'] = df['texts'].str.replace(phrase, '')
     df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
     df['texts'] = df['texts'].str.strip()
+    df['texts'] = df['texts'].apply(lambda x: x.strip())
     df = df[df['texts'] != '']
     return df
 def cluster_data(df, num_clusters=5):
     vectorizer = TfidfVectorizer(stop_words='english')
     X = vectorizer.fit_transform(df['texts'])
     kmeans = KMeans(n_clusters=num_clusters, random_state=0)
     kmeans.fit(X)
     df['Cluster'] = kmeans.labels_
     pca = PCA(n_components=2)
     principal_components = pca.fit_transform(X.toarray())
     df['PCA1'] = principal_components[:, 0]
         df = cluster_data(df, num_clusters)
         visualize_clusters(df)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
+            df.to_csv(tmpfile.name, index=False)
+            return tmpfile.name
     except Exception as e:
         return str(e)