tanish78 commited on
Commit
6847e76
·
verified ·
1 Parent(s): ab5743a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -33
app.py CHANGED
@@ -6,21 +6,14 @@ import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
  import re
8
  from io import BytesIO
 
9
 
10
  def preprocess_data(df):
11
- # Renaming the 'Queries' column to 'texts'
12
  df.rename(columns={'Queries': 'texts'}, inplace=True)
13
-
14
- # Convert the 'texts' column to string
15
  df['texts'] = df['texts'].astype(str)
16
-
17
- # Lowercase the 'texts' column
18
  df['texts'] = df['texts'].str.lower()
19
-
20
- # Remove URL from text
21
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
22
 
23
- # Remove emojis from text
24
  def remove_emoji(string):
25
  emoji_pattern = re.compile("["
26
  u"\U0001F600-\U0001F64F"
@@ -34,7 +27,6 @@ def preprocess_data(df):
34
 
35
  df['texts'] = df['texts'].apply(remove_emoji)
36
 
37
- # Define synonyms
38
  custom_synonyms = {
39
  'application': ['form'],
40
  'apply': ['fill', 'applied'],
@@ -48,20 +40,17 @@ def preprocess_data(df):
48
  'interview': ["pi"]
49
  }
50
 
51
- # Replace synonyms in the 'texts' column
52
  for original_word, synonym_list in custom_synonyms.items():
53
  for synonym in synonym_list:
54
- pattern = r"\b" + synonym + r"\b(?!\s*\()" # match whole word and exclude words in parentheses
55
  df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
56
- pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()" # match whole word followed by optional whitespace and "you"
57
  df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
58
 
59
- # Define list of spam words or phrases
60
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
61
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
62
  "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
63
 
64
- # Remove any row that contains a spam phrase
65
  rows_to_remove = set()
66
  for spam_phrase in spam_list:
67
  pattern = r"\b" + re.escape(spam_phrase) + r"\b"
@@ -70,25 +59,21 @@ def preprocess_data(df):
70
 
71
  df = df.drop(rows_to_remove)
72
 
73
- # Drop rows containing any greetings and its variations
74
  greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
75
  for greet_var in greet_variations:
76
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
77
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
78
 
79
- # Drop rows containing any okay response and its variations
80
  okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
81
  for okay_var in okay_variations:
82
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
83
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
84
 
85
- # Drop rows containing any yes response and its variations
86
  yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
87
  for yes_var in yes_variations:
88
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
89
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
90
 
91
- # Remove specific phrases from the "texts" column
92
  remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
93
  "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
94
  "actually no","next steps","i'm a student alumni","i have questions"]
@@ -96,7 +81,6 @@ def preprocess_data(df):
96
  for phrase in remove_phrases:
97
  df['texts'] = df['texts'].str.replace(phrase, '')
98
 
99
- # Drop rows containing any general words from response and its variations
100
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
101
  "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
102
  "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
@@ -109,35 +93,28 @@ def preprocess_data(df):
109
  return re.sub(r'[^\w\s]', '', text)
110
  df['texts'] = df['texts'].apply(remove_punctuations)
111
 
112
- # Remove specific phrases from the "texts" column
113
  remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
114
 
115
  for phrase in remove_morephrases:
116
  df['texts'] = df['texts'].str.replace(phrase, '')
117
 
118
- # Remove rows with phone numbers in the 'texts' column
119
  df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
120
 
121
- # Remove any leading or trailing whitespaces
122
  df['texts'] = df['texts'].str.strip()
123
 
124
- # Remove blank rows
125
- df['texts'] = df['texts'].apply(lambda x: x.strip()) # Remove leading and trailing whitespaces
126
  df = df[df['texts'] != '']
127
 
128
  return df
129
 
130
  def cluster_data(df, num_clusters=5):
131
- # Vectorize the text data
132
  vectorizer = TfidfVectorizer(stop_words='english')
133
  X = vectorizer.fit_transform(df['texts'])
134
 
135
- # Perform K-Means clustering
136
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
137
  kmeans.fit(X)
138
  df['Cluster'] = kmeans.labels_
139
 
140
- # Perform PCA to reduce dimensions for visualization
141
  pca = PCA(n_components=2)
142
  principal_components = pca.fit_transform(X.toarray())
143
  df['PCA1'] = principal_components[:, 0]
@@ -161,12 +138,9 @@ def main(file, num_clusters):
161
  df = cluster_data(df, num_clusters)
162
  visualize_clusters(df)
163
 
164
- # Save the DataFrame to a CSV file
165
- output = BytesIO()
166
- df.to_csv(output, index=False)
167
- output.seek(0)
168
-
169
- return output
170
  except Exception as e:
171
  return str(e)
172
 
 
6
  from sklearn.decomposition import PCA
7
  import re
8
  from io import BytesIO
9
+ import tempfile
10
 
11
  def preprocess_data(df):
 
12
  df.rename(columns={'Queries': 'texts'}, inplace=True)
 
 
13
  df['texts'] = df['texts'].astype(str)
 
 
14
  df['texts'] = df['texts'].str.lower()
 
 
15
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
16
 
 
17
  def remove_emoji(string):
18
  emoji_pattern = re.compile("["
19
  u"\U0001F600-\U0001F64F"
 
27
 
28
  df['texts'] = df['texts'].apply(remove_emoji)
29
 
 
30
  custom_synonyms = {
31
  'application': ['form'],
32
  'apply': ['fill', 'applied'],
 
40
  'interview': ["pi"]
41
  }
42
 
 
43
  for original_word, synonym_list in custom_synonyms.items():
44
  for synonym in synonym_list:
45
+ pattern = r"\b" + synonym + r"\b(?!\s*\()"
46
  df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
47
+ pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
48
  df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
49
 
 
50
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
51
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
52
  "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
53
 
 
54
  rows_to_remove = set()
55
  for spam_phrase in spam_list:
56
  pattern = r"\b" + re.escape(spam_phrase) + r"\b"
 
59
 
60
  df = df.drop(rows_to_remove)
61
 
 
62
  greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
63
  for greet_var in greet_variations:
64
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
65
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
66
 
 
67
  okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
68
  for okay_var in okay_variations:
69
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
70
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
71
 
 
72
  yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
73
  for yes_var in yes_variations:
74
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
75
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
76
 
 
77
  remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
78
  "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
79
  "actually no","next steps","i'm a student alumni","i have questions"]
 
81
  for phrase in remove_phrases:
82
  df['texts'] = df['texts'].str.replace(phrase, '')
83
 
 
84
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
85
  "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
86
  "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
 
93
  return re.sub(r'[^\w\s]', '', text)
94
  df['texts'] = df['texts'].apply(remove_punctuations)
95
 
 
96
  remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
97
 
98
  for phrase in remove_morephrases:
99
  df['texts'] = df['texts'].str.replace(phrase, '')
100
 
 
101
  df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
102
 
 
103
  df['texts'] = df['texts'].str.strip()
104
 
105
+ df['texts'] = df['texts'].apply(lambda x: x.strip())
 
106
  df = df[df['texts'] != '']
107
 
108
  return df
109
 
110
  def cluster_data(df, num_clusters=5):
 
111
  vectorizer = TfidfVectorizer(stop_words='english')
112
  X = vectorizer.fit_transform(df['texts'])
113
 
 
114
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
115
  kmeans.fit(X)
116
  df['Cluster'] = kmeans.labels_
117
 
 
118
  pca = PCA(n_components=2)
119
  principal_components = pca.fit_transform(X.toarray())
120
  df['PCA1'] = principal_components[:, 0]
 
138
  df = cluster_data(df, num_clusters)
139
  visualize_clusters(df)
140
 
141
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
142
+ df.to_csv(tmpfile.name, index=False)
143
+ return tmpfile.name
 
 
 
144
  except Exception as e:
145
  return str(e)
146