tanish78 commited on
Commit
90eec1e
·
verified ·
1 Parent(s): ee4d135

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -13
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
- from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score
6
  import matplotlib.pyplot as plt
7
  from sklearn.decomposition import PCA
8
  import re
@@ -10,17 +10,12 @@ from io import BytesIO
10
  import tempfile
11
  import numpy as np
12
  from PIL import Image
13
- from nltk.stem import WordNetLemmatizer
14
- from sklearn.preprocessing import normalize
15
 
16
  def preprocess_data(df):
17
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
18
  df['texts'] = df['texts'].astype(str)
19
  df['texts'] = df['texts'].str.lower()
20
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
21
-
22
- lemmatizer = WordNetLemmatizer()
23
- df['texts'] = df['texts'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))
24
 
25
  def remove_emoji(string):
26
  emoji_pattern = re.compile("["
@@ -116,9 +111,8 @@ def preprocess_data(df):
116
  return df
117
 
118
  def cluster_data(df, num_clusters):
119
- vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.85, min_df=2)
120
  X = vectorizer.fit_transform(df['texts'])
121
- X = normalize(X)
122
 
123
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
124
  kmeans.fit(X)
@@ -205,17 +199,15 @@ def main(file, num_clusters_to_display):
205
  silhouette_avg = silhouette_score(X, kmeans.labels_)
206
  silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=15)
207
 
208
- davies_bouldin = davies_bouldin_score(X, kmeans.labels_)
209
-
210
  # Convert silhouette score to percentage
211
  silhouette_percentage = (silhouette_avg + 1) * 50
212
 
213
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
214
  df.to_csv(tmpfile.name, index=False)
215
- return tmpfile.name, silhouette_percentage, davies_bouldin, cluster_plot, silhouette_plot
216
  except Exception as e:
217
  print(f"Error: {e}")
218
- return str(e), None, None, None, None
219
 
220
  interface = gr.Interface(
221
  fn=main,
@@ -226,7 +218,6 @@ interface = gr.Interface(
226
  outputs=[
227
  gr.File(label="Clustered Data CSV"),
228
  gr.Number(label="Clustering Quality (%)"),
229
- gr.Number(label="Davies-Bouldin Index"),
230
  gr.Image(label="Cluster Plot"),
231
  gr.Image(label="Silhouette Plot")
232
  ],
 
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
+ from sklearn.metrics import silhouette_score, silhouette_samples
6
  import matplotlib.pyplot as plt
7
  from sklearn.decomposition import PCA
8
  import re
 
10
  import tempfile
11
  import numpy as np
12
  from PIL import Image
 
 
13
 
14
  def preprocess_data(df):
15
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
16
  df['texts'] = df['texts'].astype(str)
17
  df['texts'] = df['texts'].str.lower()
18
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
 
 
 
19
 
20
  def remove_emoji(string):
21
  emoji_pattern = re.compile("["
 
111
  return df
112
 
113
  def cluster_data(df, num_clusters):
114
+ vectorizer = TfidfVectorizer(stop_words='english')
115
  X = vectorizer.fit_transform(df['texts'])
 
116
 
117
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
118
  kmeans.fit(X)
 
199
  silhouette_avg = silhouette_score(X, kmeans.labels_)
200
  silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=15)
201
 
 
 
202
  # Convert silhouette score to percentage
203
  silhouette_percentage = (silhouette_avg + 1) * 50
204
 
205
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
206
  df.to_csv(tmpfile.name, index=False)
207
+ return tmpfile.name, silhouette_percentage, cluster_plot, silhouette_plot
208
  except Exception as e:
209
  print(f"Error: {e}")
210
+ return str(e), None, None, None
211
 
212
  interface = gr.Interface(
213
  fn=main,
 
218
  outputs=[
219
  gr.File(label="Clustered Data CSV"),
220
  gr.Number(label="Clustering Quality (%)"),
 
221
  gr.Image(label="Cluster Plot"),
222
  gr.Image(label="Silhouette Plot")
223
  ],