Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,13 +7,8 @@ from sklearn.decomposition import PCA
|
|
| 7 |
import re
|
| 8 |
from io import BytesIO
|
| 9 |
import tempfile
|
| 10 |
-
from datetime import datetime
|
| 11 |
|
| 12 |
def preprocess_data(df):
|
| 13 |
-
# Filter based on the 'Answer' column and the date
|
| 14 |
-
df = df[(df['Answer'] == 'Fallback Message shown') & (pd.to_datetime(df['Date'], dayfirst=True) > datetime(2024, 7, 1))]
|
| 15 |
-
|
| 16 |
-
# Rename and preprocess the 'Question Asked' column
|
| 17 |
df.rename(columns={'Question Asked': 'texts'}, inplace=True)
|
| 18 |
df['texts'] = df['texts'].astype(str)
|
| 19 |
df['texts'] = df['texts'].str.lower()
|
|
@@ -21,17 +16,17 @@ def preprocess_data(df):
|
|
| 21 |
|
| 22 |
def remove_emoji(string):
|
| 23 |
emoji_pattern = re.compile("["
|
| 24 |
-
u"\U0001F600-\U0001F64F"
|
| 25 |
-
u"\U0001F300-\U0001F5FF"
|
| 26 |
-
u"\U0001F680-\U0001F6FF"
|
| 27 |
-
u"\U0001F1E0-\U0001F1FF"
|
| 28 |
u"\U00002702-\U000027B0"
|
| 29 |
u"\U000024C2-\U0001F251"
|
| 30 |
"]+", flags=re.UNICODE)
|
| 31 |
-
return emoji_pattern.sub(r'', string)
|
| 32 |
|
| 33 |
df['texts'] = df['texts'].apply(remove_emoji)
|
| 34 |
-
|
| 35 |
custom_synonyms = {
|
| 36 |
'application': ['form'],
|
| 37 |
'apply': ['fill', 'applied'],
|
|
@@ -139,7 +134,11 @@ def visualize_clusters(df):
|
|
| 139 |
|
| 140 |
def main(file, num_clusters_to_display):
|
| 141 |
try:
|
| 142 |
-
df = pd.read_csv(file
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
df = preprocess_data(df)
|
| 144 |
df = cluster_data(df)
|
| 145 |
visualize_clusters(df)
|
|
@@ -153,7 +152,7 @@ def main(file, num_clusters_to_display):
|
|
| 153 |
filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
|
| 154 |
top_clusters = filtered_clusters[:num_clusters_to_display]
|
| 155 |
|
| 156 |
-
df = df[df['
|
| 157 |
df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
|
| 158 |
df = df.sort_values('Cluster')
|
| 159 |
|
|
@@ -167,11 +166,11 @@ interface = gr.Interface(
|
|
| 167 |
fn=main,
|
| 168 |
inputs=[
|
| 169 |
gr.File(label="Upload CSV File (.csv)"),
|
| 170 |
-
gr.Slider(
|
| 171 |
],
|
| 172 |
outputs=gr.File(label="Clustered Data CSV"),
|
| 173 |
title="Unanswered User Queries Clustering",
|
| 174 |
-
description="Upload a CSV file (.csv) and select the number of largest clusters to display (excluding cluster 0)"
|
| 175 |
)
|
| 176 |
|
| 177 |
-
interface.launch()
|
|
|
|
| 7 |
import re
|
| 8 |
from io import BytesIO
|
| 9 |
import tempfile
|
|
|
|
| 10 |
|
| 11 |
def preprocess_data(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
df.rename(columns={'Question Asked': 'texts'}, inplace=True)
|
| 13 |
df['texts'] = df['texts'].astype(str)
|
| 14 |
df['texts'] = df['texts'].str.lower()
|
|
|
|
| 16 |
|
| 17 |
def remove_emoji(string):
|
| 18 |
emoji_pattern = re.compile("["
|
| 19 |
+
u"\U0001F600-\U0001F64F"
|
| 20 |
+
u"\U0001F300-\U0001F5FF"
|
| 21 |
+
u"\U0001F680-\U0001F6FF"
|
| 22 |
+
u"\U0001F1E0-\U0001F1FF"
|
| 23 |
u"\U00002702-\U000027B0"
|
| 24 |
u"\U000024C2-\U0001F251"
|
| 25 |
"]+", flags=re.UNICODE)
|
| 26 |
+
return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
|
| 27 |
|
| 28 |
df['texts'] = df['texts'].apply(remove_emoji)
|
| 29 |
+
|
| 30 |
custom_synonyms = {
|
| 31 |
'application': ['form'],
|
| 32 |
'apply': ['fill', 'applied'],
|
|
|
|
| 134 |
|
| 135 |
def main(file, num_clusters_to_display):
|
| 136 |
try:
|
| 137 |
+
df = pd.read_csv(file)
|
| 138 |
+
|
| 139 |
+
# Filter by 'Fallback Message shown' and date after '01/07/24'
|
| 140 |
+
df = df[(df['Answer'] == 'Fallback Message shown') & (pd.to_datetime(df['Date and Time']) > '2024-07-01')]
|
| 141 |
+
|
| 142 |
df = preprocess_data(df)
|
| 143 |
df = cluster_data(df)
|
| 144 |
visualize_clusters(df)
|
|
|
|
| 152 |
filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
|
| 153 |
top_clusters = filtered_clusters[:num_clusters_to_display]
|
| 154 |
|
| 155 |
+
df = df[df['Cluster'].isin(top_clusters)]
|
| 156 |
df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
|
| 157 |
df = df.sort_values('Cluster')
|
| 158 |
|
|
|
|
| 166 |
fn=main,
|
| 167 |
inputs=[
|
| 168 |
gr.File(label="Upload CSV File (.csv)"),
|
| 169 |
+
gr.Slider(label="Number of Categories to Display", minimum=1, maximum=20, step=1, value=5)
|
| 170 |
],
|
| 171 |
outputs=gr.File(label="Clustered Data CSV"),
|
| 172 |
title="Unanswered User Queries Clustering",
|
| 173 |
+
description="Upload a CSV file (.csv) and select the number of largest clusters to display (excluding cluster 0)."
|
| 174 |
)
|
| 175 |
|
| 176 |
+
interface.launch(share=True)
|