Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -107,13 +107,8 @@ def preprocess_data(df):
|
|
| 107 |
|
| 108 |
return df
|
| 109 |
|
| 110 |
-
def
|
| 111 |
-
|
| 112 |
-
df.rename(columns={'Question': 'texts'}, inplace=True)
|
| 113 |
-
df['texts'] = df['texts'].astype(str)
|
| 114 |
-
return preprocess_data(df)
|
| 115 |
-
|
| 116 |
-
def cluster_data(df, num_clusters):
|
| 117 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 118 |
X = vectorizer.fit_transform(df['texts'])
|
| 119 |
|
|
@@ -139,18 +134,26 @@ def visualize_clusters(df):
|
|
| 139 |
|
| 140 |
def main(file, num_clusters_to_display):
|
| 141 |
try:
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
| 144 |
df = pd.read_excel(file)
|
| 145 |
-
|
| 146 |
-
elif file_ext == 'csv':
|
| 147 |
df = pd.read_csv(file)
|
| 148 |
-
df = preprocess_csv_data(df)
|
| 149 |
else:
|
| 150 |
-
return "Unsupported file
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
visualize_clusters(df)
|
| 155 |
|
| 156 |
cluster_sizes = df['Cluster'].value_counts()
|
|
@@ -166,21 +169,31 @@ def main(file, num_clusters_to_display):
|
|
| 166 |
df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
|
| 167 |
df = df.sort_values('Cluster')
|
| 168 |
|
|
|
|
| 169 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
|
| 170 |
df.to_csv(tmpfile.name, index=False)
|
|
|
|
|
|
|
| 171 |
return tmpfile.name
|
|
|
|
| 172 |
except Exception as e:
|
| 173 |
-
return str(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
interface = gr.Interface(
|
| 176 |
-
fn=
|
| 177 |
inputs=[
|
| 178 |
-
gr.File(label="Upload Excel or CSV File (.xlsx or .csv)"),
|
| 179 |
-
gr.Slider(1,
|
| 180 |
],
|
| 181 |
-
outputs=gr.File(label="
|
| 182 |
-
title="Unanswered User Queries Clustering"
|
| 183 |
-
description="Upload an Excel or CSV file and select the number of largest clusters to display (excluding cluster 0)"
|
| 184 |
)
|
| 185 |
|
| 186 |
-
interface.launch()
|
|
|
|
| 107 |
|
| 108 |
return df
|
| 109 |
|
| 110 |
+
def cluster_data(df):
|
| 111 |
+
num_clusters = 15 # Set the number of clusters to 15
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 113 |
X = vectorizer.fit_transform(df['texts'])
|
| 114 |
|
|
|
|
| 134 |
|
| 135 |
def main(file, num_clusters_to_display):
|
| 136 |
try:
|
| 137 |
+
# Determine the file type
|
| 138 |
+
file_extension = file.name.split('.')[-1].lower()
|
| 139 |
+
|
| 140 |
+
# Load the file
|
| 141 |
+
if file_extension == 'xlsx':
|
| 142 |
df = pd.read_excel(file)
|
| 143 |
+
elif file_extension == 'csv':
|
|
|
|
| 144 |
df = pd.read_csv(file)
|
|
|
|
| 145 |
else:
|
| 146 |
+
return "Unsupported file type. Please upload an Excel or CSV file."
|
| 147 |
+
|
| 148 |
+
# Process CSV file specifically
|
| 149 |
+
if file_extension == 'csv':
|
| 150 |
+
# Keep only rows where 'Answer' is 'Fallback Message shown'
|
| 151 |
+
df = df[df['Answer'] == 'Fallback Message shown']
|
| 152 |
+
# Focus on 'Query' column for text processing
|
| 153 |
+
df.rename(columns={'Query': 'texts'}, inplace=True)
|
| 154 |
+
|
| 155 |
+
df = preprocess_data(df)
|
| 156 |
+
df = cluster_data(df)
|
| 157 |
visualize_clusters(df)
|
| 158 |
|
| 159 |
cluster_sizes = df['Cluster'].value_counts()
|
|
|
|
| 169 |
df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
|
| 170 |
df = df.sort_values('Cluster')
|
| 171 |
|
| 172 |
+
# Save the resulting DataFrame to a CSV file
|
| 173 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
|
| 174 |
df.to_csv(tmpfile.name, index=False)
|
| 175 |
+
tmpfile.flush()
|
| 176 |
+
|
| 177 |
return tmpfile.name
|
| 178 |
+
|
| 179 |
except Exception as e:
|
| 180 |
+
return f"An error occurred: {str(e)}"
|
| 181 |
+
|
| 182 |
+
def upload_file(file, num_clusters_to_display):
|
| 183 |
+
result = main(file, num_clusters_to_display)
|
| 184 |
+
if result.endswith(".csv"):
|
| 185 |
+
return result
|
| 186 |
+
else:
|
| 187 |
+
return f"Error: {result}"
|
| 188 |
|
| 189 |
interface = gr.Interface(
|
| 190 |
+
fn=upload_file,
|
| 191 |
inputs=[
|
| 192 |
+
gr.inputs.File(label="Upload Excel or CSV File (.xlsx or .csv)", type="file"),
|
| 193 |
+
gr.inputs.Slider(minimum=1, maximum=20, step=1, default=5, label="Number of Categories to Display")
|
| 194 |
],
|
| 195 |
+
outputs=gr.outputs.File(label="Output CSV File"),
|
| 196 |
+
title="Unanswered User Queries Clustering"
|
|
|
|
| 197 |
)
|
| 198 |
|
| 199 |
+
interface.launch(debug=True)
|