Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -39,26 +39,19 @@ categories_keywords = {
|
|
| 39 |
}
|
| 40 |
|
| 41 |
def categorize_question(question):
|
| 42 |
-
# Split the question into words
|
| 43 |
words = question.split()
|
| 44 |
-
|
| 45 |
-
# Check if the question has only one word
|
| 46 |
if len(words) == 1:
|
| 47 |
single_word = words[0].lower()
|
| 48 |
-
# Check if the single word is in the Start of Conversation category
|
| 49 |
if any(single_word in keyword for keyword in categories_keywords["Start of Conversation"]):
|
| 50 |
return "Start of Conversation"
|
| 51 |
else:
|
| 52 |
return "End of Conversation"
|
| 53 |
|
| 54 |
-
# Categorization of other queries
|
| 55 |
for category, keywords in categories_keywords.items():
|
| 56 |
if any(keyword.lower() in question.lower() for keyword in keywords):
|
| 57 |
return category
|
| 58 |
return "Miscellaneous"
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
def preprocess_data(df):
|
| 63 |
df.rename(columns={'Question Asked': 'texts'}, inplace=True)
|
| 64 |
df['texts'] = df['texts'].astype(str).str.lower()
|
|
@@ -110,7 +103,6 @@ def preprocess_data(df):
|
|
| 110 |
df['texts'] = df['texts'].str.strip()
|
| 111 |
df = df[df['texts'] != '']
|
| 112 |
|
| 113 |
-
# Categorize the texts
|
| 114 |
df['Category'] = df['texts'].apply(categorize_question)
|
| 115 |
|
| 116 |
return df
|
|
@@ -153,7 +145,6 @@ def generate_wordcloud(df):
|
|
| 153 |
return img
|
| 154 |
|
| 155 |
def generate_bar_chart(df, num_clusters_to_display):
|
| 156 |
-
# Exclude common words from the top words
|
| 157 |
common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
|
| 158 |
|
| 159 |
top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
|
|
@@ -179,30 +170,15 @@ def main(file, num_clusters_to_display):
|
|
| 179 |
try:
|
| 180 |
df = pd.read_csv(file)
|
| 181 |
|
| 182 |
-
# Filter by 'Fallback Message shown'
|
| 183 |
df = df[df['Answer'] == 'Fallback Message shown']
|
| 184 |
|
| 185 |
df = preprocess_data(df)
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
category_sizes.columns = ['Category', 'Count']
|
| 190 |
-
sorted_categories = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
|
| 191 |
-
|
| 192 |
-
# Get the largest x categories as specified by num_clusters_to_display
|
| 193 |
-
largest_categories = sorted_categories[:num_clusters_to_display]
|
| 194 |
-
|
| 195 |
-
# Filter the dataframe to include only the largest categories
|
| 196 |
-
filtered_df = df[df['Category'].isin(largest_categories)]
|
| 197 |
-
|
| 198 |
-
# Sort the dataframe by Category
|
| 199 |
-
filtered_df = filtered_df.sort_values(by='Category')
|
| 200 |
-
|
| 201 |
-
wordcloud_img = generate_wordcloud(filtered_df)
|
| 202 |
-
bar_chart_img = generate_bar_chart(filtered_df, num_clusters_to_display)
|
| 203 |
|
| 204 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
|
| 205 |
-
|
| 206 |
csv_file_path = tmpfile.name
|
| 207 |
|
| 208 |
return csv_file_path, wordcloud_img, bar_chart_img
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
def categorize_question(question):
|
|
|
|
| 42 |
words = question.split()
|
|
|
|
|
|
|
| 43 |
if len(words) == 1:
|
| 44 |
single_word = words[0].lower()
|
|
|
|
| 45 |
if any(single_word in keyword for keyword in categories_keywords["Start of Conversation"]):
|
| 46 |
return "Start of Conversation"
|
| 47 |
else:
|
| 48 |
return "End of Conversation"
|
| 49 |
|
|
|
|
| 50 |
for category, keywords in categories_keywords.items():
|
| 51 |
if any(keyword.lower() in question.lower() for keyword in keywords):
|
| 52 |
return category
|
| 53 |
return "Miscellaneous"
|
| 54 |
|
|
|
|
|
|
|
| 55 |
def preprocess_data(df):
|
| 56 |
df.rename(columns={'Question Asked': 'texts'}, inplace=True)
|
| 57 |
df['texts'] = df['texts'].astype(str).str.lower()
|
|
|
|
| 103 |
df['texts'] = df['texts'].str.strip()
|
| 104 |
df = df[df['texts'] != '']
|
| 105 |
|
|
|
|
| 106 |
df['Category'] = df['texts'].apply(categorize_question)
|
| 107 |
|
| 108 |
return df
|
|
|
|
| 145 |
return img
|
| 146 |
|
| 147 |
def generate_bar_chart(df, num_clusters_to_display):
|
|
|
|
| 148 |
common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
|
| 149 |
|
| 150 |
top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
|
|
|
|
| 170 |
try:
|
| 171 |
df = pd.read_csv(file)
|
| 172 |
|
|
|
|
| 173 |
df = df[df['Answer'] == 'Fallback Message shown']
|
| 174 |
|
| 175 |
df = preprocess_data(df)
|
| 176 |
|
| 177 |
+
wordcloud_img = generate_wordcloud(df)
|
| 178 |
+
bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
|
| 181 |
+
df.to_csv(tmpfile.name, index=False)
|
| 182 |
csv_file_path = tmpfile.name
|
| 183 |
|
| 184 |
return csv_file_path, wordcloud_img, bar_chart_img
|