Spaces:
Sleeping
Sleeping
Update classification.py
Browse files- classification.py +5 -34
classification.py
CHANGED
|
@@ -171,46 +171,17 @@ def process_categories(categories, model):
|
|
| 171 |
|
| 172 |
|
| 173 |
def match_categories(df, category_df, treshold=0.45):
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
for ebd_content in df['Embeddings']:
|
| 177 |
if isinstance(ebd_content, torch.Tensor):
|
| 178 |
cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
|
| 179 |
high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
|
| 183 |
-
experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
|
| 184 |
-
topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
|
| 185 |
-
scores_list.append([float(cos_scores[index]) for index in high_score_indices])
|
| 186 |
-
else:
|
| 187 |
-
categories_list.append(np.nan)
|
| 188 |
-
experts_list.append(np.nan)
|
| 189 |
-
topic_list.append(np.nan)
|
| 190 |
-
scores_list.append('pas interessant')
|
| 191 |
-
|
| 192 |
-
df["Description"] = categories_list
|
| 193 |
-
df["Expert"] = experts_list
|
| 194 |
-
df["Topic"] = topic_list
|
| 195 |
-
df["Score"] = scores_list
|
| 196 |
return df
|
| 197 |
|
| 198 |
-
def flatten_nested_lists(nested_list):
|
| 199 |
-
"""Flatten a list of potentially nested lists into a single list."""
|
| 200 |
-
flattened_list = []
|
| 201 |
-
for item in nested_list:
|
| 202 |
-
if isinstance(item, list):
|
| 203 |
-
flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list
|
| 204 |
-
else:
|
| 205 |
-
flattened_list.append(item)
|
| 206 |
-
return flattened_list
|
| 207 |
-
|
| 208 |
def save_data(df, filename):
|
| 209 |
-
# Apply flattening and then join for the 'Expert' column
|
| 210 |
-
df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
|
| 211 |
-
df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
|
| 212 |
-
df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
|
| 213 |
-
df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
|
| 214 |
|
| 215 |
df = df.drop(columns=['Embeddings'])
|
| 216 |
new_filename = filename.replace(".", "_classified.")
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
def match_categories(df, category_df, treshold=0.45):
|
| 174 |
+
for topic in category_df['topic']:
|
| 175 |
+
df[topic] = 0
|
| 176 |
+
for i, ebd_content in enumerate(df['Embeddings']):
|
| 177 |
if isinstance(ebd_content, torch.Tensor):
|
| 178 |
cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
|
| 179 |
high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]
|
| 180 |
+
for j in high_score_indices:
|
| 181 |
+
df.loc[i, category_df.loc[j, 'topic']] = float(cos_scores[index])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
return df
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
def save_data(df, filename):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
df = df.drop(columns=['Embeddings'])
|
| 187 |
new_filename = filename.replace(".", "_classified.")
|