Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from transformers import AutoTokenizer, AutoModel | |
| from sentence_transformers import SentenceTransformer, util | |
| import numpy as np | |
| import torch | |
| def load_data(file_obj): | |
| # Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it | |
| return pd.read_excel(file_obj) | |
| def initialize_models(): | |
| model_ST = SentenceTransformer("all-mpnet-base-v2") | |
| return model_ST | |
| def generate_embeddings(df, model, Column): | |
| embeddings_list = [] | |
| for index, row in df.iterrows(): | |
| if type(row[Column]) == str: | |
| print(index) | |
| if 'Title' in df.columns: | |
| if type(row["Title"]) == str: | |
| content = row["Title"] + "\n" + row[Column] | |
| else: | |
| content = row[Column] | |
| else: | |
| content = row[Column] | |
| embeddings = model.encode(content, convert_to_tensor=True) | |
| embeddings_list.append(embeddings) | |
| else: | |
| embeddings_list.append(np.nan) | |
| df['Embeddings'] = embeddings_list | |
| return df | |
| def process_categories(categories, model): | |
| # Create a new DataFrame to store category information and embeddings | |
| df_cate = pd.DataFrame(categories) | |
| # Generate embeddings for each category description | |
| df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1) | |
| return df_cate | |
| def match_categories(df, category_df, treshold=0.45): | |
| categories_list, experts_list, topic_list, scores_list = [], [], [], [] | |
| for ebd_content in df['Embeddings']: | |
| if isinstance(ebd_content, torch.Tensor): | |
| cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0] | |
| high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold] | |
| # Append the corresponding categories, experts, and topics for each high-scoring index | |
| categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices]) | |
| experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices]) | |
| topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices]) | |
| scores_list.append([float(cos_scores[index]) for index in high_score_indices]) | |
| else: | |
| categories_list.append(np.nan) | |
| experts_list.append(np.nan) | |
| topic_list.append(np.nan) | |
| scores_list.append('pas interessant') | |
| df["Description"] = categories_list | |
| df["Expert"] = experts_list | |
| df["Topic"] = topic_list | |
| df["Score"] = scores_list | |
| return df | |
| def flatten_nested_lists(nested_list): | |
| """Flatten a list of potentially nested lists into a single list.""" | |
| flattened_list = [] | |
| for item in nested_list: | |
| if isinstance(item, list): | |
| flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list | |
| else: | |
| flattened_list.append(item) | |
| return flattened_list | |
| def save_data(df, filename): | |
| # Apply flattening and then join for the 'Expert' column | |
| df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x) | |
| df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) | |
| df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) | |
| df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x) | |
| df = df.drop(columns=['Embeddings']) | |
| new_filename = filename.replace(".", "_classified.") | |
| df.to_excel(new_filename, index=False) | |
| return new_filename | |
| def classification(column, file_path, categories, treshold): | |
| # Load data | |
| df = load_data(file_path) | |
| # Initialize models | |
| model_ST = initialize_models() | |
| # Generate embeddings for df | |
| df = generate_embeddings(df, model_ST, column) | |
| category_df = process_categories(categories, model_ST) | |
| # Match categories | |
| df = match_categories(df, category_df, treshold=treshold) | |
| # Save data | |
| return save_data(df,file_path), df | |