Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| def save_embeddings(embeddings, file_name): | |
| embeddings = embeddings.cpu().numpy() # Convert PyTorch tensor to numpy array | |
| dimension = embeddings.shape[1] | |
| # Save embeddings as .npy file | |
| np.save(f"{file_name}_embeddings.npy", embeddings) | |
| index = faiss.IndexFlatL2(dimension) | |
| faiss.normalize_L2(embeddings) | |
| index.add(embeddings) | |
| faiss.write_index(index, file_name) | |
| return index | |
| def normalize_embeddings(embeddings): | |
| embeddings = embeddings.cpu().numpy() # Convert PyTorch tensor to numpy array | |
| faiss.normalize_L2(embeddings) | |
| return embeddings | |
| def train_model(model_name): | |
| model = SentenceTransformer(model_name) | |
| return model | |
| def get_embeddings(model, texts): | |
| embeddings = model.encode(texts, convert_to_tensor=True) | |
| return embeddings | |
| def load_data(file_path): | |
| data = pd.read_csv(file_path) | |
| return data | |
| def clean_text(text): | |
| # Function to clean text | |
| text = text.lower() | |
| text = re.sub(r'[^\w\s]', '', text) | |
| text = re.sub(r'\d+', '', text) | |
| text = text.strip() | |
| return text | |
| def preprocess_data(data): | |
| data['utterance'] = data['utterance'].apply(clean_text) | |
| return data | |
| # Load and preprocess data | |
| data_file_path = r"C:\Users\serban.tica\Documents\Intent_detection\data\Pager_Intents_Recent.csv" | |
| data = load_data(data_file_path) | |
| data = preprocess_data(data) | |
| # Models to evaluate | |
| models = { | |
| "multilingual-e5-small":"intfloat/multilingual-e5-small" #"bert-base-nli-mean-tokens":"sentence-transformers/bert-base-nli-mean-tokens", #"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1" | |
| } | |
| # "all-mpnet-base-v2":"sentence-transformers/all-mpnet-base-v2", | |
| # "bert-base-nli":"sentence-transformers/bert-base-nli-mean-tokens", | |
| # "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", | |
| # "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1" | |
| # "bert-base-romanian-cased-v1": "sentence-transformers/bert-base-romanian-cased-v1", | |
| # "bert-base-romanian-uncased-v1": "sentence-transformers/dumitrescustefan/bert-base-romanian-uncased-v1", | |
| #"mBERT": "bert-base-multilingual-cased", "XLM-R": "xlm-roberta-base", "Romanian BERT": "dumitrescustefan/bert-base-romanian-cased-v1", "dumitrescustefan/bert-base-romanian-uncased-v1": "dumitrescustefan/bert-base-romanian-uncased-v1" | |
| # Generate and save embeddings for each model, "xlm-r-distilroberta-base-paraphrase-v1" | |
| for model_name, model_path in models.items(): | |
| print(f"Processing model: {model_name}") | |
| model = train_model(model_path) | |
| texts = data['utterance'].tolist() | |
| embeddings = get_embeddings(model, texts) | |
| save_embeddings(embeddings, file_name=f"embeddings/{model_name}_vector_db.index") | |