Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| from A_Preprocess import load_pdf_data | |
| from E_Model_utils import get_embeddings | |
| import numpy as np | |
| import faiss | |
| # Load and preprocess data | |
| data_file_path = r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data\Pager_Intents_Cleaned.csv' | |
| data = load_pdf_data(data_file_path) | |
| sentences = data['utterance'].tolist() | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| model_name = 'all-MiniLM-L6-v2' | |
| embeddings = get_embeddings(model, sentences) | |
| print(f'Embeddings shape: {embeddings.shape}.') | |
| #save embeddings as faiss index | |
| # Convert embeddings to float32 | |
| embeddings = np.array(embeddings).astype('float32') | |
| # Create a FAISS index | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) # L2 distance | |
| index.add(embeddings) | |
| # Save the FAISS index | |
| faiss.write_index(index, f"{model_name}_faiss.index") | |
| # Load the FAISS index (for later use) | |
| index = faiss.read_index(f"{model_name}_faiss.index") | |
| # To query the index, you can use the search method | |
| # Example: Find the 5 nearest neighbors of a query embedding | |
| query_embedding = 'cat am de platit la factura' | |
| query_embedding = np.array([embeddings[0]]).astype('float32') # Example query | |
| D, I = index.search(query_embedding, 5) # D: distances, I: indices | |
| print("Indices of nearest neighbors:", I) | |
| print("Distances of nearest neighbors:", D) | |
| #print(embeddings[:10]) |