Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| from A_Preprocess import load_pdf_data | |
| from E_Model_utils import batch_process_transformes_embeddings, get_embeddings, get_transformes_embeddings | |
| from E_Faiss_utils import save_faiss_embeddings_index | |
| from transformers import AutoTokenizer, AutoModel | |
| import numpy as np | |
| import os | |
| import sys | |
| from pathlib import Path | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| # Load and preprocess data | |
| # old data_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv' | |
| # old data = load_pdf_data(data_file_path) | |
| # Load and preprocess data | |
| file_name = 'InvoiceDetailsExplanation.csv' | |
| data_file_path = BASE_DIR / "data" / file_name | |
| data = load_pdf_data(str(data_file_path)) | |
| sentences = data['utterance'].tolist() | |
| tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small") | |
| # Load model from local path | |
| model = AutoModel.from_pretrained("intfloat/multilingual-e5-small", local_files_only=True) | |
| model_name = 'multilingual-e5-small' | |
| # filter randomly only 100 sentences - for testing faster | |
| # import random | |
| # random.seed(42) | |
| # random.shuffle(sentences) | |
| # sentences = sentences[:100] | |
| # ** Uncomment the following lines to load the model name ** | |
| #model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
| #model_name = 'paraphrase-multilingual-MiniLM-L12-v2' | |
| #model = SentenceTransformer('AlexHung29629/sgpt-llama3.2-1b-stage1') | |
| #model_name = 'llama3.2-1b' | |
| #model = SentenceTransformer('sentence-transformers/multilingual-e5-small') | |
| #model_name = 'all-MiniLM-L6-v2' | |
| # ** Uncomment the following lines to save the embeddings from sentence-transformers modeling ** | |
| #embeddings = get_embeddings(model, sentences) | |
| #save_faiss_embeddings_index(embeddings, file_name=f"embeddings/{model_name}_vector_db.index") | |
| # print(f'Embeddings shape: {embeddings.shape}.') | |
| # print(embeddings[:10]) | |
| # ** Uncomment the following lines to save the embeddings from transformers modeling ** | |
| # Load Romanian BERT model and tokenizer | |
| #tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
| #model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
| #model_name = 'bert-base-romanian-cased-v1' | |
| # Require a lot of memory | |
| #embeddings = get_transformes_embeddings(sentences, model, tokenizer) | |
| # Using batch processing - for low memory | |
| embeddings = batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128) | |
| save_faiss_embeddings_index(embeddings, file_name=f"{file_name}_{model_name}_vector_db.index") | |
| print(f'Embeddings shape: {embeddings.shape}.') |