Spaces:
Sleeping
Sleeping
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| import sys | |
| import glob | |
| import torch | |
| import pandas as pd | |
| from tqdm import tqdm | |
| parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) | |
| sys.path.append(parent_dir) | |
| import functions as fn | |
| def get_embeddings(chunk_size, chunk_overlap, model_name, input_path='docs/*.txt', output_path='embeddings/embeddings.xlsx'): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| all_splitted_text = [] | |
| file_names = [] | |
| for file in glob.glob(input_path): | |
| text = fn.load_text(file) | |
| splitted_text = text_splitter.create_documents([text]) | |
| all_splitted_text.extend(splitted_text) | |
| file_names.extend([os.path.basename(file)] * len(splitted_text)) | |
| model = SentenceTransformer(model_name) | |
| embeddings_list = [] | |
| content_list = [] | |
| file_name_list = [] | |
| model_name_list = [] | |
| for segment, file_name in tqdm(zip(all_splitted_text, file_names), desc="Procesando segmentos"): | |
| embeddings = model.encode(segment.page_content) | |
| embeddings_list.append(embeddings) | |
| content_list.append(segment.page_content) | |
| file_name_list.append(file_name) | |
| model_name_list.append(model_name) | |
| embeddings_df = pd.DataFrame(embeddings_list) | |
| embeddings_df['segment_content'] = content_list | |
| embeddings_df['file_name'] = file_name_list | |
| embeddings_df['model_name'] = model_name_list | |
| embeddings_df.to_excel(output_path, index=False) | |
| if __name__ == "__main__": | |
| current_dir = os.getcwd() | |
| get_embeddings(chunk_size=512, chunk_overlap=100, model_name='intfloat/multilingual-e5-large') | |