|
|
import spaces |
|
|
from langchain_text_splitters.character import RecursiveCharacterTextSplitter |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
from src.utils.constants import EMBEDDING_MODEL_NAME |
|
|
|
|
|
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME) |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=10) |
|
|
def embed_splitted_docs(splitted_docs): |
|
|
""" |
|
|
Encode the given list of documents using the specified embedding model. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
splitted_docs : List of str |
|
|
The list of documents to be embedded. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
embeddings : List of numpy.ndarray |
|
|
The embeddings of the given documents. |
|
|
""" |
|
|
embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True) |
|
|
return embeddings |
|
|
|
|
|
|
|
|
def split_corpus(corpus, chunk_size=1000): |
|
|
""" |
|
|
Split a given corpus into chunks of a given size. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
corpus : List of str |
|
|
The corpus to be split. |
|
|
chunk_size : int, default=1000 |
|
|
The size of the chunks to be split from the corpus. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
List of str |
|
|
The list of chunks (splitted documents) from the corpus. |
|
|
""" |
|
|
splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=chunk_size, chunk_overlap=0, add_start_index=True |
|
|
) |
|
|
splitted_docs = splitter.create_documents(corpus) |
|
|
splitted_docs = list(map(lambda x: x.page_content, splitted_docs)) |
|
|
|
|
|
return splitted_docs |
|
|
|