Bertopic / src /scripts /nlp_processing.py
Dopler47's picture
increased GPU duraction
66d14d1
import spaces
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from src.utils.constants import EMBEDDING_MODEL_NAME
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
@spaces.GPU(duration=10)
def embed_splitted_docs(splitted_docs):
"""
Encode the given list of documents using the specified embedding model.
Parameters
----------
splitted_docs : List of str
The list of documents to be embedded.
Returns
-------
embeddings : List of numpy.ndarray
The embeddings of the given documents.
"""
embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
return embeddings
def split_corpus(corpus, chunk_size=1000):
"""
Split a given corpus into chunks of a given size.
Parameters
----------
corpus : List of str
The corpus to be split.
chunk_size : int, default=1000
The size of the chunks to be split from the corpus.
Returns
-------
List of str
The list of chunks (splitted documents) from the corpus.
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=0, add_start_index=True
)
splitted_docs = splitter.create_documents(corpus)
splitted_docs = list(map(lambda x: x.page_content, splitted_docs))
return splitted_docs