File size: 1,418 Bytes
ff54a63
 
c6607a8
ff54a63
947d516
 
c6607a8
ff54a63
c6607a8
66d14d1
c6607a8
ff54a63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import spaces
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

from src.utils.constants import EMBEDDING_MODEL_NAME

embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)


@spaces.GPU(duration=10)
def embed_splitted_docs(splitted_docs):
    """
    Encode the given list of documents using the specified embedding model.

    Parameters
    ----------
    splitted_docs : List of str
        The list of documents to be embedded.

    Returns
    -------
    embeddings : List of numpy.ndarray
        The embeddings of the given documents.
    """
    embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
    return embeddings


def split_corpus(corpus, chunk_size=1000):
    """
    Split a given corpus into chunks of a given size.

    Parameters
    ----------
    corpus : List of str
        The corpus to be split.
    chunk_size : int, default=1000
        The size of the chunks to be split from the corpus.

    Returns
    -------
    List of str
        The list of chunks (splitted documents) from the corpus.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=0, add_start_index=True
    )
    splitted_docs = splitter.create_documents(corpus)
    splitted_docs = list(map(lambda x: x.page_content, splitted_docs))

    return splitted_docs