File size: 1,302 Bytes
dd7f0a1
78362e2
 
 
dd7f0a1
78362e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings


def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf", 
        loader_cls=PyPDFLoader,
        show_progress=True
        )
    
    documents = loader.load()
    return documents

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs : List[Document]= []
    for doc in docs:
        source = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={
                    "source": source}))
    return minimal_docs

def text_splitter(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
        )
    text_chunks=text_splitter.split_documents(minimal_docs)
    return text_chunks

def download_embeddings():

    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings= HuggingFaceEmbeddings(
        model_name=model_name,
       
    )
    return embeddings
embeddings = download_embeddings()