from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_classic.text_splitter import RecursiveCharacterTextSplitter from typing import List from langchain_classic.schema import Document import torch from langchain_groq import ChatGroq from langchain_classic.chains import create_retrieval_chain from langchain_classic.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate def load_file(path): loader = DirectoryLoader(path, glob= '*.pdf', loader_cls= PyPDFLoader) documents = loader.load() return documents def filtering(documents): docs: List[Document] = [] for doc in documents: src= doc.metadata.get("source") docs.append( Document( page_content=doc.page_content, metadata={"source":src}) ) return docs def chunking(docs): splitter = RecursiveCharacterTextSplitter( chunk_size= 1000, chunk_overlap= 200, length_function= len ) text = splitter.split_documents(docs) return text def download_embeddings(): embeddings= HuggingFaceEmbeddings( model_name= "BAAI/bge-small-en-v1.5", model_kwargs= {"device":"cuda" if torch.cuda.is_available() else "cpu"} ) return embeddings