from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_huggingface import HuggingFaceEmbeddings #Extract Data From the PDF File def load_pdf_file(data): loader= DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader) documents=loader.load() return documents #Split the Data into Text Chunks def text_split(extracted_data): text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) text_chunks=text_splitter.split_documents(extracted_data) return text_chunks #Download the Embeddings from HuggingFace def download_hugging_face_embeddings(): try: print("Starting to load embedding model...") embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') print("Embedding model loaded successfully") return embeddings except Exception as e: print(f"Error loading embedding model: {e}") raise