Spaces:
Sleeping
Sleeping
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| #Extract Data From the PDF File | |
| def load_pdf_file(data): | |
| loader= DirectoryLoader(data, | |
| glob="*.pdf", | |
| loader_cls=PyPDFLoader) | |
| documents=loader.load() | |
| return documents | |
| #Split the Data into Text Chunks | |
| def text_split(extracted_data): | |
| text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) | |
| text_chunks=text_splitter.split_documents(extracted_data) | |
| return text_chunks | |
| #Download the Embeddings from HuggingFace | |
| def download_hugging_face_embeddings(): | |
| try: | |
| print("Starting to load embedding model...") | |
| embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') | |
| print("Embedding model loaded successfully") | |
| return embeddings | |
| except Exception as e: | |
| print(f"Error loading embedding model: {e}") | |
| raise |