Spaces:
Sleeping
Sleeping
File size: 1,209 Bytes
79338e8 249af48 79338e8 1c6c420 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
#Extract Data From the PDF File
def load_pdf_file(data):
loader= DirectoryLoader(data,
glob="*.pdf",
loader_cls=PyPDFLoader)
documents=loader.load()
return documents
#Split the Data into Text Chunks
def text_split(extracted_data):
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks=text_splitter.split_documents(extracted_data)
return text_chunks
#Download the Embeddings from HuggingFace
def download_hugging_face_embeddings():
try:
print("Starting to load embedding model...")
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
print("Embedding model loaded successfully")
return embeddings
except Exception as e:
print(f"Error loading embedding model: {e}")
raise |