USMLEPrepAI / src /helper.py
Nahiyan14's picture
Upload 8 files
dacda30 verified
raw
history blame contribute delete
979 Bytes
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
#Extract Data From the PDF File
def load_pdf_file(data):
loader= DirectoryLoader(data,
glob="*.pdf",
loader_cls=PyPDFLoader)
documents=loader.load()
return documents
#Split the Data into Text Chunks
def text_split(extracted_data):
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks=text_splitter.split_documents(extracted_data)
return text_chunks
#Download the Embeddings from HuggingFace
def download_hugging_face_embeddings():
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') #this model return 384 dimensions
return embeddings