Spaces:
Runtime error
Runtime error
| from langchain_community.document_loaders import PyPDFDirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from huggingface_hub import hf_hub_download | |
| # loading the data | |
| def load_data(path): | |
| loader = PyPDFDirectoryLoader(path) | |
| extracted_data = loader.load() | |
| return extracted_data | |
| #Create text chunks | |
| def text_split(extracted_data): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20) | |
| text_chunks = text_splitter.split_documents(extracted_data) | |
| return text_chunks | |
| #download embedding model | |
| def download_hf_embeddings(): | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| return embeddings | |
| # downloading any pdf on web | |
| import os | |
| import requests | |
| def download_pdf(url): | |
| if not os.path.exists('data'): | |
| os.makedirs('data') | |
| pdf_url = url | |
| # Get the filename from the URL | |
| filename = pdf_url.split("/")[-1] | |
| # Full path where the PDF will be saved | |
| save_path = os.path.join('data', filename) | |
| # Download the PDF | |
| response = requests.get(pdf_url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Write the content to a file | |
| with open(save_path, 'wb') as file: | |
| file.write(response.content) | |
| print(f"PDF downloaded and saved to {save_path}") | |
| else: | |
| print(f"Failed to download PDF. Status code: {response.status_code}") | |
| def download_hf_model(model_name_or_path, model_basename): | |
| model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) | |
| return model_path | |