Spaces:
Sleeping
Sleeping
| import os | |
| import yaml | |
| import requests | |
| from pathlib import Path | |
| from langchain.document_loaders import PyPDFLoader | |
| def get_pdf_from_url(arxiv_id: str, save_dir: str) -> str: | |
| """ | |
| Downloads a PDF from arXiv given an ID, unless already downloaded. | |
| Returns: | |
| str: Path to the downloaded (or existing) PDF. | |
| """ | |
| os.makedirs(save_dir, exist_ok=True) | |
| pdf_path = os.path.join(save_dir, f"{arxiv_id}.pdf") | |
| if os.path.exists(pdf_path): | |
| # print(f"[cache] PDF already exists: {pdf_path}") | |
| return pdf_path | |
| url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| with open(pdf_path, "wb") as f: | |
| f.write(response.content) | |
| def load_config(config_path: str="./configs/pipeline.yaml") -> dict: | |
| """ | |
| Load a YAML configuration file and return its contents as a dictionary. | |
| Args: | |
| config_path (str): The path to the YAML configuration file. | |
| Returns: | |
| dict: The contents of the configuration file. | |
| """ | |
| config_path = Path(config_path) | |
| if not config_path.exists(): | |
| raise FileNotFoundError(f"Configuration file {config_path} does not exist.") | |
| with open(config_path, 'r') as file: | |
| config = yaml.safe_load(file) | |
| return config | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """ | |
| Extract text from a PDF file. | |
| Args: | |
| pdf_path (str): The path to the PDF file. | |
| Returns: | |
| str: The extracted text from the PDF. | |
| """ | |
| loader = PyPDFLoader(pdf_path) | |
| documents = loader.load() | |
| return documents |