import os import yaml import requests from pathlib import Path from langchain.document_loaders import PyPDFLoader def get_pdf_from_url(arxiv_id: str, save_dir: str) -> str: """ Downloads a PDF from arXiv given an ID, unless already downloaded. Returns: str: Path to the downloaded (or existing) PDF. """ os.makedirs(save_dir, exist_ok=True) pdf_path = os.path.join(save_dir, f"{arxiv_id}.pdf") if os.path.exists(pdf_path): # print(f"[cache] PDF already exists: {pdf_path}") return pdf_path url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" response = requests.get(url) response.raise_for_status() with open(pdf_path, "wb") as f: f.write(response.content) def load_config(config_path: str="./configs/pipeline.yaml") -> dict: """ Load a YAML configuration file and return its contents as a dictionary. Args: config_path (str): The path to the YAML configuration file. Returns: dict: The contents of the configuration file. """ config_path = Path(config_path) if not config_path.exists(): raise FileNotFoundError(f"Configuration file {config_path} does not exist.") with open(config_path, 'r') as file: config = yaml.safe_load(file) return config def extract_text_from_pdf(pdf_path: str) -> str: """ Extract text from a PDF file. Args: pdf_path (str): The path to the PDF file. Returns: str: The extracted text from the PDF. """ loader = PyPDFLoader(pdf_path) documents = loader.load() return documents