ScholarBot / src /utils.py
vinny4's picture
initial commit
9c37331
import os
import yaml
import requests
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
def get_pdf_from_url(arxiv_id: str, save_dir: str) -> str:
"""
Downloads a PDF from arXiv given an ID, unless already downloaded.
Returns:
str: Path to the downloaded (or existing) PDF.
"""
os.makedirs(save_dir, exist_ok=True)
pdf_path = os.path.join(save_dir, f"{arxiv_id}.pdf")
if os.path.exists(pdf_path):
# print(f"[cache] PDF already exists: {pdf_path}")
return pdf_path
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
response = requests.get(url)
response.raise_for_status()
with open(pdf_path, "wb") as f:
f.write(response.content)
def load_config(config_path: str="./configs/pipeline.yaml") -> dict:
"""
Load a YAML configuration file and return its contents as a dictionary.
Args:
config_path (str): The path to the YAML configuration file.
Returns:
dict: The contents of the configuration file.
"""
config_path = Path(config_path)
if not config_path.exists():
raise FileNotFoundError(f"Configuration file {config_path} does not exist.")
with open(config_path, 'r') as file:
config = yaml.safe_load(file)
return config
def extract_text_from_pdf(pdf_path: str) -> str:
"""
Extract text from a PDF file.
Args:
pdf_path (str): The path to the PDF file.
Returns:
str: The extracted text from the PDF.
"""
loader = PyPDFLoader(pdf_path)
documents = loader.load()
return documents