Spaces:

vinny4
/

ScholarBot

Sleeping

ScholarBot / src /utils.py

initial commit

9c37331 7 months ago

1.61 kB

	import os
	import yaml
	import requests
	from pathlib import Path
	from langchain.document_loaders import PyPDFLoader

	def get_pdf_from_url(arxiv_id: str, save_dir: str) -> str:
	"""
	Downloads a PDF from arXiv given an ID, unless already downloaded.

	Returns:
	str: Path to the downloaded (or existing) PDF.
	"""
	os.makedirs(save_dir, exist_ok=True)
	pdf_path = os.path.join(save_dir, f"{arxiv_id}.pdf")

	if os.path.exists(pdf_path):
	# print(f"[cache] PDF already exists: {pdf_path}")
	return pdf_path

	url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"

	response = requests.get(url)
	response.raise_for_status()

	with open(pdf_path, "wb") as f:
	f.write(response.content)

	def load_config(config_path: str="./configs/pipeline.yaml") -> dict:
	"""
	Load a YAML configuration file and return its contents as a dictionary.

	Args:
	config_path (str): The path to the YAML configuration file.

	Returns:
	dict: The contents of the configuration file.
	"""
	config_path = Path(config_path)
	if not config_path.exists():
	raise FileNotFoundError(f"Configuration file {config_path} does not exist.")

	with open(config_path, 'r') as file:
	config = yaml.safe_load(file)

	return config

	def extract_text_from_pdf(pdf_path: str) -> str:
	"""
	Extract text from a PDF file.

	Args:
	pdf_path (str): The path to the PDF file.

	Returns:
	str: The extracted text from the PDF.
	"""
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()

	return documents