Spaces:

raannakasturi
/

ScientryAPI-oldest

Paused

ScientryAPI-oldest / extract_text.py

Update extract_text.py

8f9eff9 verified 11 months ago

1.58 kB

	from pdfplumber import open as pdf_open
	import requests
	import os

	def download_pdf(url, id):
	id = id.replace("/", "-")
	directory = "downloads"
	os.makedirs(directory, exist_ok=True)
	file_path = os.path.join(directory, f"{id}.pdf")
	try:
	response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"})
	response.raise_for_status()
	with open(file_path, "wb") as file:
	file.write(response.content)
	print(f"Saving PDF to: {file_path}")
	except Exception as e:
	print(f"Error downloading PDF: {e}")
	return None
	return file_path

	def extract_text_from_pdf(url, id):
	pdf_path = download_pdf(url, id)
	if not pdf_path or not os.path.exists(pdf_path):
	print(f"PDF not found: {pdf_path}")
	return ""
	try:
	with pdf_open(pdf_path) as pdf:
	all_text = " ".join([page.extract_text() or "" for page in pdf.pages])
	start_index = all_text.find("ABSTRACT")
	end_index = all_text.find("REFERENCES")
	if start_index != -1 and end_index != -1 and start_index < end_index:
	relevant_text = all_text[start_index:end_index]
	else:
	relevant_text = all_text
	research_paper_text = relevant_text
	except Exception as e:
	print(f"Error processing PDF: {e}")
	research_paper_text = ""
	finally:
	if os.path.exists(pdf_path):
	os.remove(pdf_path)
	return research_paper_text