Spaces:

Corin1998
/

PR_IRminiSaaS

Sleeping

Update app/ingest.py

c16bedc verified 6 months ago

882 Bytes

	from typing import Optional
	from pypdf import PdfReader
	import io
	import trafilatura
	import requests
	from bs4 import BeautifulSoup

	USER_AGENT = "Mozilla/5.0 (compatible; PRIRBot/1.0)"

	def extract_from_pdf(file_bytes: bytes) -> str:
	reader = PdfReader(io.BytesIO(file_bytes))
	texts = []
	for page in reader.pages:
	try:
	texts.append(page.extract_text() or "")
	except Exception:
	pass
	return "\n".join(texts)

	def extract_from_url(url: str) -> str:
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
	if txt:
	return txt
	# fallback: simple soup
	resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=20)
	soup = BeautifulSoup(resp.text, "html.parser")
	return soup.get_text("\n")