Spaces:
Sleeping
Sleeping
File size: 882 Bytes
6bf21bd c16bedc 6bf21bd c16bedc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | from typing import Optional
from pypdf import PdfReader
import io
import trafilatura
import requests
from bs4 import BeautifulSoup
USER_AGENT = "Mozilla/5.0 (compatible; PRIRBot/1.0)"
def extract_from_pdf(file_bytes: bytes) -> str:
reader = PdfReader(io.BytesIO(file_bytes))
texts = []
for page in reader.pages:
try:
texts.append(page.extract_text() or "")
except Exception:
pass
return "\n".join(texts)
def extract_from_url(url: str) -> str:
downloaded = trafilatura.fetch_url(url)
if downloaded:
txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
if txt:
return txt
# fallback: simple soup
resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=20)
soup = BeautifulSoup(resp.text, "html.parser")
return soup.get_text("\n")
|