Spaces:
Sleeping
Sleeping
| from pypdf import PdfReader | |
| import io | |
| import trafilatura | |
| import requests | |
| from bs4 import BeautifulSoup | |
| UA = "Mozilla/5.0 (compatible; PRIRBot/1.0)" | |
| def extract_from_pdf(file_bytes: bytes) -> str: | |
| reader = PdfReader(io.BytesIO(file_bytes)) | |
| texts = [] | |
| for p in reader.pages: | |
| try: | |
| texts.append(p.extract_text() or "") | |
| except Exception: | |
| pass | |
| return "\n".join(texts) | |
| def extract_from_url(url: str) -> str: | |
| downloaded = trafilatura.fetch_url(url) | |
| if downloaded: | |
| txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True) | |
| if txt: | |
| return txt | |
| resp = requests.get(url, headers={"User-Agent": UA}, timeout=20) | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| return soup.get_text("\n") | |