Spaces:

Corin1998
/

PR_IRminiSaaS

Sleeping

Corin1998 commited on Aug 30, 2025

Commit

bf66cf0

verified ·

1 Parent(s): 1323e14

Create ingest_utils.py

Files changed (1) hide show

ingest_utils.py ADDED Viewed

+from pypdf import PdfReader
+import io
+import trafilatura
+import requests
+from bs4 import BeautifulSoup
+UA = "Mozilla/5.0 (compatible; PRIRBot/1.0)"
+def extract_from_pdf(file_bytes: bytes) -> str:
+    reader = PdfReader(io.BytesIO(file_bytes))
+    texts = []
+    for p in reader.pages:
+        try:
+            texts.append(p.extract_text() or "")
+        except Exception:
+            pass
+    return "\n".join(texts)
+def extract_from_url(url: str) -> str:
+    downloaded = trafilatura.fetch_url(url)
+    if downloaded:
+        txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
+        if txt:
+            return txt
+    resp = requests.get(url, headers={"User-Agent": UA}, timeout=20)
+    soup = BeautifulSoup(resp.text, "html.parser")
+    return soup.get_text("\n")