Corin1998 commited on
Commit
bf66cf0
·
verified ·
1 Parent(s): 1323e14

Create ingest_utils.py

Browse files
Files changed (1) hide show
  1. ingest_utils.py +27 -0
ingest_utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+ import io
3
+ import trafilatura
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+
7
+ UA = "Mozilla/5.0 (compatible; PRIRBot/1.0)"
8
+
9
+ def extract_from_pdf(file_bytes: bytes) -> str:
10
+ reader = PdfReader(io.BytesIO(file_bytes))
11
+ texts = []
12
+ for p in reader.pages:
13
+ try:
14
+ texts.append(p.extract_text() or "")
15
+ except Exception:
16
+ pass
17
+ return "\n".join(texts)
18
+
19
+ def extract_from_url(url: str) -> str:
20
+ downloaded = trafilatura.fetch_url(url)
21
+ if downloaded:
22
+ txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
23
+ if txt:
24
+ return txt
25
+ resp = requests.get(url, headers={"User-Agent": UA}, timeout=20)
26
+ soup = BeautifulSoup(resp.text, "html.parser")
27
+ return soup.get_text("\n")