Corin1998 commited on
Commit
c16bedc
·
verified ·
1 Parent(s): f67c66f

Update app/ingest.py

Browse files
Files changed (1) hide show
  1. app/ingest.py +3 -6
app/ingest.py CHANGED
@@ -5,21 +5,18 @@ import trafilatura
5
  import requests
6
  from bs4 import BeautifulSoup
7
 
8
-
9
  USER_AGENT = "Mozilla/5.0 (compatible; PRIRBot/1.0)"
10
 
11
-
12
  def extract_from_pdf(file_bytes: bytes) -> str:
13
  reader = PdfReader(io.BytesIO(file_bytes))
14
  texts = []
15
  for page in reader.pages:
16
  try:
17
  texts.append(page.extract_text() or "")
18
- except Exception:
19
- pass
20
  return "\n".join(texts)
21
 
22
-
23
  def extract_from_url(url: str) -> str:
24
  downloaded = trafilatura.fetch_url(url)
25
  if downloaded:
@@ -29,4 +26,4 @@ def extract_from_url(url: str) -> str:
29
  # fallback: simple soup
30
  resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=20)
31
  soup = BeautifulSoup(resp.text, "html.parser")
32
- return soup.get_text("\n")
 
5
  import requests
6
  from bs4 import BeautifulSoup
7
 
 
8
  USER_AGENT = "Mozilla/5.0 (compatible; PRIRBot/1.0)"
9
 
 
10
  def extract_from_pdf(file_bytes: bytes) -> str:
11
  reader = PdfReader(io.BytesIO(file_bytes))
12
  texts = []
13
  for page in reader.pages:
14
  try:
15
  texts.append(page.extract_text() or "")
16
+ except Exception:
17
+ pass
18
  return "\n".join(texts)
19
 
 
20
  def extract_from_url(url: str) -> str:
21
  downloaded = trafilatura.fetch_url(url)
22
  if downloaded:
 
26
  # fallback: simple soup
27
  resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=20)
28
  soup = BeautifulSoup(resp.text, "html.parser")
29
+ return soup.get_text("\n")