File size: 804 Bytes
bf66cf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from pypdf import PdfReader
import io
import trafilatura
import requests
from bs4 import BeautifulSoup

UA = "Mozilla/5.0 (compatible; PRIRBot/1.0)"

def extract_from_pdf(file_bytes: bytes) -> str:
    reader = PdfReader(io.BytesIO(file_bytes))
    texts = []
    for p in reader.pages:
        try:
            texts.append(p.extract_text() or "")
        except Exception:
            pass
    return "\n".join(texts)

def extract_from_url(url: str) -> str:
    downloaded = trafilatura.fetch_url(url)
    if downloaded:
        txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
        if txt:
            return txt
    resp = requests.get(url, headers={"User-Agent": UA}, timeout=20)
    soup = BeautifulSoup(resp.text, "html.parser")
    return soup.get_text("\n")