chatbot-gitconnect / app /services /pdf_service.py
quantumbit's picture
preprocessing endpoint fixed and chat endpoint has been updated to specific results-sem wise
d0220ae
import io
import time
import urllib.request
import requests
from requests.adapters import HTTPAdapter
from pypdf import PdfReader
from urllib3.util import Retry
def fetch_pdf_text(
pdf_url: str,
timeout: int = 60,
max_retries: int = 3,
backoff_sec: float = 1.5,
) -> str:
last_exc: Exception | None = None
connect_timeout = min(max(int(timeout / 3), 10), 30)
read_timeout = max(timeout, 60)
session = requests.Session()
retry_cfg = Retry(
total=max_retries,
connect=max_retries,
read=max_retries,
backoff_factor=backoff_sec,
status_forcelist=(429, 500, 502, 503, 504),
allowed_methods=frozenset(["GET", "HEAD"]),
)
adapter = HTTPAdapter(max_retries=retry_cfg)
session.mount("http://", adapter)
session.mount("https://", adapter)
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "application/pdf,*/*;q=0.8",
}
for attempt in range(max_retries):
try:
response = session.get(
pdf_url,
headers=headers,
timeout=(connect_timeout, read_timeout),
)
response.raise_for_status()
pdf_stream = io.BytesIO(response.content)
reader = PdfReader(pdf_stream)
extracted = []
for page in reader.pages:
text = page.extract_text() or ""
if text.strip():
extracted.append(text)
return "\n\n".join(extracted).strip()
except Exception as exc:
last_exc = exc
# Fallback path: some hosts behave better with urllib defaults.
try:
req = urllib.request.Request(
pdf_url,
headers={"User-Agent": headers["User-Agent"]},
)
with urllib.request.urlopen(req, timeout=read_timeout) as resp:
content = resp.read()
pdf_stream = io.BytesIO(content)
reader = PdfReader(pdf_stream)
extracted = []
for page in reader.pages:
text = page.extract_text() or ""
if text.strip():
extracted.append(text)
return "\n\n".join(extracted).strip()
except Exception as fallback_exc:
last_exc = fallback_exc
if attempt < max_retries - 1:
sleep_sec = backoff_sec * (2 ** attempt)
time.sleep(sleep_sec)
raise RuntimeError(f"Failed to fetch PDF after {max_retries} attempts: {last_exc}")
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 150) -> list[str]:
if not text.strip():
return []
clean_text = " ".join(text.split())
chunks = []
start = 0
step = max(chunk_size - overlap, 1)
while start < len(clean_text):
end = min(start + chunk_size, len(clean_text))
chunks.append(clean_text[start:end])
if end >= len(clean_text):
break
start += step
return chunks