Spaces:

Bogdan555
/

grantforge-api

Sleeping

grantforge-api / backend /rag_pipeline /pdf_parser.py

GrantForge Bot

Deploy to Hugging Face

afd56bc 3 days ago

9.65 kB

	"""
	LlamaParse + Hierarchical Chunking — serce pipeline RAG dla GrantForge AI.

	FAZA 2: Zaawansowane parsowanie PDF dokumentów prawnych (regulaminy dotacji,
	wytyczne MFiPR, załączniki KOP) z zachowaniem struktury tabelarycznej.

	Architektura failover:
	1. LlamaParse API (LLAMA_CLOUD_API_KEY) — najlepsza jakość, zachowa tabele i listy
	2. PyPDF2 + struktura heurystyczna (pypdf) — bez klucza API
	3. Unstructured — dla trudnych skanów

	Zgodność: FAZA 2 planu Enterprise (LlamaParse dla dokumentów prawnych).
	"""

	import os
	import asyncio
	import tempfile
	import logging
	from typing import Optional
	from tenacity import retry, stop_after_attempt, wait_exponential

	logger = logging.getLogger(__name__)


	# ──────────────────────────────────────────────────────────────────────────────
	# Downloader PDF (z retry)
	# ──────────────────────────────────────────────────────────────────────────────


	@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=8))
	async def download_pdf(url: str) -> Optional[str]:
	"""Pobiera PDF do pliku tymczasowego. Retry 3x z exponential backoff."""
	import httpx

	try:
	async with httpx.AsyncClient(follow_redirects=True, timeout=45.0) as client:
	response = await client.get(url)
	response.raise_for_status()
	fd, temp_path = tempfile.mkstemp(suffix=".pdf")
	with os.fdopen(fd, "wb") as f:
	f.write(response.content)
	logger.info(f"[PDF] Pobrano: {url} ({len(response.content) / 1024:.1f} KB)")
	return temp_path
	except Exception as e:
	logger.error(f"[PDF] Błąd pobierania {url}: {e}")
	raise


	# ──────────────────────────────────────────────────────────────────────────────
	# WARSTWA 1: LlamaParse (najlepsza jakość — zachowuje tabele, paragrafy, §)
	# ──────────────────────────────────────────────────────────────────────────────

	_LLAMAPARSE_INSTRUCTION = """
	Parsing a Polish-language legal document related to EU grant programs
	(dotacje europejskie, fundusze strukturalne).

	Rules:
	1. Preserve ALL paragraph headers (§ 1, Art. 1, Rozdział I, etc.)
	2. Preserve tables exactly (budget tables, timeline tables, criteria scoring)
	3. Preserve numbered lists and bullet points with their hierarchy
	4. Mark page breaks as: <!-- PAGE_BREAK -->
	5. If a section header spans multiple lines, merge them on one line
	6. Do NOT skip footnotes — mark as [Przypis N]: text
	7. Polish legal abbreviations must remain unchanged (MFiPR, PARP, NCBR, UE, IOB)
	"""


	def _parse_llamaparse_sync(file_path: str) -> str:
	"""
	LlamaParse z instrukcjami dla dokumentów prawnych polskich dotacji.
	Zwraca Markdown z zachowaną strukturą §/Art./Rozdział.
	"""
	from llama_parse import LlamaParse

	api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
	if not api_key:
	raise EnvironmentError("LLAMA_CLOUD_API_KEY nie skonfigurowany.")

	logger.info("[LlamaParse] Uruchamianie parsowania PDF (warstwa 1)...")
	parser = LlamaParse(
	api_key=api_key,
	result_type="markdown",
	verbose=False,
	language="pl", # język polski
	parsing_instruction=_LLAMAPARSE_INSTRUCTION,
	page_separator="\n<!-- PAGE_BREAK -->\n",
	skip_diagonal_text=True, # ignoruj znaki wodne / stopki
	invalidate_cache=False, # cache API dla tego samego PDF
	do_not_unroll_columns=False, # zachowaj układ kolumn → tabele
	)
	documents = parser.load_data(file_path)
	result = "\n\n".join(doc.text for doc in documents)
	logger.info(f"[LlamaParse] Sukces — {len(documents)} stron, {len(result)} znaków.")
	return result


	# ──────────────────────────────────────────────────────────────────────────────
	# WARSTWA 2: PyPDF (fallback bez klucza API)
	# ──────────────────────────────────────────────────────────────────────────────


	def _parse_pypdf_sync(file_path: str) -> str:
	"""
	Fallback: PyPDF + heurystyczny ekstraktor struktury § / Art. / Rozdział.
	Wolniejszy i mniej precyzyjny niż LlamaParse, ale działa offline.
	"""
	try:
	from pypdf import PdfReader

	reader = PdfReader(file_path)
	pages_text = []
	for i, page in enumerate(reader.pages):
	text = page.extract_text() or ""
	if text.strip():
	pages_text.append(f"<!-- PAGE {i+1} -->\n{text}")
	full_text = "\n\n".join(pages_text)
	logger.info(
	f"[PyPDF] Sparsowano {len(reader.pages)} stron, {len(full_text)} znaków."
	)
	return full_text
	except ImportError:
	logger.warning("[PyPDF] pypdf nie zainstalowany — próba z unstructured.")
	raise


	# ──────────────────────────────────────────────────────────────────────────────
	# WARSTWA 3: Unstructured (fallback dla skanów)
	# ──────────────────────────────────────────────────────────────────────────────


	def _parse_unstructured_sync(file_path: str) -> str:
	"""Ostatnia linia obrony — unstructured dla skanów i trudnych PDFów."""
	# from unstructured.partition.pdf import partition_pdf
	logger.info("[Unstructured] Fallback parsowania wyłączony (zbyt ciężka zależność).")
	# elements = partition_pdf(filename=file_path)
	# return "\n\n".join(str(el) for el in elements)
	raise ImportError("Unstructured.partition is disabled for performance reasons.")


	# ──────────────────────────────────────────────────────────────────────────────
	# Orkiestrator — waterfall failover
	# ──────────────────────────────────────────────────────────────────────────────


	async def parse_pdf_from_url(url: str, **kwargs) -> dict:
	"""
	Główny orchestrator parsowania PDF:
	LlamaParse → PyPDF → Unstructured → ""
	"""
	try:
	file_path = await download_pdf(url)
	except Exception as e:
	logger.error(f"[PDF] Nie udało się pobrać PDF: {e}")
	return {"text": "", "parser": "failed_download"}

	try:
	# Warstwa 1: LlamaParse (najlepsza)
	if os.environ.get("LLAMA_CLOUD_API_KEY"):
	try:
	text = await asyncio.to_thread(_parse_llamaparse_sync, file_path)
	return {"text": text, "parser": "llamaparse"}
	except Exception as e:
	logger.warning(f"[LlamaParse] Nieudane ({e}) — fallback PyPDF.")

	# Warstwa 2: PyPDF (offline)
	try:
	text = await asyncio.to_thread(_parse_pypdf_sync, file_path)
	return {"text": text, "parser": "pypdf"}
	except Exception as e:
	logger.warning(f"[PyPDF] Nieudane ({e}) — fallback Unstructured.")

	# Warstwa 3: Unstructured (skanowane PDFy)
	text = await asyncio.to_thread(_parse_unstructured_sync, file_path)
	return {"text": text, "parser": "unstructured"}

	except Exception as e:
	logger.error(f"[PDF] Wszystkie parsery zawiodły dla {url}: {e}")
	return {"text": "", "parser": "error"}
	finally:
	try:
	os.unlink(file_path)
	except Exception:
	pass


	async def parse_pdf_from_file(file_path: str, **kwargs) -> dict:
	"""
	Parsuje PDF z lokalnego pliku (używany przy upload przez użytkownika).
	Identyczny waterfall jak parse_pdf_from_url.
	"""
	try:
	if os.environ.get("LLAMA_CLOUD_API_KEY"):
	try:
	text = await asyncio.to_thread(_parse_llamaparse_sync, file_path)
	return {"text": text, "parser": "llamaparse"}
	except Exception as e:
	logger.warning(f"[LlamaParse] Błąd upload: {e} — fallback PyPDF.")
	try:
	text = await asyncio.to_thread(_parse_pypdf_sync, file_path)
	return {"text": text, "parser": "pypdf"}
	except Exception:
	text = await asyncio.to_thread(_parse_unstructured_sync, file_path)
	return {"text": text, "parser": "unstructured"}
	except Exception as e:
	logger.error(f"[PDF] Parsowanie pliku {file_path} nieudane: {e}")
	return {"text": "", "parser": "error"}