Spaces:

FauzanAriyatmoko
/

LLM-ChatBot-Document

Running

App Files Files Community

LLM-ChatBot-Document / utils /pdf_processor.py

FauzanAriyatmoko

feat: Implement an interactive document viewer with citation highlighting and structured PDF text extraction.

a86d063 15 days ago

raw

history blame contribute delete

10.4 kB

	"""
	PDF Processing utilities for extracting and chunking text from PDF files
	"""
	import os
	from typing import List, Dict
	import PyPDF2
	import pdfplumber
	try:
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	except ImportError:
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from config.model_config import config

	class PDFProcessor:
	"""Handle PDF text extraction and processing"""

	def __init__(self):
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=config.CHUNK_SIZE,
	chunk_overlap=config.CHUNK_OVERLAP,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	def extract_text_from_pdf(self, pdf_path: str, method: str = "pdfplumber") -> str:
	"""
	Extract text from PDF file

	Args:
	pdf_path: Path to PDF file
	method: Extraction method ('pypdf2' or 'pdfplumber')

	Returns:
	Extracted text as string
	"""
	text = ""

	try:
	if method == "pdfplumber":
	text = self._extract_with_pdfplumber(pdf_path)
	else:
	text = self._extract_with_pypdf2(pdf_path)
	except Exception as e:
	print(f"Error extracting text from {pdf_path}: {e}")
	# Fallback to alternative method
	if method == "pdfplumber":
	text = self._extract_with_pypdf2(pdf_path)
	else:
	text = self._extract_with_pdfplumber(pdf_path)

	return text

	def _extract_with_pypdf2(self, pdf_path: str) -> str:
	"""Extract text using PyPDF2"""
	text = ""
	with open(pdf_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text

	def _extract_with_pdfplumber(self, pdf_path: str) -> str:
	"""Extract text using pdfplumber (better for complex PDFs)"""
	text = ""
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text

	def chunk_text(self, text: str) -> List[str]:
	"""
	Split text into chunks

	Args:
	text: Input text to chunk

	Returns:
	List of text chunks
	"""
	chunks = self.text_splitter.split_text(text)
	return chunks

	def extract_with_structure(self, pdf_path: str) -> Dict:
	"""
	Extract text with page and paragraph structure

	Args:
	pdf_path: Path to PDF file

	Returns:
	Dictionary with structured content including pages and paragraphs
	"""
	structured_content = {
	"pages": [],
	"paragraphs": [],
	"full_text": ""
	}

	try:
	with pdfplumber.open(pdf_path) as pdf:
	paragraph_id = 0

	for page_num, page in enumerate(pdf.pages, start=1):
	page_text = page.extract_text()
	if not page_text:
	continue

	# Split into paragraphs (double newline or significant whitespace)
	raw_paragraphs = page_text.split('\n\n')
	page_paragraphs = []

	for para_text in raw_paragraphs:
	para_text = para_text.strip()
	if len(para_text) > 20: # Ignore very short fragments
	paragraph_id += 1
	paragraph_data = {
	"id": f"para_{paragraph_id}",
	"page": page_num,
	"text": para_text,
	"char_start": len(structured_content["full_text"]),
	"char_end": len(structured_content["full_text"]) + len(para_text)
	}
	page_paragraphs.append(paragraph_data)
	structured_content["paragraphs"].append(paragraph_data)
	structured_content["full_text"] += para_text + "\n\n"

	structured_content["pages"].append({
	"page_num": page_num,
	"text": page_text,
	"paragraphs": page_paragraphs
	})

	except Exception as e:
	print(f"Error extracting structured content: {e}")
	# Fallback to simple extraction
	text = self.extract_text_from_pdf(pdf_path)
	structured_content["full_text"] = text
	structured_content["paragraphs"] = [{
	"id": "para_1",
	"page": 1,
	"text": text,
	"char_start": 0,
	"char_end": len(text)
	}]

	return structured_content

	def generate_html_preview(self, structured_content: Dict, filename: str) -> str:
	"""
	Generate HTML representation of PDF for viewer

	Args:
	structured_content: Structured content from extract_with_structure
	filename: Name of the PDF file

	Returns:
	HTML string
	"""
	html = f"""
	<div class="document-content" data-filename="{filename}">
	<div class="document-header">
	<h3>📄 {filename}</h3>
	<p class="doc-meta">{len(structured_content['pages'])} halaman • {len(structured_content['paragraphs'])} paragraf</p>
	</div>
	"""

	for page in structured_content["pages"]:
	html += f"""
	<div class="pdf-page" data-page="{page['page_num']}">
	<div class="page-number">Halaman {page['page_num']}</div>
	"""

	for para in page["paragraphs"]:
	html += f"""
	<p class="paragraph" id="{para['id']}" data-page="{para['page']}">
	{para['text']}
	</p>
	"""

	html += "</div>"

	html += "</div>"
	return html

	def chunk_text_with_metadata(self, structured_content: Dict) -> List[Dict]:
	"""
	Split text into chunks with metadata about source location

	Args:
	structured_content: Structured content from extract_with_structure

	Returns:
	List of dictionaries with chunk text and metadata
	"""
	# Get chunks from the splitter
	text_chunks = self.text_splitter.split_text(structured_content["full_text"])

	chunks_with_metadata = []

	for i, chunk_text in enumerate(text_chunks):
	# Find which paragraphs this chunk overlaps with
	chunk_start = structured_content["full_text"].find(chunk_text)
	chunk_end = chunk_start + len(chunk_text)

	# Find overlapping paragraphs
	related_paragraphs = []
	related_pages = set()

	for para in structured_content["paragraphs"]:
	# Check if chunk overlaps with paragraph
	if not (chunk_end < para["char_start"] or chunk_start > para["char_end"]):
	related_paragraphs.append(para["id"])
	related_pages.add(para["page"])

	chunks_with_metadata.append({
	"text": chunk_text,
	"chunk_index": i,
	"paragraph_ids": related_paragraphs,
	"pages": sorted(list(related_pages)),
	"char_start": chunk_start,
	"char_end": chunk_end
	})

	return chunks_with_metadata

	def process_pdf(self, pdf_path: str) -> Dict:
	"""
	Complete processing pipeline: extract and chunk PDF with structure

	Args:
	pdf_path: Path to PDF file

	Returns:
	Dictionary with filename, text, chunks, and structured content
	"""
	filename = os.path.basename(pdf_path)

	# Extract structured content
	structured_content = self.extract_with_structure(pdf_path)

	if not structured_content["full_text"].strip():
	raise ValueError(f"No text extracted from {filename}")

	# Generate HTML preview
	html_preview = self.generate_html_preview(structured_content, filename)

	# Chunk text with metadata
	chunks_with_metadata = self.chunk_text_with_metadata(structured_content)

	# Extract just the text for backward compatibility
	chunks = [c["text"] for c in chunks_with_metadata]

	return {
	"filename": filename,
	"full_text": structured_content["full_text"],
	"chunks": chunks,
	"chunks_metadata": chunks_with_metadata,
	"structured_content": structured_content,
	"html_preview": html_preview,
	"num_chunks": len(chunks),
	"total_chars": len(structured_content["full_text"]),
	"num_pages": len(structured_content["pages"]),
	"num_paragraphs": len(structured_content["paragraphs"])
	}

	def get_pdf_info(self, pdf_path: str) -> Dict:
	"""
	Get metadata about PDF file

	Args:
	pdf_path: Path to PDF file

	Returns:
	Dictionary with PDF metadata
	"""
	info = {
	"filename": os.path.basename(pdf_path),
	"file_size": os.path.getsize(pdf_path),
	"num_pages": 0
	}

	try:
	with open(pdf_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	info["num_pages"] = len(pdf_reader.pages)
	except Exception as e:
	print(f"Error getting PDF info: {e}")

	return info