Spaces:

FauzanAriyatmoko
/

LLM-ChatBot-Document

Running

File size: 10,386 Bytes

"""
PDF Processing utilities for extracting and chunking text from PDF files
"""
import os
from typing import List, Dict
import PyPDF2
import pdfplumber
try:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
from config.model_config import config

class PDFProcessor:
    """Handle PDF text extraction and processing"""
    
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def extract_text_from_pdf(self, pdf_path: str, method: str = "pdfplumber") -> str:
        """
        Extract text from PDF file
        
        Args:
            pdf_path: Path to PDF file
            method: Extraction method ('pypdf2' or 'pdfplumber')
            
        Returns:
            Extracted text as string
        """
        text = ""
        
        try:
            if method == "pdfplumber":
                text = self._extract_with_pdfplumber(pdf_path)
            else:
                text = self._extract_with_pypdf2(pdf_path)
        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {e}")
            # Fallback to alternative method
            if method == "pdfplumber":
                text = self._extract_with_pypdf2(pdf_path)
            else:
                text = self._extract_with_pdfplumber(pdf_path)
        
        return text
    
    def _extract_with_pypdf2(self, pdf_path: str) -> str:
        """Extract text using PyPDF2"""
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text
    
    def _extract_with_pdfplumber(self, pdf_path: str) -> str:
        """Extract text using pdfplumber (better for complex PDFs)"""
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text
    
    def chunk_text(self, text: str) -> List[str]:
        """
        Split text into chunks
        
        Args:
            text: Input text to chunk
            
        Returns:
            List of text chunks
        """
        chunks = self.text_splitter.split_text(text)
        return chunks
    
    def extract_with_structure(self, pdf_path: str) -> Dict:
        """
        Extract text with page and paragraph structure
        
        Args:
            pdf_path: Path to PDF file
            
        Returns:
            Dictionary with structured content including pages and paragraphs
        """
        structured_content = {
            "pages": [],
            "paragraphs": [],
            "full_text": ""
        }
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                paragraph_id = 0
                
                for page_num, page in enumerate(pdf.pages, start=1):
                    page_text = page.extract_text()
                    if not page_text:
                        continue
                    
                    # Split into paragraphs (double newline or significant whitespace)
                    raw_paragraphs = page_text.split('\n\n')
                    page_paragraphs = []
                    
                    for para_text in raw_paragraphs:
                        para_text = para_text.strip()
                        if len(para_text) > 20:  # Ignore very short fragments
                            paragraph_id += 1
                            paragraph_data = {
                                "id": f"para_{paragraph_id}",
                                "page": page_num,
                                "text": para_text,
                                "char_start": len(structured_content["full_text"]),
                                "char_end": len(structured_content["full_text"]) + len(para_text)
                            }
                            page_paragraphs.append(paragraph_data)
                            structured_content["paragraphs"].append(paragraph_data)
                            structured_content["full_text"] += para_text + "\n\n"
                    
                    structured_content["pages"].append({
                        "page_num": page_num,
                        "text": page_text,
                        "paragraphs": page_paragraphs
                    })
        
        except Exception as e:
            print(f"Error extracting structured content: {e}")
            # Fallback to simple extraction
            text = self.extract_text_from_pdf(pdf_path)
            structured_content["full_text"] = text
            structured_content["paragraphs"] = [{
                "id": "para_1",
                "page": 1,
                "text": text,
                "char_start": 0,
                "char_end": len(text)
            }]
        
        return structured_content
    
    def generate_html_preview(self, structured_content: Dict, filename: str) -> str:
        """
        Generate HTML representation of PDF for viewer
        
        Args:
            structured_content: Structured content from extract_with_structure
            filename: Name of the PDF file
            
        Returns:
            HTML string
        """
        html = f"""
        <div class="document-content" data-filename="{filename}">
            <div class="document-header">
                <h3>📄 {filename}</h3>
                <p class="doc-meta">{len(structured_content['pages'])} halaman • {len(structured_content['paragraphs'])} paragraf</p>
            </div>
        """
        
        for page in structured_content["pages"]:
            html += f"""
            <div class="pdf-page" data-page="{page['page_num']}">
                <div class="page-number">Halaman {page['page_num']}</div>
            """
            
            for para in page["paragraphs"]:
                html += f"""
                <p class="paragraph" id="{para['id']}" data-page="{para['page']}">
                    {para['text']}
                </p>
                """
            
            html += "</div>"
        
        html += "</div>"
        return html
    
    def chunk_text_with_metadata(self, structured_content: Dict) -> List[Dict]:
        """
        Split text into chunks with metadata about source location
        
        Args:
            structured_content: Structured content from extract_with_structure
            
        Returns:
            List of dictionaries with chunk text and metadata
        """
        # Get chunks from the splitter
        text_chunks = self.text_splitter.split_text(structured_content["full_text"])
        
        chunks_with_metadata = []
        
        for i, chunk_text in enumerate(text_chunks):
            # Find which paragraphs this chunk overlaps with
            chunk_start = structured_content["full_text"].find(chunk_text)
            chunk_end = chunk_start + len(chunk_text)
            
            # Find overlapping paragraphs
            related_paragraphs = []
            related_pages = set()
            
            for para in structured_content["paragraphs"]:
                # Check if chunk overlaps with paragraph
                if not (chunk_end < para["char_start"] or chunk_start > para["char_end"]):
                    related_paragraphs.append(para["id"])
                    related_pages.add(para["page"])
            
            chunks_with_metadata.append({
                "text": chunk_text,
                "chunk_index": i,
                "paragraph_ids": related_paragraphs,
                "pages": sorted(list(related_pages)),
                "char_start": chunk_start,
                "char_end": chunk_end
            })
        
        return chunks_with_metadata
    
    def process_pdf(self, pdf_path: str) -> Dict:
        """
        Complete processing pipeline: extract and chunk PDF with structure
        
        Args:
            pdf_path: Path to PDF file
            
        Returns:
            Dictionary with filename, text, chunks, and structured content
        """
        filename = os.path.basename(pdf_path)
        
        # Extract structured content
        structured_content = self.extract_with_structure(pdf_path)
        
        if not structured_content["full_text"].strip():
            raise ValueError(f"No text extracted from {filename}")
        
        # Generate HTML preview
        html_preview = self.generate_html_preview(structured_content, filename)
        
        # Chunk text with metadata
        chunks_with_metadata = self.chunk_text_with_metadata(structured_content)
        
        # Extract just the text for backward compatibility
        chunks = [c["text"] for c in chunks_with_metadata]
        
        return {
            "filename": filename,
            "full_text": structured_content["full_text"],
            "chunks": chunks,
            "chunks_metadata": chunks_with_metadata,
            "structured_content": structured_content,
            "html_preview": html_preview,
            "num_chunks": len(chunks),
            "total_chars": len(structured_content["full_text"]),
            "num_pages": len(structured_content["pages"]),
            "num_paragraphs": len(structured_content["paragraphs"])
        }
    
    def get_pdf_info(self, pdf_path: str) -> Dict:
        """
        Get metadata about PDF file
        
        Args:
            pdf_path: Path to PDF file
            
        Returns:
            Dictionary with PDF metadata
        """
        info = {
            "filename": os.path.basename(pdf_path),
            "file_size": os.path.getsize(pdf_path),
            "num_pages": 0
        }
        
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                info["num_pages"] = len(pdf_reader.pages)
        except Exception as e:
            print(f"Error getting PDF info: {e}")
        
        return info