Spaces:
Sleeping
Sleeping
File size: 4,852 Bytes
b35e487 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
"""
PDF Processor Module
Handles PDF text extraction and chunking for RAG pipeline
"""
import logging
from typing import List, Optional
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
class PDFProcessor:
"""Handles PDF processing, text extraction, and chunking"""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
"""
Initialize PDF processor
Args:
chunk_size: Size of text chunks
chunk_overlap: Overlap between chunks
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
# Configure logging
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
def extract_text_from_pdf(self, pdf_path: str) -> str:
"""
Extract text from PDF file
Args:
pdf_path: Path to PDF file
Returns:
Extracted text as string
"""
try:
self.logger.info(f"Extracting text from: {pdf_path}")
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
text = ""
for page_num, page in enumerate(pdf_reader.pages):
try:
page_text = page.extract_text()
if page_text:
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
except Exception as e:
self.logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
continue
self.logger.info(f"Extracted {len(text)} characters from PDF")
return text
except Exception as e:
self.logger.error(f"Error reading PDF file {pdf_path}: {e}")
raise
def split_text_into_chunks(self, text: str) -> List[Document]:
"""
Split text into chunks using LangChain text splitter
Args:
text: Text to split
Returns:
List of Document objects
"""
try:
self.logger.info("Splitting text into chunks")
# Create a single document first
documents = [Document(page_content=text, metadata={"source": "pdf"})]
# Split into chunks
chunks = self.text_splitter.split_documents(documents)
self.logger.info(f"Created {len(chunks)} text chunks")
return chunks
except Exception as e:
self.logger.error(f"Error splitting text: {e}")
raise
def process_pdf(self, pdf_path: str) -> List[Document]:
"""
Complete PDF processing pipeline
Args:
pdf_path: Path to PDF file
Returns:
List of Document chunks
"""
try:
# Extract text
text = self.extract_text_from_pdf(pdf_path)
if not text.strip():
self.logger.warning("No text extracted from PDF")
return []
# Split into chunks
chunks = self.split_text_into_chunks(text)
# Add metadata
for chunk in chunks:
chunk.metadata["source"] = pdf_path
chunk.metadata["chunk_size"] = len(chunk.page_content)
return chunks
except Exception as e:
self.logger.error(f"Error processing PDF {pdf_path}: {e}")
raise
def get_chunk_stats(self, chunks: List[Document]) -> dict:
"""
Get statistics about the chunks
Args:
chunks: List of Document chunks
Returns:
Dictionary with chunk statistics
"""
if not chunks:
return {"total_chunks": 0, "avg_chunk_size": 0, "total_characters": 0}
total_chars = sum(len(chunk.page_content) for chunk in chunks)
avg_size = total_chars / len(chunks)
return {
"total_chunks": len(chunks),
"avg_chunk_size": round(avg_size, 2),
"total_characters": total_chars
} |