| | """ |
| | Example: Document Processing Pipeline |
| | |
| | Demonstrates: |
| | 1. Processing a PDF document |
| | 2. Extracting text with OCR |
| | 3. Layout detection |
| | 4. Semantic chunking |
| | """ |
| |
|
| | import asyncio |
| | from pathlib import Path |
| | from loguru import logger |
| |
|
| | |
| | from src.document.pipeline import ( |
| | PipelineConfig, |
| | DocumentProcessor, |
| | process_document, |
| | ) |
| | from src.document.ocr import OCRConfig |
| |
|
| |
|
| | def example_basic_processing(): |
| | """Basic document processing example.""" |
| | print("=" * 50) |
| | print("Basic Document Processing") |
| | print("=" * 50) |
| |
|
| | |
| | config = PipelineConfig( |
| | ocr=OCRConfig(engine="paddleocr"), |
| | render_dpi=300, |
| | max_pages=5, |
| | ) |
| |
|
| | |
| | processor = DocumentProcessor(config) |
| |
|
| | |
| | |
| | sample_doc = Path("./data/sample.pdf") |
| |
|
| | if not sample_doc.exists(): |
| | print(f"Sample document not found: {sample_doc}") |
| | print("Create a sample PDF at ./data/sample.pdf to run this example") |
| | return |
| |
|
| | |
| | result = processor.process(sample_doc) |
| |
|
| | |
| | print(f"\nDocument: {result.metadata.filename}") |
| | print(f"Pages: {result.metadata.num_pages}") |
| | print(f"Chunks: {result.metadata.total_chunks}") |
| | print(f"Characters: {result.metadata.total_characters}") |
| | print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}") |
| |
|
| | print("\n--- Sample Chunks ---") |
| | for i, chunk in enumerate(result.chunks[:3]): |
| | print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}") |
| | print(f"Text: {chunk.text[:200]}...") |
| | print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})") |
| |
|
| |
|
| | def example_with_layout(): |
| | """Document processing with layout analysis.""" |
| | print("\n" + "=" * 50) |
| | print("Document Processing with Layout Analysis") |
| | print("=" * 50) |
| |
|
| | from src.document.layout import LayoutConfig, LayoutType |
| |
|
| | |
| | config = PipelineConfig( |
| | ocr=OCRConfig(engine="paddleocr"), |
| | layout=LayoutConfig(method="rule_based"), |
| | include_layout_regions=True, |
| | ) |
| |
|
| | processor = DocumentProcessor(config) |
| |
|
| | sample_doc = Path("./data/sample.pdf") |
| | if not sample_doc.exists(): |
| | print("Sample document not found") |
| | return |
| |
|
| | result = processor.process(sample_doc) |
| |
|
| | |
| | layout_counts = {} |
| | for region in result.layout_regions: |
| | layout_type = region.layout_type.value |
| | layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1 |
| |
|
| | print(f"\nLayout Analysis:") |
| | for layout_type, count in sorted(layout_counts.items()): |
| | print(f" {layout_type}: {count} regions") |
| |
|
| | |
| | tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE] |
| | if tables: |
| | print(f"\n--- Tables Found ({len(tables)}) ---") |
| | for i, table in enumerate(tables[:2]): |
| | print(f"\nTable {i+1}: Page {table.page+1}") |
| | print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})") |
| | print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}") |
| |
|
| |
|
| | def example_convenience_function(): |
| | """Using the convenience function.""" |
| | print("\n" + "=" * 50) |
| | print("Using Convenience Function") |
| | print("=" * 50) |
| |
|
| | sample_doc = Path("./data/sample.pdf") |
| | if not sample_doc.exists(): |
| | print("Sample document not found") |
| | return |
| |
|
| | |
| | result = process_document(sample_doc) |
| |
|
| | print(f"Processed: {result.metadata.filename}") |
| | print(f"Chunks: {len(result.chunks)}") |
| | print(f"\nFull text preview:") |
| | print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | example_basic_processing() |
| | example_with_layout() |
| | example_convenience_function() |
| |
|