# SPARKNET Document Processing Configuration # =========================================== # OCR Configuration ocr: # Engine selection: "paddleocr" (default) or "tesseract" engine: paddleocr # PaddleOCR settings paddleocr: lang: en use_gpu: false det_db_thresh: 0.3 det_db_box_thresh: 0.5 rec_algorithm: CRNN show_log: false # Tesseract settings tesseract: lang: eng config: "--psm 3" # Page segmentation mode oem: 3 # OCR Engine mode (LSTM) # Preprocessing preprocessing: deskew: true denoise: false contrast_enhance: false # Layout Detection Configuration layout: # Detection method: "rule_based" (default) or "model_based" method: rule_based # Rule-based settings rule_based: merge_threshold: 20 # Pixels to merge nearby regions column_detection: true min_region_area: 100 # Confidence thresholds thresholds: text: 0.5 title: 0.7 table: 0.6 figure: 0.6 list: 0.5 # Reading Order Configuration reading_order: # Reconstruction method: "rule_based" (default) method: rule_based # Column detection column_gap_threshold: 50 # Minimum gap between columns reading_direction: ltr # Left-to-right # Line grouping line_height_tolerance: 0.5 # Chunking Configuration chunking: # Chunk size limits target_size: 512 # Target tokens per chunk max_size: 1024 # Maximum tokens per chunk min_size: 50 # Minimum tokens per chunk # Overlap for context overlap_size: 50 # Tokens to overlap between chunks # Semantic chunking semantic_boundaries: true respect_paragraphs: true respect_sections: true # Grounding/Evidence Configuration grounding: # Image cropping for evidence include_images: true crop_padding: 10 # Pixels around regions max_image_size: 512 image_format: PNG # PNG or JPEG image_quality: 85 # JPEG quality # Snippet settings max_snippet_length: 200 include_context: true # Pipeline Configuration pipeline: # PDF rendering render_dpi: 300 # Caching enable_caching: true cache_directory: ./data/cache # Processing options parallel_pages: false max_pages: null # Limit pages (null for all) # Output options include_ocr_regions: true include_layout_regions: true generate_full_text: true # Validation Configuration validation: # Critic settings critic: confidence_threshold: 0.7 evidence_required: true strict_mode: false max_fields_per_request: 10 # Verifier settings verifier: fuzzy_match: true case_sensitive: false min_match_ratio: 0.6 strong_threshold: 0.9 moderate_threshold: 0.7 weak_threshold: 0.5 # LLM Configuration for DocumentAgent agent: # Ollama settings ollama_base_url: http://localhost:11434 default_model: llama3.2:3b # Model routing by complexity model_routing: simple: llama3.2:1b standard: llama3.2:3b complex: llama3.1:8b analysis: llama3.1:70b # For heavy analysis (optional) # Agent behavior max_iterations: 10 temperature: 0.1 timeout: 120 # Seconds # Logging Configuration logging: level: INFO # DEBUG, INFO, WARNING, ERROR format: "{time} | {level} | {message}" file: null # Log file path (null for stderr only)