# SPARKNET Document Processing Configuration
# ===========================================

# OCR Configuration
ocr:
  # Engine selection: "paddleocr" (default) or "tesseract"
  engine: paddleocr

  # PaddleOCR settings
  paddleocr:
    lang: en
    use_gpu: false
    det_db_thresh: 0.3
    det_db_box_thresh: 0.5
    rec_algorithm: CRNN
    show_log: false

  # Tesseract settings
  tesseract:
    lang: eng
    config: "--psm 3"  # Page segmentation mode
    oem: 3  # OCR Engine mode (LSTM)

  # Preprocessing
  preprocessing:
    deskew: true
    denoise: false
    contrast_enhance: false

# Layout Detection Configuration
layout:
  # Detection method: "rule_based" (default) or "model_based"
  method: rule_based

  # Rule-based settings
  rule_based:
    merge_threshold: 20  # Pixels to merge nearby regions
    column_detection: true
    min_region_area: 100

  # Confidence thresholds
  thresholds:
    text: 0.5
    title: 0.7
    table: 0.6
    figure: 0.6
    list: 0.5

# Reading Order Configuration
reading_order:
  # Reconstruction method: "rule_based" (default)
  method: rule_based

  # Column detection
  column_gap_threshold: 50  # Minimum gap between columns
  reading_direction: ltr  # Left-to-right

  # Line grouping
  line_height_tolerance: 0.5

# Chunking Configuration
chunking:
  # Chunk size limits
  target_size: 512  # Target tokens per chunk
  max_size: 1024  # Maximum tokens per chunk
  min_size: 50  # Minimum tokens per chunk

  # Overlap for context
  overlap_size: 50  # Tokens to overlap between chunks

  # Semantic chunking
  semantic_boundaries: true
  respect_paragraphs: true
  respect_sections: true

# Grounding/Evidence Configuration
grounding:
  # Image cropping for evidence
  include_images: true
  crop_padding: 10  # Pixels around regions
  max_image_size: 512
  image_format: PNG  # PNG or JPEG
  image_quality: 85  # JPEG quality

  # Snippet settings
  max_snippet_length: 200
  include_context: true

# Pipeline Configuration
pipeline:
  # PDF rendering
  render_dpi: 300

  # Caching
  enable_caching: true
  cache_directory: ./data/cache

  # Processing options
  parallel_pages: false
  max_pages: null  # Limit pages (null for all)

  # Output options
  include_ocr_regions: true
  include_layout_regions: true
  generate_full_text: true

# Validation Configuration
validation:
  # Critic settings
  critic:
    confidence_threshold: 0.7
    evidence_required: true
    strict_mode: false
    max_fields_per_request: 10

  # Verifier settings
  verifier:
    fuzzy_match: true
    case_sensitive: false
    min_match_ratio: 0.6
    strong_threshold: 0.9
    moderate_threshold: 0.7
    weak_threshold: 0.5

# LLM Configuration for DocumentAgent
agent:
  # Ollama settings
  ollama_base_url: http://localhost:11434
  default_model: llama3.2:3b

  # Model routing by complexity
  model_routing:
    simple: llama3.2:1b
    standard: llama3.2:3b
    complex: llama3.1:8b
    analysis: llama3.1:70b  # For heavy analysis (optional)

  # Agent behavior
  max_iterations: 10
  temperature: 0.1
  timeout: 120  # Seconds

# Logging Configuration
logging:
  level: INFO  # DEBUG, INFO, WARNING, ERROR
  format: "{time} | {level} | {message}"
  file: null  # Log file path (null for stderr only)