parsers:
  text:
    enabled: true
  pymupdf:
    enabled: true
  docling:
    enabled: false
    do_ocr: false
    do_table_structure: false
    force_backend_text: true
  marker:
    enabled: false
    command: null
    timeout_seconds: 300
    output_args: "--output_dir {output_dir} --output_format markdown"
    extra_args: ""
  mineru:
    enabled: false
    command: null
    timeout_seconds: 600
    output_args: "--output_dir {output_dir}"
    extra_args: ""
  olmocr:
    enabled: false
    command: null
    timeout_seconds: 600
    output_args: "--output_dir {output_dir}"
    extra_args: ""
  paddleocr:
    enabled: false
    command: null
    timeout_seconds: 600
    output_args: "--output_dir {output_dir}"
    extra_args: ""
  unstructured:
    enabled: false

routing:
  run_multiple_on_hard_pages: true
  max_primary_parsers_per_page: 2
  hard_page_threshold: 0.65
  scanned_text_threshold: 0.40
  table_density_threshold: 0.25
  formula_density_threshold: 0.15
  figure_density_threshold: 0.20

repair:
  enabled: true
  max_iterations: 3
  # Plan and dry-run GPU escalations for verification failures.
  gpu_escalation: true
  # Actually invoke the configured GPU/VLM backend on flagged regions.
  # Defaults to false to avoid surprise model downloads on local runs;
  # set true on the Space once GPU models are warm.
  execute_gpu_escalations: false
  table_repair: true
  reading_order_repair: true
  figure_repair: true
  ocr_repair: true

gpu:
  backend: transformers
  provider: huggingface_spaces
  space_name: zeroshotGPU
  batch_pages: true
  validate_tasks: true
  max_batch_size: 4
  max_gpu_seconds_per_doc: 120
  max_vlm_calls_per_doc: 30
  models:
    vlm:
      model_id: Qwen/Qwen2.5-VL-3B-Instruct
      task: image-text-to-text
      device: auto
      dtype: bfloat16
      max_batch_size: 1
    ocr:
      model_id: Qwen/Qwen2.5-VL-3B-Instruct
      task: document-ocr
      device: auto
      dtype: bfloat16
      max_batch_size: 1
    table:
      model_id: Qwen/Qwen2.5-VL-3B-Instruct
      task: table-repair
      device: auto
      dtype: bfloat16
      max_batch_size: 1
    embedding:
      model_id: jinaai/jina-embeddings-v3
      task: retrieval.passage
      device: auto
      dtype: bfloat16
      max_batch_size: 16
  task_model_roles:
    vlm_route_repair: vlm
    ocr_page: ocr
    table_vlm_repair: table
    figure_description: vlm

pdf:
  render_pages: true
  render_dpi: 150
  crop_tables: true
  crop_figures: true
  asset_dir: assets

quality:
  accept_threshold: 0.88
  blocking_failures:
    - empty_page
    - invalid_table
    - missing_text_coverage
    - reading_order_failure

chunking:
  enabled: true
  planner: agentic
  baseline_strategy: recursive_structure
  target_tokens: 512
  min_tokens: 120
  overlap_ratio: 0.15
  parent_child: true
  parent_target_tokens: 1600
  page_level_for_paginated_docs: true
  table_chunks: true
  figure_chunks: true
  contextual_prefix: false
  contextual_retrieval: false
  semantic_similarity_threshold: 0.18
  max_propositions_per_source: 8
  max_proposition_chunks: 64
  semantic_chunking: false
  late_chunking: false
  vision_guided: false
  agentic_proposition_chunking: false
  strategy_ladder:
    - fixed_token_baseline
    - recursive_structure
    - metadata_enriched
    - parent_child
    - contextual_retrieval
    - late_chunking
    - semantic_chunking
    - vision_guided
    - agentic_proposition

benchmarks:
  retriever:
    # `lexical` (default, model-free TF-IDF) or `embedding` (sentence-transformers).
    # The `embedding` backend pulls model_id and task from gpu.models.embedding
    # unless overridden here. Requires `pip install sentence-transformers`.
    backend: lexical
    model_id: null
    task: null

deployment:
  target: huggingface_spaces
  gpu_models_target: zeroshotGPU