Spaces:
Running on Zero
Running on Zero
| parsers: | |
| text: | |
| enabled: true | |
| pymupdf: | |
| enabled: true | |
| docling: | |
| enabled: false | |
| do_ocr: false | |
| do_table_structure: false | |
| force_backend_text: true | |
| marker: | |
| enabled: false | |
| command: null | |
| timeout_seconds: 300 | |
| output_args: "--output_dir {output_dir} --output_format markdown" | |
| extra_args: "" | |
| mineru: | |
| enabled: false | |
| command: null | |
| timeout_seconds: 600 | |
| output_args: "--output_dir {output_dir}" | |
| extra_args: "" | |
| olmocr: | |
| enabled: false | |
| command: null | |
| timeout_seconds: 600 | |
| output_args: "--output_dir {output_dir}" | |
| extra_args: "" | |
| paddleocr: | |
| enabled: false | |
| command: null | |
| timeout_seconds: 600 | |
| output_args: "--output_dir {output_dir}" | |
| extra_args: "" | |
| unstructured: | |
| enabled: false | |
| routing: | |
| run_multiple_on_hard_pages: true | |
| max_primary_parsers_per_page: 2 | |
| hard_page_threshold: 0.65 | |
| scanned_text_threshold: 0.40 | |
| table_density_threshold: 0.25 | |
| formula_density_threshold: 0.15 | |
| figure_density_threshold: 0.20 | |
| repair: | |
| enabled: true | |
| max_iterations: 3 | |
| # Plan and dry-run GPU escalations for verification failures. | |
| gpu_escalation: true | |
| # Actually invoke the configured GPU/VLM backend on flagged regions. | |
| # Defaults to false to avoid surprise model downloads on local runs; | |
| # set true on the Space once GPU models are warm. | |
| execute_gpu_escalations: false | |
| table_repair: true | |
| reading_order_repair: true | |
| figure_repair: true | |
| ocr_repair: true | |
| gpu: | |
| backend: transformers | |
| provider: huggingface_spaces | |
| space_name: zeroshotGPU | |
| batch_pages: true | |
| validate_tasks: true | |
| max_batch_size: 4 | |
| max_gpu_seconds_per_doc: 120 | |
| max_vlm_calls_per_doc: 30 | |
| models: | |
| vlm: | |
| model_id: Qwen/Qwen2.5-VL-3B-Instruct | |
| task: image-text-to-text | |
| device: auto | |
| dtype: bfloat16 | |
| max_batch_size: 1 | |
| ocr: | |
| model_id: Qwen/Qwen2.5-VL-3B-Instruct | |
| task: document-ocr | |
| device: auto | |
| dtype: bfloat16 | |
| max_batch_size: 1 | |
| table: | |
| model_id: Qwen/Qwen2.5-VL-3B-Instruct | |
| task: table-repair | |
| device: auto | |
| dtype: bfloat16 | |
| max_batch_size: 1 | |
| embedding: | |
| model_id: jinaai/jina-embeddings-v3 | |
| task: retrieval.passage | |
| device: auto | |
| dtype: bfloat16 | |
| max_batch_size: 16 | |
| task_model_roles: | |
| vlm_route_repair: vlm | |
| ocr_page: ocr | |
| table_vlm_repair: table | |
| figure_description: vlm | |
| pdf: | |
| render_pages: true | |
| render_dpi: 150 | |
| crop_tables: true | |
| crop_figures: true | |
| asset_dir: assets | |
| quality: | |
| accept_threshold: 0.88 | |
| blocking_failures: | |
| - empty_page | |
| - invalid_table | |
| - missing_text_coverage | |
| - reading_order_failure | |
| chunking: | |
| enabled: true | |
| planner: agentic | |
| baseline_strategy: recursive_structure | |
| target_tokens: 512 | |
| min_tokens: 120 | |
| overlap_ratio: 0.15 | |
| parent_child: true | |
| parent_target_tokens: 1600 | |
| page_level_for_paginated_docs: true | |
| table_chunks: true | |
| figure_chunks: true | |
| contextual_prefix: false | |
| contextual_retrieval: false | |
| semantic_similarity_threshold: 0.18 | |
| max_propositions_per_source: 8 | |
| max_proposition_chunks: 64 | |
| semantic_chunking: false | |
| late_chunking: false | |
| vision_guided: false | |
| agentic_proposition_chunking: false | |
| strategy_ladder: | |
| - fixed_token_baseline | |
| - recursive_structure | |
| - metadata_enriched | |
| - parent_child | |
| - contextual_retrieval | |
| - late_chunking | |
| - semantic_chunking | |
| - vision_guided | |
| - agentic_proposition | |
| benchmarks: | |
| retriever: | |
| # `lexical` (default, model-free TF-IDF) or `embedding` (sentence-transformers). | |
| # The `embedding` backend pulls model_id and task from gpu.models.embedding | |
| # unless overridden here. Requires `pip install sentence-transformers`. | |
| backend: lexical | |
| model_id: null | |
| task: null | |
| deployment: | |
| target: huggingface_spaces | |
| gpu_models_target: zeroshotGPU | |