Spaces:
Paused
Paused
Commit
·
0699c5f
1
Parent(s):
333c083
Reduce thinking model max_new_tokens to fix slow inference
Browse filesRoot cause: max_new_tokens=32768 caused ~27 min inference time,
appearing as a "hang". Reduced to 8192 for ~7 min thinking stage.
The device mismatch warning is expected behavior - transformers
handles routing internally for device_map="auto" models.
Also includes ruff auto-fixes for unused imports.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- app.py +1 -1
- config/inference.py +1 -1
- pipeline/calculations.py +1 -1
- pipeline/main.py +1 -1
- rag/index_builder.py +2 -2
- scripts/qwen3_vl/qwen3_vl_reranker.py +1 -1
- ui/components.py +1 -2
- ui/tabs/results.py +0 -1
app.py
CHANGED
|
@@ -16,7 +16,7 @@ import logging
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
from models.loader import get_models
|
| 19 |
-
from ui.state import SessionState, create_new_session
|
| 20 |
from ui.storage import get_head_html
|
| 21 |
from ui.tabs import room, images, observations, results
|
| 22 |
from ui import samples
|
|
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
from models.loader import get_models
|
| 19 |
+
from ui.state import SessionState, create_new_session
|
| 20 |
from ui.storage import get_head_html
|
| 21 |
from ui.tabs import room, images, observations, results
|
| 22 |
from ui import samples
|
config/inference.py
CHANGED
|
@@ -15,7 +15,7 @@ class ThinkingInferenceConfig:
|
|
| 15 |
Used for deep analysis with <think> chains.
|
| 16 |
"""
|
| 17 |
|
| 18 |
-
max_new_tokens: int =
|
| 19 |
temperature: float = 0.6 # Per Qwen3-VL GitHub docs
|
| 20 |
top_p: float = 0.95
|
| 21 |
top_k: int = 20
|
|
|
|
| 15 |
Used for deep analysis with <think> chains.
|
| 16 |
"""
|
| 17 |
|
| 18 |
+
max_new_tokens: int = 8192 # Balanced for reasoning + reasonable time (~7 min)
|
| 19 |
temperature: float = 0.6 # Per Qwen3-VL GitHub docs
|
| 20 |
top_p: float = 0.95
|
| 21 |
top_k: int = 20
|
pipeline/calculations.py
CHANGED
|
@@ -10,7 +10,7 @@ Implements deterministic calculations from FDAM v4.0.1:
|
|
| 10 |
import logging
|
| 11 |
import math
|
| 12 |
from dataclasses import dataclass, field
|
| 13 |
-
from typing import Literal
|
| 14 |
|
| 15 |
from ui.state import SessionState
|
| 16 |
|
|
|
|
| 10 |
import logging
|
| 11 |
import math
|
| 12 |
from dataclasses import dataclass, field
|
| 13 |
+
from typing import Literal
|
| 14 |
|
| 15 |
from ui.state import SessionState
|
| 16 |
|
pipeline/main.py
CHANGED
|
@@ -326,7 +326,7 @@ class FDAMPipeline:
|
|
| 326 |
logger.info("=" * 60)
|
| 327 |
logger.info("PIPELINE EXECUTION SUMMARY")
|
| 328 |
logger.info("=" * 60)
|
| 329 |
-
logger.info(
|
| 330 |
logger.info(f"Total execution time: {total_time:.2f}s")
|
| 331 |
logger.info(f"Images analyzed: {len(vision_results)}")
|
| 332 |
logger.info(f"Dispositions generated: {len(dispositions)}")
|
|
|
|
| 326 |
logger.info("=" * 60)
|
| 327 |
logger.info("PIPELINE EXECUTION SUMMARY")
|
| 328 |
logger.info("=" * 60)
|
| 329 |
+
logger.info("Success: True")
|
| 330 |
logger.info(f"Total execution time: {total_time:.2f}s")
|
| 331 |
logger.info(f"Images analyzed: {len(vision_results)}")
|
| 332 |
logger.info(f"Dispositions generated: {len(dispositions)}")
|
rag/index_builder.py
CHANGED
|
@@ -9,7 +9,7 @@ Usage:
|
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
|
| 12 |
-
from rag.chunker import SemanticChunker
|
| 13 |
from rag.vectorstore import ChromaVectorStore
|
| 14 |
|
| 15 |
|
|
@@ -160,7 +160,7 @@ def build_index(rebuild: bool = False) -> dict:
|
|
| 160 |
|
| 161 |
# Print collection stats
|
| 162 |
collection_stats = vectorstore.get_stats()
|
| 163 |
-
print(
|
| 164 |
print(f" Total chunks in DB: {collection_stats['total_chunks']}")
|
| 165 |
print(f" Categories: {collection_stats['categories']}")
|
| 166 |
print(f" Priorities: {collection_stats['priorities']}")
|
|
|
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
|
| 12 |
+
from rag.chunker import SemanticChunker
|
| 13 |
from rag.vectorstore import ChromaVectorStore
|
| 14 |
|
| 15 |
|
|
|
|
| 160 |
|
| 161 |
# Print collection stats
|
| 162 |
collection_stats = vectorstore.get_stats()
|
| 163 |
+
print("\nCollection stats:")
|
| 164 |
print(f" Total chunks in DB: {collection_stats['total_chunks']}")
|
| 165 |
print(f" Categories: {collection_stats['categories']}")
|
| 166 |
print(f" Priorities: {collection_stats['priorities']}")
|
scripts/qwen3_vl/qwen3_vl_reranker.py
CHANGED
|
@@ -9,7 +9,7 @@ import numpy as np
|
|
| 9 |
import logging
|
| 10 |
|
| 11 |
from PIL import Image
|
| 12 |
-
from typing import List,
|
| 13 |
from qwen_vl_utils import process_vision_info
|
| 14 |
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
|
| 15 |
|
|
|
|
| 9 |
import logging
|
| 10 |
|
| 11 |
from PIL import Image
|
| 12 |
+
from typing import List, Dict, Any
|
| 13 |
from qwen_vl_utils import process_vision_info
|
| 14 |
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
|
| 15 |
|
ui/components.py
CHANGED
|
@@ -3,8 +3,7 @@
|
|
| 3 |
Provides helper functions for common Gradio UI patterns.
|
| 4 |
"""
|
| 5 |
|
| 6 |
-
|
| 7 |
-
from typing import Callable, Optional
|
| 8 |
|
| 9 |
from .state import SessionState, AssessmentHistory
|
| 10 |
|
|
|
|
| 3 |
Provides helper functions for common Gradio UI patterns.
|
| 4 |
"""
|
| 5 |
|
| 6 |
+
from typing import Optional
|
|
|
|
| 7 |
|
| 8 |
from .state import SessionState, AssessmentHistory
|
| 9 |
|
ui/tabs/results.py
CHANGED
|
@@ -10,7 +10,6 @@ import tempfile
|
|
| 10 |
|
| 11 |
from ui.state import SessionState
|
| 12 |
from ui.components import create_stats_dict, create_progress_html, image_store
|
| 13 |
-
from config.settings import settings
|
| 14 |
from pipeline import FDAMPipeline, PipelineResult, PDFGenerator
|
| 15 |
|
| 16 |
|
|
|
|
| 10 |
|
| 11 |
from ui.state import SessionState
|
| 12 |
from ui.components import create_stats_dict, create_progress_html, image_store
|
|
|
|
| 13 |
from pipeline import FDAMPipeline, PipelineResult, PDFGenerator
|
| 14 |
|
| 15 |
|