KinetoLabs Claude Opus 4.5 commited on
Commit
0699c5f
·
1 Parent(s): 333c083

Reduce thinking model max_new_tokens to fix slow inference

Browse files

Root cause: max_new_tokens=32768 caused ~27 min inference time,
appearing as a "hang". Reduced to 8192 for ~7 min thinking stage.

The device mismatch warning is expected behavior - transformers
handles routing internally for device_map="auto" models.

Also includes ruff auto-fixes for unused imports.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

app.py CHANGED
@@ -16,7 +16,7 @@ import logging
16
  logger = logging.getLogger(__name__)
17
 
18
  from models.loader import get_models
19
- from ui.state import SessionState, create_new_session, session_to_json, session_from_json
20
  from ui.storage import get_head_html
21
  from ui.tabs import room, images, observations, results
22
  from ui import samples
 
16
  logger = logging.getLogger(__name__)
17
 
18
  from models.loader import get_models
19
+ from ui.state import SessionState, create_new_session
20
  from ui.storage import get_head_html
21
  from ui.tabs import room, images, observations, results
22
  from ui import samples
config/inference.py CHANGED
@@ -15,7 +15,7 @@ class ThinkingInferenceConfig:
15
  Used for deep analysis with <think> chains.
16
  """
17
 
18
- max_new_tokens: int = 32768 # Extended for reasoning chains (model supports 40960)
19
  temperature: float = 0.6 # Per Qwen3-VL GitHub docs
20
  top_p: float = 0.95
21
  top_k: int = 20
 
15
  Used for deep analysis with <think> chains.
16
  """
17
 
18
+ max_new_tokens: int = 8192 # Balanced for reasoning + reasonable time (~7 min)
19
  temperature: float = 0.6 # Per Qwen3-VL GitHub docs
20
  top_p: float = 0.95
21
  top_k: int = 20
pipeline/calculations.py CHANGED
@@ -10,7 +10,7 @@ Implements deterministic calculations from FDAM v4.0.1:
10
  import logging
11
  import math
12
  from dataclasses import dataclass, field
13
- from typing import Literal, Optional
14
 
15
  from ui.state import SessionState
16
 
 
10
  import logging
11
  import math
12
  from dataclasses import dataclass, field
13
+ from typing import Literal
14
 
15
  from ui.state import SessionState
16
 
pipeline/main.py CHANGED
@@ -326,7 +326,7 @@ class FDAMPipeline:
326
  logger.info("=" * 60)
327
  logger.info("PIPELINE EXECUTION SUMMARY")
328
  logger.info("=" * 60)
329
- logger.info(f"Success: True")
330
  logger.info(f"Total execution time: {total_time:.2f}s")
331
  logger.info(f"Images analyzed: {len(vision_results)}")
332
  logger.info(f"Dispositions generated: {len(dispositions)}")
 
326
  logger.info("=" * 60)
327
  logger.info("PIPELINE EXECUTION SUMMARY")
328
  logger.info("=" * 60)
329
+ logger.info("Success: True")
330
  logger.info(f"Total execution time: {total_time:.2f}s")
331
  logger.info(f"Images analyzed: {len(vision_results)}")
332
  logger.info(f"Dispositions generated: {len(dispositions)}")
rag/index_builder.py CHANGED
@@ -9,7 +9,7 @@ Usage:
9
  import argparse
10
  from pathlib import Path
11
 
12
- from rag.chunker import SemanticChunker, Chunk
13
  from rag.vectorstore import ChromaVectorStore
14
 
15
 
@@ -160,7 +160,7 @@ def build_index(rebuild: bool = False) -> dict:
160
 
161
  # Print collection stats
162
  collection_stats = vectorstore.get_stats()
163
- print(f"\nCollection stats:")
164
  print(f" Total chunks in DB: {collection_stats['total_chunks']}")
165
  print(f" Categories: {collection_stats['categories']}")
166
  print(f" Priorities: {collection_stats['priorities']}")
 
9
  import argparse
10
  from pathlib import Path
11
 
12
+ from rag.chunker import SemanticChunker
13
  from rag.vectorstore import ChromaVectorStore
14
 
15
 
 
160
 
161
  # Print collection stats
162
  collection_stats = vectorstore.get_stats()
163
+ print("\nCollection stats:")
164
  print(f" Total chunks in DB: {collection_stats['total_chunks']}")
165
  print(f" Categories: {collection_stats['categories']}")
166
  print(f" Priorities: {collection_stats['priorities']}")
scripts/qwen3_vl/qwen3_vl_reranker.py CHANGED
@@ -9,7 +9,7 @@ import numpy as np
9
  import logging
10
 
11
  from PIL import Image
12
- from typing import List, Optional, Union, Dict, Any
13
  from qwen_vl_utils import process_vision_info
14
  from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
15
 
 
9
  import logging
10
 
11
  from PIL import Image
12
+ from typing import List, Dict, Any
13
  from qwen_vl_utils import process_vision_info
14
  from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
15
 
ui/components.py CHANGED
@@ -3,8 +3,7 @@
3
  Provides helper functions for common Gradio UI patterns.
4
  """
5
 
6
- import gradio as gr
7
- from typing import Callable, Optional
8
 
9
  from .state import SessionState, AssessmentHistory
10
 
 
3
  Provides helper functions for common Gradio UI patterns.
4
  """
5
 
6
+ from typing import Optional
 
7
 
8
  from .state import SessionState, AssessmentHistory
9
 
ui/tabs/results.py CHANGED
@@ -10,7 +10,6 @@ import tempfile
10
 
11
  from ui.state import SessionState
12
  from ui.components import create_stats_dict, create_progress_html, image_store
13
- from config.settings import settings
14
  from pipeline import FDAMPipeline, PipelineResult, PDFGenerator
15
 
16
 
 
10
 
11
  from ui.state import SessionState
12
  from ui.components import create_stats_dict, create_progress_html, image_store
 
13
  from pipeline import FDAMPipeline, PipelineResult, PDFGenerator
14
 
15