Spaces:
Paused
Paused
File size: 2,198 Bytes
f3ebc82 706520f f3ebc82 88bdcff 333c083 706520f 333c083 706520f 333c083 706520f 333c083 706520f f3ebc82 88bdcff f3ebc82 706520f f3ebc82 88bdcff 706520f f3ebc82 88bdcff f3ebc82 88bdcff 706520f f3ebc82 88bdcff f3ebc82 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
"""Model inference configuration parameters.
Configuration values aligned with official Qwen3-VL model recommendations
and FDAM Technical Spec requirements.
Pipeline uses:
- Vision: Qwen/Qwen3-VL-30B-A3B-Thinking-FP8 (single model, FP8 via vLLM)
- Embedding: Qwen/Qwen3-VL-Embedding-2B (2048-dim)
- Reranker: Qwen/Qwen3-VL-Reranker-2B
"""
from dataclasses import dataclass
@dataclass
class VisionInferenceConfig:
"""Configuration for 30B-A3B FP8 vision model inference.
Single model handles both analysis and structured JSON output.
Uses vLLM with tensor parallelism across 4 GPUs.
"""
max_tokens: int = 8192 # vLLM uses max_tokens not max_new_tokens
temperature: float = 0.6 # Per Qwen3-VL GitHub docs
top_p: float = 0.95
top_k: int = 20
repetition_penalty: float = 1.0 # Per Qwen3-VL docs
@dataclass
class GenerationInferenceConfig:
"""Configuration for document generation (SOW, sampling plans).
Per FDAM Technical Spec Section 3 - separate config for longer generation.
"""
max_new_tokens: int = 8192
temperature: float = 0.2 # Slightly higher for more varied text
top_p: float = 0.95
do_sample: bool = True
repetition_penalty: float = 1.05
@dataclass
class EmbeddingConfig:
"""Configuration for embedding model.
Per Qwen3-VL-Embedding-2B config.json: text_config.hidden_size = 2048
"""
embedding_dimension: int = 2048 # Per Qwen3-VL-Embedding-2B hidden_size
normalize: bool = True # L2 normalization (per official implementation)
@dataclass
class RerankerConfig:
"""Configuration for reranker model."""
top_k: int = 5
@dataclass
class RAGConfig:
"""Configuration for RAG retrieval pipeline.
Per FDAM Technical Spec Section 3.
"""
top_k_retrieval: int = 10 # Initial retrieval count
top_k_rerank: int = 5 # Final results after reranking
similarity_threshold: float = 0.7 # Minimum similarity to include
# Default configurations
vision_config = VisionInferenceConfig() # Single 30B-A3B FP8 model
generation_config = GenerationInferenceConfig()
embedding_config = EmbeddingConfig()
reranker_config = RerankerConfig()
rag_config = RAGConfig()
|