File size: 2,198 Bytes
f3ebc82
 
 
 
706520f
 
 
 
 
f3ebc82
88bdcff
 
 
 
333c083
706520f
 
333c083
706520f
 
333c083
 
706520f
333c083
 
 
706520f
f3ebc82
 
 
 
 
 
 
 
 
 
 
 
 
 
88bdcff
 
 
 
f3ebc82
 
706520f
f3ebc82
88bdcff
706520f
f3ebc82
88bdcff
 
 
 
 
 
 
 
 
f3ebc82
 
 
 
 
 
 
 
 
 
 
 
88bdcff
706520f
f3ebc82
88bdcff
 
f3ebc82
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""Model inference configuration parameters.

Configuration values aligned with official Qwen3-VL model recommendations
and FDAM Technical Spec requirements.

Pipeline uses:
- Vision: Qwen/Qwen3-VL-30B-A3B-Thinking-FP8 (single model, FP8 via vLLM)
- Embedding: Qwen/Qwen3-VL-Embedding-2B (2048-dim)
- Reranker: Qwen/Qwen3-VL-Reranker-2B
"""

from dataclasses import dataclass


@dataclass
class VisionInferenceConfig:
    """Configuration for 30B-A3B FP8 vision model inference.

    Single model handles both analysis and structured JSON output.
    Uses vLLM with tensor parallelism across 4 GPUs.
    """

    max_tokens: int = 8192  # vLLM uses max_tokens not max_new_tokens
    temperature: float = 0.6  # Per Qwen3-VL GitHub docs
    top_p: float = 0.95
    top_k: int = 20
    repetition_penalty: float = 1.0  # Per Qwen3-VL docs


@dataclass
class GenerationInferenceConfig:
    """Configuration for document generation (SOW, sampling plans).

    Per FDAM Technical Spec Section 3 - separate config for longer generation.
    """

    max_new_tokens: int = 8192
    temperature: float = 0.2  # Slightly higher for more varied text
    top_p: float = 0.95
    do_sample: bool = True
    repetition_penalty: float = 1.05


@dataclass
class EmbeddingConfig:
    """Configuration for embedding model.

    Per Qwen3-VL-Embedding-2B config.json: text_config.hidden_size = 2048
    """

    embedding_dimension: int = 2048  # Per Qwen3-VL-Embedding-2B hidden_size
    normalize: bool = True  # L2 normalization (per official implementation)


@dataclass
class RerankerConfig:
    """Configuration for reranker model."""

    top_k: int = 5


@dataclass
class RAGConfig:
    """Configuration for RAG retrieval pipeline.

    Per FDAM Technical Spec Section 3.
    """

    top_k_retrieval: int = 10  # Initial retrieval count
    top_k_rerank: int = 5  # Final results after reranking
    similarity_threshold: float = 0.7  # Minimum similarity to include


# Default configurations
vision_config = VisionInferenceConfig()  # Single 30B-A3B FP8 model
generation_config = GenerationInferenceConfig()
embedding_config = EmbeddingConfig()
reranker_config = RerankerConfig()
rag_config = RAGConfig()