from dataclasses import dataclass import os from typing import Literal EMBEDDING_MODEL = "pgc/bge-m3-onnx" EMBEDDING_DIM = 1024 @dataclass(frozen=True, slots=True) class EmbeddingRuntimeConfig: model_name: str source: str model_file: str providers: list[str] dim: int = EMBEDDING_DIM def get_runtime_config(runtime: Literal["query", "ingest"]) -> EmbeddingRuntimeConfig: # Xenova/bge-m3 provides self-contained single-file quantized ONNX variants # (model_int8.onnx, ~568 MB) whereas BAAI/bge-m3 only has model.onnx + # model.onnx_data (external-data format, 2.3 GB), which fastembed cannot load # as a custom model without special handling. if runtime == "ingest": return EmbeddingRuntimeConfig( model_name=EMBEDDING_MODEL, source=os.getenv("PGC_BGE_M3_INGEST_SOURCE", "Xenova/bge-m3"), model_file=os.getenv("PGC_BGE_M3_INGEST_MODEL_FILE", "onnx/model_int8.onnx"), providers=["CUDAExecutionProvider", "CPUExecutionProvider"], ) return EmbeddingRuntimeConfig( model_name=EMBEDDING_MODEL, source=os.getenv("PGC_BGE_M3_QUERY_SOURCE", "Xenova/bge-m3"), model_file=os.getenv("PGC_BGE_M3_QUERY_MODEL_FILE", "onnx/model_int8.onnx"), providers=["CPUExecutionProvider"], )