import time from typing import Any import base64 import numpy as np import torch from fastapi import FastAPI, HTTPException from fastapi.responses import HTMLResponse from pydantic import BaseModel, ConfigDict from sentence_transformers import SentenceTransformer torch.set_grad_enabled(False) torch.set_num_threads(2) APP_TITLE = "ollama-code-embed" MODEL_ID = "jinaai/jina-code-embeddings-0.5b" MODEL_NAME = "code-embed" MODEL_ALIASES = [ MODEL_NAME, f"{MODEL_NAME}:latest", MODEL_ID, f"{MODEL_ID}:latest", ] MODEL_CREATED_AT = "2026-03-11T00:00:00Z" MODEL_DIMENSIONS = 896 SERVER_VERSION = "0.11.0" app = FastAPI(title=APP_TITLE, version="1.0.0") _model: SentenceTransformer | None = None _loaded_at_ns: int | None = None _load_duration_ns: int = 0 def model_card(name: str) -> dict[str, Any]: return { "name": name, "model": name, "modified_at": MODEL_CREATED_AT, "size": 0, "digest": MODEL_ID, "details": { "format": "sentence-transformers", "family": "jina", "families": ["jina", "embedding"], "parameter_size": "0.5B", "quantization_level": "F32", }, } class CompatibleRequest(BaseModel): model_config = ConfigDict(extra="allow") class EmbedRequest(CompatibleRequest): model: str = MODEL_NAME input: str | list[str] | None = None prompt: str | None = None truncate: bool = True dimensions: int | None = None options: dict[str, Any] | None = None keep_alive: str | int | None = None class OpenAIEmbeddingRequest(CompatibleRequest): model: str = MODEL_ID input: str | list[str] encoding_format: str = "float" dimensions: int | None = None user: str | None = None def get_model() -> SentenceTransformer: global _model, _loaded_at_ns, _load_duration_ns if _model is None: started = time.perf_counter_ns() _model = SentenceTransformer(MODEL_ID, trust_remote_code=True, device="cpu") _load_duration_ns = time.perf_counter_ns() - started _loaded_at_ns = time.time_ns() return _model @app.on_event("startup") def preload_model() -> None: get_model() def normalize_inputs(request: EmbedRequest) -> list[str]: if request.input is not None: return request.input if isinstance(request.input, list) else [request.input] if request.prompt is not None: return [request.prompt] raise HTTPException(status_code=400, detail="Request must include 'input' or 'prompt'") def normalize_openai_inputs(request: OpenAIEmbeddingRequest) -> list[str]: return request.input if isinstance(request.input, list) else [request.input] def maybe_truncate(vector: np.ndarray, dimensions: int | None) -> np.ndarray: if dimensions is None or dimensions <= 0 or dimensions >= vector.shape[0]: return vector truncated = vector[:dimensions] norm = np.linalg.norm(truncated) if norm > 0: truncated = truncated / norm return truncated def validate_model_name(model_name: str) -> None: if model_name not in MODEL_ALIASES: raise HTTPException(status_code=404, detail=f"Model '{model_name}' not found") def estimate_prompt_eval_count(texts: list[str], model: SentenceTransformer) -> int: tokenizer = getattr(model, "tokenizer", None) if tokenizer is None: return sum(max(1, len(text.split())) for text in texts) return sum(len(tokenizer.encode(text, add_special_tokens=True)) for text in texts) @app.get("/", response_class=HTMLResponse) def root() -> str: return f"""
Model: {MODEL_ID}
Served name: {MODEL_NAME}
GET /api/versionGET /api/tagsPOST /api/embedPOST /api/embeddingsPOST /embed