| """Nemotron-Mini-4B wrapper. Loads the model and generates diagnoses. |
| |
| Per AGENTS.md, this is the second stage of the dual-model pipeline. |
| Receives defect JSON from the vision model plus user metadata, returns |
| root cause diagnosis and physical remediation steps. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import logging |
| import os |
| from typing import Any |
|
|
| logger = logging.getLogger(__name__) |
|
|
| NEMOTRON_MODEL_ID = "nvidia/Nemotron-Mini-4B-Instruct" |
| MAX_NEW_TOKENS = int(os.getenv("HALIDE_NEMOTRON_MAX_TOKENS", "512")) |
|
|
|
|
| class NemotronReasoner: |
| """Lazy-loading wrapper around Nemotron-Mini-4B-Instruct.""" |
|
|
| def __init__(self, model_path: str | None = None) -> None: |
| self._model_path = model_path or NEMOTRON_MODEL_ID |
| self._tokenizer: Any = None |
| self._model: Any = None |
| self._device: str = "cpu" |
| self._dtype: Any = None |
|
|
| @property |
| def model_path(self) -> str: |
| return self._model_path |
|
|
| def load(self) -> None: |
| if self._model is not None: |
| return |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| logger.info("Loading Nemotron-Mini-4B from %s", self._model_path) |
| self._tokenizer = AutoTokenizer.from_pretrained(self._model_path) |
| self._dtype = torch.bfloat16 |
| self._model = AutoModelForCausalLM.from_pretrained( |
| self._model_path, |
| torch_dtype=self._dtype, |
| device_map="auto", |
| ) |
| self._device = str(next(self._model.parameters()).device) |
| logger.info("Nemotron loaded on %s", self._device) |
|
|
| def generate(self, prompt: str, system: str | None = None) -> str: |
| if self._model is None: |
| self.load() |
|
|
| import torch |
|
|
| if system: |
| messages = [ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": prompt}, |
| ] |
| else: |
| messages = [{"role": "user", "content": prompt}] |
|
|
| input_ids = self._tokenizer.apply_chat_template( |
| messages, add_generation_prompt=True, return_tensors="pt" |
| ).to(self._device) |
|
|
| with torch.inference_mode(): |
| output = self._model.generate( |
| input_ids, |
| max_new_tokens=MAX_NEW_TOKENS, |
| do_sample=False, |
| pad_token_id=self._tokenizer.eos_token_id, |
| ) |
|
|
| response_ids = output[0][input_ids.shape[-1]:] |
| return self._tokenizer.decode(response_ids, skip_special_tokens=True) |
|
|
| def close(self) -> None: |
| if self._model is not None: |
| del self._model |
| self._model = None |
| if self._tokenizer is not None: |
| del self._tokenizer |
| self._tokenizer = None |
|
|
|
|
| _default_reasoner: NemotronReasoner | None = None |
|
|
|
|
| def get_reasoner() -> NemotronReasoner: |
| global _default_reasoner |
| if _default_reasoner is None: |
| _default_reasoner = NemotronReasoner() |
| return _default_reasoner |
|
|