"""End-to-end orchestration for generation and validation.""" from __future__ import annotations import time from src.config import settings from src.generator import generate_response from src.hallucination import hallucination_check from src.model_loader import load_model_bundle from src.prompts import SYSTEM_PROMPT, build_user_prompt from src.rag import CodeRAG from src.relevancy import RelevancyScorer class CodingLLMPipeline: """Coordinates model, RAG, explainability, and quality checks.""" def __init__(self): self.model_bundle = None self.relevancy = RelevancyScorer() self.rag = CodeRAG() if settings.use_rag else None def _ensure_model_loaded(self): if self.model_bundle is None: self.model_bundle = load_model_bundle() def run(self, instruction: str, user_input: str) -> dict: started = time.perf_counter() self._ensure_model_loaded() query_text = f"{instruction}\n{user_input}".strip() retrieved_context = self.rag.retrieve(query_text) if self.rag else "" prompt = f"{SYSTEM_PROMPT}\n\n{build_user_prompt(instruction, user_input, retrieved_context)}" generation = generate_response(self.model_bundle, prompt) hallucination_result = hallucination_check(generation.code) relevancy_score = self.relevancy.score(query_text, generation.code) explanation = generation.explanation if hallucination_result.hallucination: explanation = f"{generation.explanation}\n\nHallucination check reason: {hallucination_result.reason}" latency_ms = int((time.perf_counter() - started) * 1000) return { "code": generation.code, "explanation": explanation, "confidence": round(generation.confidence, 4), "important_tokens": generation.important_tokens, "relevancy_score": round(relevancy_score, 4), "hallucination": hallucination_result.hallucination, "latency_ms": latency_ms, } @property def active_model_name(self) -> str: """Current model name, loading lazily if needed.""" self._ensure_model_loaded() return self.model_bundle.active_model_name