""" modules/llm_backbone.py ────────────────────────────────────────────────────────────────────────────── VoiceVerse Pro — LLM Script Generation Layer Model : meta-llama/Llama-3.1-8B-Instruct (default — widely supported 2026) Swap via LLMConfig.model_id for any HF-hosted chat model. Backend: huggingface_hub.InferenceClient with provider="hf-inference" Forces HF's own serverless inference endpoint — avoids third-party providers (e.g. Together) that independently deprecate models and return 410 Gone errors. Format : ChatCompletion messages API (system + user roles) WHY NOT HuggingFaceEndpoint? langchain-huggingface's HuggingFaceEndpoint internally calls InferenceClient.post(), which was REMOVED in huggingface_hub ≥ 0.26. Using InferenceClient.chat_completion() directly is the stable 2026 path. DESIGN RULES: - The LLM NEVER generates without retrieved context. - Context is injected verbatim into every prompt via the user message. - Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged dialogue (podcast), depending on output_mode. - Temperature, max_new_tokens are runtime-configurable. """ from __future__ import annotations import logging import os from dataclasses import dataclass from typing import Optional logger = logging.getLogger(__name__) # ────────────────────────────────────────────────────────────────────────────── # Supported model presets (shown in sidebar dropdown) # ────────────────────────────────────────────────────────────────────────────── SUPPORTED_MODELS = [ "mistralai/Mistral-7B-Instruct-v0.2", # compact, capable ] DEFAULT_MODEL = SUPPORTED_MODELS[0] # ────────────────────────────────────────────────────────────────────────────── # Configuration # ────────────────────────────────────────────────────────────────────────────── @dataclass class LLMConfig: """Runtime-tunable LLM parameters.""" model_id: str = DEFAULT_MODEL max_new_tokens: int = 1024 temperature: float = 0.65 hf_token: Optional[str] = None # Force HF's own serverless inference — avoids Together/other providers # that deprecate models independently of HF's model hub. provider: str = "auto" # ────────────────────────────────────────────────────────────────────────────── # Prompt templates # ────────────────────────────────────────────────────────────────────────────── SYSTEM_PROMPT = """\ You are VoiceVerse, a world-class scriptwriter for spoken-audio content. Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message. You MUST NOT introduce information not present in that context. Write in a warm, engaging, conversational spoken-English style. No markdown, no bullet points, no headers — pure spoken prose only. The script will be read aloud by a TTS engine.""" USER_TEMPLATE = """\ ───────────────────────────────────────────────────────────── RETRIEVED CONTEXT (your SOLE factual source): {context} ───────────────────────────────────────────────────────────── TASK: {task_description} FORMAT REQUIREMENTS: • Open with a compelling hook (1–2 sentences). • Develop the topic across 3–5 natural paragraphs drawn ONLY from the context. • Close with a memorable takeaway or question to the listener. • No markdown. No lists. No headers. Pure spoken prose. • Target length: {target_words} words.""" # ── Podcast (two-speaker) prompts ───────────────────────────────────────────── PODCAST_SYSTEM_PROMPT = """\ You are VoiceVerse, a world-class podcast scriptwriter. Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message. You MUST NOT introduce information not present in that context. Write a natural back-and-forth dialogue between two speakers: HOST — female, warm and inquisitive, guides the conversation GUEST — male, knowledgeable and enthusiastic, elaborates on topics Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text. No markdown, no stage directions, no descriptions — only spoken dialogue lines. The script will be read aloud by a TTS engine with two distinct voices.""" PODCAST_USER_TEMPLATE = """\ ───────────────────────────────────────────────────────────── RETRIEVED CONTEXT (your SOLE factual source): {context} ───────────────────────────────────────────────────────────── TASK: {task_description} FORMAT REQUIREMENTS (STRICTLY FOLLOW): • Every line must start with [HOST] or [GUEST] followed by their spoken words. • Alternate naturally between HOST and GUEST. Aim for 8–16 exchanges. • HOST opens and closes the episode. • Draw ALL facts ONLY from the context above. • No markdown. No stage directions. No headers. Only dialogue lines. • Target total length: {target_words} words of dialogue. Example format: [HOST] Welcome to VoiceVerse. Today we're diving into something fascinating. [GUEST] Thanks for having me. I've been looking forward to this conversation. [HOST] Let's start with the basics. What should our listeners know first? [GUEST] Great question. The most important thing to understand is...""" # ────────────────────────────────────────────────────────────────────────────── # LLM Backbone # ────────────────────────────────────────────────────────────────────────────── class LLMBackbone: """ Calls huggingface_hub.InferenceClient.chat_completion() to generate grounded spoken-style scripts. Uses provider="hf-inference" (HF's own serverless endpoint) to avoid third-party providers that independently deprecate models. Supports two output modes: - Transcript: plain spoken prose - Podcast: [HOST]/[GUEST] tagged dialogue for dual-voice TTS """ def __init__(self, config: Optional[LLMConfig] = None) -> None: self.config = config or LLMConfig() self._client = None logger.info( "LLMBackbone initialised | model=%s | provider=%s", self.config.model_id, self.config.provider, ) # ── Public API ───────────────────────────────────────────────────────────── def generate_script( self, context_text: str, task_description: str, target_words: int = 400, output_mode: str = "Audio Transcript", # matches OutputMode.value ) -> str: """ Generate a grounded script. Args: context_text: Retrieved context from RAGEngine (REQUIRED). task_description: High-level user instruction for the script. target_words: Approximate word count target. output_mode: "Audio Transcript" or "Podcast (2 Speakers)". Returns: Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes. """ if not context_text or not context_text.strip(): raise ValueError( "context_text must not be empty. " "The LLM requires retrieved context to generate." ) is_podcast = output_mode == "Podcast (2 Speakers)" messages = self._build_messages( context_text, task_description, target_words, is_podcast ) logger.info( "Calling chat_completion | model=%s | mode=%s | ~%d context chars", self.config.model_id, output_mode, len(context_text), ) try: response = self._get_client().chat_completion( messages=messages, max_tokens=self.config.max_new_tokens, temperature=self.config.temperature, ) raw_output: str = response.choices[0].message.content except Exception as exc: logger.error("InferenceClient call failed: %s", exc) raise RuntimeError(f"LLM generation failed: {exc}") from exc script = self._post_process(raw_output) logger.info("Script generated | %d words | podcast=%s", len(script.split()), is_podcast) return script # ── Message builder ──────────────────────────────────────────────────────── def _build_messages( self, context: str, task: str, target_words: int, is_podcast: bool = False, ) -> list[dict]: if is_podcast: system = PODCAST_SYSTEM_PROMPT user_content = PODCAST_USER_TEMPLATE.format( context=context, task_description=task, target_words=target_words, ) else: system = SYSTEM_PROMPT user_content = USER_TEMPLATE.format( context=context, task_description=task, target_words=target_words, ) return [ {"role": "system", "content": system}, {"role": "user", "content": user_content}, ] # ── Post-processing ──────────────────────────────────────────────────────── @staticmethod def _post_process(raw: str) -> str: for tag in ("[INST]", "[/INST]", "", "", "<>", "<>"): raw = raw.replace(tag, "") lines = [line.rstrip() for line in raw.splitlines()] cleaned: list[str] = [] blank_count = 0 for line in lines: if not line.strip(): blank_count += 1 if blank_count <= 2: cleaned.append("") else: blank_count = 0 cleaned.append(line) return "\n".join(cleaned).strip() # ── Lazy client init ─────────────────────────────────────────────────────── def _get_client(self): """ Lazy-load huggingface_hub.InferenceClient with provider="hf-inference". Uses HF's own serverless inference endpoint — avoids third-party providers (e.g. Together) that independently deprecate models and return 410 Gone. The client is bound to a specific model at init time. """ if self._client is None: from huggingface_hub import InferenceClient token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN") if not token: raise EnvironmentError( "Hugging Face API token not found. " "Set HUGGINGFACEHUB_API_TOKEN in your .env file " "or paste it in the sidebar." ) logger.info( "Initialising InferenceClient | model=%s | provider=%s", self.config.model_id, self.config.provider, ) self._client = InferenceClient( model=self.config.model_id, # bind model at client level token=token, provider=self.config.provider, ) logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id) return self._client