| | """ |
| | modules/llm_backbone.py |
| | ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | VoiceVerse Pro β LLM Script Generation Layer |
| | |
| | Model : meta-llama/Llama-3.1-8B-Instruct (default β widely supported 2026) |
| | Swap via LLMConfig.model_id for any HF-hosted chat model. |
| | Backend: huggingface_hub.InferenceClient with provider="hf-inference" |
| | Forces HF's own serverless inference endpoint β avoids third-party |
| | providers (e.g. Together) that independently deprecate models and |
| | return 410 Gone errors. |
| | Format : ChatCompletion messages API (system + user roles) |
| | |
| | WHY NOT HuggingFaceEndpoint? |
| | langchain-huggingface's HuggingFaceEndpoint internally calls |
| | InferenceClient.post(), which was REMOVED in huggingface_hub β₯ 0.26. |
| | Using InferenceClient.chat_completion() directly is the stable 2026 path. |
| | |
| | DESIGN RULES: |
| | - The LLM NEVER generates without retrieved context. |
| | - Context is injected verbatim into every prompt via the user message. |
| | - Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged |
| | dialogue (podcast), depending on output_mode. |
| | - Temperature, max_new_tokens are runtime-configurable. |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import logging |
| | import os |
| | from dataclasses import dataclass |
| | from typing import Optional |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | SUPPORTED_MODELS = [ |
| | "mistralai/Mistral-7B-Instruct-v0.2", |
| | ] |
| |
|
| | DEFAULT_MODEL = SUPPORTED_MODELS[0] |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @dataclass |
| | class LLMConfig: |
| | """Runtime-tunable LLM parameters.""" |
| | model_id: str = DEFAULT_MODEL |
| | max_new_tokens: int = 1024 |
| | temperature: float = 0.65 |
| | hf_token: Optional[str] = None |
| | |
| | |
| | provider: str = "auto" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | SYSTEM_PROMPT = """\ |
| | You are VoiceVerse, a world-class scriptwriter for spoken-audio content. |
| | Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message. |
| | You MUST NOT introduce information not present in that context. |
| | Write in a warm, engaging, conversational spoken-English style. |
| | No markdown, no bullet points, no headers β pure spoken prose only. |
| | The script will be read aloud by a TTS engine.""" |
| |
|
| | USER_TEMPLATE = """\ |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | RETRIEVED CONTEXT (your SOLE factual source): |
| | {context} |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | |
| | TASK: |
| | {task_description} |
| | |
| | FORMAT REQUIREMENTS: |
| | β’ Open with a compelling hook (1β2 sentences). |
| | β’ Develop the topic across 3β5 natural paragraphs drawn ONLY from the context. |
| | β’ Close with a memorable takeaway or question to the listener. |
| | β’ No markdown. No lists. No headers. Pure spoken prose. |
| | β’ Target length: {target_words} words.""" |
| |
|
| |
|
| | |
| |
|
| | PODCAST_SYSTEM_PROMPT = """\ |
| | You are VoiceVerse, a world-class podcast scriptwriter. |
| | Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message. |
| | You MUST NOT introduce information not present in that context. |
| | Write a natural back-and-forth dialogue between two speakers: |
| | HOST β female, warm and inquisitive, guides the conversation |
| | GUEST β male, knowledgeable and enthusiastic, elaborates on topics |
| | Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text. |
| | No markdown, no stage directions, no descriptions β only spoken dialogue lines. |
| | The script will be read aloud by a TTS engine with two distinct voices.""" |
| |
|
| | PODCAST_USER_TEMPLATE = """\ |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | RETRIEVED CONTEXT (your SOLE factual source): |
| | {context} |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | |
| | TASK: |
| | {task_description} |
| | |
| | FORMAT REQUIREMENTS (STRICTLY FOLLOW): |
| | β’ Every line must start with [HOST] or [GUEST] followed by their spoken words. |
| | β’ Alternate naturally between HOST and GUEST. Aim for 8β16 exchanges. |
| | β’ HOST opens and closes the episode. |
| | β’ Draw ALL facts ONLY from the context above. |
| | β’ No markdown. No stage directions. No headers. Only dialogue lines. |
| | β’ Target total length: {target_words} words of dialogue. |
| | |
| | Example format: |
| | [HOST] Welcome to VoiceVerse. Today we're diving into something fascinating. |
| | [GUEST] Thanks for having me. I've been looking forward to this conversation. |
| | [HOST] Let's start with the basics. What should our listeners know first? |
| | [GUEST] Great question. The most important thing to understand is...""" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class LLMBackbone: |
| | """ |
| | Calls huggingface_hub.InferenceClient.chat_completion() to generate |
| | grounded spoken-style scripts. |
| | |
| | Uses provider="hf-inference" (HF's own serverless endpoint) to avoid |
| | third-party providers that independently deprecate models. |
| | Supports two output modes: |
| | - Transcript: plain spoken prose |
| | - Podcast: [HOST]/[GUEST] tagged dialogue for dual-voice TTS |
| | """ |
| |
|
| | def __init__(self, config: Optional[LLMConfig] = None) -> None: |
| | self.config = config or LLMConfig() |
| | self._client = None |
| | logger.info( |
| | "LLMBackbone initialised | model=%s | provider=%s", |
| | self.config.model_id, |
| | self.config.provider, |
| | ) |
| |
|
| | |
| |
|
| | def generate_script( |
| | self, |
| | context_text: str, |
| | task_description: str, |
| | target_words: int = 400, |
| | output_mode: str = "Audio Transcript", |
| | ) -> str: |
| | """ |
| | Generate a grounded script. |
| | |
| | Args: |
| | context_text: Retrieved context from RAGEngine (REQUIRED). |
| | task_description: High-level user instruction for the script. |
| | target_words: Approximate word count target. |
| | output_mode: "Audio Transcript" or "Podcast (2 Speakers)". |
| | |
| | Returns: |
| | Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes. |
| | """ |
| | if not context_text or not context_text.strip(): |
| | raise ValueError( |
| | "context_text must not be empty. " |
| | "The LLM requires retrieved context to generate." |
| | ) |
| |
|
| | is_podcast = output_mode == "Podcast (2 Speakers)" |
| | messages = self._build_messages( |
| | context_text, task_description, target_words, is_podcast |
| | ) |
| |
|
| | logger.info( |
| | "Calling chat_completion | model=%s | mode=%s | ~%d context chars", |
| | self.config.model_id, |
| | output_mode, |
| | len(context_text), |
| | ) |
| |
|
| | try: |
| | response = self._get_client().chat_completion( |
| | messages=messages, |
| | max_tokens=self.config.max_new_tokens, |
| | temperature=self.config.temperature, |
| | ) |
| | raw_output: str = response.choices[0].message.content |
| | except Exception as exc: |
| | logger.error("InferenceClient call failed: %s", exc) |
| | raise RuntimeError(f"LLM generation failed: {exc}") from exc |
| |
|
| | script = self._post_process(raw_output) |
| | logger.info("Script generated | %d words | podcast=%s", len(script.split()), is_podcast) |
| | return script |
| |
|
| | |
| |
|
| | def _build_messages( |
| | self, |
| | context: str, |
| | task: str, |
| | target_words: int, |
| | is_podcast: bool = False, |
| | ) -> list[dict]: |
| | if is_podcast: |
| | system = PODCAST_SYSTEM_PROMPT |
| | user_content = PODCAST_USER_TEMPLATE.format( |
| | context=context, |
| | task_description=task, |
| | target_words=target_words, |
| | ) |
| | else: |
| | system = SYSTEM_PROMPT |
| | user_content = USER_TEMPLATE.format( |
| | context=context, |
| | task_description=task, |
| | target_words=target_words, |
| | ) |
| | return [ |
| | {"role": "system", "content": system}, |
| | {"role": "user", "content": user_content}, |
| | ] |
| |
|
| | |
| |
|
| | @staticmethod |
| | def _post_process(raw: str) -> str: |
| | for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"): |
| | raw = raw.replace(tag, "") |
| | lines = [line.rstrip() for line in raw.splitlines()] |
| | cleaned: list[str] = [] |
| | blank_count = 0 |
| | for line in lines: |
| | if not line.strip(): |
| | blank_count += 1 |
| | if blank_count <= 2: |
| | cleaned.append("") |
| | else: |
| | blank_count = 0 |
| | cleaned.append(line) |
| | return "\n".join(cleaned).strip() |
| |
|
| | |
| |
|
| | def _get_client(self): |
| | """ |
| | Lazy-load huggingface_hub.InferenceClient with provider="hf-inference". |
| | Uses HF's own serverless inference endpoint β avoids third-party providers |
| | (e.g. Together) that independently deprecate models and return 410 Gone. |
| | The client is bound to a specific model at init time. |
| | """ |
| | if self._client is None: |
| | from huggingface_hub import InferenceClient |
| |
|
| | token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN") |
| | if not token: |
| | raise EnvironmentError( |
| | "Hugging Face API token not found. " |
| | "Set HUGGINGFACEHUB_API_TOKEN in your .env file " |
| | "or paste it in the sidebar." |
| | ) |
| |
|
| | logger.info( |
| | "Initialising InferenceClient | model=%s | provider=%s", |
| | self.config.model_id, |
| | self.config.provider, |
| | ) |
| | self._client = InferenceClient( |
| | model=self.config.model_id, |
| | token=token, |
| | provider=self.config.provider, |
| | ) |
| | logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id) |
| | return self._client |