Spaces:

ahanbose
/

voiceAI

Sleeping

App Files Files Community

ahanbose commited on Feb 19

Commit

dc2a587

verified ·

1 Parent(s): ea38730

Update src/modules/llm_backbone.py

Browse files

Files changed (1) hide show

src/modules/llm_backbone.py +286 -239

src/modules/llm_backbone.py CHANGED Viewed

@@ -1,240 +1,287 @@
-"""
-modules/llm_backbone.py
-──────────────────────────────────────────────────────────────────────────────
-VoiceVerse Pro — LLM Script Generation Layer
-Model  : meta-llama/Llama-3.1-8B-Instruct  (default — widely supported 2026)
-         Swap via LLMConfig.model_id for any HF-hosted chat model.
-Backend: huggingface_hub.InferenceClient with provider="auto"
-         "auto" lets HF route to whichever provider currently serves the model,
-         avoiding 410 Gone errors from deprecated provider/model combinations.
-Format : ChatCompletion messages API (system + user roles)
-WHY NOT HuggingFaceEndpoint?
-  langchain-huggingface's HuggingFaceEndpoint internally calls
-  InferenceClient.post(), which was REMOVED in huggingface_hub ≥ 0.26.
-  Using InferenceClient.chat_completion() directly is the stable 2026 path.
-DESIGN RULES:
-  - The LLM NEVER generates without retrieved context.
-  - Context is injected verbatim into every prompt via the user message.
-  - Output is a structured, spoken-style podcast/narration script.
-  - Temperature, max_new_tokens are runtime-configurable.
-"""
-from __future__ import annotations
-import logging
-import os
-from dataclasses import dataclass
-from typing import Optional
-logger = logging.getLogger(__name__)
-# ──────────────────────────────────────────────────────────────────────────────
-# Supported model presets (shown in sidebar dropdown)
-# ──────────────────────────────────────────────────────────────────────────────
-SUPPORTED_MODELS = [
-    "meta-llama/Llama-3.1-8B-Instruct",          # default — fast, free tier
-    "Qwen/Qwen3-Coder-Next-GGUF",
-    "meta-llama/Llama-3.3-70B-Instruct",          # higher quality
-    "mistralai/Mistral-7B-Instruct-v0.3",          # lightweight Mistral
-    "mistralai/Mistral-7B-Instruct-v0.2",          # older Mistral variant
-    "Qwen/Qwen2.5-72B-Instruct",                   # strong alternative
-    "microsoft/Phi-4-reasoning-plus",              # compact, capable
-]
-DEFAULT_MODEL = SUPPORTED_MODELS[0]
-# ──────────────────────────────────────────────────────────────────────────────
-# Configuration
-# ──────────────────────────────────────────────────────────────────────────────
-@dataclass
-class LLMConfig:
-    """Runtime-tunable LLM parameters."""
-    model_id: str = DEFAULT_MODEL
-    max_new_tokens: int = 1024
-    temperature: float = 0.65
-    hf_token: Optional[str] = None
-    # Force HF's own serverless inference — avoids Together/other providers
-    # that deprecate models independently of HF's model hub.
-    provider: str = "auto"
-    task: str = "none"
-# ──────────────────────────────────────────────────────────────────────────────
-# Prompt templates
-# ──────────────────────────────────────────────────────────────────────────────
-SYSTEM_PROMPT = """\
-You are VoiceVerse, a world-class scriptwriter for spoken-audio content.
-Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
-You MUST NOT introduce information not present in that context.
-Write in a warm, engaging, conversational spoken-English style.
-No markdown, no bullet points, no headers — pure spoken prose only.
-The script will be read aloud by a TTS engine."""
-USER_TEMPLATE = """\
-─────────────────────────────────────────────────────────────
-RETRIEVED CONTEXT (your SOLE factual source):
-{context}
-─────────────────────────────────────────────────────────────
-TASK:
-{task_description}
-FORMAT REQUIREMENTS:
-• Open with a compelling hook (1–2 sentences).
-• Develop the topic across 3–5 natural paragraphs drawn ONLY from the context.
-• Close with a memorable takeaway or question to the listener.
-• No markdown. No lists. No headers. Pure spoken prose.
-• Target length: {target_words} words."""
-# ──────────────────────────────────────────────────────────────────────────────
-# LLM Backbone
-# ──────────────────────────────────────────────────────────────────────────────
-class LLMBackbone:
-    """
-    Calls huggingface_hub.InferenceClient.chat_completion() to generate
-    grounded spoken-style scripts.
-    provider="auto" instructs HF's inference router to automatically select
-    the best available provider for the model — this prevents 410 Gone errors
-    caused by a specific provider deprecating a model.
-    """
-    def __init__(self, config: Optional[LLMConfig] = None) -> None:
-        self.config = config or LLMConfig()
-        self._client = None
-        logger.info(
-            "LLMBackbone initialised | model=%s | provider=%s",
-            self.config.model_id,
-            self.config.provider,
-        )
-    # ── Public API ─────────────────────────────────────────────────────────────
-    def generate_script(
-        self,
-        context_text: str,
-        task_description: str,
-        target_words: int = 400,
-    ) -> str:
-        """
-        Generate a grounded spoken-style script.
-        Args:
-            context_text:     Retrieved context from RAGEngine (REQUIRED).
-            task_description: High-level user instruction for the script.
-            target_words:     Approximate word count target.
-        Returns:
-            Clean script text (no markdown artefacts).
-        Raises:
-            ValueError:   If context_text is empty (anti-hallucination guard).
-            RuntimeError: If the HF Inference API call fails.
-        """
-        if not context_text or not context_text.strip():
-            raise ValueError(
-                "context_text must not be empty. "
-                "The LLM requires retrieved context to generate."
-            )
-        messages = self._build_messages(context_text, task_description, target_words)
-        logger.info(
-            "Calling chat_completion | model=%s | provider=%s | ~%d context chars",
-            self.config.model_id,
-            self.config.provider,
-            len(context_text),
-        )
-        try:
-            response = self._get_client().chat_completion(
-                messages=messages,
-                max_tokens=self.config.max_new_tokens,
-                temperature=self.config.temperature,
-            )
-            raw_output: str = response.choices[0].message.content
-        except Exception as exc:
-            logger.error("InferenceClient call failed: %s", exc)
-            raise RuntimeError(f"LLM generation failed: {exc}") from exc
-        script = self._post_process(raw_output)
-        logger.info("Script generated | %d words", len(script.split()))
-        return script
-    # ── Message builder ────────────────────────────────────────────────────────
-    def _build_messages(
-        self,
-        context: str,
-        task: str,
-        target_words: int,
-    ) -> list[dict]:
-        user_content = USER_TEMPLATE.format(
-            context=context,
-            task_description=task,
-            target_words=target_words,
-        )
-        return [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user",   "content": user_content},
-        ]
-    # ── Post-processing ────────────────────────────────────────────────────────
-    @staticmethod
-    def _post_process(raw: str) -> str:
-        for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"):
-            raw = raw.replace(tag, "")
-        lines = [line.rstrip() for line in raw.splitlines()]
-        cleaned: list[str] = []
-        blank_count = 0
-        for line in lines:
-            if not line.strip():
-                blank_count += 1
-                if blank_count <= 2:
-                    cleaned.append("")
-            else:
-                blank_count = 0
-                cleaned.append(line)
-        return "\n".join(cleaned).strip()
-    # ── Lazy client init ───────────────────────────────────────���───────────────
-    def _get_client(self):
-        """
-        Lazy-load InferenceClient with provider="auto".
-        "auto" = HF inference router picks the fastest available provider
-        for the requested model, avoiding stale provider/model 410 errors.
-        """
-        if self._client is None:
-            from huggingface_hub import InferenceClient
-            token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-            if not token:
-                raise EnvironmentError(
-                    "Hugging Face API token not found. "
-                    "Set HUGGINGFACEHUB_API_TOKEN in your .env file "
-                    "or paste it in the sidebar."
-                )
-            logger.info(
-                "Initialising InferenceClient | model=%s | provider=%s",
-                self.config.model_id,
-                self.config.provider,
-            )
-            self._client = InferenceClient(
-                model=self.config.model_id,   # bind model at client level
-                token=token,
-                provider=self.config.provider,
-            )
-            logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id)
         return self._client

+"""
+modules/llm_backbone.py
+──────────────────────────────────────────────────────────────────────────────
+VoiceVerse Pro — LLM Script Generation Layer
+Model  : meta-llama/Llama-3.1-8B-Instruct  (default — widely supported 2026)
+         Swap via LLMConfig.model_id for any HF-hosted chat model.
+Backend: huggingface_hub.InferenceClient with provider="hf-inference"
+         Forces HF's own serverless inference endpoint — avoids third-party
+         providers (e.g. Together) that independently deprecate models and
+         return 410 Gone errors.
+Format : ChatCompletion messages API (system + user roles)
+WHY NOT HuggingFaceEndpoint?
+  langchain-huggingface's HuggingFaceEndpoint internally calls
+  InferenceClient.post(), which was REMOVED in huggingface_hub ≥ 0.26.
+  Using InferenceClient.chat_completion() directly is the stable 2026 path.
+DESIGN RULES:
+  - The LLM NEVER generates without retrieved context.
+  - Context is injected verbatim into every prompt via the user message.
+  - Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged
+    dialogue (podcast), depending on output_mode.
+  - Temperature, max_new_tokens are runtime-configurable.
+"""
+from __future__ import annotations
+import logging
+import os
+from dataclasses import dataclass
+from typing import Optional
+logger = logging.getLogger(__name__)
+# ──────────────────────────────────────────────────────────────────────────────
+# Supported model presets (shown in sidebar dropdown)
+# ──────────────────────────────────────────────────────────────────────────────
+SUPPORTED_MODELS = [
+      "mistralai/Mistral-7B-Instruct-v0.2",             # compact, capable
+]
+DEFAULT_MODEL = SUPPORTED_MODELS[0]
+# ──────────────────────────────────────────────────────────────────────────────
+# Configuration
+# ──────────────────────────────────────────────────────────────────────────────
+@dataclass
+class LLMConfig:
+    """Runtime-tunable LLM parameters."""
+    model_id: str = DEFAULT_MODEL
+    max_new_tokens: int = 1024
+    temperature: float = 0.65
+    hf_token: Optional[str] = None
+    # Force HF's own serverless inference — avoids Together/other providers
+    # that deprecate models independently of HF's model hub.
+    provider: str = "auto"
+# ──────────────────────────────────────────────────────────────────────────────
+# Prompt templates
+# ──────────────────────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = """\
+You are VoiceVerse, a world-class scriptwriter for spoken-audio content.
+Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
+You MUST NOT introduce information not present in that context.
+Write in a warm, engaging, conversational spoken-English style.
+No markdown, no bullet points, no headers — pure spoken prose only.
+The script will be read aloud by a TTS engine."""
+USER_TEMPLATE = """\
+─────────────────────────────────────────────────────────────
+RETRIEVED CONTEXT (your SOLE factual source):
+{context}
+─────────────────────────────────────────────────────────────
+TASK:
+{task_description}
+FORMAT REQUIREMENTS:
+• Open with a compelling hook (1–2 sentences).
+• Develop the topic across 3–5 natural paragraphs drawn ONLY from the context.
+• Close with a memorable takeaway or question to the listener.
+• No markdown. No lists. No headers. Pure spoken prose.
+• Target length: {target_words} words."""
+# ── Podcast (two-speaker) prompts ─────────────────────────────────────────────
+PODCAST_SYSTEM_PROMPT = """\
+You are VoiceVerse, a world-class podcast scriptwriter.
+Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
+You MUST NOT introduce information not present in that context.
+Write a natural back-and-forth dialogue between two speakers:
+  HOST  — female, warm and inquisitive, guides the conversation
+  GUEST — male, knowledgeable and enthusiastic, elaborates on topics
+Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text.
+No markdown, no stage directions, no descriptions — only spoken dialogue lines.
+The script will be read aloud by a TTS engine with two distinct voices."""
+PODCAST_USER_TEMPLATE = """\
+─────────────────────────────────────────────────────────────
+RETRIEVED CONTEXT (your SOLE factual source):
+{context}
+─────────────────────────────────────────────────────────────
+TASK:
+{task_description}
+FORMAT REQUIREMENTS (STRICTLY FOLLOW):
+• Every line must start with [HOST] or [GUEST] followed by their spoken words.
+• Alternate naturally between HOST and GUEST. Aim for 8–16 exchanges.
+• HOST opens and closes the episode.
+• Draw ALL facts ONLY from the context above.
+• No markdown. No stage directions. No headers. Only dialogue lines.
+• Target total length: {target_words} words of dialogue.
+Example format:
+[HOST] Welcome to VoiceVerse. Today we're diving into something fascinating.
+[GUEST] Thanks for having me. I've been looking forward to this conversation.
+[HOST] Let's start with the basics. What should our listeners know first?
+[GUEST] Great question. The most important thing to understand is..."""
+# ──────────────────────────────────────────────────────────────────────────────
+# LLM Backbone
+# ──────────────────────────────────────────────────────────────────────────────
+class LLMBackbone:
+    """
+    Calls huggingface_hub.InferenceClient.chat_completion() to generate
+    grounded spoken-style scripts.
+    Uses provider="hf-inference" (HF's own serverless endpoint) to avoid
+    third-party providers that independently deprecate models.
+    Supports two output modes:
+      - Transcript: plain spoken prose
+      - Podcast:    [HOST]/[GUEST] tagged dialogue for dual-voice TTS
+    """
+    def __init__(self, config: Optional[LLMConfig] = None) -> None:
+        self.config = config or LLMConfig()
+        self._client = None
+        logger.info(
+            "LLMBackbone initialised | model=%s | provider=%s",
+            self.config.model_id,
+            self.config.provider,
+        )
+    # ── Public API ─────────────────────────────────────────────────────────────
+    def generate_script(
+        self,
+        context_text: str,
+        task_description: str,
+        target_words: int = 400,
+        output_mode: str = "Audio Transcript",   # matches OutputMode.value
+    ) -> str:
+        """
+        Generate a grounded script.
+        Args:
+            context_text:     Retrieved context from RAGEngine (REQUIRED).
+            task_description: High-level user instruction for the script.
+            target_words:     Approximate word count target.
+            output_mode:      "Audio Transcript" or "Podcast (2 Speakers)".
+        Returns:
+            Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes.
+        """
+        if not context_text or not context_text.strip():
+            raise ValueError(
+                "context_text must not be empty. "
+                "The LLM requires retrieved context to generate."
+            )
+        is_podcast = output_mode == "Podcast (2 Speakers)"
+        messages = self._build_messages(
+            context_text, task_description, target_words, is_podcast
+        )
+        logger.info(
+            "Calling chat_completion | model=%s | mode=%s | ~%d context chars",
+            self.config.model_id,
+            output_mode,
+            len(context_text),
+        )
+        try:
+            response = self._get_client().chat_completion(
+                messages=messages,
+                max_tokens=self.config.max_new_tokens,
+                temperature=self.config.temperature,
+            )
+            raw_output: str = response.choices[0].message.content
+        except Exception as exc:
+            logger.error("InferenceClient call failed: %s", exc)
+            raise RuntimeError(f"LLM generation failed: {exc}") from exc
+        script = self._post_process(raw_output)
+        logger.info("Script generated | %d words | podcast=%s", len(script.split()), is_podcast)
+        return script
+    # ── Message builder ────────────────────────────────────────────────────────
+    def _build_messages(
+        self,
+        context: str,
+        task: str,
+        target_words: int,
+        is_podcast: bool = False,
+    ) -> list[dict]:
+        if is_podcast:
+            system = PODCAST_SYSTEM_PROMPT
+            user_content = PODCAST_USER_TEMPLATE.format(
+                context=context,
+                task_description=task,
+                target_words=target_words,
+            )
+        else:
+            system = SYSTEM_PROMPT
+            user_content = USER_TEMPLATE.format(
+                context=context,
+                task_description=task,
+                target_words=target_words,
+            )
+        return [
+            {"role": "system", "content": system},
+            {"role": "user",   "content": user_content},
+        ]
+    # ── Post-processing ────────────────────────────────────────────────────────
+    @staticmethod
+    def _post_process(raw: str) -> str:
+        for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"):
+            raw = raw.replace(tag, "")
+        lines = [line.rstrip() for line in raw.splitlines()]
+        cleaned: list[str] = []
+        blank_count = 0
+        for line in lines:
+            if not line.strip():
+                blank_count += 1
+                if blank_count <= 2:
+                    cleaned.append("")
+            else:
+                blank_count = 0
+                cleaned.append(line)
+        return "\n".join(cleaned).strip()
+    # ── Lazy client init ───────────────────────────────────────────────────────
+    def _get_client(self):
+        """
+        Lazy-load huggingface_hub.InferenceClient with provider="hf-inference".
+        Uses HF's own serverless inference endpoint — avoids third-party providers
+        (e.g. Together) that independently deprecate models and return 410 Gone.
+        The client is bound to a specific model at init time.
+        """
+        if self._client is None:
+            from huggingface_hub import InferenceClient
+            token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+            if not token:
+                raise EnvironmentError(
+                    "Hugging Face API token not found. "
+                    "Set HUGGINGFACEHUB_API_TOKEN in your .env file "
+                    "or paste it in the sidebar."
+                )
+            logger.info(
+                "Initialising InferenceClient | model=%s | provider=%s",
+                self.config.model_id,
+                self.config.provider,
+            )
+            self._client = InferenceClient(
+                model=self.config.model_id,   # bind model at client level
+                token=token,
+                provider=self.config.provider,
+            )
+            logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id)
         return self._client