Spaces:

ahanbose
/

voiceAI

Sleeping

File size: 13,459 Bytes

"""
modules/llm_backbone.py
──────────────────────────────────────────────────────────────────────────────
VoiceVerse Pro — LLM Script Generation Layer

Model  : meta-llama/Llama-3.1-8B-Instruct  (default — widely supported 2026)
         Swap via LLMConfig.model_id for any HF-hosted chat model.
Backend: huggingface_hub.InferenceClient with provider="hf-inference"
         Forces HF's own serverless inference endpoint — avoids third-party
         providers (e.g. Together) that independently deprecate models and
         return 410 Gone errors.
Format : ChatCompletion messages API (system + user roles)

WHY NOT HuggingFaceEndpoint?
  langchain-huggingface's HuggingFaceEndpoint internally calls
  InferenceClient.post(), which was REMOVED in huggingface_hub ≥ 0.26.
  Using InferenceClient.chat_completion() directly is the stable 2026 path.

DESIGN RULES:
  - The LLM NEVER generates without retrieved context.
  - Context is injected verbatim into every prompt via the user message.
  - Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged
    dialogue (podcast), depending on output_mode.
  - Temperature, max_new_tokens are runtime-configurable.
"""

from __future__ import annotations

import logging
import os
from dataclasses import dataclass
from typing import Optional

logger = logging.getLogger(__name__)


# ──────────────────────────────────────────────────────────────────────────────
# Supported model presets (shown in sidebar dropdown)
# ──────────────────────────────────────────────────────────────────────────────

SUPPORTED_MODELS = [
      "mistralai/Mistral-7B-Instruct-v0.2",             # compact, capable
]

DEFAULT_MODEL = SUPPORTED_MODELS[0]


# ──────────────────────────────────────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────────────────────────────────────

@dataclass
class LLMConfig:
    """Runtime-tunable LLM parameters."""
    model_id: str = DEFAULT_MODEL
    max_new_tokens: int = 1024
    temperature: float = 0.65
    hf_token: Optional[str] = None
    # Force HF's own serverless inference — avoids Together/other providers
    # that deprecate models independently of HF's model hub.
    provider: str = "auto"


# ──────────────────────────────────────────────────────────────────────────────
# Prompt templates
# ──────────────────────────────────────────────────────────────────────────────

SYSTEM_PROMPT = """\
You are VoiceVerse, a world-class scriptwriter for spoken-audio content.
Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
You MUST NOT introduce information not present in that context.
Write in a warm, engaging, conversational spoken-English style.
No markdown, no bullet points, no headers — pure spoken prose only.
The script will be read aloud by a TTS engine."""

USER_TEMPLATE = """\
─────────────────────────────────────────────────────────────
RETRIEVED CONTEXT (your SOLE factual source):
{context}
─────────────────────────────────────────────────────────────

TASK:
{task_description}

FORMAT REQUIREMENTS:
• Open with a compelling hook (1–2 sentences).
• Develop the topic across 3–5 natural paragraphs drawn ONLY from the context.
• Close with a memorable takeaway or question to the listener.
• No markdown. No lists. No headers. Pure spoken prose.
• Target length: {target_words} words."""


# ── Podcast (two-speaker) prompts ─────────────────────────────────────────────

PODCAST_SYSTEM_PROMPT = """\
You are VoiceVerse, a world-class podcast scriptwriter.
Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
You MUST NOT introduce information not present in that context.
Write a natural back-and-forth dialogue between two speakers:
  HOST  — female, warm and inquisitive, guides the conversation
  GUEST — male, knowledgeable and enthusiastic, elaborates on topics
Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text.
No markdown, no stage directions, no descriptions — only spoken dialogue lines.
The script will be read aloud by a TTS engine with two distinct voices."""

PODCAST_USER_TEMPLATE = """\
─────────────────────────────────────────────────────────────
RETRIEVED CONTEXT (your SOLE factual source):
{context}
─────────────────────────────────────────────────────────────

TASK:
{task_description}

FORMAT REQUIREMENTS (STRICTLY FOLLOW):
• Every line must start with [HOST] or [GUEST] followed by their spoken words.
• Alternate naturally between HOST and GUEST. Aim for 8–16 exchanges.
• HOST opens and closes the episode.
• Draw ALL facts ONLY from the context above.
• No markdown. No stage directions. No headers. Only dialogue lines.
• Target total length: {target_words} words of dialogue.

Example format:
[HOST] Welcome to VoiceVerse. Today we're diving into something fascinating.
[GUEST] Thanks for having me. I've been looking forward to this conversation.
[HOST] Let's start with the basics. What should our listeners know first?
[GUEST] Great question. The most important thing to understand is..."""


# ──────────────────────────────────────────────────────────────────────────────
# LLM Backbone
# ──────────────────────────────────────────────────────────────────────────────

class LLMBackbone:
    """
    Calls huggingface_hub.InferenceClient.chat_completion() to generate
    grounded spoken-style scripts.

    Uses provider="hf-inference" (HF's own serverless endpoint) to avoid
    third-party providers that independently deprecate models.
    Supports two output modes:
      - Transcript: plain spoken prose
      - Podcast:    [HOST]/[GUEST] tagged dialogue for dual-voice TTS
    """

    def __init__(self, config: Optional[LLMConfig] = None) -> None:
        self.config = config or LLMConfig()
        self._client = None
        logger.info(
            "LLMBackbone initialised | model=%s | provider=%s",
            self.config.model_id,
            self.config.provider,
        )

    # ── Public API ─────────────────────────────────────────────────────────────

    def generate_script(
        self,
        context_text: str,
        task_description: str,
        target_words: int = 400,
        output_mode: str = "Audio Transcript",   # matches OutputMode.value
    ) -> str:
        """
        Generate a grounded script.

        Args:
            context_text:     Retrieved context from RAGEngine (REQUIRED).
            task_description: High-level user instruction for the script.
            target_words:     Approximate word count target.
            output_mode:      "Audio Transcript" or "Podcast (2 Speakers)".

        Returns:
            Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes.
        """
        if not context_text or not context_text.strip():
            raise ValueError(
                "context_text must not be empty. "
                "The LLM requires retrieved context to generate."
            )

        is_podcast = output_mode == "Podcast (2 Speakers)"
        messages = self._build_messages(
            context_text, task_description, target_words, is_podcast
        )

        logger.info(
            "Calling chat_completion | model=%s | mode=%s | ~%d context chars",
            self.config.model_id,
            output_mode,
            len(context_text),
        )

        try:
            response = self._get_client().chat_completion(
                messages=messages,
                max_tokens=self.config.max_new_tokens,
                temperature=self.config.temperature,
            )
            raw_output: str = response.choices[0].message.content
        except Exception as exc:
            logger.error("InferenceClient call failed: %s", exc)
            raise RuntimeError(f"LLM generation failed: {exc}") from exc

        script = self._post_process(raw_output)
        logger.info("Script generated | %d words | podcast=%s", len(script.split()), is_podcast)
        return script

    # ── Message builder ────────────────────────────────────────────────────────

    def _build_messages(
        self,
        context: str,
        task: str,
        target_words: int,
        is_podcast: bool = False,
    ) -> list[dict]:
        if is_podcast:
            system = PODCAST_SYSTEM_PROMPT
            user_content = PODCAST_USER_TEMPLATE.format(
                context=context,
                task_description=task,
                target_words=target_words,
            )
        else:
            system = SYSTEM_PROMPT
            user_content = USER_TEMPLATE.format(
                context=context,
                task_description=task,
                target_words=target_words,
            )
        return [
            {"role": "system", "content": system},
            {"role": "user",   "content": user_content},
        ]

    # ── Post-processing ────────────────────────────────────────────────────────

    @staticmethod
    def _post_process(raw: str) -> str:
        for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"):
            raw = raw.replace(tag, "")
        lines = [line.rstrip() for line in raw.splitlines()]
        cleaned: list[str] = []
        blank_count = 0
        for line in lines:
            if not line.strip():
                blank_count += 1
                if blank_count <= 2:
                    cleaned.append("")
            else:
                blank_count = 0
                cleaned.append(line)
        return "\n".join(cleaned).strip()

    # ── Lazy client init ───────────────────────────────────────────────────────

    def _get_client(self):
        """
        Lazy-load huggingface_hub.InferenceClient with provider="hf-inference".
        Uses HF's own serverless inference endpoint — avoids third-party providers
        (e.g. Together) that independently deprecate models and return 410 Gone.
        The client is bound to a specific model at init time.
        """
        if self._client is None:
            from huggingface_hub import InferenceClient

            token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN")
            if not token:
                raise EnvironmentError(
                    "Hugging Face API token not found. "
                    "Set HUGGINGFACEHUB_API_TOKEN in your .env file "
                    "or paste it in the sidebar."
                )

            logger.info(
                "Initialising InferenceClient | model=%s | provider=%s",
                self.config.model_id,
                self.config.provider,
            )
            self._client = InferenceClient(
                model=self.config.model_id,   # bind model at client level
                token=token,
                provider=self.config.provider,
            )
            logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id)
        return self._client