voiceAI / src /modules /llm_backbone.py
ahanbose's picture
Update src/modules/llm_backbone.py
dc2a587 verified
"""
modules/llm_backbone.py
──────────────────────────────────────────────────────────────────────────────
VoiceVerse Pro β€” LLM Script Generation Layer
Model : meta-llama/Llama-3.1-8B-Instruct (default β€” widely supported 2026)
Swap via LLMConfig.model_id for any HF-hosted chat model.
Backend: huggingface_hub.InferenceClient with provider="hf-inference"
Forces HF's own serverless inference endpoint β€” avoids third-party
providers (e.g. Together) that independently deprecate models and
return 410 Gone errors.
Format : ChatCompletion messages API (system + user roles)
WHY NOT HuggingFaceEndpoint?
langchain-huggingface's HuggingFaceEndpoint internally calls
InferenceClient.post(), which was REMOVED in huggingface_hub β‰₯ 0.26.
Using InferenceClient.chat_completion() directly is the stable 2026 path.
DESIGN RULES:
- The LLM NEVER generates without retrieved context.
- Context is injected verbatim into every prompt via the user message.
- Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged
dialogue (podcast), depending on output_mode.
- Temperature, max_new_tokens are runtime-configurable.
"""
from __future__ import annotations
import logging
import os
from dataclasses import dataclass
from typing import Optional
logger = logging.getLogger(__name__)
# ──────────────────────────────────────────────────────────────────────────────
# Supported model presets (shown in sidebar dropdown)
# ──────────────────────────────────────────────────────────────────────────────
SUPPORTED_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.2", # compact, capable
]
DEFAULT_MODEL = SUPPORTED_MODELS[0]
# ──────────────────────────────────────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────────────────────────────────────
@dataclass
class LLMConfig:
"""Runtime-tunable LLM parameters."""
model_id: str = DEFAULT_MODEL
max_new_tokens: int = 1024
temperature: float = 0.65
hf_token: Optional[str] = None
# Force HF's own serverless inference β€” avoids Together/other providers
# that deprecate models independently of HF's model hub.
provider: str = "auto"
# ──────────────────────────────────────────────────────────────────────────────
# Prompt templates
# ──────────────────────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """\
You are VoiceVerse, a world-class scriptwriter for spoken-audio content.
Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
You MUST NOT introduce information not present in that context.
Write in a warm, engaging, conversational spoken-English style.
No markdown, no bullet points, no headers β€” pure spoken prose only.
The script will be read aloud by a TTS engine."""
USER_TEMPLATE = """\
─────────────────────────────────────────────────────────────
RETRIEVED CONTEXT (your SOLE factual source):
{context}
─────────────────────────────────────────────────────────────
TASK:
{task_description}
FORMAT REQUIREMENTS:
β€’ Open with a compelling hook (1–2 sentences).
β€’ Develop the topic across 3–5 natural paragraphs drawn ONLY from the context.
β€’ Close with a memorable takeaway or question to the listener.
β€’ No markdown. No lists. No headers. Pure spoken prose.
β€’ Target length: {target_words} words."""
# ── Podcast (two-speaker) prompts ─────────────────────────────────────────────
PODCAST_SYSTEM_PROMPT = """\
You are VoiceVerse, a world-class podcast scriptwriter.
Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
You MUST NOT introduce information not present in that context.
Write a natural back-and-forth dialogue between two speakers:
HOST β€” female, warm and inquisitive, guides the conversation
GUEST β€” male, knowledgeable and enthusiastic, elaborates on topics
Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text.
No markdown, no stage directions, no descriptions β€” only spoken dialogue lines.
The script will be read aloud by a TTS engine with two distinct voices."""
PODCAST_USER_TEMPLATE = """\
─────────────────────────────────────────────────────────────
RETRIEVED CONTEXT (your SOLE factual source):
{context}
─────────────────────────────────────────────────────────────
TASK:
{task_description}
FORMAT REQUIREMENTS (STRICTLY FOLLOW):
β€’ Every line must start with [HOST] or [GUEST] followed by their spoken words.
β€’ Alternate naturally between HOST and GUEST. Aim for 8–16 exchanges.
β€’ HOST opens and closes the episode.
β€’ Draw ALL facts ONLY from the context above.
β€’ No markdown. No stage directions. No headers. Only dialogue lines.
β€’ Target total length: {target_words} words of dialogue.
Example format:
[HOST] Welcome to VoiceVerse. Today we're diving into something fascinating.
[GUEST] Thanks for having me. I've been looking forward to this conversation.
[HOST] Let's start with the basics. What should our listeners know first?
[GUEST] Great question. The most important thing to understand is..."""
# ──────────────────────────────────────────────────────────────────────────────
# LLM Backbone
# ──────────────────────────────────────────────────────────────────────────────
class LLMBackbone:
"""
Calls huggingface_hub.InferenceClient.chat_completion() to generate
grounded spoken-style scripts.
Uses provider="hf-inference" (HF's own serverless endpoint) to avoid
third-party providers that independently deprecate models.
Supports two output modes:
- Transcript: plain spoken prose
- Podcast: [HOST]/[GUEST] tagged dialogue for dual-voice TTS
"""
def __init__(self, config: Optional[LLMConfig] = None) -> None:
self.config = config or LLMConfig()
self._client = None
logger.info(
"LLMBackbone initialised | model=%s | provider=%s",
self.config.model_id,
self.config.provider,
)
# ── Public API ─────────────────────────────────────────────────────────────
def generate_script(
self,
context_text: str,
task_description: str,
target_words: int = 400,
output_mode: str = "Audio Transcript", # matches OutputMode.value
) -> str:
"""
Generate a grounded script.
Args:
context_text: Retrieved context from RAGEngine (REQUIRED).
task_description: High-level user instruction for the script.
target_words: Approximate word count target.
output_mode: "Audio Transcript" or "Podcast (2 Speakers)".
Returns:
Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes.
"""
if not context_text or not context_text.strip():
raise ValueError(
"context_text must not be empty. "
"The LLM requires retrieved context to generate."
)
is_podcast = output_mode == "Podcast (2 Speakers)"
messages = self._build_messages(
context_text, task_description, target_words, is_podcast
)
logger.info(
"Calling chat_completion | model=%s | mode=%s | ~%d context chars",
self.config.model_id,
output_mode,
len(context_text),
)
try:
response = self._get_client().chat_completion(
messages=messages,
max_tokens=self.config.max_new_tokens,
temperature=self.config.temperature,
)
raw_output: str = response.choices[0].message.content
except Exception as exc:
logger.error("InferenceClient call failed: %s", exc)
raise RuntimeError(f"LLM generation failed: {exc}") from exc
script = self._post_process(raw_output)
logger.info("Script generated | %d words | podcast=%s", len(script.split()), is_podcast)
return script
# ── Message builder ────────────────────────────────────────────────────────
def _build_messages(
self,
context: str,
task: str,
target_words: int,
is_podcast: bool = False,
) -> list[dict]:
if is_podcast:
system = PODCAST_SYSTEM_PROMPT
user_content = PODCAST_USER_TEMPLATE.format(
context=context,
task_description=task,
target_words=target_words,
)
else:
system = SYSTEM_PROMPT
user_content = USER_TEMPLATE.format(
context=context,
task_description=task,
target_words=target_words,
)
return [
{"role": "system", "content": system},
{"role": "user", "content": user_content},
]
# ── Post-processing ────────────────────────────────────────────────────────
@staticmethod
def _post_process(raw: str) -> str:
for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"):
raw = raw.replace(tag, "")
lines = [line.rstrip() for line in raw.splitlines()]
cleaned: list[str] = []
blank_count = 0
for line in lines:
if not line.strip():
blank_count += 1
if blank_count <= 2:
cleaned.append("")
else:
blank_count = 0
cleaned.append(line)
return "\n".join(cleaned).strip()
# ── Lazy client init ───────────────────────────────────────────────────────
def _get_client(self):
"""
Lazy-load huggingface_hub.InferenceClient with provider="hf-inference".
Uses HF's own serverless inference endpoint β€” avoids third-party providers
(e.g. Together) that independently deprecate models and return 410 Gone.
The client is bound to a specific model at init time.
"""
if self._client is None:
from huggingface_hub import InferenceClient
token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not token:
raise EnvironmentError(
"Hugging Face API token not found. "
"Set HUGGINGFACEHUB_API_TOKEN in your .env file "
"or paste it in the sidebar."
)
logger.info(
"Initialising InferenceClient | model=%s | provider=%s",
self.config.model_id,
self.config.provider,
)
self._client = InferenceClient(
model=self.config.model_id, # bind model at client level
token=token,
provider=self.config.provider,
)
logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id)
return self._client