Spaces:

ahanbose
/

voiceAI

Sleeping

App Files Files Community

voiceAI / src /modules /llm_backbone.py

ahanbose

Update src/modules/llm_backbone.py

dc2a587 verified 24 days ago

raw

history blame contribute delete

13.5 kB

	"""
	modules/llm_backbone.py
	──────────────────────────────────────────────────────────────────────────────
	VoiceVerse Pro — LLM Script Generation Layer

	Model : meta-llama/Llama-3.1-8B-Instruct (default — widely supported 2026)
	Swap via LLMConfig.model_id for any HF-hosted chat model.
	Backend: huggingface_hub.InferenceClient with provider="hf-inference"
	Forces HF's own serverless inference endpoint — avoids third-party
	providers (e.g. Together) that independently deprecate models and
	return 410 Gone errors.
	Format : ChatCompletion messages API (system + user roles)

	WHY NOT HuggingFaceEndpoint?
	langchain-huggingface's HuggingFaceEndpoint internally calls
	InferenceClient.post(), which was REMOVED in huggingface_hub ≥ 0.26.
	Using InferenceClient.chat_completion() directly is the stable 2026 path.

	DESIGN RULES:
	- The LLM NEVER generates without retrieved context.
	- Context is injected verbatim into every prompt via the user message.
	- Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged
	dialogue (podcast), depending on output_mode.
	- Temperature, max_new_tokens are runtime-configurable.
	"""

	from __future__ import annotations

	import logging
	import os
	from dataclasses import dataclass
	from typing import Optional

	logger = logging.getLogger(__name__)


	# ──────────────────────────────────────────────────────────────────────────────
	# Supported model presets (shown in sidebar dropdown)
	# ──────────────────────────────────────────────────────────────────────────────

	SUPPORTED_MODELS = [
	"mistralai/Mistral-7B-Instruct-v0.2", # compact, capable
	]

	DEFAULT_MODEL = SUPPORTED_MODELS[0]


	# ──────────────────────────────────────────────────────────────────────────────
	# Configuration
	# ──────────────────────────────────────────────────────────────────────────────

	@dataclass
	class LLMConfig:
	"""Runtime-tunable LLM parameters."""
	model_id: str = DEFAULT_MODEL
	max_new_tokens: int = 1024
	temperature: float = 0.65
	hf_token: Optional[str] = None
	# Force HF's own serverless inference — avoids Together/other providers
	# that deprecate models independently of HF's model hub.
	provider: str = "auto"


	# ──────────────────────────────────────────────────────────────────────────────
	# Prompt templates
	# ──────────────────────────────────────────────────────────────────────────────

	SYSTEM_PROMPT = """\
	You are VoiceVerse, a world-class scriptwriter for spoken-audio content.
	Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
	You MUST NOT introduce information not present in that context.
	Write in a warm, engaging, conversational spoken-English style.
	No markdown, no bullet points, no headers — pure spoken prose only.
	The script will be read aloud by a TTS engine."""

	USER_TEMPLATE = """\
	─────────────────────────────────────────────────────────────
	RETRIEVED CONTEXT (your SOLE factual source):
	{context}
	─────────────────────────────────────────────────────────────

	TASK:
	{task_description}

	FORMAT REQUIREMENTS:
	• Open with a compelling hook (1–2 sentences).
	• Develop the topic across 3–5 natural paragraphs drawn ONLY from the context.
	• Close with a memorable takeaway or question to the listener.
	• No markdown. No lists. No headers. Pure spoken prose.
	• Target length: {target_words} words."""


	# ── Podcast (two-speaker) prompts ─────────────────────────────────────────────

	PODCAST_SYSTEM_PROMPT = """\
	You are VoiceVerse, a world-class podcast scriptwriter.
	Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
	You MUST NOT introduce information not present in that context.
	Write a natural back-and-forth dialogue between two speakers:
	HOST — female, warm and inquisitive, guides the conversation
	GUEST — male, knowledgeable and enthusiastic, elaborates on topics
	Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text.
	No markdown, no stage directions, no descriptions — only spoken dialogue lines.
	The script will be read aloud by a TTS engine with two distinct voices."""

	PODCAST_USER_TEMPLATE = """\
	─────────────────────────────────────────────────────────────
	RETRIEVED CONTEXT (your SOLE factual source):
	{context}
	─────────────────────────────────────────────────────────────

	TASK:
	{task_description}

	FORMAT REQUIREMENTS (STRICTLY FOLLOW):
	• Every line must start with [HOST] or [GUEST] followed by their spoken words.
	• Alternate naturally between HOST and GUEST. Aim for 8–16 exchanges.
	• HOST opens and closes the episode.
	• Draw ALL facts ONLY from the context above.
	• No markdown. No stage directions. No headers. Only dialogue lines.
	• Target total length: {target_words} words of dialogue.

	Example format:
	[HOST] Welcome to VoiceVerse. Today we're diving into something fascinating.
	[GUEST] Thanks for having me. I've been looking forward to this conversation.
	[HOST] Let's start with the basics. What should our listeners know first?
	[GUEST] Great question. The most important thing to understand is..."""


	# ──────────────────────────────────────────────────────────────────────────────
	# LLM Backbone
	# ──────────────────────────────────────────────────────────────────────────────

	class LLMBackbone:
	"""
	Calls huggingface_hub.InferenceClient.chat_completion() to generate
	grounded spoken-style scripts.

	Uses provider="hf-inference" (HF's own serverless endpoint) to avoid
	third-party providers that independently deprecate models.
	Supports two output modes:
	- Transcript: plain spoken prose
	- Podcast: [HOST]/[GUEST] tagged dialogue for dual-voice TTS
	"""

	def __init__(self, config: Optional[LLMConfig] = None) -> None:
	self.config = config or LLMConfig()
	self._client = None
	logger.info(
	"LLMBackbone initialised \| model=%s \| provider=%s",
	self.config.model_id,
	self.config.provider,
	)

	# ── Public API ─────────────────────────────────────────────────────────────

	def generate_script(
	self,
	context_text: str,
	task_description: str,
	target_words: int = 400,
	output_mode: str = "Audio Transcript", # matches OutputMode.value
	) -> str:
	"""
	Generate a grounded script.

	Args:
	context_text: Retrieved context from RAGEngine (REQUIRED).
	task_description: High-level user instruction for the script.
	target_words: Approximate word count target.
	output_mode: "Audio Transcript" or "Podcast (2 Speakers)".

	Returns:
	Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes.
	"""
	if not context_text or not context_text.strip():
	raise ValueError(
	"context_text must not be empty. "
	"The LLM requires retrieved context to generate."
	)

	is_podcast = output_mode == "Podcast (2 Speakers)"
	messages = self._build_messages(
	context_text, task_description, target_words, is_podcast
	)

	logger.info(
	"Calling chat_completion \| model=%s \| mode=%s \| ~%d context chars",
	self.config.model_id,
	output_mode,
	len(context_text),
	)

	try:
	response = self._get_client().chat_completion(
	messages=messages,
	max_tokens=self.config.max_new_tokens,
	temperature=self.config.temperature,
	)
	raw_output: str = response.choices[0].message.content
	except Exception as exc:
	logger.error("InferenceClient call failed: %s", exc)
	raise RuntimeError(f"LLM generation failed: {exc}") from exc

	script = self._post_process(raw_output)
	logger.info("Script generated \| %d words \| podcast=%s", len(script.split()), is_podcast)
	return script

	# ── Message builder ────────────────────────────────────────────────────────

	def _build_messages(
	self,
	context: str,
	task: str,
	target_words: int,
	is_podcast: bool = False,
	) -> list[dict]:
	if is_podcast:
	system = PODCAST_SYSTEM_PROMPT
	user_content = PODCAST_USER_TEMPLATE.format(
	context=context,
	task_description=task,
	target_words=target_words,
	)
	else:
	system = SYSTEM_PROMPT
	user_content = USER_TEMPLATE.format(
	context=context,
	task_description=task,
	target_words=target_words,
	)
	return [
	{"role": "system", "content": system},
	{"role": "user", "content": user_content},
	]

	# ── Post-processing ────────────────────────────────────────────────────────

	@staticmethod
	def _post_process(raw: str) -> str:
	for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"):
	raw = raw.replace(tag, "")
	lines = [line.rstrip() for line in raw.splitlines()]
	cleaned: list[str] = []
	blank_count = 0
	for line in lines:
	if not line.strip():
	blank_count += 1
	if blank_count <= 2:
	cleaned.append("")
	else:
	blank_count = 0
	cleaned.append(line)
	return "\n".join(cleaned).strip()

	# ── Lazy client init ───────────────────────────────────────────────────────

	def _get_client(self):
	"""
	Lazy-load huggingface_hub.InferenceClient with provider="hf-inference".
	Uses HF's own serverless inference endpoint — avoids third-party providers
	(e.g. Together) that independently deprecate models and return 410 Gone.
	The client is bound to a specific model at init time.
	"""
	if self._client is None:
	from huggingface_hub import InferenceClient

	token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN")
	if not token:
	raise EnvironmentError(
	"Hugging Face API token not found. "
	"Set HUGGINGFACEHUB_API_TOKEN in your .env file "
	"or paste it in the sidebar."
	)

	logger.info(
	"Initialising InferenceClient \| model=%s \| provider=%s",
	self.config.model_id,
	self.config.provider,
	)
	self._client = InferenceClient(
	model=self.config.model_id, # bind model at client level
	token=token,
	provider=self.config.provider,
	)
	logger.info("InferenceClient ready \| provider=%s \| model=%s", self.config.provider, self.config.model_id)
	return self._client