ahanbose commited on
Commit
dc2a587
Β·
verified Β·
1 Parent(s): ea38730

Update src/modules/llm_backbone.py

Browse files
Files changed (1) hide show
  1. src/modules/llm_backbone.py +286 -239
src/modules/llm_backbone.py CHANGED
@@ -1,240 +1,287 @@
1
- """
2
- modules/llm_backbone.py
3
- ──────────────────────────────────────────────────────────────────────────────
4
- VoiceVerse Pro β€” LLM Script Generation Layer
5
-
6
- Model : meta-llama/Llama-3.1-8B-Instruct (default β€” widely supported 2026)
7
- Swap via LLMConfig.model_id for any HF-hosted chat model.
8
- Backend: huggingface_hub.InferenceClient with provider="auto"
9
- "auto" lets HF route to whichever provider currently serves the model,
10
- avoiding 410 Gone errors from deprecated provider/model combinations.
11
- Format : ChatCompletion messages API (system + user roles)
12
-
13
- WHY NOT HuggingFaceEndpoint?
14
- langchain-huggingface's HuggingFaceEndpoint internally calls
15
- InferenceClient.post(), which was REMOVED in huggingface_hub β‰₯ 0.26.
16
- Using InferenceClient.chat_completion() directly is the stable 2026 path.
17
-
18
- DESIGN RULES:
19
- - The LLM NEVER generates without retrieved context.
20
- - Context is injected verbatim into every prompt via the user message.
21
- - Output is a structured, spoken-style podcast/narration script.
22
- - Temperature, max_new_tokens are runtime-configurable.
23
- """
24
-
25
- from __future__ import annotations
26
-
27
- import logging
28
- import os
29
- from dataclasses import dataclass
30
- from typing import Optional
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- # ──────────────────────────────────────────────────────────────────────────────
36
- # Supported model presets (shown in sidebar dropdown)
37
- # ──────────────────────────────────────────────────────────────────────────────
38
-
39
- SUPPORTED_MODELS = [
40
- "meta-llama/Llama-3.1-8B-Instruct", # default β€” fast, free tier
41
- "Qwen/Qwen3-Coder-Next-GGUF",
42
- "meta-llama/Llama-3.3-70B-Instruct", # higher quality
43
- "mistralai/Mistral-7B-Instruct-v0.3", # lightweight Mistral
44
- "mistralai/Mistral-7B-Instruct-v0.2", # older Mistral variant
45
- "Qwen/Qwen2.5-72B-Instruct", # strong alternative
46
- "microsoft/Phi-4-reasoning-plus", # compact, capable
47
- ]
48
-
49
- DEFAULT_MODEL = SUPPORTED_MODELS[0]
50
-
51
-
52
- # ──────────────────────────────────────────────────────────────────────────────
53
- # Configuration
54
- # ──────────────────────────────────────────────────────────────────────────────
55
-
56
- @dataclass
57
- class LLMConfig:
58
- """Runtime-tunable LLM parameters."""
59
- model_id: str = DEFAULT_MODEL
60
- max_new_tokens: int = 1024
61
- temperature: float = 0.65
62
- hf_token: Optional[str] = None
63
- # Force HF's own serverless inference β€” avoids Together/other providers
64
- # that deprecate models independently of HF's model hub.
65
- provider: str = "auto"
66
- task: str = "none"
67
-
68
- # ──────────────────────────────────────────────────────────────────────────────
69
- # Prompt templates
70
- # ──────────────────────────────────────────────────────────────────────────────
71
-
72
- SYSTEM_PROMPT = """\
73
- You are VoiceVerse, a world-class scriptwriter for spoken-audio content.
74
- Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
75
- You MUST NOT introduce information not present in that context.
76
- Write in a warm, engaging, conversational spoken-English style.
77
- No markdown, no bullet points, no headers β€” pure spoken prose only.
78
- The script will be read aloud by a TTS engine."""
79
-
80
- USER_TEMPLATE = """\
81
- ─────────────────────────────────────────────────────────────
82
- RETRIEVED CONTEXT (your SOLE factual source):
83
- {context}
84
- ─────────────────────────────────────────────────────────────
85
-
86
- TASK:
87
- {task_description}
88
-
89
- FORMAT REQUIREMENTS:
90
- β€’ Open with a compelling hook (1–2 sentences).
91
- β€’ Develop the topic across 3–5 natural paragraphs drawn ONLY from the context.
92
- β€’ Close with a memorable takeaway or question to the listener.
93
- β€’ No markdown. No lists. No headers. Pure spoken prose.
94
- β€’ Target length: {target_words} words."""
95
-
96
-
97
- # ──────────────────────────────────────────────────────────────────────────────
98
- # LLM Backbone
99
- # ──────────────────────────────────────────────────────────────────────────────
100
-
101
- class LLMBackbone:
102
- """
103
- Calls huggingface_hub.InferenceClient.chat_completion() to generate
104
- grounded spoken-style scripts.
105
-
106
- provider="auto" instructs HF's inference router to automatically select
107
- the best available provider for the model β€” this prevents 410 Gone errors
108
- caused by a specific provider deprecating a model.
109
- """
110
-
111
- def __init__(self, config: Optional[LLMConfig] = None) -> None:
112
- self.config = config or LLMConfig()
113
- self._client = None
114
- logger.info(
115
- "LLMBackbone initialised | model=%s | provider=%s",
116
- self.config.model_id,
117
- self.config.provider,
118
- )
119
-
120
- # ── Public API ─────────────────────────────────────────────────────────────
121
-
122
- def generate_script(
123
- self,
124
- context_text: str,
125
- task_description: str,
126
- target_words: int = 400,
127
- ) -> str:
128
- """
129
- Generate a grounded spoken-style script.
130
-
131
- Args:
132
- context_text: Retrieved context from RAGEngine (REQUIRED).
133
- task_description: High-level user instruction for the script.
134
- target_words: Approximate word count target.
135
-
136
- Returns:
137
- Clean script text (no markdown artefacts).
138
-
139
- Raises:
140
- ValueError: If context_text is empty (anti-hallucination guard).
141
- RuntimeError: If the HF Inference API call fails.
142
- """
143
- if not context_text or not context_text.strip():
144
- raise ValueError(
145
- "context_text must not be empty. "
146
- "The LLM requires retrieved context to generate."
147
- )
148
-
149
- messages = self._build_messages(context_text, task_description, target_words)
150
-
151
- logger.info(
152
- "Calling chat_completion | model=%s | provider=%s | ~%d context chars",
153
- self.config.model_id,
154
- self.config.provider,
155
- len(context_text),
156
- )
157
-
158
- try:
159
- response = self._get_client().chat_completion(
160
- messages=messages,
161
- max_tokens=self.config.max_new_tokens,
162
- temperature=self.config.temperature,
163
- )
164
- raw_output: str = response.choices[0].message.content
165
- except Exception as exc:
166
- logger.error("InferenceClient call failed: %s", exc)
167
- raise RuntimeError(f"LLM generation failed: {exc}") from exc
168
-
169
- script = self._post_process(raw_output)
170
- logger.info("Script generated | %d words", len(script.split()))
171
- return script
172
-
173
- # ── Message builder ────────────────────────────────────────────────────────
174
-
175
- def _build_messages(
176
- self,
177
- context: str,
178
- task: str,
179
- target_words: int,
180
- ) -> list[dict]:
181
- user_content = USER_TEMPLATE.format(
182
- context=context,
183
- task_description=task,
184
- target_words=target_words,
185
- )
186
- return [
187
- {"role": "system", "content": SYSTEM_PROMPT},
188
- {"role": "user", "content": user_content},
189
- ]
190
-
191
- # ── Post-processing ────────────────────────────────────────────────────────
192
-
193
- @staticmethod
194
- def _post_process(raw: str) -> str:
195
- for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"):
196
- raw = raw.replace(tag, "")
197
- lines = [line.rstrip() for line in raw.splitlines()]
198
- cleaned: list[str] = []
199
- blank_count = 0
200
- for line in lines:
201
- if not line.strip():
202
- blank_count += 1
203
- if blank_count <= 2:
204
- cleaned.append("")
205
- else:
206
- blank_count = 0
207
- cleaned.append(line)
208
- return "\n".join(cleaned).strip()
209
-
210
- # ── Lazy client init ───────────────────────────────────────���───────────────
211
-
212
- def _get_client(self):
213
- """
214
- Lazy-load InferenceClient with provider="auto".
215
- "auto" = HF inference router picks the fastest available provider
216
- for the requested model, avoiding stale provider/model 410 errors.
217
- """
218
- if self._client is None:
219
- from huggingface_hub import InferenceClient
220
-
221
- token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN")
222
- if not token:
223
- raise EnvironmentError(
224
- "Hugging Face API token not found. "
225
- "Set HUGGINGFACEHUB_API_TOKEN in your .env file "
226
- "or paste it in the sidebar."
227
- )
228
-
229
- logger.info(
230
- "Initialising InferenceClient | model=%s | provider=%s",
231
- self.config.model_id,
232
- self.config.provider,
233
- )
234
- self._client = InferenceClient(
235
- model=self.config.model_id, # bind model at client level
236
- token=token,
237
- provider=self.config.provider,
238
- )
239
- logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  return self._client
 
1
+ """
2
+ modules/llm_backbone.py
3
+ ──────────────────────────────────────────────────────────────────────────────
4
+ VoiceVerse Pro β€” LLM Script Generation Layer
5
+
6
+ Model : meta-llama/Llama-3.1-8B-Instruct (default β€” widely supported 2026)
7
+ Swap via LLMConfig.model_id for any HF-hosted chat model.
8
+ Backend: huggingface_hub.InferenceClient with provider="hf-inference"
9
+ Forces HF's own serverless inference endpoint β€” avoids third-party
10
+ providers (e.g. Together) that independently deprecate models and
11
+ return 410 Gone errors.
12
+ Format : ChatCompletion messages API (system + user roles)
13
+
14
+ WHY NOT HuggingFaceEndpoint?
15
+ langchain-huggingface's HuggingFaceEndpoint internally calls
16
+ InferenceClient.post(), which was REMOVED in huggingface_hub β‰₯ 0.26.
17
+ Using InferenceClient.chat_completion() directly is the stable 2026 path.
18
+
19
+ DESIGN RULES:
20
+ - The LLM NEVER generates without retrieved context.
21
+ - Context is injected verbatim into every prompt via the user message.
22
+ - Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged
23
+ dialogue (podcast), depending on output_mode.
24
+ - Temperature, max_new_tokens are runtime-configurable.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ import os
31
+ from dataclasses import dataclass
32
+ from typing import Optional
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ # ──────────────────────────────────────────────────────────────────────────────
38
+ # Supported model presets (shown in sidebar dropdown)
39
+ # ──────────────────────────────────────────────────────────────────────────────
40
+
41
+ SUPPORTED_MODELS = [
42
+ "mistralai/Mistral-7B-Instruct-v0.2", # compact, capable
43
+ ]
44
+
45
+ DEFAULT_MODEL = SUPPORTED_MODELS[0]
46
+
47
+
48
+ # ──────────────────────────────────────────────────────────────────────────────
49
+ # Configuration
50
+ # ──────────────────────────────────────────────────────────────────────────────
51
+
52
+ @dataclass
53
+ class LLMConfig:
54
+ """Runtime-tunable LLM parameters."""
55
+ model_id: str = DEFAULT_MODEL
56
+ max_new_tokens: int = 1024
57
+ temperature: float = 0.65
58
+ hf_token: Optional[str] = None
59
+ # Force HF's own serverless inference β€” avoids Together/other providers
60
+ # that deprecate models independently of HF's model hub.
61
+ provider: str = "auto"
62
+
63
+
64
+ # ──────────────────────────────────────────────────────────────────────────────
65
+ # Prompt templates
66
+ # ──────────────────────────────────────────────────────────────────────────────
67
+
68
+ SYSTEM_PROMPT = """\
69
+ You are VoiceVerse, a world-class scriptwriter for spoken-audio content.
70
+ Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
71
+ You MUST NOT introduce information not present in that context.
72
+ Write in a warm, engaging, conversational spoken-English style.
73
+ No markdown, no bullet points, no headers β€” pure spoken prose only.
74
+ The script will be read aloud by a TTS engine."""
75
+
76
+ USER_TEMPLATE = """\
77
+ ─────────────────────────────────────────────────────────────
78
+ RETRIEVED CONTEXT (your SOLE factual source):
79
+ {context}
80
+ ─────────────────────────────────────────────────────────────
81
+
82
+ TASK:
83
+ {task_description}
84
+
85
+ FORMAT REQUIREMENTS:
86
+ β€’ Open with a compelling hook (1–2 sentences).
87
+ β€’ Develop the topic across 3–5 natural paragraphs drawn ONLY from the context.
88
+ β€’ Close with a memorable takeaway or question to the listener.
89
+ β€’ No markdown. No lists. No headers. Pure spoken prose.
90
+ β€’ Target length: {target_words} words."""
91
+
92
+
93
+ # ── Podcast (two-speaker) prompts ─────────────────────────────────────────────
94
+
95
+ PODCAST_SYSTEM_PROMPT = """\
96
+ You are VoiceVerse, a world-class podcast scriptwriter.
97
+ Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message.
98
+ You MUST NOT introduce information not present in that context.
99
+ Write a natural back-and-forth dialogue between two speakers:
100
+ HOST β€” female, warm and inquisitive, guides the conversation
101
+ GUEST β€” male, knowledgeable and enthusiastic, elaborates on topics
102
+ Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text.
103
+ No markdown, no stage directions, no descriptions β€” only spoken dialogue lines.
104
+ The script will be read aloud by a TTS engine with two distinct voices."""
105
+
106
+ PODCAST_USER_TEMPLATE = """\
107
+ ─────────────────────────────────────────────────────────────
108
+ RETRIEVED CONTEXT (your SOLE factual source):
109
+ {context}
110
+ ─────────────────────────────────────────────────────────────
111
+
112
+ TASK:
113
+ {task_description}
114
+
115
+ FORMAT REQUIREMENTS (STRICTLY FOLLOW):
116
+ β€’ Every line must start with [HOST] or [GUEST] followed by their spoken words.
117
+ β€’ Alternate naturally between HOST and GUEST. Aim for 8–16 exchanges.
118
+ β€’ HOST opens and closes the episode.
119
+ β€’ Draw ALL facts ONLY from the context above.
120
+ β€’ No markdown. No stage directions. No headers. Only dialogue lines.
121
+ β€’ Target total length: {target_words} words of dialogue.
122
+
123
+ Example format:
124
+ [HOST] Welcome to VoiceVerse. Today we're diving into something fascinating.
125
+ [GUEST] Thanks for having me. I've been looking forward to this conversation.
126
+ [HOST] Let's start with the basics. What should our listeners know first?
127
+ [GUEST] Great question. The most important thing to understand is..."""
128
+
129
+
130
+ # ──────────────────────────────────────────────────────────────────────────────
131
+ # LLM Backbone
132
+ # ──────────────────────────────────────────────────────────────────────────────
133
+
134
+ class LLMBackbone:
135
+ """
136
+ Calls huggingface_hub.InferenceClient.chat_completion() to generate
137
+ grounded spoken-style scripts.
138
+
139
+ Uses provider="hf-inference" (HF's own serverless endpoint) to avoid
140
+ third-party providers that independently deprecate models.
141
+ Supports two output modes:
142
+ - Transcript: plain spoken prose
143
+ - Podcast: [HOST]/[GUEST] tagged dialogue for dual-voice TTS
144
+ """
145
+
146
+ def __init__(self, config: Optional[LLMConfig] = None) -> None:
147
+ self.config = config or LLMConfig()
148
+ self._client = None
149
+ logger.info(
150
+ "LLMBackbone initialised | model=%s | provider=%s",
151
+ self.config.model_id,
152
+ self.config.provider,
153
+ )
154
+
155
+ # ── Public API ─────────────────────────────────────────────────────────────
156
+
157
+ def generate_script(
158
+ self,
159
+ context_text: str,
160
+ task_description: str,
161
+ target_words: int = 400,
162
+ output_mode: str = "Audio Transcript", # matches OutputMode.value
163
+ ) -> str:
164
+ """
165
+ Generate a grounded script.
166
+
167
+ Args:
168
+ context_text: Retrieved context from RAGEngine (REQUIRED).
169
+ task_description: High-level user instruction for the script.
170
+ target_words: Approximate word count target.
171
+ output_mode: "Audio Transcript" or "Podcast (2 Speakers)".
172
+
173
+ Returns:
174
+ Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes.
175
+ """
176
+ if not context_text or not context_text.strip():
177
+ raise ValueError(
178
+ "context_text must not be empty. "
179
+ "The LLM requires retrieved context to generate."
180
+ )
181
+
182
+ is_podcast = output_mode == "Podcast (2 Speakers)"
183
+ messages = self._build_messages(
184
+ context_text, task_description, target_words, is_podcast
185
+ )
186
+
187
+ logger.info(
188
+ "Calling chat_completion | model=%s | mode=%s | ~%d context chars",
189
+ self.config.model_id,
190
+ output_mode,
191
+ len(context_text),
192
+ )
193
+
194
+ try:
195
+ response = self._get_client().chat_completion(
196
+ messages=messages,
197
+ max_tokens=self.config.max_new_tokens,
198
+ temperature=self.config.temperature,
199
+ )
200
+ raw_output: str = response.choices[0].message.content
201
+ except Exception as exc:
202
+ logger.error("InferenceClient call failed: %s", exc)
203
+ raise RuntimeError(f"LLM generation failed: {exc}") from exc
204
+
205
+ script = self._post_process(raw_output)
206
+ logger.info("Script generated | %d words | podcast=%s", len(script.split()), is_podcast)
207
+ return script
208
+
209
+ # ── Message builder ────────────────────────────────────────────────────────
210
+
211
+ def _build_messages(
212
+ self,
213
+ context: str,
214
+ task: str,
215
+ target_words: int,
216
+ is_podcast: bool = False,
217
+ ) -> list[dict]:
218
+ if is_podcast:
219
+ system = PODCAST_SYSTEM_PROMPT
220
+ user_content = PODCAST_USER_TEMPLATE.format(
221
+ context=context,
222
+ task_description=task,
223
+ target_words=target_words,
224
+ )
225
+ else:
226
+ system = SYSTEM_PROMPT
227
+ user_content = USER_TEMPLATE.format(
228
+ context=context,
229
+ task_description=task,
230
+ target_words=target_words,
231
+ )
232
+ return [
233
+ {"role": "system", "content": system},
234
+ {"role": "user", "content": user_content},
235
+ ]
236
+
237
+ # ── Post-processing ────────────────────────────────────────────────────────
238
+
239
+ @staticmethod
240
+ def _post_process(raw: str) -> str:
241
+ for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"):
242
+ raw = raw.replace(tag, "")
243
+ lines = [line.rstrip() for line in raw.splitlines()]
244
+ cleaned: list[str] = []
245
+ blank_count = 0
246
+ for line in lines:
247
+ if not line.strip():
248
+ blank_count += 1
249
+ if blank_count <= 2:
250
+ cleaned.append("")
251
+ else:
252
+ blank_count = 0
253
+ cleaned.append(line)
254
+ return "\n".join(cleaned).strip()
255
+
256
+ # ── Lazy client init ───────────────────────────────────────────────────────
257
+
258
+ def _get_client(self):
259
+ """
260
+ Lazy-load huggingface_hub.InferenceClient with provider="hf-inference".
261
+ Uses HF's own serverless inference endpoint β€” avoids third-party providers
262
+ (e.g. Together) that independently deprecate models and return 410 Gone.
263
+ The client is bound to a specific model at init time.
264
+ """
265
+ if self._client is None:
266
+ from huggingface_hub import InferenceClient
267
+
268
+ token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN")
269
+ if not token:
270
+ raise EnvironmentError(
271
+ "Hugging Face API token not found. "
272
+ "Set HUGGINGFACEHUB_API_TOKEN in your .env file "
273
+ "or paste it in the sidebar."
274
+ )
275
+
276
+ logger.info(
277
+ "Initialising InferenceClient | model=%s | provider=%s",
278
+ self.config.model_id,
279
+ self.config.provider,
280
+ )
281
+ self._client = InferenceClient(
282
+ model=self.config.model_id, # bind model at client level
283
+ token=token,
284
+ provider=self.config.provider,
285
+ )
286
+ logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id)
287
  return self._client