ruslanmv commited on
Commit
e1bbc7c
·
1 Parent(s): d980bec

Simplification Chat

Browse files
app/core/inference/client.py CHANGED
@@ -1,6 +1,6 @@
1
  # app/core/inference/client.py
2
  import os, json, time, logging
3
- from typing import Dict, List, Optional, Iterator, Tuple
4
 
5
  import requests
6
 
@@ -27,7 +27,6 @@ def _mk_messages(system_prompt: Optional[str], user_text: str) -> List[Dict[str,
27
  return msgs
28
 
29
  def _timeout_tuple(connect: float = 10.0, read: float = 60.0) -> Tuple[float, float]:
30
- # requests timeout is (connect, read)
31
  return (connect, read)
32
 
33
  class RouterRequestsClient:
@@ -51,11 +50,6 @@ class RouterRequestsClient:
51
  self.max_retries = max(0, int(max_retries))
52
  self.timeout = _timeout_tuple(connect_timeout, read_timeout)
53
 
54
- # anti-repeat knobs (safe defaults; ignored if provider doesn't support them)
55
- self.frequency_penalty = float(os.getenv("LLM_FREQUENCY_PENALTY", "0.6"))
56
- self.presence_penalty = float(os.getenv("LLM_PRESENCE_PENALTY", "0.05"))
57
- self.top_p = float(os.getenv("LLM_TOP_P", "0.95"))
58
-
59
  # -------- Non-stream (single text) --------
60
  def chat_nonstream(
61
  self,
@@ -63,17 +57,21 @@ class RouterRequestsClient:
63
  user_text: str,
64
  max_tokens: int,
65
  temperature: float,
 
 
66
  ) -> str:
67
- payload = {
68
  "model": _model_with_provider(self.model, self.provider),
69
  "messages": _mk_messages(system_prompt, user_text),
70
  "temperature": float(temperature),
71
  "max_tokens": int(max_tokens),
72
- "top_p": self.top_p,
73
- "frequency_penalty": self.frequency_penalty,
74
- "presence_penalty": self.presence_penalty,
75
  "stream": False,
76
  }
 
 
 
 
 
77
  text, ok = self._try_once(payload)
78
  if ok:
79
  return text
@@ -95,7 +93,6 @@ class RouterRequestsClient:
95
  if r.status_code >= 400:
96
  logger.error("Router error %s: %s", r.status_code, r.text)
97
  last_err = RuntimeError(f"{r.status_code}: {r.text}")
98
- # do not hard-spin; brief pause
99
  time.sleep(min(1.5 * (attempt + 1), 3.0))
100
  continue
101
  data = r.json()
@@ -114,18 +111,22 @@ class RouterRequestsClient:
114
  system_prompt: Optional[str],
115
  user_text: str,
116
  max_tokens: int,
117
- temperature: float
 
 
118
  ) -> Iterator[str]:
119
- payload = {
120
  "model": _model_with_provider(self.model, self.provider),
121
  "messages": _mk_messages(system_prompt, user_text),
122
  "temperature": float(temperature),
123
  "max_tokens": int(max_tokens),
124
- "top_p": self.top_p,
125
- "frequency_penalty": self.frequency_penalty,
126
- "presence_penalty": self.presence_penalty,
127
  "stream": True,
128
  }
 
 
 
 
 
129
  # primary
130
  ok = False
131
  for token in self._stream_once(payload):
@@ -141,9 +142,7 @@ class RouterRequestsClient:
141
 
142
  def _stream_once(self, payload: dict) -> Iterator[str]:
143
  try:
144
- with requests.post(
145
- ROUTER_URL, headers=self.headers, json=payload, stream=True, timeout=self.timeout
146
- ) as r:
147
  if r.status_code >= 400:
148
  logger.error("Router stream error %s: %s", r.status_code, r.text)
149
  return
@@ -157,7 +156,6 @@ class RouterRequestsClient:
157
  return
158
  try:
159
  obj = json.loads(data)
160
- # OpenAI-style: delta tokens
161
  delta = obj["choices"][0]["delta"].get("content", "")
162
  if delta:
163
  yield delta
@@ -169,12 +167,6 @@ class RouterRequestsClient:
169
  return
170
 
171
  # -------- Planning (non-stream) --------
172
- def plan_nonstream(
173
- self,
174
- system_prompt: str,
175
- user_text: str,
176
- max_tokens: int,
177
- temperature: float
178
- ) -> str:
179
- """Use same chat/completions but always non-stream for planning."""
180
  return self.chat_nonstream(system_prompt, user_text, max_tokens, temperature)
 
1
  # app/core/inference/client.py
2
  import os, json, time, logging
3
+ from typing import Dict, List, Optional, Iterator, Tuple, Any
4
 
5
  import requests
6
 
 
27
  return msgs
28
 
29
  def _timeout_tuple(connect: float = 10.0, read: float = 60.0) -> Tuple[float, float]:
 
30
  return (connect, read)
31
 
32
  class RouterRequestsClient:
 
50
  self.max_retries = max(0, int(max_retries))
51
  self.timeout = _timeout_tuple(connect_timeout, read_timeout)
52
 
 
 
 
 
 
53
  # -------- Non-stream (single text) --------
54
  def chat_nonstream(
55
  self,
 
57
  user_text: str,
58
  max_tokens: int,
59
  temperature: float,
60
+ stop: Optional[List[str]] = None,
61
+ extra: Optional[Dict[str, Any]] = None,
62
  ) -> str:
63
+ payload: Dict[str, Any] = {
64
  "model": _model_with_provider(self.model, self.provider),
65
  "messages": _mk_messages(system_prompt, user_text),
66
  "temperature": float(temperature),
67
  "max_tokens": int(max_tokens),
 
 
 
68
  "stream": False,
69
  }
70
+ if stop:
71
+ payload["stop"] = stop
72
+ if extra:
73
+ payload.update(extra)
74
+
75
  text, ok = self._try_once(payload)
76
  if ok:
77
  return text
 
93
  if r.status_code >= 400:
94
  logger.error("Router error %s: %s", r.status_code, r.text)
95
  last_err = RuntimeError(f"{r.status_code}: {r.text}")
 
96
  time.sleep(min(1.5 * (attempt + 1), 3.0))
97
  continue
98
  data = r.json()
 
111
  system_prompt: Optional[str],
112
  user_text: str,
113
  max_tokens: int,
114
+ temperature: float,
115
+ stop: Optional[List[str]] = None,
116
+ extra: Optional[Dict[str, Any]] = None,
117
  ) -> Iterator[str]:
118
+ payload: Dict[str, Any] = {
119
  "model": _model_with_provider(self.model, self.provider),
120
  "messages": _mk_messages(system_prompt, user_text),
121
  "temperature": float(temperature),
122
  "max_tokens": int(max_tokens),
 
 
 
123
  "stream": True,
124
  }
125
+ if stop:
126
+ payload["stop"] = stop
127
+ if extra:
128
+ payload.update(extra)
129
+
130
  # primary
131
  ok = False
132
  for token in self._stream_once(payload):
 
142
 
143
  def _stream_once(self, payload: dict) -> Iterator[str]:
144
  try:
145
+ with requests.post(ROUTER_URL, headers=self.headers, json=payload, stream=True, timeout=self.timeout) as r:
 
 
146
  if r.status_code >= 400:
147
  logger.error("Router stream error %s: %s", r.status_code, r.text)
148
  return
 
156
  return
157
  try:
158
  obj = json.loads(data)
 
159
  delta = obj["choices"][0]["delta"].get("content", "")
160
  if delta:
161
  yield delta
 
167
  return
168
 
169
  # -------- Planning (non-stream) --------
170
+ def plan_nonstream(self, system_prompt: str, user_text: str,
171
+ max_tokens: int, temperature: float) -> str:
 
 
 
 
 
 
172
  return self.chat_nonstream(system_prompt, user_text, max_tokens, temperature)
app/services/chat_service.py CHANGED
@@ -21,9 +21,11 @@ except Exception: # pragma: no cover
21
  CrossEncoder = None # type: ignore
22
 
23
  SYSTEM_PROMPT = (
24
- "You are MATRIX-AI, a concise, helpful assistant for the Matrix EcoSystem.\n"
25
- "Use the provided CONTEXT strictly. If the answer is not supported by context, say you don't know.\n"
26
- "Reply in 2–4 short sentences. Do NOT repeat sentences or rephrase the same point multiple times.\n"
 
 
27
  )
28
 
29
  # Thread-safe singleton retriever
@@ -55,18 +57,17 @@ def get_retriever(settings: Settings) -> Optional[Retriever]:
55
 
56
 
57
  # ----------------------------
58
- # Anti-repetition helpers
59
  # ----------------------------
60
  _SENT_SPLIT = re.compile(r'(?<=[\.\!\?])\s+')
61
  _NORM = re.compile(r'[^a-z0-9\s]+')
62
-
 
63
 
64
  def _norm_sentence(s: str) -> str:
65
  s = s.lower().strip()
66
  s = _NORM.sub(' ', s)
67
- s = re.sub(r'\s+', ' ', s)
68
- return s
69
-
70
 
71
  def _jaccard(a: str, b: str) -> float:
72
  ta = set(a.split())
@@ -75,12 +76,32 @@ def _jaccard(a: str, b: str) -> float:
75
  return 0.0
76
  return len(ta & tb) / max(1, len(ta | tb))
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def _squash_repetition(text: str, max_sentences: int = 4, sim_threshold: float = 0.88) -> str:
80
- """
81
- Remove near-duplicate sentences while keeping order.
82
- Also collapses whitespace and caps total sentences.
83
- """
84
  t = re.sub(r'\s+', ' ', text).strip()
85
  if not t:
86
  return t
@@ -91,37 +112,35 @@ def _squash_repetition(text: str, max_sentences: int = 4, sim_threshold: float =
91
  ns = _norm_sentence(s)
92
  if not ns:
93
  continue
94
- is_dup = False
95
- for prev in norms:
96
- if _jaccard(prev, ns) >= sim_threshold:
97
- is_dup = True
98
- break
99
- if not is_dup:
100
- out.append(s.strip())
101
- norms.append(ns)
102
  if len(out) >= max_sentences:
103
  break
104
  return ' '.join(out).strip()
105
 
 
 
 
 
 
 
106
 
107
  # ----------------------------
108
- # RAG utilities (ranking & snippets)
109
  # ----------------------------
110
  _ALIAS_TABLE: Dict[str, List[str]] = {
111
  "matrixhub": ["matrix hub", "hub api", "catalog", "registry", "cas"],
112
  "mcp": ["model context protocol", "manifest", "server manifest", "admin api"],
113
  "agent-matrix": ["matrix agents", "matrix ecosystem", "matrix toolkit"],
114
  }
115
-
116
  _WORD_RE = re.compile(r"[A-Za-z0-9_]+")
117
 
118
-
119
  def _normalize(text: str) -> List[str]:
120
  return [t.lower() for t in _WORD_RE.findall(text)]
121
 
122
-
123
  def _expand_query(q: str) -> str:
124
- """Add domain aliases to help the embedding retrieve the right docs."""
125
  ql = q.lower()
126
  extras: List[str] = []
127
  for canon, variants in _ALIAS_TABLE.items():
@@ -131,7 +150,6 @@ def _expand_query(q: str) -> str:
131
  return q + " | " + " ".join(sorted(set(extras)))
132
  return q
133
 
134
-
135
  def _keyword_overlap_score(query: str, text: str) -> float:
136
  q_tokens = set(_normalize(query))
137
  d_tokens = set(_normalize(text))
@@ -141,7 +159,6 @@ def _keyword_overlap_score(query: str, text: str) -> float:
141
  union = len(q_tokens | d_tokens)
142
  return inter / max(1, union)
143
 
144
-
145
  def _domain_boost(text: str) -> float:
146
  t = text.lower()
147
  boost = 0.0
@@ -150,7 +167,6 @@ def _domain_boost(text: str) -> float:
150
  boost += 0.05
151
  return min(boost, 0.25)
152
 
153
-
154
  def _best_paragraphs(text: str, query: str, max_chars: int = 700) -> str:
155
  paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
156
  if not paras:
@@ -168,12 +184,8 @@ def _best_paragraphs(text: str, query: str, max_chars: int = 700) -> str:
168
  break
169
  return "\n".join(picked)
170
 
171
-
172
  def _cross_encoder_scores(
173
- model: Optional["CrossEncoder"],
174
- query: str,
175
- docs: List[Dict],
176
- max_pairs: int = 50,
177
  ) -> Optional[List[float]]:
178
  if not model:
179
  return None
@@ -184,19 +196,14 @@ def _cross_encoder_scores(
184
  logger.warning("Cross-encoder scoring failed; continuing without it (%s)", e)
185
  return None
186
 
187
-
188
  def _rerank_docs(
189
- docs: List[Dict],
190
- query: str,
191
- k_final: int,
192
- reranker: Optional["CrossEncoder"] = None,
193
  ) -> List[Dict]:
194
  if not docs:
195
  return []
196
  vec_scores = [float(d.get("score", 0.0)) for d in docs]
197
  if vec_scores:
198
- vmin = min(vec_scores)
199
- vmax = max(vec_scores)
200
  rng = max(1e-6, (vmax - vmin))
201
  vec_norm = [(v - vmin) / rng for v in vec_scores]
202
  else:
@@ -219,11 +226,9 @@ def _rerank_docs(
219
  if ce_norm is not None:
220
  score = 0.80 * score + 0.20 * ce_norm[i]
221
  merged.append((score, d))
222
-
223
  merged.sort(key=lambda x: x[0], reverse=True)
224
  return [d for _s, d in merged[:k_final]]
225
 
226
-
227
  def _build_context_from_docs(docs: List[Dict], query: str, max_blocks: int = 4) -> Tuple[str, List[str]]:
228
  blocks: List[str] = []
229
  sources: List[str] = []
@@ -242,10 +247,7 @@ def _build_context_from_docs(docs: List[Dict], query: str, max_blocks: int = 4)
242
  # Service
243
  # ----------------------------
244
  class ChatService:
245
- def __init__(
246
- self,
247
- settings: Settings,
248
- ):
249
  self.settings = settings
250
  self.client = RouterRequestsClient(
251
  model=settings.model.name,
@@ -266,6 +268,10 @@ class ChatService:
266
  except Exception as e:
267
  logger.warning("Reranker disabled: %s", e)
268
 
 
 
 
 
269
  # ---------- RAG core ----------
270
  def _retrieve_best(self, query: str) -> Tuple[str, List[str]]:
271
  if not self.retriever:
@@ -286,16 +292,16 @@ class ChatService:
286
  def _augment(self, query: str) -> Tuple[str, List[str]]:
287
  ctx, sources = self._retrieve_best(query)
288
  if ctx:
289
- # IMPORTANT: no trailing "Answer:" label; it often induces echo/repeat.
290
  user_msg = (
291
  f"{ctx}\n\n"
292
- "Based only on the context above, answer the question succinctly in 2–4 sentences.\n"
293
- f"Question: {query}"
294
  )
295
  else:
296
  user_msg = (
297
- "Answer succinctly in 2–4 sentences. Do not repeat yourself.\n"
298
- f"Question: {query}"
299
  )
300
  return user_msg, sources
301
 
@@ -307,16 +313,16 @@ class ChatService:
307
  user_msg,
308
  max_tokens=self.settings.model.max_new_tokens,
309
  temperature=self.settings.model.temperature,
 
 
310
  )
311
- # Anti-stutter pass (server-side)
312
- text = _squash_repetition(text, max_sentences=4, sim_threshold=0.88)
313
  return text, sources
314
 
315
  # ---------- Stream ----------
316
  def stream_answer(self, query: str):
317
  """
318
- Wrap the raw stream with a cleaner that suppresses near-duplicate sentences.
319
- We keep an internal buffer, clean it, and emit only new delta after cleanup.
320
  """
321
  user_msg, _ = self._augment(query)
322
  raw = self.client.chat_stream(
@@ -324,17 +330,19 @@ class ChatService:
324
  user_msg,
325
  max_tokens=self.settings.model.max_new_tokens,
326
  temperature=self.settings.model.temperature,
 
 
327
  )
328
 
329
- buf = "" # what we've collected
330
- emitted = "" # what we've already yielded (cleaned)
331
  for token in raw:
332
  if not token:
333
  continue
334
  buf += token
335
- cleaned = _squash_repetition(buf, max_sentences=4, sim_threshold=0.88)
336
  if len(cleaned) < len(emitted):
337
- # shouldn't happen, but guard: reset to cleaned
338
  emitted = cleaned
339
  continue
340
  delta = cleaned[len(emitted):]
 
21
  CrossEncoder = None # type: ignore
22
 
23
  SYSTEM_PROMPT = (
24
+ "You are MATRIX-AI, a concise assistant for the Matrix EcoSystem.\n"
25
+ "Answer the user's question directly in 2–4 short sentences.\n"
26
+ "Do NOT restate the question. Do NOT use labels like 'Question:' or 'Answer:'.\n"
27
+ "Use the provided CONTEXT if present; if the answer is not supported by it, say you don't know.\n"
28
+ "Do not ask follow-up questions unless the user explicitly asks you to."
29
  )
30
 
31
  # Thread-safe singleton retriever
 
57
 
58
 
59
  # ----------------------------
60
+ # Anti-repetition + de-label helpers
61
  # ----------------------------
62
  _SENT_SPLIT = re.compile(r'(?<=[\.\!\?])\s+')
63
  _NORM = re.compile(r'[^a-z0-9\s]+')
64
+ _QA_LINE_RE = re.compile(r'^\s*(question|q|user)\s*:\s*', re.I)
65
+ _ANSWER_PREFIX_RE = re.compile(r'^\s*(answer|a)\s*:\s*', re.I)
66
 
67
  def _norm_sentence(s: str) -> str:
68
  s = s.lower().strip()
69
  s = _NORM.sub(' ', s)
70
+ return re.sub(r'\s+', ' ', s)
 
 
71
 
72
  def _jaccard(a: str, b: str) -> float:
73
  ta = set(a.split())
 
76
  return 0.0
77
  return len(ta & tb) / max(1, len(ta | tb))
78
 
79
+ def _strip_qa_meta(text: str) -> str:
80
+ """Drop lines like 'Question: ...' and leading 'Answer:' labels."""
81
+ lines = text.splitlines()
82
+ out: List[str] = []
83
+ for i, l in enumerate(lines):
84
+ if i == 0:
85
+ l = _ANSWER_PREFIX_RE.sub('', l).strip()
86
+ if _QA_LINE_RE.match(l):
87
+ continue
88
+ out.append(l)
89
+ return "\n".join(out).strip()
90
+
91
+ def _remove_query_echo(text: str, query: str, sim_threshold: float = 0.9) -> str:
92
+ """Remove sentences that are near-duplicates of the original query."""
93
+ qn = _norm_sentence(query)
94
+ parts = _SENT_SPLIT.split(re.sub(r'\s+', ' ', text).strip()) or [text]
95
+ kept: List[str] = []
96
+ for s in parts:
97
+ sn = _norm_sentence(s)
98
+ if _jaccard(qn, sn) >= sim_threshold:
99
+ continue
100
+ kept.append(s.strip())
101
+ return ' '.join(kept).strip()
102
 
103
  def _squash_repetition(text: str, max_sentences: int = 4, sim_threshold: float = 0.88) -> str:
104
+ """Remove near-duplicate sentences while keeping order and cap total sentences."""
 
 
 
105
  t = re.sub(r'\s+', ' ', text).strip()
106
  if not t:
107
  return t
 
112
  ns = _norm_sentence(s)
113
  if not ns:
114
  continue
115
+ if any(_jaccard(prev, ns) >= sim_threshold for prev in norms):
116
+ continue
117
+ out.append(s.strip())
118
+ norms.append(ns)
 
 
 
 
119
  if len(out) >= max_sentences:
120
  break
121
  return ' '.join(out).strip()
122
 
123
+ def _clean_answer(text: str, query: str) -> str:
124
+ t = _strip_qa_meta(text)
125
+ t = _remove_query_echo(t, query)
126
+ t = _squash_repetition(t, max_sentences=4, sim_threshold=0.88)
127
+ return t
128
+
129
 
130
  # ----------------------------
131
+ # RAG helpers (query expansion, ranking, snippets)
132
  # ----------------------------
133
  _ALIAS_TABLE: Dict[str, List[str]] = {
134
  "matrixhub": ["matrix hub", "hub api", "catalog", "registry", "cas"],
135
  "mcp": ["model context protocol", "manifest", "server manifest", "admin api"],
136
  "agent-matrix": ["matrix agents", "matrix ecosystem", "matrix toolkit"],
137
  }
 
138
  _WORD_RE = re.compile(r"[A-Za-z0-9_]+")
139
 
 
140
  def _normalize(text: str) -> List[str]:
141
  return [t.lower() for t in _WORD_RE.findall(text)]
142
 
 
143
  def _expand_query(q: str) -> str:
 
144
  ql = q.lower()
145
  extras: List[str] = []
146
  for canon, variants in _ALIAS_TABLE.items():
 
150
  return q + " | " + " ".join(sorted(set(extras)))
151
  return q
152
 
 
153
  def _keyword_overlap_score(query: str, text: str) -> float:
154
  q_tokens = set(_normalize(query))
155
  d_tokens = set(_normalize(text))
 
159
  union = len(q_tokens | d_tokens)
160
  return inter / max(1, union)
161
 
 
162
  def _domain_boost(text: str) -> float:
163
  t = text.lower()
164
  boost = 0.0
 
167
  boost += 0.05
168
  return min(boost, 0.25)
169
 
 
170
  def _best_paragraphs(text: str, query: str, max_chars: int = 700) -> str:
171
  paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
172
  if not paras:
 
184
  break
185
  return "\n".join(picked)
186
 
 
187
  def _cross_encoder_scores(
188
+ model: Optional["CrossEncoder"], query: str, docs: List[Dict], max_pairs: int = 50
 
 
 
189
  ) -> Optional[List[float]]:
190
  if not model:
191
  return None
 
196
  logger.warning("Cross-encoder scoring failed; continuing without it (%s)", e)
197
  return None
198
 
 
199
  def _rerank_docs(
200
+ docs: List[Dict], query: str, k_final: int, reranker: Optional["CrossEncoder"] = None
 
 
 
201
  ) -> List[Dict]:
202
  if not docs:
203
  return []
204
  vec_scores = [float(d.get("score", 0.0)) for d in docs]
205
  if vec_scores:
206
+ vmin, vmax = min(vec_scores), max(vec_scores)
 
207
  rng = max(1e-6, (vmax - vmin))
208
  vec_norm = [(v - vmin) / rng for v in vec_scores]
209
  else:
 
226
  if ce_norm is not None:
227
  score = 0.80 * score + 0.20 * ce_norm[i]
228
  merged.append((score, d))
 
229
  merged.sort(key=lambda x: x[0], reverse=True)
230
  return [d for _s, d in merged[:k_final]]
231
 
 
232
  def _build_context_from_docs(docs: List[Dict], query: str, max_blocks: int = 4) -> Tuple[str, List[str]]:
233
  blocks: List[str] = []
234
  sources: List[str] = []
 
247
  # Service
248
  # ----------------------------
249
  class ChatService:
250
+ def __init__(self, settings: Settings):
 
 
 
251
  self.settings = settings
252
  self.client = RouterRequestsClient(
253
  model=settings.model.name,
 
268
  except Exception as e:
269
  logger.warning("Reranker disabled: %s", e)
270
 
271
+ # default inference knobs to reduce repetition
272
+ self._stop = ["\nQuestion:", "\nUser:", "\nQ:", "\nAnswer:", "\nA:"]
273
+ self._extra = {"frequency_penalty": 0.2, "presence_penalty": 0.0}
274
+
275
  # ---------- RAG core ----------
276
  def _retrieve_best(self, query: str) -> Tuple[str, List[str]]:
277
  if not self.retriever:
 
292
  def _augment(self, query: str) -> Tuple[str, List[str]]:
293
  ctx, sources = self._retrieve_best(query)
294
  if ctx:
295
+ # No Q:/A: labels just a clear directive + the raw question
296
  user_msg = (
297
  f"{ctx}\n\n"
298
+ "Using only the context above, respond concisely (2–4 sentences) to this query.\n"
299
+ f"{query}"
300
  )
301
  else:
302
  user_msg = (
303
+ "Respond concisely (2–4 sentences). Do not restate the question or add labels.\n"
304
+ f"{query}"
305
  )
306
  return user_msg, sources
307
 
 
313
  user_msg,
314
  max_tokens=self.settings.model.max_new_tokens,
315
  temperature=self.settings.model.temperature,
316
+ stop=self._stop,
317
+ extra=self._extra,
318
  )
319
+ text = _clean_answer(text, query)
 
320
  return text, sources
321
 
322
  # ---------- Stream ----------
323
  def stream_answer(self, query: str):
324
  """
325
+ Stream while cleaning: suppress Q/A labels and near-duplicate lines as they appear.
 
326
  """
327
  user_msg, _ = self._augment(query)
328
  raw = self.client.chat_stream(
 
330
  user_msg,
331
  max_tokens=self.settings.model.max_new_tokens,
332
  temperature=self.settings.model.temperature,
333
+ stop=self._stop,
334
+ extra=self._extra,
335
  )
336
 
337
+ buf = "" # collected raw
338
+ emitted = "" # cleaned we already sent
339
  for token in raw:
340
  if not token:
341
  continue
342
  buf += token
343
+ cleaned = _clean_answer(buf, query)
344
  if len(cleaned) < len(emitted):
345
+ # parser got stricter; resync
346
  emitted = cleaned
347
  continue
348
  delta = cleaned[len(emitted):]