qunwang commited on
Commit
5d2e88d
·
1 Parent(s): 553339e

Fix podcast regex error and optimize embedding generation

Browse files

- Fix PatternError in build_podcast_script_from_summary: patterns without capture groups now use empty string replacement
- Optimize embedding generation: use batch API (100 chunks per batch) instead of individual calls
- This significantly speeds up file upload and RAG chunk building

Files changed (2) hide show
  1. rag_engine.py +47 -15
  2. tts_podcast.py +10 -2
rag_engine.py CHANGED
@@ -275,7 +275,10 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
275
  return []
276
 
277
  # 3) 对每个文本块做 embedding,并附上 metadata
278
- chunks: List[Dict] = []
 
 
 
279
  for idx, t in enumerate(texts):
280
  text = (t or "").strip()
281
  if not text:
@@ -285,20 +288,49 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
285
  text_chunks = _split_into_chunks(text) if len(text) > 1400 else [text]
286
 
287
  for j, chunk_text in enumerate(text_chunks):
288
- emb = get_embedding(chunk_text)
289
- if emb is None:
290
- continue
291
-
292
- section_label = f"{doc_type_val} – chunk {idx + 1}" + (f"#{j + 1}" if len(text_chunks) > 1 else "")
293
- chunks.append(
294
- {
295
- "text": chunk_text,
296
- "embedding": emb,
297
- "source_file": basename,
298
- "section": section_label,
299
- "doc_type": doc_type_val, # NEW: add doc_type for filtering
300
- }
301
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  print(
304
  f"[RAG] built {len(chunks)} chunks from file ({ext}, doc_type={doc_type_val}, path={basename})"
 
275
  return []
276
 
277
  # 3) 对每个文本块做 embedding,并附上 metadata
278
+ # First, collect all chunk texts for batch embedding generation
279
+ chunk_texts: List[str] = []
280
+ chunk_metadata: List[Tuple[int, int]] = [] # (idx, sub_chunk_idx)
281
+
282
  for idx, t in enumerate(texts):
283
  text = (t or "").strip()
284
  if not text:
 
288
  text_chunks = _split_into_chunks(text) if len(text) > 1400 else [text]
289
 
290
  for j, chunk_text in enumerate(text_chunks):
291
+ chunk_texts.append(chunk_text)
292
+ chunk_metadata.append((idx, j))
293
+
294
+ # Generate embeddings in batch (much faster than individual calls)
295
+ embeddings: List[Optional[List[float]]] = []
296
+ if chunk_texts:
297
+ try:
298
+ from config import client, EMBEDDING_MODEL
299
+ # Batch embeddings (OpenAI supports up to 2048, use 100 per batch for reliability)
300
+ batch_size = 100
301
+ for i in range(0, len(chunk_texts), batch_size):
302
+ batch = chunk_texts[i:i + batch_size]
303
+ resp = client.embeddings.create(
304
+ model=EMBEDDING_MODEL,
305
+ input=batch,
306
+ )
307
+ batch_embeddings = [item.embedding for item in resp.data]
308
+ embeddings.extend(batch_embeddings)
309
+ except Exception as e:
310
+ print(f"[RAG] batch embedding error: {repr(e)}, falling back to individual calls")
311
+ # Fallback to individual calls
312
+ embeddings = []
313
+ for chunk_text in chunk_texts:
314
+ emb = get_embedding(chunk_text)
315
+ embeddings.append(emb)
316
+
317
+ # Build chunks with embeddings
318
+ chunks: List[Dict] = []
319
+ for (chunk_text, (idx, j)), emb in zip(zip(chunk_texts, chunk_metadata), embeddings):
320
+ if emb is None:
321
+ continue
322
+
323
+ text_chunks_for_idx = _split_into_chunks(texts[idx]) if len(texts[idx]) > 1400 else [texts[idx]]
324
+ section_label = f"{doc_type_val} – chunk {idx + 1}" + (f"#{j + 1}" if len(text_chunks_for_idx) > 1 else "")
325
+ chunks.append(
326
+ {
327
+ "text": chunk_text,
328
+ "embedding": emb,
329
+ "source_file": basename,
330
+ "section": section_label,
331
+ "doc_type": doc_type_val,
332
+ }
333
+ )
334
 
335
  print(
336
  f"[RAG] built {len(chunks)} chunks from file ({ext}, doc_type={doc_type_val}, path={basename})"
tts_podcast.py CHANGED
@@ -95,9 +95,17 @@ def build_podcast_script_from_summary(summary_md: str, intro_title: str = "Clare
95
  """Build a short podcast script from an existing summary markdown."""
96
  if not summary_md or not summary_md.strip():
97
  return f"Welcome to {intro_title}. No summary available for this session."
 
98
  text = summary_md.strip()
99
- for pattern in (r"^#+\s*", r"\*\*([^*]+)\*\*", r"\*([^*]+)\*", r"\[([^\]]+)\]\([^)]+\)"):
100
- text = re.sub(pattern, r"\1", text)
 
 
 
 
 
 
 
101
  return f"Welcome to {intro_title}. {text} Thanks for listening."
102
 
103
 
 
95
  """Build a short podcast script from an existing summary markdown."""
96
  if not summary_md or not summary_md.strip():
97
  return f"Welcome to {intro_title}. No summary available for this session."
98
+ # Strip markdown for cleaner speech
99
  text = summary_md.strip()
100
+ # Fix: patterns without capture groups should use empty string replacement
101
+ # Remove markdown headers (no capture group, replace with empty)
102
+ text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)
103
+ # Remove bold (**text** -> text)
104
+ text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
105
+ # Remove italic (*text* -> text)
106
+ text = re.sub(r"\*([^*]+)\*", r"\1", text)
107
+ # Remove links ([text](url) -> text)
108
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
109
  return f"Welcome to {intro_title}. {text} Thanks for listening."
110
 
111