ClareVoice

Runtime error

qunwang commited on Feb 9

Commit

5d2e88d

1 Parent(s): 553339e

Fix podcast regex error and optimize embedding generation

- Fix PatternError in build_podcast_script_from_summary: patterns without capture groups now use empty string replacement
- Optimize embedding generation: use batch API (100 chunks per batch) instead of individual calls
- This significantly speeds up file upload and RAG chunk building

Files changed (2) hide show

rag_engine.py +47 -15
tts_podcast.py +10 -2

rag_engine.py CHANGED Viewed

@@ -275,7 +275,10 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
             return []
         # 3) 对每个文本块做 embedding，并附上 metadata
-        chunks: List[Dict] = []
         for idx, t in enumerate(texts):
             text = (t or "").strip()
             if not text:
@@ -285,20 +288,49 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
             text_chunks = _split_into_chunks(text) if len(text) > 1400 else [text]
             for j, chunk_text in enumerate(text_chunks):
-                emb = get_embedding(chunk_text)
-                if emb is None:
-                    continue
-                section_label = f"{doc_type_val} – chunk {idx + 1}" + (f"#{j + 1}" if len(text_chunks) > 1 else "")
-                chunks.append(
-                    {
-                        "text": chunk_text,
-                        "embedding": emb,
-                        "source_file": basename,
-                        "section": section_label,
-                        "doc_type": doc_type_val,  # NEW: add doc_type for filtering
-                    }
-                )
         print(
             f"[RAG] built {len(chunks)} chunks from file ({ext}, doc_type={doc_type_val}, path={basename})"

             return []
         # 3) 对每个文本块做 embedding，并附上 metadata
+        # First, collect all chunk texts for batch embedding generation
+        chunk_texts: List[str] = []
+        chunk_metadata: List[Tuple[int, int]] = []  # (idx, sub_chunk_idx)
         for idx, t in enumerate(texts):
             text = (t or "").strip()
             if not text:
             text_chunks = _split_into_chunks(text) if len(text) > 1400 else [text]
             for j, chunk_text in enumerate(text_chunks):
+                chunk_texts.append(chunk_text)
+                chunk_metadata.append((idx, j))
+        # Generate embeddings in batch (much faster than individual calls)
+        embeddings: List[Optional[List[float]]] = []
+        if chunk_texts:
+            try:
+                from config import client, EMBEDDING_MODEL
+                # Batch embeddings (OpenAI supports up to 2048, use 100 per batch for reliability)
+                batch_size = 100
+                for i in range(0, len(chunk_texts), batch_size):
+                    batch = chunk_texts[i:i + batch_size]
+                    resp = client.embeddings.create(
+                        model=EMBEDDING_MODEL,
+                        input=batch,
+                    )
+                    batch_embeddings = [item.embedding for item in resp.data]
+                    embeddings.extend(batch_embeddings)
+            except Exception as e:
+                print(f"[RAG] batch embedding error: {repr(e)}, falling back to individual calls")
+                # Fallback to individual calls
+                embeddings = []
+                for chunk_text in chunk_texts:
+                    emb = get_embedding(chunk_text)
+                    embeddings.append(emb)
+        # Build chunks with embeddings
+        chunks: List[Dict] = []
+        for (chunk_text, (idx, j)), emb in zip(zip(chunk_texts, chunk_metadata), embeddings):
+            if emb is None:
+                continue
+            text_chunks_for_idx = _split_into_chunks(texts[idx]) if len(texts[idx]) > 1400 else [texts[idx]]
+            section_label = f"{doc_type_val} – chunk {idx + 1}" + (f"#{j + 1}" if len(text_chunks_for_idx) > 1 else "")
+            chunks.append(
+                {
+                    "text": chunk_text,
+                    "embedding": emb,
+                    "source_file": basename,
+                    "section": section_label,
+                    "doc_type": doc_type_val,
+                }
+            )
         print(
             f"[RAG] built {len(chunks)} chunks from file ({ext}, doc_type={doc_type_val}, path={basename})"

tts_podcast.py CHANGED Viewed

@@ -95,9 +95,17 @@ def build_podcast_script_from_summary(summary_md: str, intro_title: str = "Clare
     """Build a short podcast script from an existing summary markdown."""
     if not summary_md or not summary_md.strip():
         return f"Welcome to {intro_title}. No summary available for this session."
     text = summary_md.strip()
-    for pattern in (r"^#+\s*", r"\*\*([^*]+)\*\*", r"\*([^*]+)\*", r"\[([^\]]+)\]\([^)]+\)"):
-        text = re.sub(pattern, r"\1", text)
     return f"Welcome to {intro_title}. {text} Thanks for listening."

     """Build a short podcast script from an existing summary markdown."""
     if not summary_md or not summary_md.strip():
         return f"Welcome to {intro_title}. No summary available for this session."
+    # Strip markdown for cleaner speech
     text = summary_md.strip()
+    # Fix: patterns without capture groups should use empty string replacement
+    # Remove markdown headers (no capture group, replace with empty)
+    text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)
+    # Remove bold (**text** -> text)
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+    # Remove italic (*text* -> text)
+    text = re.sub(r"\*([^*]+)\*", r"\1", text)
+    # Remove links ([text](url) -> text)
+    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
     return f"Welcome to {intro_title}. {text} Thanks for listening."