qunwang commited on
Commit
266ce23
·
1 Parent(s): 8a3c12f

Add TTS and podcast generation

Browse files

Adds OpenAI TTS-backed /api/tts and /api/podcast endpoints plus a simple RightPanel audio player to listen to export/summary or generate a podcast.

.env.example ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy to .env and fill in your values. Never commit .env.
2
+
3
+ # Required for Clare
4
+ OPENAI_API_KEY=your-openai-api-key-here
5
+
6
+ # Optional: LangSmith (tracing / feedback)
7
+ # LANGSMITH_API_KEY=your-langsmith-key
8
+ # LANGSMITH_PROJECT=your-project-name
9
+ # CLARE_ENABLE_LANGSMITH_LOG=1
10
+ # CLARE_ENABLE_LANGSMITH_FEEDBACK=1
11
+
.gitignore ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment and secrets – never commit
2
+ .env
3
+ .env.local
4
+ .env.*.local
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ venv/
13
+ .venv/
14
+ env/
15
+
16
+ # Node / frontend
17
+ node_modules/
18
+ web/dist/
19
+ web/build/
20
+ web/out/
21
+
22
+ # IDE / OS
23
+ .idea/
24
+ .vscode/
25
+ .DS_Store
26
+ *.log
27
+
README.md CHANGED
@@ -20,14 +20,22 @@ This Space hosts **Clare**, an AI-powered personalized learning assistant for Ha
20
  - **Observability**: LangSmith
21
  - **Deployment**: Hugging Face Docker Space
22
 
 
 
 
 
 
 
23
 
24
 
25
  ```
26
  📦 project/
27
  ├── app.py
28
- ├── config.py
29
- ├── clare_core.py
30
- ├── rag_engine.py ← 负责 RAG
 
 
31
  └── requirements.txt
32
 
33
  ```
 
20
  - **Observability**: LangSmith
21
  - **Deployment**: Hugging Face Docker Space
22
 
23
+ ### Optional: Text-to-Speech & Podcast
24
+
25
+ - **TTS**: Uses the same **OpenAI API key** (no extra secrets). Convert export/summary text to speech.
26
+ - **Podcast**: Generates an MP3 from the session summary or full conversation.
27
+ - **Hugging Face**: Set `OPENAI_API_KEY` in the Space **Settings → Secrets**. No extra env vars needed.
28
+
29
 
30
 
31
  ```
32
  📦 project/
33
  ├── app.py
34
+ ├── api/
35
+ ├── server.py
36
+ ├── clare_core.py
37
+ │ ├── rag_engine.py ← RAG
38
+ │ └── tts_podcast.py ← TTS & podcast (OpenAI TTS)
39
  └── requirements.txt
40
 
41
  ```
api/server.py CHANGED
@@ -4,7 +4,7 @@ import time
4
  from typing import Dict, List, Optional
5
 
6
  from fastapi import FastAPI, UploadFile, File, Form, Request
7
- from fastapi.responses import FileResponse, JSONResponse
8
  from fastapi.staticfiles import StaticFiles
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel
@@ -21,6 +21,12 @@ from api.clare_core import (
21
  export_conversation,
22
  summarize_conversation,
23
  )
 
 
 
 
 
 
24
 
25
  # ✅ LangSmith (same idea as your Gradio app.py)
26
  try:
@@ -184,6 +190,18 @@ class SummaryReq(BaseModel):
184
  language_preference: str = "Auto"
185
 
186
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  class FeedbackReq(BaseModel):
188
  user_id: str
189
  rating: str # "helpful" | "not_helpful"
@@ -431,6 +449,61 @@ def api_summary(req: SummaryReq):
431
  return {"markdown": md}
432
 
433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  @app.get("/api/memoryline")
435
  def memoryline(user_id: str):
436
  _ = _get_session((user_id or "").strip())
 
4
  from typing import Dict, List, Optional
5
 
6
  from fastapi import FastAPI, UploadFile, File, Form, Request
7
+ from fastapi.responses import FileResponse, JSONResponse, Response
8
  from fastapi.staticfiles import StaticFiles
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel
 
21
  export_conversation,
22
  summarize_conversation,
23
  )
24
+ from api.tts_podcast import (
25
+ text_to_speech,
26
+ build_podcast_script_from_history,
27
+ build_podcast_script_from_summary,
28
+ generate_podcast_audio,
29
+ )
30
 
31
  # ✅ LangSmith (same idea as your Gradio app.py)
32
  try:
 
190
  language_preference: str = "Auto"
191
 
192
 
193
+ class TtsReq(BaseModel):
194
+ user_id: str
195
+ text: str
196
+ voice: Optional[str] = "nova" # alloy, echo, fable, onyx, nova, shimmer
197
+
198
+
199
+ class PodcastReq(BaseModel):
200
+ user_id: str
201
+ source: str = "summary" # "summary" | "conversation"
202
+ voice: Optional[str] = "nova"
203
+
204
+
205
  class FeedbackReq(BaseModel):
206
  user_id: str
207
  rating: str # "helpful" | "not_helpful"
 
449
  return {"markdown": md}
450
 
451
 
452
+ # ----------------------------
453
+ # TTS & Podcast (OpenAI TTS API)
454
+ # ----------------------------
455
+ @app.post("/api/tts")
456
+ def api_tts(req: TtsReq):
457
+ """Convert text to speech; returns MP3 audio."""
458
+ user_id = (req.user_id or "").strip()
459
+ if not user_id:
460
+ return JSONResponse({"error": "Missing user_id"}, status_code=400)
461
+ text = (req.text or "").strip()
462
+ if not text:
463
+ return JSONResponse({"error": "Missing text"}, status_code=400)
464
+ if len(text) > 50_000:
465
+ return JSONResponse({"error": "Text too long (max 50000 chars)"}, status_code=400)
466
+ try:
467
+ audio_bytes = text_to_speech(text, voice=req.voice or "nova")
468
+ except Exception as e:
469
+ print(f"[tts] error: {repr(e)}")
470
+ return JSONResponse({"error": f"TTS failed: {repr(e)}"}, status_code=500)
471
+ if not audio_bytes:
472
+ return JSONResponse({"error": "No audio generated"}, status_code=500)
473
+ return Response(content=audio_bytes, media_type="audio/mpeg")
474
+
475
+
476
+ @app.post("/api/podcast")
477
+ def api_podcast(req: PodcastReq):
478
+ """Generate podcast audio from session summary or conversation. Returns MP3."""
479
+ user_id = (req.user_id or "").strip()
480
+ if not user_id:
481
+ return JSONResponse({"error": "Missing user_id"}, status_code=400)
482
+ sess = _get_session(user_id)
483
+ source = (req.source or "summary").lower()
484
+ voice = req.voice or "nova"
485
+ try:
486
+ if source == "conversation":
487
+ script = build_podcast_script_from_history(sess["history"])
488
+ else:
489
+ md = summarize_conversation(
490
+ sess["history"],
491
+ sess["course_outline"],
492
+ sess["weaknesses"],
493
+ sess["cognitive_state"],
494
+ sess["model_name"],
495
+ "Auto",
496
+ )
497
+ script = build_podcast_script_from_summary(md)
498
+ audio_bytes = generate_podcast_audio(script, voice=voice)
499
+ except Exception as e:
500
+ print(f"[podcast] error: {repr(e)}")
501
+ return JSONResponse({"error": f"Podcast failed: {repr(e)}"}, status_code=500)
502
+ if not audio_bytes:
503
+ return JSONResponse({"error": "No audio generated"}, status_code=500)
504
+ return Response(content=audio_bytes, media_type="audio/mpeg")
505
+
506
+
507
  @app.get("/api/memoryline")
508
  def memoryline(user_id: str):
509
  _ = _get_session((user_id or "").strip())
api/tts_podcast.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/tts_podcast.py
2
+ """
3
+ Text-to-Speech and Podcast generation for Clare.
4
+ Uses OpenAI TTS API (same OPENAI_API_KEY as chat). Safe for Hugging Face deployment.
5
+ - Max 4096 characters per TTS request; long text is chunked.
6
+ """
7
+ import re
8
+ from typing import List, Tuple
9
+
10
+ from .config import client
11
+
12
+ # OpenAI TTS limits (see https://platform.openai.com/docs/guides/text-to-speech)
13
+ TTS_MAX_CHARS = 4096
14
+ TTS_MODEL = "tts-1" # or "tts-1-hd" for higher quality (slower)
15
+ TTS_VOICES = ("alloy", "echo", "fable", "onyx", "nova", "shimmer")
16
+ DEFAULT_VOICE = "nova"
17
+
18
+
19
+ def _chunk_text_for_tts(text: str, max_chars: int = TTS_MAX_CHARS - 100) -> List[str]:
20
+ """Split text into chunks under max_chars, trying to break at sentence boundaries."""
21
+ text = (text or "").strip()
22
+ if not text:
23
+ return []
24
+ if len(text) <= max_chars:
25
+ return [text]
26
+
27
+ chunks: List[str] = []
28
+ pattern = re.compile(r"(?<=[.!?。!?\n])\s+")
29
+ parts = pattern.split(text)
30
+ current = ""
31
+ for p in parts:
32
+ if len(current) + len(p) + 1 <= max_chars:
33
+ current = (current + " " + p).strip() if current else p
34
+ else:
35
+ if current:
36
+ chunks.append(current)
37
+ if len(p) > max_chars:
38
+ for i in range(0, len(p), max_chars):
39
+ chunks.append(p[i : i + max_chars])
40
+ current = ""
41
+ else:
42
+ current = p
43
+ if current:
44
+ chunks.append(current)
45
+ return chunks
46
+
47
+
48
+ def text_to_speech(text: str, voice: str = DEFAULT_VOICE, model: str = TTS_MODEL) -> bytes:
49
+ """
50
+ Convert text to MP3 audio using OpenAI TTS.
51
+ Long text is chunked and concatenated (binary concatenation of MP3 is valid).
52
+ """
53
+ if not text or not text.strip():
54
+ return b""
55
+ voice = (voice or DEFAULT_VOICE).lower()
56
+ if voice not in TTS_VOICES:
57
+ voice = DEFAULT_VOICE
58
+
59
+ chunks = _chunk_text_for_tts(text)
60
+ if not chunks:
61
+ return b""
62
+
63
+ all_bytes: List[bytes] = []
64
+ for chunk in chunks:
65
+ if not chunk.strip():
66
+ continue
67
+ resp = client.audio.speech.create(model=model, voice=voice, input=chunk)
68
+ all_bytes.append(resp.content)
69
+
70
+ return b"".join(all_bytes)
71
+
72
+
73
+ def build_podcast_script_from_history(
74
+ history: List[Tuple[str, str]],
75
+ intro_title: str = "Clare Learning Summary",
76
+ max_turns: int = 20,
77
+ ) -> str:
78
+ """Build a podcast script from chat history."""
79
+ lines: List[str] = []
80
+ lines.append(f"Welcome to {intro_title}. Here are the key points from your session with Clare.")
81
+ turns = (history or [])[:max_turns]
82
+ for user_msg, assistant_msg in turns:
83
+ if user_msg and user_msg.strip():
84
+ lines.append(f"Question: {user_msg.strip()}")
85
+ if assistant_msg and assistant_msg.strip():
86
+ msg = assistant_msg.strip()
87
+ if len(msg) > 1500:
88
+ msg = msg[:1500] + " ..."
89
+ lines.append(f"Clare: {msg}")
90
+ lines.append("Thanks for listening. Keep learning with Clare.")
91
+ return "\n\n".join(lines)
92
+
93
+
94
+ def build_podcast_script_from_summary(summary_md: str, intro_title: str = "Clare Summary Podcast") -> str:
95
+ """Build a short podcast script from an existing summary markdown."""
96
+ if not summary_md or not summary_md.strip():
97
+ return f"Welcome to {intro_title}. No summary available for this session."
98
+ text = summary_md.strip()
99
+ for pattern in (r"^#+\s*", r"\*\*([^*]+)\*\*", r"\*([^*]+)\*", r"\[([^\]]+)\]\([^)]+\)"):
100
+ text = re.sub(pattern, r"\1", text)
101
+ return f"Welcome to {intro_title}. {text} Thanks for listening."
102
+
103
+
104
+ def generate_podcast_audio(script: str, voice: str = DEFAULT_VOICE, model: str = TTS_MODEL) -> bytes:
105
+ return text_to_speech(script, voice=voice, model=model)
106
+
requirements.txt CHANGED
@@ -13,3 +13,6 @@ python-pptx
13
 
14
  python-multipart>=0.0.9
15
 
 
 
 
 
13
 
14
  python-multipart>=0.0.9
15
 
16
+ numpy
17
+ python-dotenv>=1.0.0
18
+
web/src/components/RightPanel.tsx CHANGED
@@ -1,5 +1,5 @@
1
  // web/src/components/RightPanel.tsx
2
- import React, { useState } from 'react';
3
  import { Button } from './ui/button';
4
  import { Input } from './ui/input';
5
  import { Label } from './ui/label';
@@ -7,7 +7,7 @@ import { Card } from './ui/card';
7
  import { Separator } from './ui/separator';
8
  import { Textarea } from './ui/textarea';
9
  import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from './ui/select';
10
- import { LogIn, LogOut, FileText, MessageSquare, Download, ClipboardList, Sparkles } from 'lucide-react';
11
  import type { User } from '../App';
12
  import { toast } from 'sonner';
13
  import {
@@ -58,6 +58,70 @@ export function RightPanel({
58
  const [feedbackText, setFeedbackText] = useState('');
59
  const [feedbackCategory, setFeedbackCategory] = useState<'general' | 'bug' | 'feature'>('general');
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  const handleLoginClick = () => {
62
  if (!name.trim() || !emailOrId.trim()) {
63
  toast.error('Please fill in all fields');
@@ -225,6 +289,48 @@ export function RightPanel({
225
  Copy
226
  </Button>
227
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  <div className="text-sm whitespace-pre-wrap text-foreground">{exportResult}</div>
229
  </div>
230
  ) : (
 
1
  // web/src/components/RightPanel.tsx
2
+ import React, { useEffect, useRef, useState } from 'react';
3
  import { Button } from './ui/button';
4
  import { Input } from './ui/input';
5
  import { Label } from './ui/label';
 
7
  import { Separator } from './ui/separator';
8
  import { Textarea } from './ui/textarea';
9
  import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from './ui/select';
10
+ import { LogIn, LogOut, FileText, MessageSquare, Download, ClipboardList, Sparkles, Volume2, Podcast } from 'lucide-react';
11
  import type { User } from '../App';
12
  import { toast } from 'sonner';
13
  import {
 
58
  const [feedbackText, setFeedbackText] = useState('');
59
  const [feedbackCategory, setFeedbackCategory] = useState<'general' | 'bug' | 'feature'>('general');
60
 
61
+ const [audioUrl, setAudioUrl] = useState<string | null>(null);
62
+ const [loadingTts, setLoadingTts] = useState(false);
63
+ const [loadingPodcast, setLoadingPodcast] = useState(false);
64
+ const audioRef = useRef<HTMLAudioElement>(null);
65
+
66
+ useEffect(() => {
67
+ return () => {
68
+ if (audioUrl) URL.revokeObjectURL(audioUrl);
69
+ };
70
+ }, [audioUrl]);
71
+
72
+ async function postAudio(path: string, payload: any): Promise<Blob> {
73
+ const res = await fetch(path, {
74
+ method: 'POST',
75
+ headers: { 'Content-Type': 'application/json' },
76
+ body: JSON.stringify(payload),
77
+ });
78
+ if (!res.ok) {
79
+ const txt = await res.text().catch(() => '');
80
+ throw new Error(`HTTP ${res.status}: ${txt || res.statusText}`);
81
+ }
82
+ return await res.blob();
83
+ }
84
+
85
+ const handleListenTts = async () => {
86
+ if (!isLoggedIn || !user?.user_id || !exportResult?.trim()) return;
87
+ if (audioUrl) URL.revokeObjectURL(audioUrl);
88
+ setAudioUrl(null);
89
+ try {
90
+ setLoadingTts(true);
91
+ toast.message('Generating speech…');
92
+ const blob = await postAudio('/api/tts', { user_id: user.user_id, text: exportResult, voice: 'nova' });
93
+ const url = URL.createObjectURL(blob);
94
+ setAudioUrl(url);
95
+ toast.success('Ready. Use the player below.');
96
+ setTimeout(() => audioRef.current?.play(), 100);
97
+ } catch (e) {
98
+ console.error(e);
99
+ toast.error(e instanceof Error ? e.message : 'TTS failed');
100
+ } finally {
101
+ setLoadingTts(false);
102
+ }
103
+ };
104
+
105
+ const handleGeneratePodcast = async (source: 'summary' | 'conversation') => {
106
+ if (!isLoggedIn || !user?.user_id) return;
107
+ if (audioUrl) URL.revokeObjectURL(audioUrl);
108
+ setAudioUrl(null);
109
+ try {
110
+ setLoadingPodcast(true);
111
+ toast.message(source === 'summary' ? 'Generating podcast from summary…' : 'Generating podcast from conversation…');
112
+ const blob = await postAudio('/api/podcast', { user_id: user.user_id, source, voice: 'nova' });
113
+ const url = URL.createObjectURL(blob);
114
+ setAudioUrl(url);
115
+ toast.success('Podcast ready. Use the player below.');
116
+ setTimeout(() => audioRef.current?.play(), 100);
117
+ } catch (e) {
118
+ console.error(e);
119
+ toast.error(e instanceof Error ? e.message : 'Podcast failed');
120
+ } finally {
121
+ setLoadingPodcast(false);
122
+ }
123
+ };
124
+
125
  const handleLoginClick = () => {
126
  if (!name.trim() || !emailOrId.trim()) {
127
  toast.error('Please fill in all fields');
 
289
  Copy
290
  </Button>
291
  </div>
292
+ <div className="space-y-2">
293
+ <div className="flex items-stretch gap-2">
294
+ <Button
295
+ variant="outline"
296
+ size="sm"
297
+ className="flex-1 gap-2"
298
+ onClick={handleListenTts}
299
+ disabled={!isLoggedIn || !exportResult?.trim() || loadingTts}
300
+ title="Text-to-Speech"
301
+ >
302
+ <Volume2 className="h-4 w-4" />
303
+ {loadingTts ? 'Generating…' : 'Listen (TTS)'}
304
+ </Button>
305
+ </div>
306
+ <div className="flex items-stretch gap-2">
307
+ <Button
308
+ variant="outline"
309
+ size="sm"
310
+ className="flex-1 gap-2"
311
+ onClick={() => handleGeneratePodcast('summary')}
312
+ disabled={!isLoggedIn || loadingPodcast}
313
+ title="Podcast from summary"
314
+ >
315
+ <Podcast className="h-4 w-4" />
316
+ {loadingPodcast ? '…' : 'Podcast (summary)'}
317
+ </Button>
318
+ <Button
319
+ variant="outline"
320
+ size="sm"
321
+ className="flex-1 gap-2"
322
+ onClick={() => handleGeneratePodcast('conversation')}
323
+ disabled={!isLoggedIn || loadingPodcast}
324
+ title="Podcast from conversation"
325
+ >
326
+ <Podcast className="h-4 w-4" />
327
+ {loadingPodcast ? '…' : 'Podcast (chat)'}
328
+ </Button>
329
+ </div>
330
+ {audioUrl && (
331
+ <audio ref={audioRef} src={audioUrl} controls className="w-full" />
332
+ )}
333
+ </div>
334
  <div className="text-sm whitespace-pre-wrap text-foreground">{exportResult}</div>
335
  </div>
336
  ) : (