test_AI_Agent

Sleeping

qunwang commited on 30 days ago

Commit

266ce23

1 Parent(s): 8a3c12f

Add TTS and podcast generation

Adds OpenAI TTS-backed /api/tts and /api/podcast endpoints plus a simple RightPanel audio player to listen to export/summary or generate a podcast.

Files changed (7) hide show

.env.example +11 -0
.gitignore +27 -0
README.md +11 -3
api/server.py +74 -1
api/tts_podcast.py +106 -0
requirements.txt +3 -0
web/src/components/RightPanel.tsx +108 -2

.env.example ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copy to .env and fill in your values. Never commit .env.
+# Required for Clare
+OPENAI_API_KEY=your-openai-api-key-here
+# Optional: LangSmith (tracing / feedback)
+# LANGSMITH_API_KEY=your-langsmith-key
+# LANGSMITH_PROJECT=your-project-name
+# CLARE_ENABLE_LANGSMITH_LOG=1
+# CLARE_ENABLE_LANGSMITH_FEEDBACK=1

.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Environment and secrets – never commit
+.env
+.env.local
+.env.*.local
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+.venv/
+env/
+# Node / frontend
+node_modules/
+web/dist/
+web/build/
+web/out/
+# IDE / OS
+.idea/
+.vscode/
+.DS_Store
+*.log

README.md CHANGED Viewed

@@ -20,14 +20,22 @@ This Space hosts **Clare**, an AI-powered personalized learning assistant for Ha
 - **Observability**: LangSmith
 - **Deployment**: Hugging Face Docker Space
 ```
 📦 project/
  ├── app.py
- ├── config.py
- ├── clare_core.py
- ├── rag_engine.py   ← 负责 RAG
  └── requirements.txt
 ```

 - **Observability**: LangSmith
 - **Deployment**: Hugging Face Docker Space
+### Optional: Text-to-Speech & Podcast
+- **TTS**: Uses the same **OpenAI API key** (no extra secrets). Convert export/summary text to speech.
+- **Podcast**: Generates an MP3 from the session summary or full conversation.
+- **Hugging Face**: Set `OPENAI_API_KEY` in the Space **Settings → Secrets**. No extra env vars needed.
 ```
 📦 project/
  ├── app.py
+ ├── api/
+ │   ├── server.py
+ │   ├── clare_core.py
+ │   ├── rag_engine.py   ← RAG
+ │   └── tts_podcast.py  ← TTS & podcast (OpenAI TTS)
  └── requirements.txt
 ```

api/server.py CHANGED Viewed

@@ -4,7 +4,7 @@ import time
 from typing import Dict, List, Optional
 from fastapi import FastAPI, UploadFile, File, Form, Request
-from fastapi.responses import FileResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
@@ -21,6 +21,12 @@ from api.clare_core import (
     export_conversation,
     summarize_conversation,
 )
 # ✅ LangSmith (same idea as your Gradio app.py)
 try:
@@ -184,6 +190,18 @@ class SummaryReq(BaseModel):
     language_preference: str = "Auto"
 class FeedbackReq(BaseModel):
     user_id: str
     rating: str  # "helpful" | "not_helpful"
@@ -431,6 +449,61 @@ def api_summary(req: SummaryReq):
     return {"markdown": md}
 @app.get("/api/memoryline")
 def memoryline(user_id: str):
     _ = _get_session((user_id or "").strip())

 from typing import Dict, List, Optional
 from fastapi import FastAPI, UploadFile, File, Form, Request
+from fastapi.responses import FileResponse, JSONResponse, Response
 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
     export_conversation,
     summarize_conversation,
 )
+from api.tts_podcast import (
+    text_to_speech,
+    build_podcast_script_from_history,
+    build_podcast_script_from_summary,
+    generate_podcast_audio,
+)
 # ✅ LangSmith (same idea as your Gradio app.py)
 try:
     language_preference: str = "Auto"
+class TtsReq(BaseModel):
+    user_id: str
+    text: str
+    voice: Optional[str] = "nova"  # alloy, echo, fable, onyx, nova, shimmer
+class PodcastReq(BaseModel):
+    user_id: str
+    source: str = "summary"  # "summary" | "conversation"
+    voice: Optional[str] = "nova"
 class FeedbackReq(BaseModel):
     user_id: str
     rating: str  # "helpful" | "not_helpful"
     return {"markdown": md}
+# ----------------------------
+# TTS & Podcast (OpenAI TTS API)
+# ----------------------------
+@app.post("/api/tts")
+def api_tts(req: TtsReq):
+    """Convert text to speech; returns MP3 audio."""
+    user_id = (req.user_id or "").strip()
+    if not user_id:
+        return JSONResponse({"error": "Missing user_id"}, status_code=400)
+    text = (req.text or "").strip()
+    if not text:
+        return JSONResponse({"error": "Missing text"}, status_code=400)
+    if len(text) > 50_000:
+        return JSONResponse({"error": "Text too long (max 50000 chars)"}, status_code=400)
+    try:
+        audio_bytes = text_to_speech(text, voice=req.voice or "nova")
+    except Exception as e:
+        print(f"[tts] error: {repr(e)}")
+        return JSONResponse({"error": f"TTS failed: {repr(e)}"}, status_code=500)
+    if not audio_bytes:
+        return JSONResponse({"error": "No audio generated"}, status_code=500)
+    return Response(content=audio_bytes, media_type="audio/mpeg")
+@app.post("/api/podcast")
+def api_podcast(req: PodcastReq):
+    """Generate podcast audio from session summary or conversation. Returns MP3."""
+    user_id = (req.user_id or "").strip()
+    if not user_id:
+        return JSONResponse({"error": "Missing user_id"}, status_code=400)
+    sess = _get_session(user_id)
+    source = (req.source or "summary").lower()
+    voice = req.voice or "nova"
+    try:
+        if source == "conversation":
+            script = build_podcast_script_from_history(sess["history"])
+        else:
+            md = summarize_conversation(
+                sess["history"],
+                sess["course_outline"],
+                sess["weaknesses"],
+                sess["cognitive_state"],
+                sess["model_name"],
+                "Auto",
+            )
+            script = build_podcast_script_from_summary(md)
+        audio_bytes = generate_podcast_audio(script, voice=voice)
+    except Exception as e:
+        print(f"[podcast] error: {repr(e)}")
+        return JSONResponse({"error": f"Podcast failed: {repr(e)}"}, status_code=500)
+    if not audio_bytes:
+        return JSONResponse({"error": "No audio generated"}, status_code=500)
+    return Response(content=audio_bytes, media_type="audio/mpeg")
 @app.get("/api/memoryline")
 def memoryline(user_id: str):
     _ = _get_session((user_id or "").strip())

api/tts_podcast.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# api/tts_podcast.py
+"""
+Text-to-Speech and Podcast generation for Clare.
+Uses OpenAI TTS API (same OPENAI_API_KEY as chat). Safe for Hugging Face deployment.
+- Max 4096 characters per TTS request; long text is chunked.
+"""
+import re
+from typing import List, Tuple
+from .config import client
+# OpenAI TTS limits (see https://platform.openai.com/docs/guides/text-to-speech)
+TTS_MAX_CHARS = 4096
+TTS_MODEL = "tts-1"  # or "tts-1-hd" for higher quality (slower)
+TTS_VOICES = ("alloy", "echo", "fable", "onyx", "nova", "shimmer")
+DEFAULT_VOICE = "nova"
+def _chunk_text_for_tts(text: str, max_chars: int = TTS_MAX_CHARS - 100) -> List[str]:
+    """Split text into chunks under max_chars, trying to break at sentence boundaries."""
+    text = (text or "").strip()
+    if not text:
+        return []
+    if len(text) <= max_chars:
+        return [text]
+    chunks: List[str] = []
+    pattern = re.compile(r"(?<=[.!?。！？\n])\s+")
+    parts = pattern.split(text)
+    current = ""
+    for p in parts:
+        if len(current) + len(p) + 1 <= max_chars:
+            current = (current + " " + p).strip() if current else p
+        else:
+            if current:
+                chunks.append(current)
+            if len(p) > max_chars:
+                for i in range(0, len(p), max_chars):
+                    chunks.append(p[i : i + max_chars])
+                current = ""
+            else:
+                current = p
+    if current:
+        chunks.append(current)
+    return chunks
+def text_to_speech(text: str, voice: str = DEFAULT_VOICE, model: str = TTS_MODEL) -> bytes:
+    """
+    Convert text to MP3 audio using OpenAI TTS.
+    Long text is chunked and concatenated (binary concatenation of MP3 is valid).
+    """
+    if not text or not text.strip():
+        return b""
+    voice = (voice or DEFAULT_VOICE).lower()
+    if voice not in TTS_VOICES:
+        voice = DEFAULT_VOICE
+    chunks = _chunk_text_for_tts(text)
+    if not chunks:
+        return b""
+    all_bytes: List[bytes] = []
+    for chunk in chunks:
+        if not chunk.strip():
+            continue
+        resp = client.audio.speech.create(model=model, voice=voice, input=chunk)
+        all_bytes.append(resp.content)
+    return b"".join(all_bytes)
+def build_podcast_script_from_history(
+    history: List[Tuple[str, str]],
+    intro_title: str = "Clare Learning Summary",
+    max_turns: int = 20,
+) -> str:
+    """Build a podcast script from chat history."""
+    lines: List[str] = []
+    lines.append(f"Welcome to {intro_title}. Here are the key points from your session with Clare.")
+    turns = (history or [])[:max_turns]
+    for user_msg, assistant_msg in turns:
+        if user_msg and user_msg.strip():
+            lines.append(f"Question: {user_msg.strip()}")
+        if assistant_msg and assistant_msg.strip():
+            msg = assistant_msg.strip()
+            if len(msg) > 1500:
+                msg = msg[:1500] + " ..."
+            lines.append(f"Clare: {msg}")
+    lines.append("Thanks for listening. Keep learning with Clare.")
+    return "\n\n".join(lines)
+def build_podcast_script_from_summary(summary_md: str, intro_title: str = "Clare Summary Podcast") -> str:
+    """Build a short podcast script from an existing summary markdown."""
+    if not summary_md or not summary_md.strip():
+        return f"Welcome to {intro_title}. No summary available for this session."
+    text = summary_md.strip()
+    for pattern in (r"^#+\s*", r"\*\*([^*]+)\*\*", r"\*([^*]+)\*", r"\[([^\]]+)\]\([^)]+\)"):
+        text = re.sub(pattern, r"\1", text)
+    return f"Welcome to {intro_title}. {text} Thanks for listening."
+def generate_podcast_audio(script: str, voice: str = DEFAULT_VOICE, model: str = TTS_MODEL) -> bytes:
+    return text_to_speech(script, voice=voice, model=model)

requirements.txt CHANGED Viewed

	@@ -13,3 +13,6 @@ python-pptx
13
14	python-multipart>=0.0.9
15

 python-multipart>=0.0.9
+numpy
+python-dotenv>=1.0.0

web/src/components/RightPanel.tsx CHANGED Viewed

@@ -1,5 +1,5 @@
 // web/src/components/RightPanel.tsx
-import React, { useState } from 'react';
 import { Button } from './ui/button';
 import { Input } from './ui/input';
 import { Label } from './ui/label';
@@ -7,7 +7,7 @@ import { Card } from './ui/card';
 import { Separator } from './ui/separator';
 import { Textarea } from './ui/textarea';
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from './ui/select';
-import { LogIn, LogOut, FileText, MessageSquare, Download, ClipboardList, Sparkles } from 'lucide-react';
 import type { User } from '../App';
 import { toast } from 'sonner';
 import {
@@ -58,6 +58,70 @@ export function RightPanel({
   const [feedbackText, setFeedbackText] = useState('');
   const [feedbackCategory, setFeedbackCategory] = useState<'general' | 'bug' | 'feature'>('general');
   const handleLoginClick = () => {
     if (!name.trim() || !emailOrId.trim()) {
       toast.error('Please fill in all fields');
@@ -225,6 +289,48 @@ export function RightPanel({
                   Copy
                 </Button>
               </div>
               <div className="text-sm whitespace-pre-wrap text-foreground">{exportResult}</div>
             </div>
           ) : (

 // web/src/components/RightPanel.tsx
+import React, { useEffect, useRef, useState } from 'react';
 import { Button } from './ui/button';
 import { Input } from './ui/input';
 import { Label } from './ui/label';
 import { Separator } from './ui/separator';
 import { Textarea } from './ui/textarea';
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from './ui/select';
+import { LogIn, LogOut, FileText, MessageSquare, Download, ClipboardList, Sparkles, Volume2, Podcast } from 'lucide-react';
 import type { User } from '../App';
 import { toast } from 'sonner';
 import {
   const [feedbackText, setFeedbackText] = useState('');
   const [feedbackCategory, setFeedbackCategory] = useState<'general' | 'bug' | 'feature'>('general');
+  const [audioUrl, setAudioUrl] = useState<string | null>(null);
+  const [loadingTts, setLoadingTts] = useState(false);
+  const [loadingPodcast, setLoadingPodcast] = useState(false);
+  const audioRef = useRef<HTMLAudioElement>(null);
+  useEffect(() => {
+    return () => {
+      if (audioUrl) URL.revokeObjectURL(audioUrl);
+    };
+  }, [audioUrl]);
+  async function postAudio(path: string, payload: any): Promise<Blob> {
+    const res = await fetch(path, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(payload),
+    });
+    if (!res.ok) {
+      const txt = await res.text().catch(() => '');
+      throw new Error(`HTTP ${res.status}: ${txt || res.statusText}`);
+    }
+    return await res.blob();
+  }
+  const handleListenTts = async () => {
+    if (!isLoggedIn || !user?.user_id || !exportResult?.trim()) return;
+    if (audioUrl) URL.revokeObjectURL(audioUrl);
+    setAudioUrl(null);
+    try {
+      setLoadingTts(true);
+      toast.message('Generating speech…');
+      const blob = await postAudio('/api/tts', { user_id: user.user_id, text: exportResult, voice: 'nova' });
+      const url = URL.createObjectURL(blob);
+      setAudioUrl(url);
+      toast.success('Ready. Use the player below.');
+      setTimeout(() => audioRef.current?.play(), 100);
+    } catch (e) {
+      console.error(e);
+      toast.error(e instanceof Error ? e.message : 'TTS failed');
+    } finally {
+      setLoadingTts(false);
+    }
+  };
+  const handleGeneratePodcast = async (source: 'summary' | 'conversation') => {
+    if (!isLoggedIn || !user?.user_id) return;
+    if (audioUrl) URL.revokeObjectURL(audioUrl);
+    setAudioUrl(null);
+    try {
+      setLoadingPodcast(true);
+      toast.message(source === 'summary' ? 'Generating podcast from summary…' : 'Generating podcast from conversation…');
+      const blob = await postAudio('/api/podcast', { user_id: user.user_id, source, voice: 'nova' });
+      const url = URL.createObjectURL(blob);
+      setAudioUrl(url);
+      toast.success('Podcast ready. Use the player below.');
+      setTimeout(() => audioRef.current?.play(), 100);
+    } catch (e) {
+      console.error(e);
+      toast.error(e instanceof Error ? e.message : 'Podcast failed');
+    } finally {
+      setLoadingPodcast(false);
+    }
+  };
   const handleLoginClick = () => {
     if (!name.trim() || !emailOrId.trim()) {
       toast.error('Please fill in all fields');
                   Copy
                 </Button>
               </div>
+              <div className="space-y-2">
+                <div className="flex items-stretch gap-2">
+                  <Button
+                    variant="outline"
+                    size="sm"
+                    className="flex-1 gap-2"
+                    onClick={handleListenTts}
+                    disabled={!isLoggedIn || !exportResult?.trim() || loadingTts}
+                    title="Text-to-Speech"
+                  >
+                    <Volume2 className="h-4 w-4" />
+                    {loadingTts ? 'Generating…' : 'Listen (TTS)'}
+                  </Button>
+                </div>
+                <div className="flex items-stretch gap-2">
+                  <Button
+                    variant="outline"
+                    size="sm"
+                    className="flex-1 gap-2"
+                    onClick={() => handleGeneratePodcast('summary')}
+                    disabled={!isLoggedIn || loadingPodcast}
+                    title="Podcast from summary"
+                  >
+                    <Podcast className="h-4 w-4" />
+                    {loadingPodcast ? '…' : 'Podcast (summary)'}
+                  </Button>
+                  <Button
+                    variant="outline"
+                    size="sm"
+                    className="flex-1 gap-2"
+                    onClick={() => handleGeneratePodcast('conversation')}
+                    disabled={!isLoggedIn || loadingPodcast}
+                    title="Podcast from conversation"
+                  >
+                    <Podcast className="h-4 w-4" />
+                    {loadingPodcast ? '…' : 'Podcast (chat)'}
+                  </Button>
+                </div>
+                {audioUrl && (
+                  <audio ref={audioRef} src={audioUrl} controls className="w-full" />
+                )}
+              </div>
               <div className="text-sm whitespace-pre-wrap text-foreground">{exportResult}</div>
             </div>
           ) : (