Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on 11 days ago

Commit

815b978

1 Parent(s): 9563e4a

Deploy a45bfc7

Browse files

Files changed (9) hide show

app/api/transcribe.py +74 -0
app/api/tts.py +26 -0
app/core/config.py +6 -0
app/main.py +15 -0
app/models/speech.py +9 -0
app/services/transcriber.py +63 -0
app/services/tts_client.py +46 -0
requirements.txt +1 -0
tests/test_speech_endpoints.py +61 -0

app/api/transcribe.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Annotated
+from fastapi import APIRouter, Depends, File, Form, HTTPException, Request, UploadFile, status
+from app.models.speech import TranscribeResponse
+from app.security.jwt_auth import verify_jwt
+router = APIRouter()
+_ALLOWED_AUDIO_TYPES: frozenset[str] = frozenset(
+    {
+        "audio/webm",
+        "audio/wav",
+        "audio/x-wav",
+        "audio/mpeg",
+        "audio/mp3",
+        "audio/mp4",
+        "audio/ogg",
+        "audio/flac",
+    }
+)
+@router.post("")
+async def transcribe_endpoint(
+    request: Request,
+    audio: Annotated[UploadFile, File(...)],
+    _: Annotated[dict, Depends(verify_jwt)],
+    language: Annotated[str | None, Form()] = None,
+) -> TranscribeResponse:
+    settings = request.app.state.settings
+    transcriber = request.app.state.transcriber
+    if not transcriber.is_configured:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Transcription service is not configured.",
+        )
+    content_type = (audio.content_type or "").strip().lower()
+    if content_type not in _ALLOWED_AUDIO_TYPES:
+        raise HTTPException(
+            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail="Unsupported audio format.",
+        )
+    audio_bytes = await audio.read()
+    if not audio_bytes:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Audio file is empty.",
+        )
+    if len(audio_bytes) > settings.TRANSCRIBE_MAX_UPLOAD_BYTES:
+        raise HTTPException(
+            status_code=status.HTTP_413_CONTENT_TOO_LARGE,
+            detail="Audio file exceeds maximum allowed size.",
+        )
+    language_code = language.strip().lower() if language and language.strip() else None
+    if language_code and len(language_code) > 10:
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail="Invalid language code.",
+        )
+    transcript = await transcriber.transcribe(
+        filename=audio.filename or "audio.webm",
+        content_type=content_type,
+        audio_bytes=audio_bytes,
+        language=language_code,
+    )
+    return TranscribeResponse(transcript=transcript)

app/api/tts.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import Annotated
+from fastapi import APIRouter, Depends, HTTPException, Request, status
+from fastapi.responses import Response
+from app.models.speech import SynthesizeRequest
+from app.security.jwt_auth import verify_jwt
+router = APIRouter()
+@router.post("")
+async def synthesize_endpoint(
+    request: Request,
+    payload: SynthesizeRequest,
+    _: Annotated[dict, Depends(verify_jwt)],
+) -> Response:
+    tts_client = request.app.state.tts_client
+    if not tts_client.is_configured:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="TTS service is not configured.",
+        )
+    audio_bytes = await tts_client.synthesize(payload.text.strip())
+    return Response(content=audio_bytes, media_type="audio/wav")

app/core/config.py CHANGED Viewed

@@ -12,6 +12,7 @@ class Settings(BaseSettings):
     OLLAMA_MODEL: Optional[str] = None
     GROQ_MODEL_DEFAULT: str = "llama-3.1-8b-instant"
     GROQ_MODEL_LARGE: str = "llama-3.3-70b-versatile"
     # Vector
     QDRANT_URL: str
@@ -67,6 +68,11 @@ class Settings(BaseSettings):
     # In prod, the API Space calls the HF embedder/reranker Spaces via HTTP.
     EMBEDDER_URL: str = "http://localhost:7860"
     RERANKER_URL: str = "http://localhost:7861"
     model_config = SettingsConfigDict(env_file=".env", extra="ignore")

     OLLAMA_MODEL: Optional[str] = None
     GROQ_MODEL_DEFAULT: str = "llama-3.1-8b-instant"
     GROQ_MODEL_LARGE: str = "llama-3.3-70b-versatile"
+    GROQ_TRANSCRIBE_MODEL: str = "whisper-large-v3-turbo"
     # Vector
     QDRANT_URL: str
     # In prod, the API Space calls the HF embedder/reranker Spaces via HTTP.
     EMBEDDER_URL: str = "http://localhost:7860"
     RERANKER_URL: str = "http://localhost:7861"
+    TTS_SPACE_URL: str = "http://localhost:7862"
+    # Speech-to-text upload constraints
+    TRANSCRIBE_MAX_UPLOAD_BYTES: int = 2 * 1024 * 1024
+    TRANSCRIBE_TIMEOUT_SECONDS: float = 25.0
     model_config = SettingsConfigDict(env_file=".env", extra="ignore")

app/main.py CHANGED Viewed

@@ -14,6 +14,8 @@ from app.api.admin import router as admin_router
 from app.api.chat import router as chat_router
 from app.api.feedback import router as feedback_router
 from app.api.health import router as health_router
 from app.core.config import get_settings
 from app.core.exceptions import AppError
 from app.core.logging import get_logger
@@ -25,6 +27,8 @@ from app.services.github_log import GithubLog
 from app.services.llm_client import get_llm_client, TpmBucket
 from app.services.reranker import Reranker
 from app.services.semantic_cache import SemanticCache
 from app.services.conversation_store import ConversationStore
 from qdrant_client import QdrantClient
@@ -156,6 +160,15 @@ async def lifespan(app: FastAPI):
         context_path=settings.GEMINI_CONTEXT_PATH,
     )
     app.state.gemini_client = gemini_client
     from app.services.vector_store import VectorStore
     from app.security.guard_classifier import GuardClassifier
@@ -286,6 +299,8 @@ def create_app() -> FastAPI:
     app.include_router(health_router, tags=["Health"])
     app.include_router(chat_router, prefix="/chat", tags=["Chat"])
     app.include_router(feedback_router, prefix="/chat", tags=["Feedback"])
     app.include_router(admin_router, prefix="/admin", tags=["Admin"])

 from app.api.chat import router as chat_router
 from app.api.feedback import router as feedback_router
 from app.api.health import router as health_router
+from app.api.tts import router as tts_router
+from app.api.transcribe import router as transcribe_router
 from app.core.config import get_settings
 from app.core.exceptions import AppError
 from app.core.logging import get_logger
 from app.services.llm_client import get_llm_client, TpmBucket
 from app.services.reranker import Reranker
 from app.services.semantic_cache import SemanticCache
+from app.services.transcriber import GroqTranscriber
+from app.services.tts_client import TTSClient
 from app.services.conversation_store import ConversationStore
 from qdrant_client import QdrantClient
         context_path=settings.GEMINI_CONTEXT_PATH,
     )
     app.state.gemini_client = gemini_client
+    app.state.transcriber = GroqTranscriber(
+        api_key=settings.GROQ_API_KEY or "",
+        model=settings.GROQ_TRANSCRIBE_MODEL,
+        timeout_seconds=settings.TRANSCRIBE_TIMEOUT_SECONDS,
+    )
+    app.state.tts_client = TTSClient(
+        tts_space_url=settings.TTS_SPACE_URL,
+        timeout_seconds=settings.TRANSCRIBE_TIMEOUT_SECONDS,
+    )
     from app.services.vector_store import VectorStore
     from app.security.guard_classifier import GuardClassifier
     app.include_router(health_router, tags=["Health"])
     app.include_router(chat_router, prefix="/chat", tags=["Chat"])
+    app.include_router(transcribe_router, prefix="/transcribe", tags=["Transcribe"])
+    app.include_router(tts_router, prefix="/tts", tags=["TTS"])
     app.include_router(feedback_router, prefix="/chat", tags=["Feedback"])
     app.include_router(admin_router, prefix="/admin", tags=["Admin"])

app/models/speech.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from pydantic import BaseModel, Field
+class TranscribeResponse(BaseModel):
+    transcript: str = Field(..., min_length=1, max_length=5000)
+class SynthesizeRequest(BaseModel):
+    text: str = Field(..., min_length=1, max_length=300)

app/services/transcriber.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import asyncio
+import httpx
+from groq import AsyncGroq
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
+from app.core.exceptions import GenerationError
+class GroqTranscriber:
+    def __init__(
+        self,
+        api_key: str,
+        model: str,
+        timeout_seconds: float,
+    ) -> None:
+        self._client = AsyncGroq(api_key=api_key) if api_key else None
+        self._model = model
+        self._timeout_seconds = timeout_seconds
+    @property
+    def is_configured(self) -> bool:
+        return self._client is not None
+    @retry(
+        stop=stop_after_attempt(2),
+        wait=wait_fixed(0.8),
+        retry=retry_if_exception_type((httpx.RequestError, httpx.TimeoutException)),
+    )
+    async def transcribe(
+        self,
+        filename: str,
+        content_type: str,
+        audio_bytes: bytes,
+        language: str | None = None,
+    ) -> str:
+        if not self._client:
+            raise GenerationError("Transcriber is not configured with GROQ_API_KEY")
+        async def _call() -> str:
+            response = await self._client.audio.transcriptions.create(
+                file=(filename, audio_bytes, content_type),
+                model=self._model,
+                temperature=0,
+                language=language,
+            )
+            text = getattr(response, "text", None)
+            if isinstance(text, str) and text.strip():
+                return text.strip()
+            if isinstance(response, dict):
+                value = response.get("text")
+                if isinstance(value, str) and value.strip():
+                    return value.strip()
+            raise GenerationError("Transcription response did not contain text")
+        try:
+            return await asyncio.wait_for(_call(), timeout=self._timeout_seconds)
+        except TimeoutError as exc:
+            raise GenerationError("Transcription timed out") from exc
+        except GenerationError:
+            raise
+        except Exception as exc:
+            raise GenerationError("Transcription failed", context={"error": str(exc)}) from exc

app/services/tts_client.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import asyncio
+import httpx
+from app.core.exceptions import GenerationError
+class TTSClient:
+    def __init__(self, tts_space_url: str, timeout_seconds: float) -> None:
+        self._tts_space_url = tts_space_url.rstrip("/")
+        self._timeout_seconds = timeout_seconds
+    @property
+    def is_configured(self) -> bool:
+        return bool(self._tts_space_url)
+    async def synthesize(self, text: str) -> bytes:
+        if not self.is_configured:
+            raise GenerationError("TTS client is not configured")
+        async def _call() -> bytes:
+            async with httpx.AsyncClient(timeout=self._timeout_seconds) as client:
+                response = await client.post(
+                    f"{self._tts_space_url}/synthesize",
+                    json={"text": text},
+                    headers={"Content-Type": "application/json"},
+                )
+                response.raise_for_status()
+                audio_bytes = response.content
+                if not audio_bytes:
+                    raise GenerationError("TTS response was empty")
+                return audio_bytes
+        try:
+            return await asyncio.wait_for(_call(), timeout=self._timeout_seconds)
+        except TimeoutError as exc:
+            raise GenerationError("TTS request timed out") from exc
+        except httpx.HTTPStatusError as exc:
+            raise GenerationError(
+                "TTS upstream returned an error",
+                context={"status_code": exc.response.status_code},
+            ) from exc
+        except GenerationError:
+            raise
+        except Exception as exc:
+            raise GenerationError("TTS synthesis failed", context={"error": str(exc)}) from exc

requirements.txt CHANGED Viewed

@@ -9,6 +9,7 @@
 fastapi>=0.115.0
 uvicorn[standard]>=0.29.0
 uvloop>=0.19.0
 pydantic-settings>=2.2.1
 langgraph>=0.2.0
 qdrant-client==1.9.1

 fastapi>=0.115.0
 uvicorn[standard]>=0.29.0
 uvloop>=0.19.0
+python-multipart>=0.0.9
 pydantic-settings>=2.2.1
 langgraph>=0.2.0
 qdrant-client==1.9.1

tests/test_speech_endpoints.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import asyncio
+def test_transcribe_requires_auth(app_client):
+    response = app_client.post(
+        "/transcribe",
+        files={"audio": ("sample.webm", b"abc", "audio/webm")},
+    )
+    assert response.status_code == 401
+def test_transcribe_success(app_client, valid_token):
+    async def fake_transcribe(filename, content_type, audio_bytes, language=None):
+        await asyncio.sleep(0)
+        return "hello from voice"
+    app_client.app.state.transcriber.transcribe = fake_transcribe
+    response = app_client.post(
+        "/transcribe",
+        files={"audio": ("sample.webm", b"abc", "audio/webm")},
+        headers={"Authorization": f"Bearer {valid_token}"},
+    )
+    assert response.status_code == 200
+    assert response.json()["transcript"] == "hello from voice"
+def test_transcribe_rejects_oversized_audio(app_client, valid_token):
+    app_client.app.state.settings.TRANSCRIBE_MAX_UPLOAD_BYTES = 2
+    response = app_client.post(
+        "/transcribe",
+        files={"audio": ("sample.webm", b"abcdef", "audio/webm")},
+        headers={"Authorization": f"Bearer {valid_token}"},
+    )
+    assert response.status_code == 413
+def test_tts_requires_auth(app_client):
+    response = app_client.post("/tts", json={"text": "Hello world"})
+    assert response.status_code == 401
+def test_tts_success(app_client, valid_token):
+    async def fake_synthesize(text):
+        await asyncio.sleep(0)
+        return b"RIFF....fake"
+    app_client.app.state.tts_client.synthesize = fake_synthesize
+    response = app_client.post(
+        "/tts",
+        json={"text": "Hello world"},
+        headers={"Authorization": f"Bearer {valid_token}"},
+    )
+    assert response.status_code == 200
+    assert response.headers.get("content-type", "").startswith("audio/wav")
+    assert response.content == b"RIFF....fake"