File size: 4,631 Bytes
24d1041
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import logging
import os
import uuid
from typing import Optional

import httpx
from fastapi import HTTPException, UploadFile

logger = logging.getLogger(__name__)


class ElevenLabsService:
    def __init__(self) -> None:
        self.api_key = os.getenv("ELEVENLABS_API_KEY")
        self.base_url = os.getenv("ELEVENLABS_BASE_URL", "https://api.elevenlabs.io/v1")
        self.model_id = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
        self.timeout = float(os.getenv("ELEVENLABS_TIMEOUT_SECONDS", "40"))

    def _headers(self) -> dict:
        if not self.api_key:
            raise HTTPException(
                status_code=500,
                detail="ELEVENLABS_API_KEY is not configured",
            )
        return {"xi-api-key": self.api_key}

    async def clone_voice(self, audio_file: UploadFile) -> str:
        """Clone a voice in ElevenLabs and return the generated voice_id."""
        if audio_file is None:
            raise HTTPException(status_code=400, detail="speaker_wav is required")

        file_bytes = await audio_file.read()
        if not file_bytes:
            raise HTTPException(status_code=400, detail="speaker_wav is empty")

        voice_name = f"voiceapi-temp-{uuid.uuid4().hex[:10]}"

        files = {
            "files": (
                audio_file.filename or "sample.wav",
                file_bytes,
                audio_file.content_type or "audio/wav",
            )
        }
        data = {
            "name": voice_name,
            "description": "Temporary cloned voice from VoiceAPI session",
        }

        try:
            with httpx.Client(timeout=self.timeout) as client:
                response = client.post(
                    f"{self.base_url}/voices/add",
                    headers=self._headers(),
                    data=data,
                    files=files,
                )
            if response.status_code >= 400:
                logger.error("ElevenLabs clone failed: %s", response.text)
                raise HTTPException(
                    status_code=502,
                    detail=f"Voice cloning failed: {response.text[:300]}",
                )
            payload = response.json()
            voice_id = payload.get("voice_id")
            if not voice_id:
                raise HTTPException(
                    status_code=502, detail="voice_id missing in clone response"
                )
            return voice_id
        except httpx.TimeoutException:
            raise HTTPException(status_code=504, detail="Voice cloning timed out")
        except HTTPException:
            raise
        except Exception as exc:
            logger.exception("Unexpected clone error")
            raise HTTPException(status_code=500, detail=f"Clone request failed: {exc}")

    def generate_speech(
        self,
        text: str,
        voice_id: str,
        language: Optional[str] = None,
        output_format: str = "mp3_44100_128",
    ) -> bytes:
        """Generate speech bytes using ElevenLabs text-to-speech API."""
        if not text.strip():
            raise HTTPException(status_code=400, detail="text is required")
        if not voice_id:
            raise HTTPException(status_code=400, detail="voice_id is required")

        body = {
            "text": text,
            "model_id": self.model_id,
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75,
            },
        }
        if language:
            body["language_code"] = language

        try:
            with httpx.Client(timeout=self.timeout) as client:
                response = client.post(
                    f"{self.base_url}/text-to-speech/{voice_id}",
                    params={"output_format": output_format},
                    headers={**self._headers(), "Accept": "audio/mpeg"},
                    json=body,
                )
            if response.status_code >= 400:
                logger.error("ElevenLabs TTS failed: %s", response.text)
                raise HTTPException(
                    status_code=502,
                    detail=f"Speech generation failed: {response.text[:300]}",
                )
            return response.content
        except httpx.TimeoutException:
            raise HTTPException(status_code=504, detail="Speech generation timed out")
        except HTTPException:
            raise
        except Exception as exc:
            logger.exception("Unexpected speech generation error")
            raise HTTPException(
                status_code=500, detail=f"Speech generation request failed: {exc}"
            )