File size: 5,117 Bytes
3b32b80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import sys
import uuid
from typing import Optional
import numpy as np
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from fastapi.responses import FileResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import soundfile as sf
import io

# Add neutts-air to path (same as your working code)
sys.path.append("neutts-air")

try:
    from neuttsair.neutts import NeuTTSAir
except ImportError as e:
    raise RuntimeError(f"Failed to import NeuTTS Air: {e}. Make sure neutts-air submodule is initialized.")

# Initialize FastAPI app
app = FastAPI(
    title="NeuTTS Air Production API",
    description="Production-ready Text-to-Speech with Voice Cloning",
    version="1.0.0"
)

# CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# Global model instance (same initialization as your working code)
tts = NeuTTSAir(
    backbone_repo="neuphonic/neutts-air",
    backbone_device="cpu",  # Changed to CPU for Hugging Face Spaces
    codec_repo="neuphonic/neucodec", 
    codec_device="cpu"     # Changed to CPU for Hugging Face Spaces
)

# Create directories
os.makedirs("uploads", exist_ok=True)
os.makedirs("outputs", exist_ok=True)

@app.get("/")
async def root():
    return {"status": "online", "service": "NeuTTS Air API"}

@app.get("/health")
async def health_check():
    return {"status": "healthy", "model_loaded": tts is not None}

@app.post("/api/v1/synthesize")
async def synthesize_speech(
    ref_text: str = Form(..., description="Reference audio transcript"),
    gen_text: str = Form(..., description="Text to synthesize"),
    ref_audio: UploadFile = File(..., description="Reference audio file (WAV)")
):
    """
    Synthesize speech using voice cloning
    """
    try:
        # Validate audio file
        if not ref_audio.filename.lower().endswith('.wav'):
            raise HTTPException(400, "Only WAV files are supported as reference audio")
        
        # Save uploaded file
        upload_path = f"uploads/{uuid.uuid4()}_{ref_audio.filename}"
        with open(upload_path, "wb") as f:
            content = await ref_audio.read()
            f.write(content)
        
        # Perform inference (same pattern as your working code)
        ref_codes = tts.encode_reference(upload_path)
        wav = tts.infer(gen_text, ref_codes, ref_text)
        
        # Save output
        output_path = f"outputs/{uuid.uuid4()}.wav"
        sf.write(output_path, wav, 24000)
        
        return FileResponse(
            output_path,
            media_type="audio/wav",
            filename="synthesized_speech.wav"
        )
        
    except Exception as e:
        raise HTTPException(500, f"Synthesis failed: {str(e)}")

@app.post("/api/v1/synthesize/b64")
async def synthesize_speech_base64(
    ref_text: str = Form(...),
    gen_text: str = Form(...), 
    ref_audio: UploadFile = File(...)
):
    """
    Synthesize speech and return as base64 encoded audio
    """
    try:
        # Save uploaded file
        upload_path = f"uploads/{uuid.uuid4()}_{ref_audio.filename}"
        with open(upload_path, "wb") as f:
            content = await ref_audio.read()
            f.write(content)
        
        # Perform inference
        ref_codes = tts.encode_reference(upload_path)
        wav = tts.infer(gen_text, ref_codes, ref_text)
        
        # Convert to base64
        buffer = io.BytesIO()
        sf.write(buffer, wav, 24000, format='WAV')
        buffer.seek(0)
        
        import base64
        audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
        
        return JSONResponse({
            "audio_data": audio_b64,
            "sample_rate": 24000,
            "format": "wav"
        })
        
    except Exception as e:
        raise HTTPException(500, f"Synthesis failed: {str(e)}")

# Batch processing endpoint
@app.post("/api/v1/batch-synthesize")
async def batch_synthesize(
    ref_text: str = Form(...),
    ref_audio: UploadFile = File(...),
    texts: str = Form(..., description="JSON array of texts to synthesize")
):
    """
    Synthesize multiple texts with the same voice
    """
    try:
        import json
        text_list = json.loads(texts)
        
        # Save reference audio
        upload_path = f"uploads/{uuid.uuid4()}_{ref_audio.filename}"
        with open(upload_path, "wb") as f:
            content = await ref_audio.read()
            f.write(content)
        
        # Encode reference once
        ref_codes = tts.encode_reference(upload_path)
        
        results = []
        for i, text in enumerate(text_list):
            wav = tts.infer(text, ref_codes, ref_text)
            output_path = f"outputs/{uuid.uuid4()}.wav"
            sf.write(output_path, wav, 24000)
            results.append(output_path)
        
        return {"generated_files": results}
        
    except Exception as e:
        raise HTTPException(500, f"Batch synthesis failed: {str(e)}")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)