File size: 2,239 Bytes
edc5445
9fe7102
814634e
0300579
5bcf511
9fe7102
8cbcb5c
21c3757
64f8c24
3f674bb
f6905a0
f6d0d12
9fe7102
 
 
 
 
814634e
 
 
 
 
 
 
c0b470e
9fe7102
edc5445
3353131
3f674bb
f6d0d12
9fe7102
3f674bb
3353131
9fe7102
3f674bb
9fe7102
1537a46
 
9fe7102
639afb1
 
3353131
9fe7102
 
3353131
a5201f5
9fe7102
1537a46
c51fa1f
 
 
9fe7102
2db5666
9fe7102
 
2db5666
1537a46
 
9fe7102
2db5666
a5201f5
3353131
2db5666
a5201f5
3353131
 
3f674bb
3353131
afd6df1
3f674bb
579d95d
9fe7102
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from logic import synthesize_voice, plot_data, plot_waveforms
import base64
import sys
import numpy as np
from io import BytesIO
from hifigan.inference_e2e import hifi_gan_inference

app = FastAPI()

@app.get("/")
def read_root():
    data = {"Voice": "Cloning", "Status": "Success"}
    return JSONResponse(content=data)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

hugging_face_api_url = "https://huggingface.co/spaces/lord-reso/host/synthesize"

@app.post("/synthesize")
async def synthesize(request: Request):
    print("call successful")
    
    json = await request.json()
    print(json)
    
    font_type = json['font_select']
    input_text = json['input_text']     

    print("generating mel-spectrogram")
    # Generate mel-spectrogram using Tacotron2
    # mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "Shruti_finetuned.pt")
    mel_output_data, mel_output_postnet_data, alignments_data = synthesize_voice(input_text, "kaggle_12000.pt")
    print("mel generation successful")
        
    # Convert mel-spectrogram to base64 for display in HTML
    mel_output_base64 = plot_data([mel_output_data, mel_output_postnet_data, alignments_data])

    # Audio Synthesis begins
    print("Starting audio synthesis")
    buffer = BytesIO()
    np.save(buffer, mel_output_data)
    input_mel = buffer.getvalue()
    
    hifigan_checkpoint = "generator_v1"
    
    # Generate audio using Hifigan
    audio_data = hifi_gan_inference(input_mel, hifigan_checkpoint)

    print("Creating time-domain waveform")
    # Plot the waveform
    wave_base64 = plot_waveforms(audio_data)

    # Encode audio content as Base64
    audio_base64 = base64.b64encode(audio_data).decode('utf-8')

    # Customize the response based on the information you want to send to the frontend
    response_data = {
        'mel_spectrogram': mel_output_base64,
        'audio_data': audio_base64,
        'waveform': wave_base64,
    }

    return JSONResponse(content=response_data)