Sanchayt commited on
Commit
d5c6c87
·
1 Parent(s): 9a0f2a4
Files changed (2) hide show
  1. app.py +217 -0
  2. requirement.txt +117 -0
app.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import mimetypes
3
+ import os
4
+ import shutil
5
+ import tempfile
6
+ import time
7
+
8
+ import assemblyai as aai
9
+ import gradio as gr
10
+ import numpy as np
11
+ import requests
12
+ import sounddevice as sd
13
+ from elevenlabs import clone, generate, play, set_api_key, stream
14
+ from scipy.io.wavfile import write
15
+
16
+ set_api_key("cedcbf1991539f9c825a9346e1b7b708")
17
+ import mimetypes
18
+
19
+ from gradio.components import Audio, Radio, Textbox
20
+ from gradio.components import Audio as AudioInput
21
+ from gradio.components import Audio as AudioOutput
22
+ from gradio.components import Textbox as TextboxOutput
23
+
24
+ APP_KEY = "6lWL15cmmm5y5hLYU8-MvQ=="
25
+ APP_SECRET = "xoXvx_qwuD5HczjnEYOC9OJj6HGCZDFZBHKHEegigHA="
26
+
27
+ aai.settings.api_key = "6c7f4d60028e4df9b889b93acb8ed698"
28
+
29
+
30
+ def transcribe_audio(file_path):
31
+ transcriber = aai.Transcriber()
32
+ transcript = transcriber.transcribe(file_path)
33
+ return transcript.text
34
+
35
+
36
+ def clone_and_stream_voice(name, description, labels, text, model):
37
+ voice = clone(
38
+ name=name, description=description, files=["output.wav"], labels=labels
39
+ )
40
+
41
+ audio = generate(
42
+ text=text,
43
+ voice=voice,
44
+ model=model,
45
+ stream=True,
46
+ stream_chunk_size=2048,
47
+ latency=1,
48
+ )
49
+
50
+ stream(audio)
51
+
52
+
53
+ def get_access_token():
54
+ payload = {"grant_type": "client_credentials", "expires_in": 1800}
55
+ response = requests.post(
56
+ "https://api.dolby.io/v1/auth/token",
57
+ data=payload,
58
+ auth=requests.auth.HTTPBasicAuth(APP_KEY, APP_SECRET),
59
+ )
60
+ return response.json()["access_token"]
61
+
62
+
63
+ def upload_media(file_path, headers):
64
+ upload_url = "https://api.dolby.com/media/input"
65
+ upload_body = {"url": f"dlb://in/{os.path.basename(file_path)}"}
66
+ response = requests.post(upload_url, json=upload_body, headers=headers)
67
+ response.raise_for_status()
68
+ presigned_url = response.json()["url"]
69
+
70
+ with open(file_path, "rb") as input_file:
71
+ requests.put(presigned_url, data=input_file)
72
+
73
+
74
+ def create_enhancement_job(file_path, output_path, headers, audio_type):
75
+ enhance_url = "https://api.dolby.com/media/enhance"
76
+ enhance_body = {
77
+ "input": f"dlb://in/{os.path.basename(file_path)}",
78
+ "output": f"dlb://out/{os.path.basename(output_path)}",
79
+ "content": {"type": audio_type},
80
+ }
81
+ response = requests.post(enhance_url, json=enhance_body, headers=headers)
82
+ response.raise_for_status()
83
+ return response.json()["job_id"]
84
+
85
+
86
+ def check_job_status(job_id, headers):
87
+ status_url = "https://api.dolby.com/media/enhance"
88
+ params = {"job_id": job_id}
89
+ while True:
90
+ response = requests.get(status_url, params=params, headers=headers)
91
+ response.raise_for_status()
92
+ status = response.json()["status"]
93
+ if status == "Success":
94
+ break
95
+ print(f"Job status: {status}, progress: {response.json()['progress']}%")
96
+ time.sleep(5)
97
+
98
+
99
+ def download_enhanced_file(output_path, headers):
100
+ download_url = "https://api.dolby.com/media/output"
101
+ args = {"url": f"dlb://out/{os.path.basename(output_path)}"}
102
+ with requests.get(
103
+ download_url, params=args, headers=headers, stream=True
104
+ ) as response:
105
+ response.raise_for_status()
106
+ response.raw.decode_content = True
107
+ print(f"Downloading from {response.url} into {output_path}")
108
+ with open(output_path, "wb") as output_file:
109
+ shutil.copyfileobj(response.raw, output_file)
110
+
111
+
112
+ def dolby_process(input_file, output_file, audio_type):
113
+ access_token = get_access_token()
114
+ headers = {"Authorization": f"Bearer {access_token}"}
115
+ upload_media(input_file, headers)
116
+ job_id = create_enhancement_job(input_file, output_file, headers, audio_type)
117
+ check_job_status(job_id, headers)
118
+ download_enhanced_file(output_file, headers)
119
+
120
+
121
+ def enhance_audio(recording, upload, audio_type):
122
+ audio_type = audio_type_mapping[audio_type]
123
+ if recording is not None:
124
+ rate, data = recording
125
+ temp_input_file = "input.wav"
126
+ elif upload is not None:
127
+ rate, data = upload
128
+ if rate not in [44100, 48000] or data.dtype not in [np.int16, np.int32]:
129
+ return None, None, "Invalid file type. Please upload an MP3 file."
130
+ temp_input_file = "input.mp3"
131
+ else:
132
+ return (
133
+ None,
134
+ None,
135
+ "Invalid input. Please record some audio or upload an audio file.",
136
+ )
137
+
138
+ write(temp_input_file, rate, data)
139
+
140
+ temp_output_file = "output.wav"
141
+ dolby_process(
142
+ temp_input_file, temp_output_file, audio_type
143
+ ) # Pass the audio type to the Dolby processing function
144
+
145
+ return temp_input_file, temp_output_file, "Processing complete!"
146
+
147
+
148
+ def clone_voice(temp_output_file):
149
+ # Your voice cloning logic goes here
150
+ cloned_voice_file = "cloned_voice.wav"
151
+ return cloned_voice_file, "Voice cloning complete!"
152
+
153
+
154
+ audio_type_mapping = {
155
+ "Conference": "conference",
156
+ "Interview": "interview",
157
+ "Lecture": "lecture",
158
+ "Meeting": "meeting",
159
+ "Mobile Phone": "mobile_phone",
160
+ "Music": "music",
161
+ "Podcast": "podcast",
162
+ "Studio": "studio",
163
+ "Voice Over": "voice_over",
164
+ }
165
+
166
+ from gradio import Checkbox
167
+
168
+
169
+ def combined_function(
170
+ recording, upload, audio_type, proceed_to_clone, name, description, labels, model
171
+ ):
172
+ input_file, output_file, status1 = enhance_audio(recording, upload, audio_type)
173
+ status1 = "Enhancement complete!"
174
+ transcript = transcribe_audio(output_file)
175
+ if proceed_to_clone:
176
+ clone_and_stream_voice(name, description, labels, transcript, model)
177
+ status2 = "Cloning complete!"
178
+ else:
179
+ status2 = "Voice cloning not performed."
180
+ return input_file, output_file, status1, transcript, status2
181
+
182
+
183
+ def main():
184
+ iface = gr.Interface(
185
+ fn=combined_function,
186
+ inputs=[
187
+ Audio(source="microphone", label="Recorded Audio"),
188
+ Audio(source="upload", label="Uploaded Audio"),
189
+ Radio(choices=list(audio_type_mapping.keys()), label="Audio Type"),
190
+ Checkbox(label="Proceed to Clone Voice"),
191
+ Textbox(label="Name"),
192
+ Textbox(label="Description"),
193
+ Textbox(label="Labels"),
194
+ Radio(
195
+ choices=["eleven_monolingual_v1", "eleven_multilingual_v1"],
196
+ label="Model",
197
+ ),
198
+ ],
199
+ outputs=[
200
+ Audio(type="filepath", label="Original Audio"),
201
+ Audio(type="filepath", label="Processed Audio"),
202
+ Textbox(label="Enhancement Status"),
203
+ Textbox(label="Transcript"),
204
+ Textbox(label="Cloning Status"),
205
+ ],
206
+ title="Audio Enhancer, Transcriber and Voice Cloner",
207
+ description="Enhance your audio, transcribe it and clone voices using the Dolby API",
208
+ allow_flagging="never",
209
+
210
+ )
211
+
212
+ iface.launch(server_name="0.0.0.0", server_port=7860,share=True)
213
+
214
+
215
+ if __name__ == "__main__":
216
+
217
+ main()
requirement.txt ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.5
3
+ aiosignal==1.3.1
4
+ altair==5.0.1
5
+ annotated-types==0.5.0
6
+ anyio==3.7.1
7
+ appnope==0.1.3
8
+ assemblyai==0.15.1
9
+ asttokens==2.2.1
10
+ async-timeout==4.0.2
11
+ attrs==23.1.0
12
+ backcall==0.2.0
13
+ black==23.7.0
14
+ certifi==2023.7.22
15
+ cffi==1.15.1
16
+ charset-normalizer==3.2.0
17
+ click==8.1.6
18
+ contourpy==1.1.0
19
+ cycler==0.11.0
20
+ decorator==5.1.1
21
+ elevenlabs==0.2.21
22
+ executing==1.2.0
23
+ fastapi==0.100.1
24
+ ffmpeg-python==0.2.0
25
+ ffmpy==0.3.1
26
+ filelock==3.12.2
27
+ fonttools==4.41.1
28
+ frozenlist==1.4.0
29
+ fsspec==2023.6.0
30
+ future==0.18.3
31
+ gradio==3.39.0
32
+ gradio_client==0.3.0
33
+ h11==0.14.0
34
+ httpcore==0.17.3
35
+ httpx==0.24.1
36
+ huggingface-hub==0.16.4
37
+ idna==3.4
38
+ importlib-metadata==6.8.0
39
+ ipython==8.14.0
40
+ jedi==0.19.0
41
+ Jinja2==3.1.2
42
+ jsonschema==4.18.4
43
+ jsonschema-specifications==2023.7.1
44
+ kiwisolver==1.4.4
45
+ linkify-it-py==2.0.2
46
+ llvmlite==0.40.1
47
+ markdown-it-py==2.2.0
48
+ MarkupSafe==2.1.3
49
+ matplotlib==3.7.2
50
+ matplotlib-inline==0.1.6
51
+ mdit-py-plugins==0.3.3
52
+ mdurl==0.1.2
53
+ more-itertools==10.0.0
54
+ mpmath==1.3.0
55
+ multidict==6.0.4
56
+ mypy-extensions==1.0.0
57
+ networkx==3.1
58
+ numba==0.57.1
59
+ numpy==1.24.4
60
+ openai-whisper==20230314
61
+ orjson==3.9.2
62
+ packaging==23.1
63
+ pandas==2.0.3
64
+ parso==0.8.3
65
+ pathspec==0.11.2
66
+ pexpect==4.8.0
67
+ pickleshare==0.7.5
68
+ Pillow==10.0.0
69
+ platformdirs==3.10.0
70
+ prompt-toolkit==3.0.39
71
+ ptyprocess==0.7.0
72
+ pure-eval==0.2.2
73
+ PyAudio==0.2.13
74
+ pycparser==2.21
75
+ pydantic==1.10.12
76
+ pydantic_core==2.4.0
77
+ pydub==0.25.1
78
+ Pygments==2.15.1
79
+ pyparsing==3.0.9
80
+ python-dateutil==2.8.2
81
+ python-multipart==0.0.6
82
+ pytz==2023.3
83
+ PyYAML==6.0.1
84
+ referencing==0.30.0
85
+ regex==2023.6.3
86
+ requests==2.31.0
87
+ rich==13.5.0
88
+ rpds-py==0.9.2
89
+ safetensors==0.3.1
90
+ scipy==1.11.1
91
+ semantic-version==2.10.0
92
+ setuptools-rust==1.6.0
93
+ six==1.16.0
94
+ sniffio==1.3.0
95
+ sounddevice==0.4.6
96
+ SpeechRecognition==3.10.0
97
+ stack-data==0.6.2
98
+ starlette==0.27.0
99
+ sympy==1.12
100
+ tdqm==0.0.1
101
+ tiktoken==0.3.1
102
+ tokenizers==0.13.3
103
+ toolz==0.12.0
104
+ torch==2.0.1
105
+ tqdm==4.65.0
106
+ traitlets==5.9.0
107
+ transformers==4.31.0
108
+ typing_extensions==4.7.1
109
+ tzdata==2023.3
110
+ uc-micro-py==1.0.2
111
+ urllib3==2.0.4
112
+ uvicorn==0.23.1
113
+ wcwidth==0.2.6
114
+ websockets==11.0.3
115
+ whisper-mic==1.1.1
116
+ yarl==1.9.2
117
+ zipp==3.16.2