Spaces:

dinhhan
/

audio

Sleeping

App Files Files Community

sheikhed commited on Oct 11, 2024

Commit

d52bea3

verified ·

1 Parent(s): 84501e2

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -120

app.py CHANGED Viewed

@@ -18,11 +18,6 @@ B_KEY = os.getenv("B_KEY")
 API_URL = os.getenv("API_URL")
 UPLOAD_URL = os.getenv("UPLOAD_URL")
-# Create temp directory if it doesn't exist
-TEMP_DIR = "temp"
-if not os.path.exists(TEMP_DIR):
-    os.makedirs(TEMP_DIR)
 def get_voices():
     url = "https://api.elevenlabs.io/v1/voices"
     headers = {
@@ -60,38 +55,32 @@ def text_to_speech(voice_id, text, session_id):
     if response.status_code != 200:
         return None
-    audio_file_path = os.path.join(TEMP_DIR, f'temp_voice_{session_id}.mp3')
     with open(audio_file_path, 'wb') as audio_file:
         audio_file.write(response.content)
     return audio_file_path
-def process_uploaded_audio(audio_path, session_id):
-    """Process and validate uploaded audio file"""
-    if not audio_path:
         return None
-    # Get the file extension
-    ext = os.path.splitext(audio_path)[1].lower()
-    if ext not in ['.mp3', '.wav', '.m4a', '.aac']:
-        return None
-    # Create output path
-    output_path = os.path.join(TEMP_DIR, f'temp_voice_{session_id}.mp3')
-    # Convert to mp3 if not already mp3
-    if ext != '.mp3':
-        cmd = [
-            'ffmpeg', '-i', audio_path,
-            '-codec:a', 'libmp3lame', '-qscale:a', '2',
-            '-y', output_path
-        ]
-        subprocess.run(cmd, check=True)
-        return output_path
-    else:
-        # If it's already MP3, just copy it to temp directory
-        with open(audio_path, 'rb') as src, open(output_path, 'wb') as dst:
-            dst.write(src.read())
-        return output_path
 def upload_file(file_path):
     with open(file_path, 'rb') as file:
@@ -124,7 +113,7 @@ def lipsync_api_call(video_url, audio_url):
 def check_job_status(job_id):
     headers = {"x-api-key": B_KEY}
-    max_attempts = 30
     for _ in range(max_attempts):
         response = requests.get(f"{API_URL}/{job_id}", headers=headers)
@@ -167,74 +156,67 @@ def combine_audio_video(video_path, audio_path, output_path):
     subprocess.run(cmd, check=True)
-def process_video(voice, model, text, audio_file, progress=gr.Progress()):
     session_id = str(uuid.uuid4())
     try:
-        # Handle audio input (either text-to-speech or uploaded file)
-        if audio_file is not None:
-            progress(0.1, desc="Processing uploaded audio...")
-            audio_path = process_uploaded_audio(audio_file, session_id)
-            if not audio_path:
-                return None, "Failed to process uploaded audio file."
-        elif text:
-            progress(0.1, desc="Generating speech...")
-            audio_path = text_to_speech(voice, text, session_id)
-            if not audio_path:
-                return None, "Failed to generate speech audio."
-        else:
-            return None, "Please either enter text or upload an audio file."
-        progress(0.2, desc="Processing video...")
-        video_path = os.path.join("models", model)
-        try:
-            progress(0.3, desc="Uploading files...")
-            video_url = upload_file(video_path)
-            audio_url = upload_file(audio_path)
-            if not video_url or not audio_url:
-                raise Exception("Failed to upload files")
-            progress(0.4, desc="Initiating lipsync...")
-            job_data = lipsync_api_call(video_url, audio_url)
-            if "error" in job_data or "message" in job_data:
-                raise Exception(job_data.get("error", job_data.get("message", "Unknown error")))
-            job_id = job_data["id"]
-            progress(0.5, desc="Processing lipsync...")
-            result_url = check_job_status(job_id)
-            if result_url:
-                progress(0.9, desc="Downloading result...")
-                response = requests.get(result_url)
-                output_path = os.path.join(TEMP_DIR, f"output_{session_id}.mp4")
-                with open(output_path, "wb") as f:
-                    f.write(response.content)
-                progress(1.0, desc="Complete!")
-                return output_path, "Lipsync completed successfully!"
-            else:
-                raise Exception("Lipsync processing failed or timed out")
-        except Exception as e:
-            progress(0.8, desc="Falling back to simple combination...")
-            try:
-                output_path = os.path.join(TEMP_DIR, f"output_{session_id}.mp4")
-                combine_audio_video(video_path, audio_path, output_path)
-                progress(1.0, desc="Complete!")
-                return output_path, f"Used fallback method. Original error: {str(e)}"
-            except Exception as fallback_error:
-                return None, f"All methods failed. Error: {str(fallback_error)}"
     finally:
-        # Cleanup temp files
-        for temp_file in os.listdir(TEMP_DIR):
-            if session_id in temp_file:
-                try:
-                    os.remove(os.path.join(TEMP_DIR, temp_file))
-                except:
-                    pass
 def create_interface():
     voices = get_voices()
@@ -242,29 +224,26 @@ def create_interface():
     with gr.Blocks() as app:
         gr.Markdown("# JSON Train")
         with gr.Row():
             with gr.Column():
                 input_type = gr.Radio(
-                    choices=["Text to Speech", "Upload Audio"],
                     label="Input Type",
-                    value="Text to Speech"
                 )
-                with gr.Group() as tts_group:
                     voice_dropdown = gr.Dropdown(
-                        choices=[v[0] for v in voices],
-                        label="Select Voice",
                         value=voices[0][0] if voices else None
                     )
                     text_input = gr.Textbox(label="Enter text", lines=3)
-                with gr.Group() as audio_group:
-                    audio_input = gr.Audio(
-                        label="Upload Audio",
-                        type="filepath",
-                        format="mp3"
-                    )
                 model_dropdown = gr.Dropdown(
                     choices=models,
@@ -277,32 +256,27 @@ def create_interface():
                 video_output = gr.Video(label="Generated Video")
                 status_output = gr.Textbox(label="Status", interactive=False)
-        def toggle_input_groups(choice):
-            if choice == "Text to Speech":
-                return gr.Group.update(visible=True), gr.Group.update(visible=False)
-            else:
-                return gr.Group.update(visible=False), gr.Group.update(visible=True)
         input_type.change(
-            toggle_input_groups,
             inputs=[input_type],
-            outputs=[tts_group, audio_group]
         )
-        def on_generate(input_choice, voice_name, model_name, text, audio_file):
             voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
-            if input_choice == "Text to Speech":
-                if not text:
-                    return None, "Please enter some text."
-                return process_video(voice_id, model_name, text, None)
-            else:
-                if not audio_file:
-                    return None, "Please upload an audio file."
-                return process_video(voice_id, model_name, None, audio_file)
         generate_btn.click(
             fn=on_generate,
-            inputs=[input_type, voice_dropdown, model_dropdown, text_input, audio_input],
             outputs=[video_output, status_output]
         )

 API_URL = os.getenv("API_URL")
 UPLOAD_URL = os.getenv("UPLOAD_URL")
 def get_voices():
     url = "https://api.elevenlabs.io/v1/voices"
     headers = {
     if response.status_code != 200:
         return None
+    # Save temporary audio file with session ID
+    audio_file_path = f'temp_voice_{session_id}.mp3'
     with open(audio_file_path, 'wb') as audio_file:
         audio_file.write(response.content)
     return audio_file_path
+def save_uploaded_audio(audio_file, session_id):
+    if audio_file is None:
         return None
+    # Get the file extension from the original filename
+    _, ext = os.path.splitext(audio_file.name)
+    if not ext:
+        ext = '.mp3'  # Default extension if none is found
+    # Save the uploaded audio file with session ID
+    audio_file_path = f'temp_voice_{session_id}{ext}'
+    with open(audio_file_path, 'wb') as f:
+        if isinstance(audio_file, str):  # If it's a file path
+            with open(audio_file, 'rb') as source:
+                f.write(source.read())
+        else:  # If it's a file object
+            audio_file.seek(0)
+            f.write(audio_file.read())
+    return audio_file_path
 def upload_file(file_path):
     with open(file_path, 'rb') as file:
 def check_job_status(job_id):
     headers = {"x-api-key": B_KEY}
+    max_attempts = 30  # Limit the number of attempts
     for _ in range(max_attempts):
         response = requests.get(f"{API_URL}/{job_id}", headers=headers)
     subprocess.run(cmd, check=True)
+def process_video(voice, model, text, audio_file, input_type, progress=gr.Progress()):
     session_id = str(uuid.uuid4())
+    # Handle audio based on input type
+    if input_type == "text":
+        progress(0, desc="Generating speech...")
+        audio_path = text_to_speech(voice, text, session_id)
+        if not audio_path:
+            return None, "Failed to generate speech audio."
+    else:  # audio upload
+        progress(0, desc="Processing uploaded audio...")
+        audio_path = save_uploaded_audio(audio_file, session_id)
+        if not audio_path:
+            return None, "Failed to process uploaded audio."
+    progress(0.2, desc="Processing video...")
+    video_path = os.path.join("models", model)
     try:
+        progress(0.3, desc="Uploading files...")
+        video_url = upload_file(video_path)
+        audio_url = upload_file(audio_path)
+        if not video_url or not audio_url:
+            raise Exception("Failed to upload files")
+        progress(0.4, desc="Initiating lipsync...")
+        job_data = lipsync_api_call(video_url, audio_url)
+        if "error" in job_data or "message" in job_data:
+            raise Exception(job_data.get("error", job_data.get("message", "Unknown error")))
+        job_id = job_data["id"]
+        progress(0.5, desc="Processing lipsync...")
+        result_url = check_job_status(job_id)
+        if result_url:
+            progress(0.9, desc="Downloading result...")
+            response = requests.get(result_url)
+            output_path = f"output_{session_id}.mp4"
+            with open(output_path, "wb") as f:
+                f.write(response.content)
+            progress(1.0, desc="Complete!")
+            return output_path, "Lipsync completed successfully!"
+        else:
+            raise Exception("Lipsync processing failed or timed out")
+    except Exception as e:
+        progress(0.8, desc="Falling back to simple combination...")
+        try:
+            output_path = f"output_{session_id}.mp4"
+            combine_audio_video(video_path, audio_path, output_path)
+            progress(1.0, desc="Complete!")
+            return output_path, f"Used fallback method. Original error: {str(e)}"
+        except Exception as fallback_error:
+            return None, f"All methods failed. Error: {str(fallback_error)}"
     finally:
+        # Cleanup
+        if os.path.exists(audio_path):
+            os.remove(audio_path)
 def create_interface():
     voices = get_voices()
     with gr.Blocks() as app:
         gr.Markdown("# JSON Train")
         with gr.Row():
             with gr.Column():
                 input_type = gr.Radio(
+                    choices=["text", "audio"],
                     label="Input Type",
+                    value="text"
                 )
+                # Text-to-speech inputs
+                with gr.Group() as text_inputs:
                     voice_dropdown = gr.Dropdown(
+                        choices=[v[0] for v in voices],
+                        label="Select Voice",
                         value=voices[0][0] if voices else None
                     )
                     text_input = gr.Textbox(label="Enter text", lines=3)
+                # Audio upload input
+                with gr.Group() as audio_inputs:
+                    audio_upload = gr.Audio(label="Upload Audio", type="filepath")
                 model_dropdown = gr.Dropdown(
                     choices=models,
                 video_output = gr.Video(label="Generated Video")
                 status_output = gr.Textbox(label="Status", interactive=False)
+        def toggle_inputs(input_type):
+            return (
+                gr.Group.update(visible=(input_type == "text")),
+                gr.Group.update(visible=(input_type == "audio"))
+            )
         input_type.change(
+            fn=toggle_inputs,
             inputs=[input_type],
+            outputs=[text_inputs, audio_inputs]
         )
+        def on_generate(voice_name, model_name, text, audio_file, input_type):
             voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
+            if input_type == "text" and not voice_id:
+                return None, "Invalid voice selected."
+            return process_video(voice_id, model_name, text, audio_file, input_type)
         generate_btn.click(
             fn=on_generate,
+            inputs=[voice_dropdown, model_dropdown, text_input, audio_upload, input_type],
             outputs=[video_output, status_output]
         )