Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import librosa
|
| 5 |
+
from google import genai
|
| 6 |
+
from google.genai import types
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
from elevenlabs.client import ElevenLabs
|
| 9 |
+
|
| 10 |
+
# --- 1. Load API Keys from Environment Variables ---
|
| 11 |
+
# This is the standard way, replacing Colab's 'userdata'
|
| 12 |
+
# For local testing, you'll set these in your terminal.
|
| 13 |
+
# For Hugging Face, you'll set these in "Repository secrets".
|
| 14 |
+
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
|
| 15 |
+
ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
|
| 16 |
+
LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY')
|
| 17 |
+
|
| 18 |
+
# --- 2. Initialize API Clients (Do this once, globally) ---
|
| 19 |
+
try:
|
| 20 |
+
genai_client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 21 |
+
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
| 22 |
+
|
| 23 |
+
llama_client = None
|
| 24 |
+
if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"):
|
| 25 |
+
base_url = "https://integrate.api.nvidia.com/v1"
|
| 26 |
+
llama_client = OpenAI(
|
| 27 |
+
base_url=base_url,
|
| 28 |
+
api_key=LLAMA_405B_KEY
|
| 29 |
+
)
|
| 30 |
+
if not all([genai_client, elevenlabs_client, llama_client]):
|
| 31 |
+
print("WARNING: One or more API keys are missing. The app will fail if run.")
|
| 32 |
+
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"Error initializing clients (this is OK at startup, but check keys): {e}")
|
| 35 |
+
|
| 36 |
+
# This is the long prompt from your script
|
| 37 |
+
prompt1 = """Role:
|
| 38 |
+
You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
|
| 39 |
+
... (Your full Gemini prompt) ...
|
| 40 |
+
Final Output Rule:
|
| 41 |
+
Produce a single, continuous, structured description following all the above rules.
|
| 42 |
+
Do not summarize, infer meaning, or include audio elements.
|
| 43 |
+
The output must be factual, visual, chronological, and exhaustive."""
|
| 44 |
+
|
| 45 |
+
# --- 3. The Main Workflow Function for Gradio ---
|
| 46 |
+
def generate_sfx(video_path):
|
| 47 |
+
"""
|
| 48 |
+
This single function runs your entire workflow.
|
| 49 |
+
Gradio will pass the user's uploaded video path to this function.
|
| 50 |
+
"""
|
| 51 |
+
if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]):
|
| 52 |
+
raise gr.Error("API keys are not configured. The app admin needs to set the secrets.")
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
# --- Step 0: Read the uploaded video file ---
|
| 56 |
+
video_bytes = open(video_path, 'rb').read()
|
| 57 |
+
except Exception as e:
|
| 58 |
+
raise gr.Error(f"Failed to read video file: {e}")
|
| 59 |
+
|
| 60 |
+
# --- Step 1: Gemini Video Transcript ---
|
| 61 |
+
try:
|
| 62 |
+
response = genai_client.models.generate_content(
|
| 63 |
+
model='models/gemini-2.5-pro',
|
| 64 |
+
contents=types.Content(
|
| 65 |
+
parts=[
|
| 66 |
+
types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')),
|
| 67 |
+
types.Part(text=prompt1)
|
| 68 |
+
]
|
| 69 |
+
)
|
| 70 |
+
)
|
| 71 |
+
transcript = response.text
|
| 72 |
+
except Exception as e:
|
| 73 |
+
raise gr.Error(f"Gemini API Error: {e}")
|
| 74 |
+
|
| 75 |
+
# --- Step 2: Llama Prompt Generation ---
|
| 76 |
+
try:
|
| 77 |
+
your_prompt = f"""Identify the suitable audio effects based on the given video transcript...
|
| 78 |
+
... (Your full Llama prompt) ...
|
| 79 |
+
Transcript: {transcript}"""
|
| 80 |
+
|
| 81 |
+
completion = llama_client.chat.completions.create(
|
| 82 |
+
model="meta/llama-3.1-405b-instruct",
|
| 83 |
+
messages=[
|
| 84 |
+
{"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."},
|
| 85 |
+
{"role": "user", "content": your_prompt}
|
| 86 |
+
],
|
| 87 |
+
temperature=0.5,
|
| 88 |
+
top_p=1,
|
| 89 |
+
max_tokens=2048,
|
| 90 |
+
timeout=300.0
|
| 91 |
+
)
|
| 92 |
+
response_text = completion.choices[0].message.content
|
| 93 |
+
except Exception as e:
|
| 94 |
+
raise gr.Error(f"Llama (NVIDIA API) Error: {e}")
|
| 95 |
+
|
| 96 |
+
# Clean Llama responses (your logic, made more robust)
|
| 97 |
+
prompts = response_text.splitlines()
|
| 98 |
+
fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator
|
| 99 |
+
|
| 100 |
+
if not fixed_prompts:
|
| 101 |
+
return transcript, "Llama did not return any valid prompts.", []
|
| 102 |
+
|
| 103 |
+
# --- Step 3: ElevenLabs Audio Generation ---
|
| 104 |
+
output_audio_files = []
|
| 105 |
+
sfx_prompts_text = [] # To display the prompts in the UI
|
| 106 |
+
|
| 107 |
+
for i, line in enumerate(fixed_prompts):
|
| 108 |
+
try:
|
| 109 |
+
parts = line.split(";")
|
| 110 |
+
if len(parts) < 2:
|
| 111 |
+
continue # Skip malformed lines
|
| 112 |
+
|
| 113 |
+
sfx_prompt = parts[0]
|
| 114 |
+
duration = float(parts[1])
|
| 115 |
+
sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)")
|
| 116 |
+
|
| 117 |
+
generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}"
|
| 118 |
+
|
| 119 |
+
audio = elevenlabs_client.text_to_sound_effects.convert(
|
| 120 |
+
text=generation_prompt,
|
| 121 |
+
duration_seconds=duration,
|
| 122 |
+
prompt_influence=0.6
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# Save the audio to a unique file
|
| 126 |
+
output_filename = f"generated_sfx_{i}.mp3"
|
| 127 |
+
with open(output_filename, "wb") as f:
|
| 128 |
+
for chunk in audio:
|
| 129 |
+
f.write(chunk)
|
| 130 |
+
|
| 131 |
+
# Add the file path to our list
|
| 132 |
+
output_audio_files.append(output_filename)
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
print(f"ElevenLabs error on prompt {i}: {e}")
|
| 136 |
+
# Don't crash the whole app, just skip this one audio
|
| 137 |
+
continue
|
| 138 |
+
|
| 139 |
+
# --- 4. Return all outputs to the Gradio Interface ---
|
| 140 |
+
# This must match the 'outputs' list in gr.Interface
|
| 141 |
+
final_prompts_display = "\n".join(sfx_prompts_text)
|
| 142 |
+
return transcript, final_prompts_display, output_audio_files
|
| 143 |
+
|
| 144 |
+
# --- 5. Create the Gradio Interface ---
|
| 145 |
+
demo = gr.Interface(
|
| 146 |
+
fn=generate_sfx, # The main function to call
|
| 147 |
+
|
| 148 |
+
# Input component: A video uploader
|
| 149 |
+
inputs=gr.Video(label="Upload Your Video Clip"),
|
| 150 |
+
|
| 151 |
+
# Output components: A list matching the function's return values
|
| 152 |
+
outputs=[
|
| 153 |
+
gr.Textbox(label="1. Gemini Visual Transcript"),
|
| 154 |
+
gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8),
|
| 155 |
+
gr.Gallery(label="3. ElevenLabs Generated Sound Effects", columns=1)
|
| 156 |
+
],
|
| 157 |
+
|
| 158 |
+
title="🎬 AI Video-to-Sound-Effect Workflow",
|
| 159 |
+
description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).",
|
| 160 |
+
allow_flagging="never",
|
| 161 |
+
# We can use your 'deadpool 3.mp4' as an example if it's in the same folder
|
| 162 |
+
examples=[["deadpool 3.mp4"]]
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# --- 6. Launch the App ---
|
| 166 |
+
if __name__ == "__main__":
|
| 167 |
+
# Use share=True to get a temporary public link
|
| 168 |
+
demo.launch(share=True, debug=True)
|