Spaces:
Sleeping
Sleeping
File size: 9,045 Bytes
cf11d21 9074cec cf11d21 9074cec cf11d21 9074cec cf11d21 bf75de6 cf11d21 8c98fcd cf11d21 8c98fcd cf11d21 8c98fcd bf75de6 cf11d21 1e063d6 cf11d21 9074cec cf11d21 ebc4817 cf11d21 ebc4817 cf11d21 ebc4817 cf11d21 ebc4817 4e5bba9 cf11d21 e77b207 cf11d21 bf75de6 cf11d21 9590cfb cf11d21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import gradio as gr
import os
import time
import librosa
from google import genai
from google.genai import types
from openai import OpenAI
from elevenlabs.client import ElevenLabs
# --- 1. Load API Keys from Environment Variables ---
# This is the standard way, replacing Colab's 'userdata'
# For local testing, you'll set these in your terminal.
# For Hugging Face, you'll set these in "Repository secrets".
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY')
# --- 2. Initialize API Clients (Do this once, globally) ---
try:
genai_client = genai.Client(api_key=GOOGLE_API_KEY)
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
llama_client = None
if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"):
base_url = "https://integrate.api.nvidia.com/v1"
llama_client = OpenAI(
base_url=base_url,
api_key=LLAMA_405B_KEY
)
if not all([genai_client, elevenlabs_client, llama_client]):
print("WARNING: One or more API keys are missing. The app will fail if run.")
except Exception as e:
print(f"Error initializing clients (this is OK at startup, but check keys): {e}")
# This is the long prompt from your script
prompt1 = """Role:
You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
Primary Objective:
Analyze the provided video and generate a detailed, chronological description of everything visually occurring in the footage. Focus entirely on what can be seen, not heard.
Core Instructions:
Follow these instructions exactly:
Visual-Only Focus
Describe only what is visible on-screen.
Ignore all sounds, dialogue, narration, or music.
Include on-screen text only if it appears as a visible object (e.g., sign, label, subtitle).
Chronological Detailing
Describe events strictly in the order they appear.
Use clear temporal markers such as “At the beginning…”, “Next…”, “Then…”, “After that…”, “Finally…”
Comprehensive Visual Content
Describe people, objects, settings, environments, lighting, colors, positions, and movements.
Include camera actions (pans, tilts, zooms, cuts, transitions).
Capture facial expressions, gestures, and body posture changes if visible.
Objectivity and Precision
Avoid interpretation, emotion, or speculation.
Describe only observable facts (e.g., say “The person raises their right arm,” not “The person waves hello”).
Level of Detail
Provide enough visual information for someone to recreate or storyboard the entire scene.
Include every key visual or motion change.
Output Formatting:
Use the following structured format:
[Timestamp or Sequence Indicator]
Detailed description of what is visually happening.
Example:
0:00–0:04 — A man in a dark blue jacket walks across a street. A red car passes behind him.
0:05–0:09 — The camera tilts upward to show a tall building with glass windows. The sky is cloudy.
0:10–0:13 — The man stops, looks up, and adjusts the strap of a black backpack.
If timestamps are unavailable, use sequence-based ordering (e.g., “Scene 1,” “Scene 2,” etc.).
Final Output Rule:
Produce a single, continuous, structured description following all the above rules.
Do not summarize, infer meaning, or include audio elements.
The output must be factual, visual, chronological, and exhaustive."""
# --- 3. The Main Workflow Function for Gradio ---
def generate_sfx(video_path):
"""
This single function runs your entire workflow.
Gradio will pass the user's uploaded video as a file object.
"""
if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]):
raise gr.Error("API keys are not configured...")
try:
# --- THIS IS THE NEW, FIXED LINE ---
# We now read directly from the file object's path
video_bytes = open(video_path, 'rb').read()
except Exception as e:
raise gr.Error(f"Failed to read video file: {e}")
# --- Step 1: Gemini Video Transcript ---
try:
response = genai_client.models.generate_content(
model='models/gemini-2.5-flash',
contents=types.Content(
parts=[
types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')),
types.Part(text=prompt1)
]
)
)
transcript = response.text
except Exception as e:
raise gr.Error(f"Gemini API Error: {e}")
# --- Step 2: Llama Prompt Generation ---
try:
your_prompt = f"""Identify the suitable audio effects based on the given video transcript and
generate a suitable and detailed prompt for each audio effects for another audio generating AI
model to generate the audio effects. Note that the duration of each audio should be within 2-10
seconds. Only include the prompts for generating the sound effects
and do not include any other text, such as timestamps. Separate the prompt and the duration for
each audio effects with a new line. Output in the following format for each prompt and duration:
[prompt1];[duration1] (new line) [prompt2];[duration2] etc. only include the number of the duration
in [duration] No other text should be included in
the output. Do make the prompts with details, such as the intensity, feeling etc according to the
video transcript so that the high quality and suitable sound can be generated. Transcript: {transcript}"""
completion = llama_client.chat.completions.create(
model="meta/llama-3.1-405b-instruct",
messages=[
{"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."},
{"role": "user", "content": your_prompt}
],
temperature=0.5,
top_p=1,
max_tokens=2048,
timeout=300.0
)
response_text = completion.choices[0].message.content
except Exception as e:
raise gr.Error(f"Llama (NVIDIA API) Error: {e}")
# Clean Llama responses (your logic, made more robust)
prompts = response_text.splitlines()
fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator
if not fixed_prompts:
return transcript, "Llama did not return any valid prompts", []
# --- Step 3: ElevenLabs Audio Generation ---
output_audio_files = []
sfx_prompts_text = [] # To display the prompts in the UI
for i, line in enumerate(fixed_prompts):
try:
parts = line.split(";")
if len(parts) < 2:
continue # Skip malformed lines
sfx_prompt = parts[0]
duration = float(parts[1])
sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)")
generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}"
audio = elevenlabs_client.text_to_sound_effects.convert(
text=generation_prompt,
duration_seconds=duration,
prompt_influence=0.6
)
# Save the audio to a unique file
output_filename = f"generated_sfx_{i}.mp3"
with open(output_filename, "wb") as f:
for chunk in audio:
f.write(chunk)
# Add the file path to our list
output_audio_files.append(output_filename)
time.sleep(5)
except Exception as e:
print(f"ElevenLabs error on prompt {i}: {e}")
# Don't crash the whole app, just skip this one audio
continue
# --- 4. Return all outputs to the Gradio Interface ---
# This must match the 'outputs' list in gr.Interface
final_prompts_display = "\n".join(sfx_prompts_text)
return transcript, final_prompts_display, output_audio_files
# --- 5. Create the Gradio Interface ---
demo = gr.Interface(
fn=generate_sfx, # The main function to call
# Input component: A video uploader
inputs=gr.Video(label="Upload Your Video Clip"),
# Output components: A list matching the function's return values
outputs=[
gr.Textbox(label="1. Gemini Visual Transcript"),
gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8),
gr.File(label="3. ElevenLabs Generated Sound Effects") #
],
title="🎬 AI Video-to-Sound-Effect Workflow",
description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).",
allow_flagging="never",
)
# --- 6. Launch the App ---
if __name__ == "__main__":
# Use share=True to get a temporary public link
demo.launch(share=True, debug=True) |