File size: 9,045 Bytes
cf11d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9074cec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf11d21
9074cec
cf11d21
 
 
9074cec
cf11d21
bf75de6
cf11d21
 
8c98fcd
cf11d21
 
8c98fcd
cf11d21
 
8c98fcd
 
bf75de6
cf11d21
 
 
 
 
 
1e063d6
cf11d21
 
 
 
 
 
 
 
 
 
 
 
 
9074cec
 
 
 
 
 
 
 
 
 
cf11d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebc4817
 
cf11d21
ebc4817
cf11d21
 
 
 
 
 
 
ebc4817
cf11d21
 
 
 
ebc4817
4e5bba9
cf11d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e77b207
cf11d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf75de6
cf11d21
 
 
 
 
9590cfb
cf11d21
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import gradio as gr
import os
import time
import librosa
from google import genai
from google.genai import types
from openai import OpenAI
from elevenlabs.client import ElevenLabs

# --- 1. Load API Keys from Environment Variables ---
# This is the standard way, replacing Colab's 'userdata'
# For local testing, you'll set these in your terminal.
# For Hugging Face, you'll set these in "Repository secrets".
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY')

# --- 2. Initialize API Clients (Do this once, globally) ---
try:
    genai_client = genai.Client(api_key=GOOGLE_API_KEY)
    elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
    
    llama_client = None
    if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"):
        base_url = "https://integrate.api.nvidia.com/v1"
        llama_client = OpenAI(
            base_url=base_url,
            api_key=LLAMA_405B_KEY
        )
    if not all([genai_client, elevenlabs_client, llama_client]):
        print("WARNING: One or more API keys are missing. The app will fail if run.")
        
except Exception as e:
    print(f"Error initializing clients (this is OK at startup, but check keys): {e}")

# This is the long prompt from your script
prompt1 = """Role:
 You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
Primary Objective:
Analyze the provided video and generate a detailed, chronological description of everything visually occurring in the footage. Focus entirely on what can be seen, not heard.
Core Instructions:
Follow these instructions exactly:
Visual-Only Focus

Describe only what is visible on-screen.

Ignore all sounds, dialogue, narration, or music.

Include on-screen text only if it appears as a visible object (e.g., sign, label, subtitle).

Chronological Detailing

Describe events strictly in the order they appear.

Use clear temporal markers such as “At the beginning…”, “Next…”, “Then…”, “After that…”, “Finally…”

Comprehensive Visual Content

Describe people, objects, settings, environments, lighting, colors, positions, and movements.

Include camera actions (pans, tilts, zooms, cuts, transitions).

Capture facial expressions, gestures, and body posture changes if visible.

Objectivity and Precision

Avoid interpretation, emotion, or speculation.

Describe only observable facts (e.g., say “The person raises their right arm,” not “The person waves hello”).

Level of Detail

Provide enough visual information for someone to recreate or storyboard the entire scene.

Include every key visual or motion change.
Output Formatting:
Use the following structured format:
[Timestamp or Sequence Indicator]
Detailed description of what is visually happening.

Example:
0:00–0:04 — A man in a dark blue jacket walks across a street. A red car passes behind him.
0:05–0:09 — The camera tilts upward to show a tall building with glass windows. The sky is cloudy.
0:10–0:13 — The man stops, looks up, and adjusts the strap of a black backpack.

If timestamps are unavailable, use sequence-based ordering (e.g., “Scene 1,” “Scene 2,” etc.).
Final Output Rule:
Produce a single, continuous, structured description following all the above rules.
 Do not summarize, infer meaning, or include audio elements.
 The output must be factual, visual, chronological, and exhaustive."""


# --- 3. The Main Workflow Function for Gradio ---
def generate_sfx(video_path):
    """
    This single function runs your entire workflow.
    Gradio will pass the user's uploaded video as a file object.
    """
    if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]):
        raise gr.Error("API keys are not configured...")

    try:
        # --- THIS IS THE NEW, FIXED LINE ---
        # We now read directly from the file object's path
        video_bytes = open(video_path, 'rb').read()
    except Exception as e:
        raise gr.Error(f"Failed to read video file: {e}")

    # --- Step 1: Gemini Video Transcript ---
    try:
        response = genai_client.models.generate_content(
            model='models/gemini-2.5-flash',
            contents=types.Content(
                parts=[
                    types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')),
                    types.Part(text=prompt1)
                ]
            )
        )
        transcript = response.text
    except Exception as e:
        raise gr.Error(f"Gemini API Error: {e}")

    # --- Step 2: Llama Prompt Generation ---
    try:
        your_prompt = f"""Identify the suitable audio effects based on the given video transcript and
    generate a suitable and detailed prompt for each audio effects for another audio generating AI
    model to generate the audio effects. Note that the duration of each audio should be within 2-10
    seconds. Only include the prompts for generating the sound effects
    and do not include any other text, such as timestamps. Separate the prompt and the duration for
    each audio effects with a new line. Output in the following format for each prompt and duration:
    [prompt1];[duration1] (new line) [prompt2];[duration2] etc. only include the number of the duration
    in [duration] No other text should be included in
    the output. Do make the prompts with details, such as the intensity, feeling etc according to the
    video transcript so that the high quality and suitable sound can be generated. Transcript: {transcript}"""

        completion = llama_client.chat.completions.create(
          model="meta/llama-3.1-405b-instruct",
          messages=[
            {"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."},
            {"role": "user", "content": your_prompt}
          ],
          temperature=0.5,
          top_p=1,
          max_tokens=2048,
          timeout=300.0
        )
        response_text = completion.choices[0].message.content
    except Exception as e:
        raise gr.Error(f"Llama (NVIDIA API) Error: {e}")

    # Clean Llama responses (your logic, made more robust)
    prompts = response_text.splitlines()
    fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator
    
    if not fixed_prompts:
        return transcript, "Llama did not return any valid prompts", []

    # --- Step 3: ElevenLabs Audio Generation ---
    output_audio_files = []
    sfx_prompts_text = [] # To display the prompts in the UI
    
    for i, line in enumerate(fixed_prompts):
        try:
            parts = line.split(";")
            if len(parts) < 2:
                continue # Skip malformed lines
                
            sfx_prompt = parts[0]
            duration = float(parts[1])
            
            sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)")

            generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}"
            
            audio = elevenlabs_client.text_to_sound_effects.convert(
                text=generation_prompt,
                duration_seconds=duration,
                prompt_influence=0.6
            )

            # Save the audio to a unique file
            output_filename = f"generated_sfx_{i}.mp3"
            with open(output_filename, "wb") as f:
                for chunk in audio:
                    f.write(chunk)
            
            # Add the file path to our list
            output_audio_files.append(output_filename)
            time.sleep(5)
        
        except Exception as e:
            print(f"ElevenLabs error on prompt {i}: {e}")
            # Don't crash the whole app, just skip this one audio
            continue 

    # --- 4. Return all outputs to the Gradio Interface ---
    # This must match the 'outputs' list in gr.Interface
    final_prompts_display = "\n".join(sfx_prompts_text)
    return transcript, final_prompts_display, output_audio_files

# --- 5. Create the Gradio Interface ---
demo = gr.Interface(
    fn=generate_sfx,  # The main function to call
    
    # Input component: A video uploader
    inputs=gr.Video(label="Upload Your Video Clip"),
    
    # Output components: A list matching the function's return values
    outputs=[
        gr.Textbox(label="1. Gemini Visual Transcript"),
        gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8),
        gr.File(label="3. ElevenLabs Generated Sound Effects") #
    ],
    
    title="🎬 AI Video-to-Sound-Effect Workflow",
    description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).",
    allow_flagging="never",
)

# --- 6. Launch the App ---
if __name__ == "__main__":
    # Use share=True to get a temporary public link
    demo.launch(share=True, debug=True)