Anson818 commited on
Commit
cf11d21
·
verified ·
1 Parent(s): 83d0b5c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -0
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import time
4
+ import librosa
5
+ from google import genai
6
+ from google.genai import types
7
+ from openai import OpenAI
8
+ from elevenlabs.client import ElevenLabs
9
+
10
+ # --- 1. Load API Keys from Environment Variables ---
11
+ # This is the standard way, replacing Colab's 'userdata'
12
+ # For local testing, you'll set these in your terminal.
13
+ # For Hugging Face, you'll set these in "Repository secrets".
14
+ GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
15
+ ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
16
+ LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY')
17
+
18
+ # --- 2. Initialize API Clients (Do this once, globally) ---
19
+ try:
20
+ genai_client = genai.Client(api_key=GOOGLE_API_KEY)
21
+ elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
22
+
23
+ llama_client = None
24
+ if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"):
25
+ base_url = "https://integrate.api.nvidia.com/v1"
26
+ llama_client = OpenAI(
27
+ base_url=base_url,
28
+ api_key=LLAMA_405B_KEY
29
+ )
30
+ if not all([genai_client, elevenlabs_client, llama_client]):
31
+ print("WARNING: One or more API keys are missing. The app will fail if run.")
32
+
33
+ except Exception as e:
34
+ print(f"Error initializing clients (this is OK at startup, but check keys): {e}")
35
+
36
+ # This is the long prompt from your script
37
+ prompt1 = """Role:
38
+ You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
39
+ ... (Your full Gemini prompt) ...
40
+ Final Output Rule:
41
+ Produce a single, continuous, structured description following all the above rules.
42
+ Do not summarize, infer meaning, or include audio elements.
43
+ The output must be factual, visual, chronological, and exhaustive."""
44
+
45
+ # --- 3. The Main Workflow Function for Gradio ---
46
+ def generate_sfx(video_path):
47
+ """
48
+ This single function runs your entire workflow.
49
+ Gradio will pass the user's uploaded video path to this function.
50
+ """
51
+ if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]):
52
+ raise gr.Error("API keys are not configured. The app admin needs to set the secrets.")
53
+
54
+ try:
55
+ # --- Step 0: Read the uploaded video file ---
56
+ video_bytes = open(video_path, 'rb').read()
57
+ except Exception as e:
58
+ raise gr.Error(f"Failed to read video file: {e}")
59
+
60
+ # --- Step 1: Gemini Video Transcript ---
61
+ try:
62
+ response = genai_client.models.generate_content(
63
+ model='models/gemini-2.5-pro',
64
+ contents=types.Content(
65
+ parts=[
66
+ types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')),
67
+ types.Part(text=prompt1)
68
+ ]
69
+ )
70
+ )
71
+ transcript = response.text
72
+ except Exception as e:
73
+ raise gr.Error(f"Gemini API Error: {e}")
74
+
75
+ # --- Step 2: Llama Prompt Generation ---
76
+ try:
77
+ your_prompt = f"""Identify the suitable audio effects based on the given video transcript...
78
+ ... (Your full Llama prompt) ...
79
+ Transcript: {transcript}"""
80
+
81
+ completion = llama_client.chat.completions.create(
82
+ model="meta/llama-3.1-405b-instruct",
83
+ messages=[
84
+ {"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."},
85
+ {"role": "user", "content": your_prompt}
86
+ ],
87
+ temperature=0.5,
88
+ top_p=1,
89
+ max_tokens=2048,
90
+ timeout=300.0
91
+ )
92
+ response_text = completion.choices[0].message.content
93
+ except Exception as e:
94
+ raise gr.Error(f"Llama (NVIDIA API) Error: {e}")
95
+
96
+ # Clean Llama responses (your logic, made more robust)
97
+ prompts = response_text.splitlines()
98
+ fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator
99
+
100
+ if not fixed_prompts:
101
+ return transcript, "Llama did not return any valid prompts.", []
102
+
103
+ # --- Step 3: ElevenLabs Audio Generation ---
104
+ output_audio_files = []
105
+ sfx_prompts_text = [] # To display the prompts in the UI
106
+
107
+ for i, line in enumerate(fixed_prompts):
108
+ try:
109
+ parts = line.split(";")
110
+ if len(parts) < 2:
111
+ continue # Skip malformed lines
112
+
113
+ sfx_prompt = parts[0]
114
+ duration = float(parts[1])
115
+ sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)")
116
+
117
+ generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}"
118
+
119
+ audio = elevenlabs_client.text_to_sound_effects.convert(
120
+ text=generation_prompt,
121
+ duration_seconds=duration,
122
+ prompt_influence=0.6
123
+ )
124
+
125
+ # Save the audio to a unique file
126
+ output_filename = f"generated_sfx_{i}.mp3"
127
+ with open(output_filename, "wb") as f:
128
+ for chunk in audio:
129
+ f.write(chunk)
130
+
131
+ # Add the file path to our list
132
+ output_audio_files.append(output_filename)
133
+
134
+ except Exception as e:
135
+ print(f"ElevenLabs error on prompt {i}: {e}")
136
+ # Don't crash the whole app, just skip this one audio
137
+ continue
138
+
139
+ # --- 4. Return all outputs to the Gradio Interface ---
140
+ # This must match the 'outputs' list in gr.Interface
141
+ final_prompts_display = "\n".join(sfx_prompts_text)
142
+ return transcript, final_prompts_display, output_audio_files
143
+
144
+ # --- 5. Create the Gradio Interface ---
145
+ demo = gr.Interface(
146
+ fn=generate_sfx, # The main function to call
147
+
148
+ # Input component: A video uploader
149
+ inputs=gr.Video(label="Upload Your Video Clip"),
150
+
151
+ # Output components: A list matching the function's return values
152
+ outputs=[
153
+ gr.Textbox(label="1. Gemini Visual Transcript"),
154
+ gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8),
155
+ gr.Gallery(label="3. ElevenLabs Generated Sound Effects", columns=1)
156
+ ],
157
+
158
+ title="🎬 AI Video-to-Sound-Effect Workflow",
159
+ description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).",
160
+ allow_flagging="never",
161
+ # We can use your 'deadpool 3.mp4' as an example if it's in the same folder
162
+ examples=[["deadpool 3.mp4"]]
163
+ )
164
+
165
+ # --- 6. Launch the App ---
166
+ if __name__ == "__main__":
167
+ # Use share=True to get a temporary public link
168
+ demo.launch(share=True, debug=True)