CCAI_TEST / app.py
Anson818's picture
Update app.py
e77b207 verified
import gradio as gr
import os
import time
import librosa
from google import genai
from google.genai import types
from openai import OpenAI
from elevenlabs.client import ElevenLabs
# --- 1. Load API Keys from Environment Variables ---
# This is the standard way, replacing Colab's 'userdata'
# For local testing, you'll set these in your terminal.
# For Hugging Face, you'll set these in "Repository secrets".
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY')
# --- 2. Initialize API Clients (Do this once, globally) ---
try:
genai_client = genai.Client(api_key=GOOGLE_API_KEY)
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
llama_client = None
if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"):
base_url = "https://integrate.api.nvidia.com/v1"
llama_client = OpenAI(
base_url=base_url,
api_key=LLAMA_405B_KEY
)
if not all([genai_client, elevenlabs_client, llama_client]):
print("WARNING: One or more API keys are missing. The app will fail if run.")
except Exception as e:
print(f"Error initializing clients (this is OK at startup, but check keys): {e}")
# This is the long prompt from your script
prompt1 = """Role:
You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
Primary Objective:
Analyze the provided video and generate a detailed, chronological description of everything visually occurring in the footage. Focus entirely on what can be seen, not heard.
Core Instructions:
Follow these instructions exactly:
Visual-Only Focus
Describe only what is visible on-screen.
Ignore all sounds, dialogue, narration, or music.
Include on-screen text only if it appears as a visible object (e.g., sign, label, subtitle).
Chronological Detailing
Describe events strictly in the order they appear.
Use clear temporal markers such as “At the beginning…”, “Next…”, “Then…”, “After that…”, “Finally…”
Comprehensive Visual Content
Describe people, objects, settings, environments, lighting, colors, positions, and movements.
Include camera actions (pans, tilts, zooms, cuts, transitions).
Capture facial expressions, gestures, and body posture changes if visible.
Objectivity and Precision
Avoid interpretation, emotion, or speculation.
Describe only observable facts (e.g., say “The person raises their right arm,” not “The person waves hello”).
Level of Detail
Provide enough visual information for someone to recreate or storyboard the entire scene.
Include every key visual or motion change.
Output Formatting:
Use the following structured format:
[Timestamp or Sequence Indicator]
Detailed description of what is visually happening.
Example:
0:00–0:04 — A man in a dark blue jacket walks across a street. A red car passes behind him.
0:05–0:09 — The camera tilts upward to show a tall building with glass windows. The sky is cloudy.
0:10–0:13 — The man stops, looks up, and adjusts the strap of a black backpack.
If timestamps are unavailable, use sequence-based ordering (e.g., “Scene 1,” “Scene 2,” etc.).
Final Output Rule:
Produce a single, continuous, structured description following all the above rules.
Do not summarize, infer meaning, or include audio elements.
The output must be factual, visual, chronological, and exhaustive."""
# --- 3. The Main Workflow Function for Gradio ---
def generate_sfx(video_path):
"""
This single function runs your entire workflow.
Gradio will pass the user's uploaded video as a file object.
"""
if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]):
raise gr.Error("API keys are not configured...")
try:
# --- THIS IS THE NEW, FIXED LINE ---
# We now read directly from the file object's path
video_bytes = open(video_path, 'rb').read()
except Exception as e:
raise gr.Error(f"Failed to read video file: {e}")
# --- Step 1: Gemini Video Transcript ---
try:
response = genai_client.models.generate_content(
model='models/gemini-2.5-flash',
contents=types.Content(
parts=[
types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')),
types.Part(text=prompt1)
]
)
)
transcript = response.text
except Exception as e:
raise gr.Error(f"Gemini API Error: {e}")
# --- Step 2: Llama Prompt Generation ---
try:
your_prompt = f"""Identify the suitable audio effects based on the given video transcript and
generate a suitable and detailed prompt for each audio effects for another audio generating AI
model to generate the audio effects. Note that the duration of each audio should be within 2-10
seconds. Only include the prompts for generating the sound effects
and do not include any other text, such as timestamps. Separate the prompt and the duration for
each audio effects with a new line. Output in the following format for each prompt and duration:
[prompt1];[duration1] (new line) [prompt2];[duration2] etc. only include the number of the duration
in [duration] No other text should be included in
the output. Do make the prompts with details, such as the intensity, feeling etc according to the
video transcript so that the high quality and suitable sound can be generated. Transcript: {transcript}"""
completion = llama_client.chat.completions.create(
model="meta/llama-3.1-405b-instruct",
messages=[
{"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."},
{"role": "user", "content": your_prompt}
],
temperature=0.5,
top_p=1,
max_tokens=2048,
timeout=300.0
)
response_text = completion.choices[0].message.content
except Exception as e:
raise gr.Error(f"Llama (NVIDIA API) Error: {e}")
# Clean Llama responses (your logic, made more robust)
prompts = response_text.splitlines()
fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator
if not fixed_prompts:
return transcript, "Llama did not return any valid prompts", []
# --- Step 3: ElevenLabs Audio Generation ---
output_audio_files = []
sfx_prompts_text = [] # To display the prompts in the UI
for i, line in enumerate(fixed_prompts):
try:
parts = line.split(";")
if len(parts) < 2:
continue # Skip malformed lines
sfx_prompt = parts[0]
duration = float(parts[1])
sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)")
generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}"
audio = elevenlabs_client.text_to_sound_effects.convert(
text=generation_prompt,
duration_seconds=duration,
prompt_influence=0.6
)
# Save the audio to a unique file
output_filename = f"generated_sfx_{i}.mp3"
with open(output_filename, "wb") as f:
for chunk in audio:
f.write(chunk)
# Add the file path to our list
output_audio_files.append(output_filename)
time.sleep(5)
except Exception as e:
print(f"ElevenLabs error on prompt {i}: {e}")
# Don't crash the whole app, just skip this one audio
continue
# --- 4. Return all outputs to the Gradio Interface ---
# This must match the 'outputs' list in gr.Interface
final_prompts_display = "\n".join(sfx_prompts_text)
return transcript, final_prompts_display, output_audio_files
# --- 5. Create the Gradio Interface ---
demo = gr.Interface(
fn=generate_sfx, # The main function to call
# Input component: A video uploader
inputs=gr.Video(label="Upload Your Video Clip"),
# Output components: A list matching the function's return values
outputs=[
gr.Textbox(label="1. Gemini Visual Transcript"),
gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8),
gr.File(label="3. ElevenLabs Generated Sound Effects") #
],
title="🎬 AI Video-to-Sound-Effect Workflow",
description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).",
allow_flagging="never",
)
# --- 6. Launch the App ---
if __name__ == "__main__":
# Use share=True to get a temporary public link
demo.launch(share=True, debug=True)