Spaces:

Anson818
/

CCAI_TEST

Sleeping

App Files Files Community

Anson818 commited on Nov 11, 2025

Commit

cf11d21

verified ·

1 Parent(s): 83d0b5c

Upload app.py

Browse files

Files changed (1) hide show

app.py +168 -0

app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import gradio as gr
+import os
+import time
+import librosa
+from google import genai
+from google.genai import types
+from openai import OpenAI
+from elevenlabs.client import ElevenLabs
+# --- 1. Load API Keys from Environment Variables ---
+# This is the standard way, replacing Colab's 'userdata'
+# For local testing, you'll set these in your terminal.
+# For Hugging Face, you'll set these in "Repository secrets".
+GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
+ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
+LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY')
+# --- 2. Initialize API Clients (Do this once, globally) ---
+try:
+    genai_client = genai.Client(api_key=GOOGLE_API_KEY)
+    elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
+    llama_client = None
+    if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"):
+        base_url = "https://integrate.api.nvidia.com/v1"
+        llama_client = OpenAI(
+            base_url=base_url,
+            api_key=LLAMA_405B_KEY
+        )
+    if not all([genai_client, elevenlabs_client, llama_client]):
+        print("WARNING: One or more API keys are missing. The app will fail if run.")
+except Exception as e:
+    print(f"Error initializing clients (this is OK at startup, but check keys): {e}")
+# This is the long prompt from your script
+prompt1 = """Role:
+ You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
+... (Your full Gemini prompt) ...
+Final Output Rule:
+ Produce a single, continuous, structured description following all the above rules.
+ Do not summarize, infer meaning, or include audio elements.
+ The output must be factual, visual, chronological, and exhaustive."""
+# --- 3. The Main Workflow Function for Gradio ---
+def generate_sfx(video_path):
+    """
+    This single function runs your entire workflow.
+    Gradio will pass the user's uploaded video path to this function.
+    """
+    if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]):
+        raise gr.Error("API keys are not configured. The app admin needs to set the secrets.")
+    try:
+        # --- Step 0: Read the uploaded video file ---
+        video_bytes = open(video_path, 'rb').read()
+    except Exception as e:
+        raise gr.Error(f"Failed to read video file: {e}")
+    # --- Step 1: Gemini Video Transcript ---
+    try:
+        response = genai_client.models.generate_content(
+            model='models/gemini-2.5-pro',
+            contents=types.Content(
+                parts=[
+                    types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')),
+                    types.Part(text=prompt1)
+                ]
+            )
+        )
+        transcript = response.text
+    except Exception as e:
+        raise gr.Error(f"Gemini API Error: {e}")
+    # --- Step 2: Llama Prompt Generation ---
+    try:
+        your_prompt = f"""Identify the suitable audio effects based on the given video transcript...
+        ... (Your full Llama prompt) ...
+        Transcript: {transcript}"""
+        completion = llama_client.chat.completions.create(
+          model="meta/llama-3.1-405b-instruct",
+          messages=[
+            {"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."},
+            {"role": "user", "content": your_prompt}
+          ],
+          temperature=0.5,
+          top_p=1,
+          max_tokens=2048,
+          timeout=300.0
+        )
+        response_text = completion.choices[0].message.content
+    except Exception as e:
+        raise gr.Error(f"Llama (NVIDIA API) Error: {e}")
+    # Clean Llama responses (your logic, made more robust)
+    prompts = response_text.splitlines()
+    fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator
+    if not fixed_prompts:
+        return transcript, "Llama did not return any valid prompts.", []
+    # --- Step 3: ElevenLabs Audio Generation ---
+    output_audio_files = []
+    sfx_prompts_text = [] # To display the prompts in the UI
+    for i, line in enumerate(fixed_prompts):
+        try:
+            parts = line.split(";")
+            if len(parts) < 2:
+                continue # Skip malformed lines
+            sfx_prompt = parts[0]
+            duration = float(parts[1])
+            sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)")
+            generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}"
+            audio = elevenlabs_client.text_to_sound_effects.convert(
+                text=generation_prompt,
+                duration_seconds=duration,
+                prompt_influence=0.6
+            )
+            # Save the audio to a unique file
+            output_filename = f"generated_sfx_{i}.mp3"
+            with open(output_filename, "wb") as f:
+                for chunk in audio:
+                    f.write(chunk)
+            # Add the file path to our list
+            output_audio_files.append(output_filename)
+        except Exception as e:
+            print(f"ElevenLabs error on prompt {i}: {e}")
+            # Don't crash the whole app, just skip this one audio
+            continue
+    # --- 4. Return all outputs to the Gradio Interface ---
+    # This must match the 'outputs' list in gr.Interface
+    final_prompts_display = "\n".join(sfx_prompts_text)
+    return transcript, final_prompts_display, output_audio_files
+# --- 5. Create the Gradio Interface ---
+demo = gr.Interface(
+    fn=generate_sfx,  # The main function to call
+    # Input component: A video uploader
+    inputs=gr.Video(label="Upload Your Video Clip"),
+    # Output components: A list matching the function's return values
+    outputs=[
+        gr.Textbox(label="1. Gemini Visual Transcript"),
+        gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8),
+        gr.Gallery(label="3. ElevenLabs Generated Sound Effects", columns=1)
+    ],
+    title="🎬 AI Video-to-Sound-Effect Workflow",
+    description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).",
+    allow_flagging="never",
+    # We can use your 'deadpool 3.mp4' as an example if it's in the same folder
+    examples=[["deadpool 3.mp4"]]
+)
+# --- 6. Launch the App ---
+if __name__ == "__main__":
+    # Use share=True to get a temporary public link
+    demo.launch(share=True, debug=True)