Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import time | |
| import librosa | |
| from google import genai | |
| from google.genai import types | |
| from openai import OpenAI | |
| from elevenlabs.client import ElevenLabs | |
| # --- 1. Load API Keys from Environment Variables --- | |
| # This is the standard way, replacing Colab's 'userdata' | |
| # For local testing, you'll set these in your terminal. | |
| # For Hugging Face, you'll set these in "Repository secrets". | |
| GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY') | |
| ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY') | |
| LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY') | |
| # --- 2. Initialize API Clients (Do this once, globally) --- | |
| try: | |
| genai_client = genai.Client(api_key=GOOGLE_API_KEY) | |
| elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) | |
| llama_client = None | |
| if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"): | |
| base_url = "https://integrate.api.nvidia.com/v1" | |
| llama_client = OpenAI( | |
| base_url=base_url, | |
| api_key=LLAMA_405B_KEY | |
| ) | |
| if not all([genai_client, elevenlabs_client, llama_client]): | |
| print("WARNING: One or more API keys are missing. The app will fail if run.") | |
| except Exception as e: | |
| print(f"Error initializing clients (this is OK at startup, but check keys): {e}") | |
| # This is the long prompt from your script | |
| prompt1 = """Role: | |
| You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions. | |
| Primary Objective: | |
| Analyze the provided video and generate a detailed, chronological description of everything visually occurring in the footage. Focus entirely on what can be seen, not heard. | |
| Core Instructions: | |
| Follow these instructions exactly: | |
| Visual-Only Focus | |
| Describe only what is visible on-screen. | |
| Ignore all sounds, dialogue, narration, or music. | |
| Include on-screen text only if it appears as a visible object (e.g., sign, label, subtitle). | |
| Chronological Detailing | |
| Describe events strictly in the order they appear. | |
| Use clear temporal markers such as “At the beginning…”, “Next…”, “Then…”, “After that…”, “Finally…” | |
| Comprehensive Visual Content | |
| Describe people, objects, settings, environments, lighting, colors, positions, and movements. | |
| Include camera actions (pans, tilts, zooms, cuts, transitions). | |
| Capture facial expressions, gestures, and body posture changes if visible. | |
| Objectivity and Precision | |
| Avoid interpretation, emotion, or speculation. | |
| Describe only observable facts (e.g., say “The person raises their right arm,” not “The person waves hello”). | |
| Level of Detail | |
| Provide enough visual information for someone to recreate or storyboard the entire scene. | |
| Include every key visual or motion change. | |
| Output Formatting: | |
| Use the following structured format: | |
| [Timestamp or Sequence Indicator] | |
| Detailed description of what is visually happening. | |
| Example: | |
| 0:00–0:04 — A man in a dark blue jacket walks across a street. A red car passes behind him. | |
| 0:05–0:09 — The camera tilts upward to show a tall building with glass windows. The sky is cloudy. | |
| 0:10–0:13 — The man stops, looks up, and adjusts the strap of a black backpack. | |
| If timestamps are unavailable, use sequence-based ordering (e.g., “Scene 1,” “Scene 2,” etc.). | |
| Final Output Rule: | |
| Produce a single, continuous, structured description following all the above rules. | |
| Do not summarize, infer meaning, or include audio elements. | |
| The output must be factual, visual, chronological, and exhaustive.""" | |
| # --- 3. The Main Workflow Function for Gradio --- | |
| def generate_sfx(video_path): | |
| """ | |
| This single function runs your entire workflow. | |
| Gradio will pass the user's uploaded video as a file object. | |
| """ | |
| if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]): | |
| raise gr.Error("API keys are not configured...") | |
| try: | |
| # --- THIS IS THE NEW, FIXED LINE --- | |
| # We now read directly from the file object's path | |
| video_bytes = open(video_path, 'rb').read() | |
| except Exception as e: | |
| raise gr.Error(f"Failed to read video file: {e}") | |
| # --- Step 1: Gemini Video Transcript --- | |
| try: | |
| response = genai_client.models.generate_content( | |
| model='models/gemini-2.5-flash', | |
| contents=types.Content( | |
| parts=[ | |
| types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')), | |
| types.Part(text=prompt1) | |
| ] | |
| ) | |
| ) | |
| transcript = response.text | |
| except Exception as e: | |
| raise gr.Error(f"Gemini API Error: {e}") | |
| # --- Step 2: Llama Prompt Generation --- | |
| try: | |
| your_prompt = f"""Identify the suitable audio effects based on the given video transcript and | |
| generate a suitable and detailed prompt for each audio effects for another audio generating AI | |
| model to generate the audio effects. Note that the duration of each audio should be within 2-10 | |
| seconds. Only include the prompts for generating the sound effects | |
| and do not include any other text, such as timestamps. Separate the prompt and the duration for | |
| each audio effects with a new line. Output in the following format for each prompt and duration: | |
| [prompt1];[duration1] (new line) [prompt2];[duration2] etc. only include the number of the duration | |
| in [duration] No other text should be included in | |
| the output. Do make the prompts with details, such as the intensity, feeling etc according to the | |
| video transcript so that the high quality and suitable sound can be generated. Transcript: {transcript}""" | |
| completion = llama_client.chat.completions.create( | |
| model="meta/llama-3.1-405b-instruct", | |
| messages=[ | |
| {"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."}, | |
| {"role": "user", "content": your_prompt} | |
| ], | |
| temperature=0.5, | |
| top_p=1, | |
| max_tokens=2048, | |
| timeout=300.0 | |
| ) | |
| response_text = completion.choices[0].message.content | |
| except Exception as e: | |
| raise gr.Error(f"Llama (NVIDIA API) Error: {e}") | |
| # Clean Llama responses (your logic, made more robust) | |
| prompts = response_text.splitlines() | |
| fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator | |
| if not fixed_prompts: | |
| return transcript, "Llama did not return any valid prompts", [] | |
| # --- Step 3: ElevenLabs Audio Generation --- | |
| output_audio_files = [] | |
| sfx_prompts_text = [] # To display the prompts in the UI | |
| for i, line in enumerate(fixed_prompts): | |
| try: | |
| parts = line.split(";") | |
| if len(parts) < 2: | |
| continue # Skip malformed lines | |
| sfx_prompt = parts[0] | |
| duration = float(parts[1]) | |
| sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)") | |
| generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}" | |
| audio = elevenlabs_client.text_to_sound_effects.convert( | |
| text=generation_prompt, | |
| duration_seconds=duration, | |
| prompt_influence=0.6 | |
| ) | |
| # Save the audio to a unique file | |
| output_filename = f"generated_sfx_{i}.mp3" | |
| with open(output_filename, "wb") as f: | |
| for chunk in audio: | |
| f.write(chunk) | |
| # Add the file path to our list | |
| output_audio_files.append(output_filename) | |
| time.sleep(5) | |
| except Exception as e: | |
| print(f"ElevenLabs error on prompt {i}: {e}") | |
| # Don't crash the whole app, just skip this one audio | |
| continue | |
| # --- 4. Return all outputs to the Gradio Interface --- | |
| # This must match the 'outputs' list in gr.Interface | |
| final_prompts_display = "\n".join(sfx_prompts_text) | |
| return transcript, final_prompts_display, output_audio_files | |
| # --- 5. Create the Gradio Interface --- | |
| demo = gr.Interface( | |
| fn=generate_sfx, # The main function to call | |
| # Input component: A video uploader | |
| inputs=gr.Video(label="Upload Your Video Clip"), | |
| # Output components: A list matching the function's return values | |
| outputs=[ | |
| gr.Textbox(label="1. Gemini Visual Transcript"), | |
| gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8), | |
| gr.File(label="3. ElevenLabs Generated Sound Effects") # | |
| ], | |
| title="🎬 AI Video-to-Sound-Effect Workflow", | |
| description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).", | |
| allow_flagging="never", | |
| ) | |
| # --- 6. Launch the App --- | |
| if __name__ == "__main__": | |
| # Use share=True to get a temporary public link | |
| demo.launch(share=True, debug=True) |