Spaces:

Anson818
/

CCAI_TEST

Sleeping

App Files Files Community

CCAI_TEST / app.py

Anson818

Update app.py

e77b207 verified 3 months ago

raw

history blame contribute delete

9.05 kB

	import gradio as gr
	import os
	import time
	import librosa
	from google import genai
	from google.genai import types
	from openai import OpenAI
	from elevenlabs.client import ElevenLabs

	# --- 1. Load API Keys from Environment Variables ---
	# This is the standard way, replacing Colab's 'userdata'
	# For local testing, you'll set these in your terminal.
	# For Hugging Face, you'll set these in "Repository secrets".
	GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
	ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
	LLAMA_405B_KEY = os.environ.get('LLAMA_405B_KEY')

	# --- 2. Initialize API Clients (Do this once, globally) ---
	try:
	genai_client = genai.Client(api_key=GOOGLE_API_KEY)
	elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)

	llama_client = None
	if LLAMA_405B_KEY and LLAMA_405B_KEY.startswith("nvapi-"):
	base_url = "https://integrate.api.nvidia.com/v1"
	llama_client = OpenAI(
	base_url=base_url,
	api_key=LLAMA_405B_KEY
	)
	if not all([genai_client, elevenlabs_client, llama_client]):
	print("WARNING: One or more API keys are missing. The app will fail if run.")

	except Exception as e:
	print(f"Error initializing clients (this is OK at startup, but check keys): {e}")

	# This is the long prompt from your script
	prompt1 = """Role:
	You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
	Primary Objective:
	Analyze the provided video and generate a detailed, chronological description of everything visually occurring in the footage. Focus entirely on what can be seen, not heard.
	Core Instructions:
	Follow these instructions exactly:
	Visual-Only Focus

	Describe only what is visible on-screen.

	Ignore all sounds, dialogue, narration, or music.

	Include on-screen text only if it appears as a visible object (e.g., sign, label, subtitle).

	Chronological Detailing

	Describe events strictly in the order they appear.

	Use clear temporal markers such as “At the beginning…”, “Next…”, “Then…”, “After that…”, “Finally…”

	Comprehensive Visual Content

	Describe people, objects, settings, environments, lighting, colors, positions, and movements.

	Include camera actions (pans, tilts, zooms, cuts, transitions).

	Capture facial expressions, gestures, and body posture changes if visible.

	Objectivity and Precision

	Avoid interpretation, emotion, or speculation.

	Describe only observable facts (e.g., say “The person raises their right arm,” not “The person waves hello”).

	Level of Detail

	Provide enough visual information for someone to recreate or storyboard the entire scene.

	Include every key visual or motion change.
	Output Formatting:
	Use the following structured format:
	[Timestamp or Sequence Indicator]
	Detailed description of what is visually happening.

	Example:
	0:00–0:04 — A man in a dark blue jacket walks across a street. A red car passes behind him.
	0:05–0:09 — The camera tilts upward to show a tall building with glass windows. The sky is cloudy.
	0:10–0:13 — The man stops, looks up, and adjusts the strap of a black backpack.

	If timestamps are unavailable, use sequence-based ordering (e.g., “Scene 1,” “Scene 2,” etc.).
	Final Output Rule:
	Produce a single, continuous, structured description following all the above rules.
	Do not summarize, infer meaning, or include audio elements.
	The output must be factual, visual, chronological, and exhaustive."""


	# --- 3. The Main Workflow Function for Gradio ---
	def generate_sfx(video_path):
	"""
	This single function runs your entire workflow.
	Gradio will pass the user's uploaded video as a file object.
	"""
	if not all([GOOGLE_API_KEY, ELEVENLABS_API_KEY, LLAMA_405B_KEY, llama_client]):
	raise gr.Error("API keys are not configured...")

	try:
	# --- THIS IS THE NEW, FIXED LINE ---
	# We now read directly from the file object's path
	video_bytes = open(video_path, 'rb').read()
	except Exception as e:
	raise gr.Error(f"Failed to read video file: {e}")

	# --- Step 1: Gemini Video Transcript ---
	try:
	response = genai_client.models.generate_content(
	model='models/gemini-2.5-flash',
	contents=types.Content(
	parts=[
	types.Part(inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')),
	types.Part(text=prompt1)
	]
	)
	)
	transcript = response.text
	except Exception as e:
	raise gr.Error(f"Gemini API Error: {e}")

	# --- Step 2: Llama Prompt Generation ---
	try:
	your_prompt = f"""Identify the suitable audio effects based on the given video transcript and
	generate a suitable and detailed prompt for each audio effects for another audio generating AI
	model to generate the audio effects. Note that the duration of each audio should be within 2-10
	seconds. Only include the prompts for generating the sound effects
	and do not include any other text, such as timestamps. Separate the prompt and the duration for
	each audio effects with a new line. Output in the following format for each prompt and duration:
	[prompt1];[duration1] (new line) [prompt2];[duration2] etc. only include the number of the duration
	in [duration] No other text should be included in
	the output. Do make the prompts with details, such as the intensity, feeling etc according to the
	video transcript so that the high quality and suitable sound can be generated. Transcript: {transcript}"""

	completion = llama_client.chat.completions.create(
	model="meta/llama-3.1-405b-instruct",
	messages=[
	{"role": "system", "content": "You are a prompt engineer that generates audio effect prompts based on a video transcript."},
	{"role": "user", "content": your_prompt}
	],
	temperature=0.5,
	top_p=1,
	max_tokens=2048,
	timeout=300.0
	)
	response_text = completion.choices[0].message.content
	except Exception as e:
	raise gr.Error(f"Llama (NVIDIA API) Error: {e}")

	# Clean Llama responses (your logic, made more robust)
	prompts = response_text.splitlines()
	fixed_prompts = [p for p in prompts if p and ";" in p] # Keep only valid, non-empty lines with a separator

	if not fixed_prompts:
	return transcript, "Llama did not return any valid prompts", []

	# --- Step 3: ElevenLabs Audio Generation ---
	output_audio_files = []
	sfx_prompts_text = [] # To display the prompts in the UI

	for i, line in enumerate(fixed_prompts):
	try:
	parts = line.split(";")
	if len(parts) < 2:
	continue # Skip malformed lines

	sfx_prompt = parts[0]
	duration = float(parts[1])

	sfx_prompts_text.append(f"{i+1}. {sfx_prompt} ({duration}s)")

	generation_prompt = f"Generate sound effects for a film clip based on the prompts below: {sfx_prompt}"

	audio = elevenlabs_client.text_to_sound_effects.convert(
	text=generation_prompt,
	duration_seconds=duration,
	prompt_influence=0.6
	)

	# Save the audio to a unique file
	output_filename = f"generated_sfx_{i}.mp3"
	with open(output_filename, "wb") as f:
	for chunk in audio:
	f.write(chunk)

	# Add the file path to our list
	output_audio_files.append(output_filename)
	time.sleep(5)

	except Exception as e:
	print(f"ElevenLabs error on prompt {i}: {e}")
	# Don't crash the whole app, just skip this one audio
	continue

	# --- 4. Return all outputs to the Gradio Interface ---
	# This must match the 'outputs' list in gr.Interface
	final_prompts_display = "\n".join(sfx_prompts_text)
	return transcript, final_prompts_display, output_audio_files

	# --- 5. Create the Gradio Interface ---
	demo = gr.Interface(
	fn=generate_sfx, # The main function to call

	# Input component: A video uploader
	inputs=gr.Video(label="Upload Your Video Clip"),

	# Output components: A list matching the function's return values
	outputs=[
	gr.Textbox(label="1. Gemini Visual Transcript"),
	gr.Textbox(label="2. Llama-Generated SFX Prompts", lines=8),
	gr.File(label="3. ElevenLabs Generated Sound Effects") #
	],

	title="🎬 AI Video-to-Sound-Effect Workflow",
	description="Upload a video. The app will: 1. Transcribe visuals (Gemini). 2. Create SFX prompts (Llama). 3. Generate audio (ElevenLabs).",
	allow_flagging="never",
	)

	# --- 6. Launch the App ---
	if __name__ == "__main__":
	# Use share=True to get a temporary public link
	demo.launch(share=True, debug=True)