Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoProcessor | |
| # Model name - USE THE BASE MODEL THAT HAS ALL FILES | |
| model_name = "DAMO-NLP-SG/VideoLLaMA3-7B" # This has all the config files | |
| # Global variables for model and processor | |
| model = None | |
| processor = None | |
| def load_model(): | |
| global model, processor | |
| try: | |
| print("Loading VideoLLaMA3 model...") | |
| print("This may take several minutes on first load...") | |
| # Load model with correct parameters based on official documentation | |
| # Try with flash attention first, fall back to standard attention | |
| try: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, # Changed from float16 to bfloat16 | |
| attn_implementation="flash_attention_2", # Added for better performance | |
| ) | |
| print("Loaded with flash attention") | |
| except Exception as flash_error: | |
| print(f"Flash attention failed: {flash_error}") | |
| print("Falling back to standard attention...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| print("Loaded with standard attention") | |
| # Load processor (not tokenizer) | |
| processor = AutoProcessor.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| print("Model and processor loaded successfully!") | |
| return True | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def process_video_question(video_file, question): | |
| """Process video and answer questions about it using VideoLLaMA3""" | |
| global model, processor | |
| if model is None or processor is None: | |
| return "Model is not loaded. Please wait for the model to initialize or check the logs for errors." | |
| if video_file is None: | |
| return "Please upload a video file first." | |
| if not question.strip(): | |
| return "Please enter a question about the video." | |
| try: | |
| print(f"Processing video: {video_file}") | |
| print(f"Question: {question}") | |
| # Prepare conversation in the format expected by VideoLLaMA3 | |
| conversation = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "video", | |
| "video": { | |
| "video_path": video_file, | |
| "fps": 1, | |
| "max_frames": 128 | |
| } | |
| }, | |
| {"type": "text", "text": question} | |
| ] | |
| } | |
| ] | |
| # Process the conversation | |
| inputs = processor(conversation=conversation, return_tensors="pt") | |
| # Move inputs to GPU if available | |
| inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} | |
| # Convert pixel values to bfloat16 if present | |
| if "pixel_values" in inputs: | |
| inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16) | |
| # Generate response | |
| print("Generating response...") | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| use_cache=True | |
| ) | |
| # Decode the response | |
| response = processor.decode(output_ids[0], skip_special_tokens=True) | |
| # Extract just the assistant's response (remove the conversation history) | |
| if "assistant" in response: | |
| response = response.split("assistant")[-1].strip() | |
| print(f"Generated response: {response}") | |
| return response | |
| except Exception as e: | |
| error_msg = f"Error processing video: {str(e)}" | |
| print(error_msg) | |
| import traceback | |
| traceback.print_exc() | |
| return error_msg | |
| # Initialize model when the Space starts | |
| print(f"Initializing {model_name}...") | |
| model_loaded = load_model() | |
| if not model_loaded: | |
| print("β Failed to load model. Check the logs above for details.") | |
| # Create the Gradio interface | |
| with gr.Blocks(title="VideoLLaMA3 Demo", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π₯ VideoRefer-VideoLLaMA3 Interactive Demo") | |
| gr.Markdown(f"**Model:** `{model_name}`") | |
| if model_loaded: | |
| gr.Markdown("β **Model Status:** Loaded and ready!") | |
| else: | |
| gr.Markdown("β **Model Status:** Failed to load. Check logs for details.") | |
| gr.Markdown("Upload a video and ask questions about its content!") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| video_input = gr.Video( | |
| label="πΉ Upload Video", | |
| height=300 | |
| ) | |
| question_input = gr.Textbox( | |
| label="β Ask a question about the video", | |
| placeholder="What is happening in this video?", | |
| lines=3 | |
| ) | |
| submit_btn = gr.Button("π Analyze Video", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="π€ AI Response", | |
| lines=12, | |
| placeholder="The AI response will appear here...", | |
| show_copy_button=True | |
| ) | |
| # Examples section | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| ### π‘ Example Questions: | |
| - "What objects can you see in this video?" | |
| - "Describe the main action happening in detail" | |
| - "What is the setting or location of this video?" | |
| - "How many people are in the video and what are they doing?" | |
| - "What emotions or mood does this video convey?" | |
| - "Describe the sequence of events in chronological order" | |
| """) | |
| # Connect the button to the function | |
| submit_btn.click( | |
| fn=process_video_question, | |
| inputs=[video_input, question_input], | |
| outputs=output_text | |
| ) | |
| # Auto-submit when Enter is pressed in the question box | |
| question_input.submit( | |
| fn=process_video_question, | |
| inputs=[video_input, question_input], | |
| outputs=output_text | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |