import os import torch from operator import itemgetter from mmaction.apis import init_recognizer, inference_recognizer import gradio as gr # Set paths for Hugging Face Spaces config_file = 'demo/demo_configs/tsn_r50_1x1x8_video_infer.py' checkpoint_file = 'checkpoints/tsn_r50_8xb32-1x1x8-100e_kinetics400-rgb_20220818-2692d16c.pth' # Download model checkpoint if it doesn't exist def download_checkpoint(): if not os.path.exists(checkpoint_file): os.makedirs('checkpoints', exist_ok=True) print("Model checkpoint not found. Please run 'python download_model.py' to download it.") print("Or place the checkpoint file manually at:", checkpoint_file) return False return True # Initialize model print("Initializing model...") if not download_checkpoint(): print("❌ Cannot initialize model without checkpoint. Exiting...") exit(1) try: model = init_recognizer(config_file, checkpoint_file, device='cpu') print("✅ Model loaded successfully!") except Exception as e: print(f"❌ Error loading model: {e}") print("Please check that the config file and checkpoint are correct.") exit(1) # test a single video and show the result: # video = 'demo.mp4' # label = '../tools/data/kinetics/label_map_k400.txt' # results = inference_recognizer(model, video) # pred_scores = results.pred_score.tolist() # score_tuples = tuple(zip(range(len(pred_scores)), pred_scores)) # score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) # top5_label = score_sorted[:5] # labels = open(label).readlines() # labels = [x.strip() for x in labels] # results = [(labels[k[0]], k[1]) for k in top5_label] # # show the results # for result in results: # print(f'{result[0]}: ', result[1]) def analyze_video(video): """Analyze video for action recognition""" try: if video is None: return "Please upload a video file." print(f"Processing video: {video}") results = inference_recognizer(model, video) # Format results nicely if hasattr(results, 'pred_score'): pred_scores = results.pred_score.tolist() score_tuples = tuple(zip(range(len(pred_scores)), pred_scores)) score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) top5_label = score_sorted[:5] # Load labels if available label_file = 'tools/data/kinetics/label_map_k400.txt' if os.path.exists(label_file): with open(label_file, 'r') as f: labels = [x.strip() for x in f.readlines()] results_formatted = [(labels[k[0]], f"{k[1]:.4f}") for k in top5_label] else: results_formatted = [(f"Class {k[0]}", f"{k[1]:.4f}") for k in top5_label] result_text = "Top 5 Predictions:\n" for i, (label, score) in enumerate(results_formatted, 1): result_text += f"{i}. {label}: {score}\n" return result_text else: return f"Analysis complete. Raw result: {results}" except Exception as e: return f"Error processing video: {str(e)}" # Create Gradio interface demo = gr.Interface( fn=analyze_video, inputs=gr.Video(label="Upload Video", height=300), outputs=gr.Textbox(label="Analysis Results", lines=10), title="🎬 GenVidBench - Video Action Recognition", description=""" Upload a video to analyze its content using state-of-the-art action recognition models. This demo uses TSN (Temporal Segment Networks) trained on Kinetics-400 dataset. **Supported formats:** MP4, AVI, MOV, etc. **Max duration:** Recommended under 30 seconds for faster processing. """, examples=[ ["demo/demo.mp4"] if os.path.exists("demo/demo.mp4") else None ], cache_examples=False, theme=gr.themes.Soft(), allow_flagging="never" ) if __name__ == "__main__": demo.launch()