Spaces:

Chaitanya-aitf
/

dev_caio

Paused

File size: 9,756 Bytes

"""
ShortSmith v2 - Gradio Application

Hugging Face Space interface for video highlight extraction.
Features:
- Multi-modal analysis (visual + audio + motion)
- Domain-optimized presets
- Person-specific filtering (optional)
- Scene-aware clip cutting
"""

import os
import sys
import tempfile
import shutil
from pathlib import Path
import time
import traceback

import gradio as gr

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))

# Initialize logging
try:
    from utils.logger import setup_logging, get_logger
    setup_logging(log_level="INFO", log_to_console=True)
    logger = get_logger("app")
except Exception:
    import logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("app")


def process_video(
    video_file,
    domain,
    num_clips,
    clip_duration,
    reference_image,
    custom_prompt,
    progress=gr.Progress()
):
    """
    Main video processing function.

    Args:
        video_file: Uploaded video file path
        domain: Content domain for scoring weights
        num_clips: Number of clips to extract
        clip_duration: Duration of each clip in seconds
        reference_image: Optional reference image for person filtering
        custom_prompt: Optional custom instructions
        progress: Gradio progress tracker

    Returns:
        Tuple of (status_message, clip1, clip2, clip3, log_text)
    """
    if video_file is None:
        return "Please upload a video first.", None, None, None, ""

    log_messages = []

    def log(msg):
        log_messages.append(f"[{time.strftime('%H:%M:%S')}] {msg}")
        logger.info(msg)

    try:
        video_path = Path(video_file)
        log(f"Processing video: {video_path.name}")
        progress(0.05, desc="Validating video...")

        # Import pipeline components
        from utils.helpers import validate_video_file, validate_image_file, format_duration
        from pipeline.orchestrator import PipelineOrchestrator

        # Validate video
        validation = validate_video_file(video_file)
        if not validation.is_valid:
            return f"Error: {validation.error_message}", None, None, None, "\n".join(log_messages)

        log(f"Video size: {validation.file_size / (1024*1024):.1f} MB")

        # Validate reference image if provided
        ref_path = None
        if reference_image is not None:
            ref_validation = validate_image_file(reference_image)
            if ref_validation.is_valid:
                ref_path = reference_image
                log(f"Reference image: {Path(reference_image).name}")
            else:
                log(f"Warning: Invalid reference image - {ref_validation.error_message}")

        # Map domain string to internal value
        domain_map = {
            "Sports": "sports",
            "Vlogs": "vlogs",
            "Music Videos": "music",
            "Podcasts": "podcasts",
            "Gaming": "gaming",
            "General": "general",
        }
        domain_value = domain_map.get(domain, "general")
        log(f"Domain: {domain_value}")

        # Create output directory
        output_dir = Path(tempfile.mkdtemp(prefix="shortsmith_output_"))
        log(f"Output directory: {output_dir}")

        # Initialize pipeline
        progress(0.1, desc="Initializing AI models...")
        log("Initializing pipeline...")
        pipeline = PipelineOrchestrator()

        # Process video
        progress(0.2, desc="Analyzing video...")
        log(f"Processing: {int(num_clips)} clips @ {int(clip_duration)}s each")

        result = pipeline.process(
            video_path=video_path,
            num_clips=int(num_clips),
            clip_duration=float(clip_duration),
            domain=domain_value,
            reference_image=ref_path,
            custom_prompt=custom_prompt.strip() if custom_prompt else None,
        )

        progress(0.9, desc="Extracting clips...")

        # Handle result
        if result.success:
            log(f"Processing complete in {result.processing_time:.1f}s")

            clip_paths = []
            for i, clip in enumerate(result.clips):
                if clip.clip_path.exists():
                    output_path = output_dir / f"highlight_{i+1}.mp4"
                    shutil.copy2(clip.clip_path, output_path)
                    clip_paths.append(str(output_path))
                    log(f"Clip {i+1}: {format_duration(clip.start_time)} - {format_duration(clip.end_time)} (score: {clip.hype_score:.2f})")

            status = f"Successfully extracted {len(clip_paths)} highlight clips!\nProcessing time: {result.processing_time:.1f}s"
            pipeline.cleanup()
            progress(1.0, desc="Done!")

            # Return up to 3 clips
            clip1 = clip_paths[0] if len(clip_paths) > 0 else None
            clip2 = clip_paths[1] if len(clip_paths) > 1 else None
            clip3 = clip_paths[2] if len(clip_paths) > 2 else None

            return status, clip1, clip2, clip3, "\n".join(log_messages)
        else:
            log(f"Processing failed: {result.error_message}")
            pipeline.cleanup()
            return f"Error: {result.error_message}", None, None, None, "\n".join(log_messages)

    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        log(error_msg)
        log(traceback.format_exc())
        logger.exception("Pipeline error")
        return error_msg, None, None, None, "\n".join(log_messages)


# Build Gradio interface
with gr.Blocks(
    title="ShortSmith v2",
    theme=gr.themes.Soft(),
    css="""
    .container { max-width: 1200px; margin: auto; }
    .output-video { min-height: 200px; }
    """
) as demo:

    gr.Markdown("""
    # 🎬 ShortSmith v2
    ### AI-Powered Video Highlight Extractor

    Upload a video and automatically extract the most engaging highlight clips using AI analysis.
    """)

    with gr.Row():
        # Left column - Inputs
        with gr.Column(scale=1):
            gr.Markdown("### 📤 Input")

            video_input = gr.Video(
                label="Upload Video",
                sources=["upload"],
            )

            with gr.Accordion("⚙️ Settings", open=True):
                domain_dropdown = gr.Dropdown(
                    choices=["Sports", "Vlogs", "Music Videos", "Podcasts", "Gaming", "General"],
                    value="General",
                    label="Content Domain",
                    info="Select the type of content for optimized scoring"
                )

                with gr.Row():
                    num_clips_slider = gr.Slider(
                        minimum=1,
                        maximum=3,
                        value=3,
                        step=1,
                        label="Number of Clips",
                        info="How many highlight clips to extract"
                    )
                    duration_slider = gr.Slider(
                        minimum=5,
                        maximum=30,
                        value=15,
                        step=1,
                        label="Clip Duration (seconds)",
                        info="Target duration for each clip"
                    )

            with gr.Accordion("👤 Person Filtering (Optional)", open=False):
                reference_image = gr.Image(
                    label="Reference Image",
                    type="filepath",
                    sources=["upload"],
                )
                gr.Markdown("*Upload a photo of a person to prioritize clips featuring them.*")

            with gr.Accordion("📝 Custom Instructions (Optional)", open=False):
                custom_prompt = gr.Textbox(
                    label="Additional Instructions",
                    placeholder="E.g., 'Focus on crowd reactions' or 'Prioritize action scenes'",
                    lines=2,
                )

            process_btn = gr.Button(
                "🚀 Extract Highlights",
                variant="primary",
                size="lg"
            )

        # Right column - Outputs
        with gr.Column(scale=1):
            gr.Markdown("### 📥 Output")

            status_output = gr.Textbox(
                label="Status",
                lines=2,
                interactive=False
            )

            gr.Markdown("#### Extracted Clips")
            clip1_output = gr.Video(label="Clip 1", elem_classes=["output-video"])
            clip2_output = gr.Video(label="Clip 2", elem_classes=["output-video"])
            clip3_output = gr.Video(label="Clip 3", elem_classes=["output-video"])

            with gr.Accordion("📋 Processing Log", open=False):
                log_output = gr.Textbox(
                    label="Log",
                    lines=10,
                    interactive=False,
                    show_copy_button=True
                )

    gr.Markdown("""
    ---
    **ShortSmith v2** | Powered by Qwen2-VL, InsightFace, and Librosa |
    [GitHub](https://github.com) | Built with Gradio
    """)

    # Connect the button to the processing function
    process_btn.click(
        fn=process_video,
        inputs=[
            video_input,
            domain_dropdown,
            num_clips_slider,
            duration_slider,
            reference_image,
            custom_prompt
        ],
        outputs=[
            status_output,
            clip1_output,
            clip2_output,
            clip3_output,
            log_output
        ],
        show_progress="full"
    )

# Launch the app
if __name__ == "__main__":
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )
else:
    # For HuggingFace Spaces
    demo.queue()
    demo.launch()