Spaces:

VincentGOURBIN
/

MeetingNotes-Voxtral-Analysis

Running on Zero

File size: 20,079 Bytes

"""
Application Gradio pour l'analyse intelligente de réunions avec Voxtral - Version HF Spaces.

Version adaptée pour Hugging Face Spaces avec :
- Uniquement mode Transformers (MLX et API supprimés)
- Modèles 8-bit uniquement
- Support MCP natif
- Zero GPU decorators
"""

import os
import gradio as gr
from dotenv import load_dotenv

from ..ai.voxtral_spaces_analyzer import VoxtralSpacesAnalyzer
from ..utils.zero_gpu_manager import ZeroGPUManager, gpu_inference

# Import labels locally
from .labels import UILabels

# Charger les variables d'environnement depuis le fichier .env
load_dotenv()

# Global instances for MCP functions
analyzer = None
gpu_manager = None

def initialize_components():
    """Initialize global components for MCP functions."""
    global analyzer, gpu_manager
    if analyzer is None:
        analyzer = VoxtralSpacesAnalyzer()
        gpu_manager = ZeroGPUManager()

# MCP Tools - exposed automatically by Gradio
@gpu_inference(duration=300)
def analyze_meeting_audio(
    audio_file: str,
    sections: list = None,
    model_name: str = "Voxtral-Mini-3B-2507"
) -> dict:
    """
    Analyze meeting audio and generate structured summaries using Voxtral AI.
    
    This function processes audio files to extract insights and generate 
    structured meeting summaries with configurable sections.
    
    Args:
        audio_file: Path to the audio file to analyze (MP3, WAV, M4A, OGG)
        sections: List of analysis sections to include (executive_summary, action_plan, etc.)
        model_name: Voxtral model to use for analysis (Mini-3B or Small-24B)
    
    Returns:
        Dictionary containing analysis results, processing time, and metadata
    """
    initialize_components()
    
    if not os.path.exists(audio_file):
        return {"error": "Audio file not found", "status": "failed"}
    
    try:
        import time
        start_time = time.time()
        
        # Set default sections if none provided
        if sections is None:
            sections = ["resume_executif", "discussions_principales", "plan_action"]
        
        # Switch model if different
        if analyzer.current_model_key != model_name:
            analyzer.switch_model(model_name)
        
        # Analyze audio (MCP function without progress bar)
        results = analyzer.analyze_audio_chunks(
            wav_path=audio_file,
            language="auto",
            selected_sections=sections
        )
        
        processing_time = time.time() - start_time
        
        return {
            "status": "completed",
            "analysis": results.get("transcription", "No analysis available"),
            "processing_time_seconds": processing_time,
            "model_used": model_name,
            "sections_analyzed": sections
        }
        
    except Exception as e:
        return {
            "status": "failed",
            "error": str(e),
            "processing_time_seconds": time.time() - start_time if 'start_time' in locals() else 0
        }
    finally:
        if gpu_manager:
            gpu_manager.cleanup_gpu()

def get_available_sections() -> dict:
    """Get available analysis sections for meeting summaries."""
    from ..ai.prompts_config import VoxtralPrompts
    return {
        "status": "success",
        "sections": VoxtralPrompts.AVAILABLE_SECTIONS,
        "total_sections": len(VoxtralPrompts.AVAILABLE_SECTIONS)
    }

def get_meeting_templates() -> dict:
    """Get pre-configured meeting analysis templates."""
    templates = {
        "action_meeting": {
            "name": "Action-Oriented Meeting",
            "description": "For meetings focused on decisions and action items",
            "recommended_sections": ["resume_executif", "discussions_principales", "plan_action", "decisions_prises", "prochaines_etapes"]
        },
        "info_meeting": {
            "name": "Information Meeting", 
            "description": "For presentations and informational sessions",
            "recommended_sections": ["resume_executif", "sujets_principaux", "points_importants", "questions_discussions", "elements_suivi"]
        }
    }
    return {"status": "success", "templates": templates, "total_templates": len(templates)}

# Handlers adaptés pour HF Spaces
def handle_input_mode_change(input_mode):
    """Gestion du changement de mode d'entrée."""
    if input_mode == UILabels.INPUT_MODE_AUDIO:
        return gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True)

def extract_audio_from_video(video_file, language):
    """Extraction audio depuis vidéo (placeholder pour HF Spaces)."""
    if video_file is None:
        return None, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language
    
    # Pour HF Spaces, on assume que le processing vidéo sera fait côté client
    # ou qu'on accepte déjà des fichiers audio
    return video_file, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language


@gpu_inference(duration=300)
def handle_direct_transcription(
    audio_file, hf_token, language, transcription_mode, model_key, 
    selected_sections, start_trim, end_trim, progress=gr.Progress()
):
    """Gestion de l'analyse directe adaptée pour HF Spaces."""
    initialize_components()
    
    if audio_file is None:
        return "", "❌ Veuillez d'abord télécharger un fichier audio."
    
    try:
        # Extraire le nom du modèle depuis transcription_mode
        if "Mini" in transcription_mode:
            model_name = "Voxtral-Mini-3B-2507"
        else:
            model_name = "Voxtral-Small-24B-2507"
        
        # Configurer l'analyseur
        if analyzer.current_model_key != model_name:
            analyzer.switch_model(model_name)
        
        # Setup progress callback
        def progress_callback(progress_ratio, message):
            progress(progress_ratio, desc=message)
        
        # Lancer l'analyse (chunk duration automatique selon le modèle)
        results = analyzer.analyze_audio_chunks(
            wav_path=audio_file,
            language="auto",
            selected_sections=selected_sections,
            start_trim=start_trim,
            end_trim=end_trim,
            progress_callback=progress_callback
        )
        
        return "", results.get("transcription", "Aucune analyse disponible")
        
    except Exception as e:
        error_msg = f"❌ Erreur lors de l'analyse: {str(e)}"
        return "", error_msg
    finally:
        if gpu_manager:
            gpu_manager.cleanup_gpu()

def create_spaces_interface():
    """
    Point d'entrée principal pour l'interface HF Spaces.
    
    Interface identique au projet original mais simplifiée :
    - Seul mode Transformers (pas MLX/API)
    - Modèles pré-quantisés uniquement 
    - Support MCP natif
    """
    # Initialize components
    initialize_components()
    
    # Récupérer le token Hugging Face depuis les variables d'environnement
    hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
    if hf_token is None:
        print("⚠️ Warning: HF_TOKEN environment variable not found")

    # Configuration du thème Glass personnalisé (identique à l'original)
    custom_glass_theme = gr.themes.Glass(
        primary_hue=gr.themes.colors.blue,
        secondary_hue=gr.themes.colors.gray,
        text_size=gr.themes.sizes.text_md,
        spacing_size=gr.themes.sizes.spacing_md,
        radius_size=gr.themes.sizes.radius_md
    )

    # CSS personnalisé pour l'application
    custom_css = """
        .gradio-container {
            max-width: 1200px !important;
            margin: 0 auto !important;
        }
        .main-header {
            text-align: center;
            margin-bottom: 30px;
            padding: 20px;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            border-radius: 15px;
            color: white;
            box-shadow: 0 8px 32px rgba(31, 38, 135, 0.37);
        }
        .processing-section {
            background: rgba(255, 255, 255, 0.1);
            border-radius: 10px;
            padding: 20px;
            margin: 15px 0;
            border: 1px solid rgba(255, 255, 255, 0.2);
            backdrop-filter: blur(10px);
        }
        .results-section {
            margin-top: 25px;
        }
        """

    with gr.Blocks(
        title="MeetingNotes - AI Analysis with Voxtral",
        theme=custom_glass_theme,
        css=custom_css
    ) as demo:
        # Main header with style (identique à l'original)
        with gr.Column(elem_classes="main-header"):
            gr.Markdown(
                f"""
                # {UILabels.MAIN_TITLE}
                {UILabels.MAIN_SUBTITLE}
                {UILabels.MAIN_DESCRIPTION}
                """,
                elem_classes="header-content"
            )

        # Processing mode section (SIMPLIFIÉ - seulement Transformers)
        with gr.Column(elem_classes="processing-section"):
            gr.Markdown("## 🔧 Model Configuration")
            
            # Model selection (modèles pré-quantisés)
            local_model_choice = gr.Radio(
                choices=[UILabels.MODEL_MINI, UILabels.MODEL_SMALL],
                value=UILabels.MODEL_MINI,
                label="Voxtral Model Selection"
            )
            
            # Information about the models
            gr.Markdown("""
            **📋 About this HF Spaces version:**
            - Uses standard Mistral Voxtral models optimized for Zero GPU
            - **Mini Model**: [Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507) - Faster processing, lower memory usage
            - **Small Model**: [Voxtral-Small-24B-2507](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) - Higher quality analysis, more detailed summaries
            - Chunk duration automatically optimized: 15min for Mini, 10min for Small
            
            **🔗 Complete version available:**
            For local processing (MLX/Transformers), API modes, and **speaker diarization**, check the full version on [GitHub](https://github.com/VincentGourbin/meetingnotes)
            """)

        # Input mode selection (identique à l'original)
        with gr.Column(elem_classes="processing-section"):
            gr.Markdown(UILabels.INPUT_MODE_TITLE)
            
            input_mode = gr.Radio(
                choices=[UILabels.INPUT_MODE_AUDIO, UILabels.INPUT_MODE_VIDEO], 
                value=UILabels.INPUT_MODE_AUDIO,
                label=UILabels.INPUT_MODE_LABEL
            )

        # Section Audio (mode par défaut) - identique à l'original
        with gr.Column(elem_classes="processing-section") as audio_section:
            gr.Markdown(UILabels.AUDIO_MODE_TITLE)
            
            audio_input = gr.Audio(
                label=UILabels.AUDIO_INPUT_LABEL,
                type="filepath",
                show_label=True,
                interactive=True
            )

        # Section Vidéo (cachée par défaut) - identique à l'original
        with gr.Column(elem_classes="processing-section", visible=False) as video_section:
            gr.Markdown(UILabels.VIDEO_MODE_TITLE)
            
            video_input = gr.File(
                label=UILabels.VIDEO_INPUT_LABEL,
                file_types=["video"]
            )
            
            btn_extract_audio = gr.Button(
                UILabels.EXTRACT_AUDIO_BUTTON,
                variant="secondary",
                size="lg"
            )

        # Section options de trim (identique à l'original)
        with gr.Column(elem_classes="processing-section"):
            with gr.Accordion(UILabels.TRIM_OPTIONS_TITLE, open=False):
                with gr.Row():
                    start_trim_input = gr.Number(
                        label=UILabels.START_TRIM_LABEL, 
                        value=0,
                        minimum=0,
                        maximum=3600
                    )
                    end_trim_input = gr.Number(
                        label=UILabels.END_TRIM_LABEL, 
                        value=0,
                        minimum=0,
                        maximum=3600
                    )


        # Section d'analyse principale (identique à l'original)
        with gr.Column(elem_classes="processing-section"):
            gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
            gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
            gr.Markdown("*Chunk duration is automatically optimized: 15min for Mini, 10min for Small (Zero GPU optimization)*")
            
            # Configuration des sections de résumé
            gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
            gr.Markdown(UILabels.SUMMARY_SECTIONS_DESCRIPTION)
            
            # Boutons de présélection rapide
            with gr.Row():
                btn_preset_action = gr.Button(UILabels.PRESET_ACTION_BUTTON, variant="secondary", size="sm")
                btn_preset_info = gr.Button(UILabels.PRESET_INFO_BUTTON, variant="secondary", size="sm")
                btn_preset_complet = gr.Button(UILabels.PRESET_COMPLETE_BUTTON, variant="secondary", size="sm")
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown(UILabels.ACTION_SECTIONS_TITLE)
                    section_resume_executif = gr.Checkbox(label=UILabels.SECTION_EXECUTIVE_SUMMARY, value=True)
                    section_discussions = gr.Checkbox(label=UILabels.SECTION_MAIN_DISCUSSIONS, value=True)
                    section_plan_action = gr.Checkbox(label=UILabels.SECTION_ACTION_PLAN, value=True)
                    section_decisions = gr.Checkbox(label=UILabels.SECTION_DECISIONS, value=True)
                    section_prochaines_etapes = gr.Checkbox(label=UILabels.SECTION_NEXT_STEPS, value=True)
                
                with gr.Column():
                    gr.Markdown(UILabels.INFO_SECTIONS_TITLE)
                    section_sujets_principaux = gr.Checkbox(label=UILabels.SECTION_MAIN_TOPICS, value=False)
                    section_points_importants = gr.Checkbox(label=UILabels.SECTION_KEY_POINTS, value=False)
                    section_questions = gr.Checkbox(label=UILabels.SECTION_QUESTIONS, value=False)
                    section_elements_suivi = gr.Checkbox(label=UILabels.SECTION_FOLLOW_UP, value=False)
            
            btn_direct_transcribe = gr.Button(
                UILabels.ANALYZE_BUTTON, 
                variant="primary",
                size="lg"
            )

        # Section résultats (identique à l'original)
        with gr.Column(elem_classes="results-section"):
            gr.Markdown(UILabels.RESULTS_TITLE)
            
            final_summary_output = gr.Markdown(
                value=UILabels.RESULTS_PLACEHOLDER,
                label=UILabels.RESULTS_LABEL,
                height=500
            )

        # Event handlers (adaptés pour HF Spaces)
        
        # Gestion du changement de mode d'entrée
        input_mode.change(
            fn=handle_input_mode_change,
            inputs=[input_mode],
            outputs=[audio_section, video_section]
        )
        
        # Extraction audio depuis vidéo
        btn_extract_audio.click(
            fn=extract_audio_from_video,
            inputs=[video_input, gr.State("french")],
            outputs=[audio_input, audio_section, video_section, input_mode, gr.State("french")]
        )

        # Fonctions de présélection des sections (identiques à l'original)
        def preset_action():
            return (True, True, True, True, True, False, False, False, False)
        
        def preset_info():
            return (True, False, False, False, False, True, True, True, True)
        
        def preset_complet():
            return (True, True, True, True, True, True, True, True, True)
        
        # Gestion de l'analyse directe (adaptée pour Transformers uniquement)
        def handle_analysis_direct(
            audio_file, hf_token, language, local_model, start_trim, end_trim,
            s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
            s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
        ):
            # Mode Transformers uniquement (pré-quantisé 8-bit)
            transcription_mode = f"Transformers ({local_model} 8-bit)"
            model_key = local_model
            
            # Construire la liste des sections sélectionnées
            sections_checkboxes = [
                (s_resume, "resume_executif"),
                (s_discussions, "discussions_principales"), 
                (s_plan_action, "plan_action"),
                (s_decisions, "decisions_prises"),
                (s_prochaines_etapes, "prochaines_etapes"),
                (s_sujets_principaux, "sujets_principaux"),
                (s_points_importants, "points_importants"),
                (s_questions, "questions_discussions"),
                (s_elements_suivi, "elements_suivi")
            ]
            
            selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
            
            # Appeler la fonction d'analyse directe (chunk duration automatique)
            _, summary = handle_direct_transcription(
                audio_file, hf_token, language, transcription_mode,
                model_key, selected_sections, start_trim, end_trim
            )
            return summary

        # Événements de présélection (identiques à l'original)
        btn_preset_action.click(
            fn=preset_action,
            outputs=[
                section_resume_executif, section_discussions, section_plan_action, 
                section_decisions, section_prochaines_etapes, section_sujets_principaux,
                section_points_importants, section_questions, section_elements_suivi
            ]
        )
        
        btn_preset_info.click(
            fn=preset_info,
            outputs=[
                section_resume_executif, section_discussions, section_plan_action, 
                section_decisions, section_prochaines_etapes, section_sujets_principaux,
                section_points_importants, section_questions, section_elements_suivi
            ]
        )
        
        btn_preset_complet.click(
            fn=preset_complet,
            outputs=[
                section_resume_executif, section_discussions, section_plan_action, 
                section_decisions, section_prochaines_etapes, section_sujets_principaux,
                section_points_importants, section_questions, section_elements_suivi
            ]
        )

        # Analyse principale (adaptée pour HF Spaces)
        btn_direct_transcribe.click(
            fn=handle_analysis_direct,
            inputs=[
                audio_input,
                gr.State(value=hf_token),
                gr.State("french"),
                local_model_choice,
                start_trim_input,
                end_trim_input,
                section_resume_executif,
                section_discussions,
                section_plan_action,
                section_decisions,
                section_prochaines_etapes,
                section_sujets_principaux,
                section_points_importants,
                section_questions,
                section_elements_suivi
            ],
            outputs=[final_summary_output]
        )


        # Footer (identique à l'original)
        with gr.Row():
            gr.Markdown(
                """
                ---
                **MeetingNotes** | Powered by [Voxtral](https://mistral.ai/) |
                🚀 Intelligent meeting analysis | 💾 HF Spaces with Zero GPU
                """,
                elem_classes="footer-info"
            )

    # Retourner demo (thème et CSS déjà configurés dans gr.Blocks pour Gradio 6)
    return demo