MeetingNotes-Voxtral-Analysis / src /ui /spaces_interface.py
VincentGOURBIN's picture
Upload folder using huggingface_hub
79a3d1b verified
"""
Application Gradio pour l'analyse intelligente de réunions avec Voxtral - Version HF Spaces.
Version adaptée pour Hugging Face Spaces avec :
- Uniquement mode Transformers (MLX et API supprimés)
- Modèles 8-bit uniquement
- Support MCP natif
- Zero GPU decorators
"""
import os
import gradio as gr
from dotenv import load_dotenv
from ..ai.voxtral_spaces_analyzer import VoxtralSpacesAnalyzer
from ..utils.zero_gpu_manager import ZeroGPUManager, gpu_inference
# Import labels locally
from .labels import UILabels
# Charger les variables d'environnement depuis le fichier .env
load_dotenv()
# Global instances for MCP functions
analyzer = None
gpu_manager = None
def initialize_components():
"""Initialize global components for MCP functions."""
global analyzer, gpu_manager
if analyzer is None:
analyzer = VoxtralSpacesAnalyzer()
gpu_manager = ZeroGPUManager()
# MCP Tools - exposed automatically by Gradio
@gpu_inference(duration=300)
def analyze_meeting_audio(
audio_file: str,
sections: list = None,
model_name: str = "Voxtral-Mini-3B-2507"
) -> dict:
"""
Analyze meeting audio and generate structured summaries using Voxtral AI.
This function processes audio files to extract insights and generate
structured meeting summaries with configurable sections.
Args:
audio_file: Path to the audio file to analyze (MP3, WAV, M4A, OGG)
sections: List of analysis sections to include (executive_summary, action_plan, etc.)
model_name: Voxtral model to use for analysis (Mini-3B or Small-24B)
Returns:
Dictionary containing analysis results, processing time, and metadata
"""
initialize_components()
if not os.path.exists(audio_file):
return {"error": "Audio file not found", "status": "failed"}
try:
import time
start_time = time.time()
# Set default sections if none provided
if sections is None:
sections = ["resume_executif", "discussions_principales", "plan_action"]
# Switch model if different
if analyzer.current_model_key != model_name:
analyzer.switch_model(model_name)
# Analyze audio (MCP function without progress bar)
results = analyzer.analyze_audio_chunks(
wav_path=audio_file,
language="auto",
selected_sections=sections
)
processing_time = time.time() - start_time
return {
"status": "completed",
"analysis": results.get("transcription", "No analysis available"),
"processing_time_seconds": processing_time,
"model_used": model_name,
"sections_analyzed": sections
}
except Exception as e:
return {
"status": "failed",
"error": str(e),
"processing_time_seconds": time.time() - start_time if 'start_time' in locals() else 0
}
finally:
if gpu_manager:
gpu_manager.cleanup_gpu()
def get_available_sections() -> dict:
"""Get available analysis sections for meeting summaries."""
from ..ai.prompts_config import VoxtralPrompts
return {
"status": "success",
"sections": VoxtralPrompts.AVAILABLE_SECTIONS,
"total_sections": len(VoxtralPrompts.AVAILABLE_SECTIONS)
}
def get_meeting_templates() -> dict:
"""Get pre-configured meeting analysis templates."""
templates = {
"action_meeting": {
"name": "Action-Oriented Meeting",
"description": "For meetings focused on decisions and action items",
"recommended_sections": ["resume_executif", "discussions_principales", "plan_action", "decisions_prises", "prochaines_etapes"]
},
"info_meeting": {
"name": "Information Meeting",
"description": "For presentations and informational sessions",
"recommended_sections": ["resume_executif", "sujets_principaux", "points_importants", "questions_discussions", "elements_suivi"]
}
}
return {"status": "success", "templates": templates, "total_templates": len(templates)}
# Handlers adaptés pour HF Spaces
def handle_input_mode_change(input_mode):
"""Gestion du changement de mode d'entrée."""
if input_mode == UILabels.INPUT_MODE_AUDIO:
return gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True)
def extract_audio_from_video(video_file, language):
"""Extraction audio depuis vidéo (placeholder pour HF Spaces)."""
if video_file is None:
return None, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language
# Pour HF Spaces, on assume que le processing vidéo sera fait côté client
# ou qu'on accepte déjà des fichiers audio
return video_file, gr.update(visible=True), gr.update(visible=False), UILabels.INPUT_MODE_AUDIO, language
@gpu_inference(duration=300)
def handle_direct_transcription(
audio_file, hf_token, language, transcription_mode, model_key,
selected_sections, start_trim, end_trim, progress=gr.Progress()
):
"""Gestion de l'analyse directe adaptée pour HF Spaces."""
initialize_components()
if audio_file is None:
return "", "❌ Veuillez d'abord télécharger un fichier audio."
try:
# Extraire le nom du modèle depuis transcription_mode
if "Mini" in transcription_mode:
model_name = "Voxtral-Mini-3B-2507"
else:
model_name = "Voxtral-Small-24B-2507"
# Configurer l'analyseur
if analyzer.current_model_key != model_name:
analyzer.switch_model(model_name)
# Setup progress callback
def progress_callback(progress_ratio, message):
progress(progress_ratio, desc=message)
# Lancer l'analyse (chunk duration automatique selon le modèle)
results = analyzer.analyze_audio_chunks(
wav_path=audio_file,
language="auto",
selected_sections=selected_sections,
start_trim=start_trim,
end_trim=end_trim,
progress_callback=progress_callback
)
return "", results.get("transcription", "Aucune analyse disponible")
except Exception as e:
error_msg = f"❌ Erreur lors de l'analyse: {str(e)}"
return "", error_msg
finally:
if gpu_manager:
gpu_manager.cleanup_gpu()
def create_spaces_interface():
"""
Point d'entrée principal pour l'interface HF Spaces.
Interface identique au projet original mais simplifiée :
- Seul mode Transformers (pas MLX/API)
- Modèles pré-quantisés uniquement
- Support MCP natif
"""
# Initialize components
initialize_components()
# Récupérer le token Hugging Face depuis les variables d'environnement
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
if hf_token is None:
print("⚠️ Warning: HF_TOKEN environment variable not found")
# Configuration du thème Glass personnalisé (identique à l'original)
custom_glass_theme = gr.themes.Glass(
primary_hue=gr.themes.colors.blue,
secondary_hue=gr.themes.colors.gray,
text_size=gr.themes.sizes.text_md,
spacing_size=gr.themes.sizes.spacing_md,
radius_size=gr.themes.sizes.radius_md
)
# CSS personnalisé pour l'application
custom_css = """
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
}
.main-header {
text-align: center;
margin-bottom: 30px;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 15px;
color: white;
box-shadow: 0 8px 32px rgba(31, 38, 135, 0.37);
}
.processing-section {
background: rgba(255, 255, 255, 0.1);
border-radius: 10px;
padding: 20px;
margin: 15px 0;
border: 1px solid rgba(255, 255, 255, 0.2);
backdrop-filter: blur(10px);
}
.results-section {
margin-top: 25px;
}
"""
with gr.Blocks(
title="MeetingNotes - AI Analysis with Voxtral",
theme=custom_glass_theme,
css=custom_css
) as demo:
# Main header with style (identique à l'original)
with gr.Column(elem_classes="main-header"):
gr.Markdown(
f"""
# {UILabels.MAIN_TITLE}
{UILabels.MAIN_SUBTITLE}
{UILabels.MAIN_DESCRIPTION}
""",
elem_classes="header-content"
)
# Processing mode section (SIMPLIFIÉ - seulement Transformers)
with gr.Column(elem_classes="processing-section"):
gr.Markdown("## 🔧 Model Configuration")
# Model selection (modèles pré-quantisés)
local_model_choice = gr.Radio(
choices=[UILabels.MODEL_MINI, UILabels.MODEL_SMALL],
value=UILabels.MODEL_MINI,
label="Voxtral Model Selection"
)
# Information about the models
gr.Markdown("""
**📋 About this HF Spaces version:**
- Uses standard Mistral Voxtral models optimized for Zero GPU
- **Mini Model**: [Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507) - Faster processing, lower memory usage
- **Small Model**: [Voxtral-Small-24B-2507](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) - Higher quality analysis, more detailed summaries
- Chunk duration automatically optimized: 15min for Mini, 10min for Small
**🔗 Complete version available:**
For local processing (MLX/Transformers), API modes, and **speaker diarization**, check the full version on [GitHub](https://github.com/VincentGourbin/meetingnotes)
""")
# Input mode selection (identique à l'original)
with gr.Column(elem_classes="processing-section"):
gr.Markdown(UILabels.INPUT_MODE_TITLE)
input_mode = gr.Radio(
choices=[UILabels.INPUT_MODE_AUDIO, UILabels.INPUT_MODE_VIDEO],
value=UILabels.INPUT_MODE_AUDIO,
label=UILabels.INPUT_MODE_LABEL
)
# Section Audio (mode par défaut) - identique à l'original
with gr.Column(elem_classes="processing-section") as audio_section:
gr.Markdown(UILabels.AUDIO_MODE_TITLE)
audio_input = gr.Audio(
label=UILabels.AUDIO_INPUT_LABEL,
type="filepath",
show_label=True,
interactive=True
)
# Section Vidéo (cachée par défaut) - identique à l'original
with gr.Column(elem_classes="processing-section", visible=False) as video_section:
gr.Markdown(UILabels.VIDEO_MODE_TITLE)
video_input = gr.File(
label=UILabels.VIDEO_INPUT_LABEL,
file_types=["video"]
)
btn_extract_audio = gr.Button(
UILabels.EXTRACT_AUDIO_BUTTON,
variant="secondary",
size="lg"
)
# Section options de trim (identique à l'original)
with gr.Column(elem_classes="processing-section"):
with gr.Accordion(UILabels.TRIM_OPTIONS_TITLE, open=False):
with gr.Row():
start_trim_input = gr.Number(
label=UILabels.START_TRIM_LABEL,
value=0,
minimum=0,
maximum=3600
)
end_trim_input = gr.Number(
label=UILabels.END_TRIM_LABEL,
value=0,
minimum=0,
maximum=3600
)
# Section d'analyse principale (identique à l'original)
with gr.Column(elem_classes="processing-section"):
gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
gr.Markdown("*Chunk duration is automatically optimized: 15min for Mini, 10min for Small (Zero GPU optimization)*")
# Configuration des sections de résumé
gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
gr.Markdown(UILabels.SUMMARY_SECTIONS_DESCRIPTION)
# Boutons de présélection rapide
with gr.Row():
btn_preset_action = gr.Button(UILabels.PRESET_ACTION_BUTTON, variant="secondary", size="sm")
btn_preset_info = gr.Button(UILabels.PRESET_INFO_BUTTON, variant="secondary", size="sm")
btn_preset_complet = gr.Button(UILabels.PRESET_COMPLETE_BUTTON, variant="secondary", size="sm")
with gr.Row():
with gr.Column():
gr.Markdown(UILabels.ACTION_SECTIONS_TITLE)
section_resume_executif = gr.Checkbox(label=UILabels.SECTION_EXECUTIVE_SUMMARY, value=True)
section_discussions = gr.Checkbox(label=UILabels.SECTION_MAIN_DISCUSSIONS, value=True)
section_plan_action = gr.Checkbox(label=UILabels.SECTION_ACTION_PLAN, value=True)
section_decisions = gr.Checkbox(label=UILabels.SECTION_DECISIONS, value=True)
section_prochaines_etapes = gr.Checkbox(label=UILabels.SECTION_NEXT_STEPS, value=True)
with gr.Column():
gr.Markdown(UILabels.INFO_SECTIONS_TITLE)
section_sujets_principaux = gr.Checkbox(label=UILabels.SECTION_MAIN_TOPICS, value=False)
section_points_importants = gr.Checkbox(label=UILabels.SECTION_KEY_POINTS, value=False)
section_questions = gr.Checkbox(label=UILabels.SECTION_QUESTIONS, value=False)
section_elements_suivi = gr.Checkbox(label=UILabels.SECTION_FOLLOW_UP, value=False)
btn_direct_transcribe = gr.Button(
UILabels.ANALYZE_BUTTON,
variant="primary",
size="lg"
)
# Section résultats (identique à l'original)
with gr.Column(elem_classes="results-section"):
gr.Markdown(UILabels.RESULTS_TITLE)
final_summary_output = gr.Markdown(
value=UILabels.RESULTS_PLACEHOLDER,
label=UILabels.RESULTS_LABEL,
height=500
)
# Event handlers (adaptés pour HF Spaces)
# Gestion du changement de mode d'entrée
input_mode.change(
fn=handle_input_mode_change,
inputs=[input_mode],
outputs=[audio_section, video_section]
)
# Extraction audio depuis vidéo
btn_extract_audio.click(
fn=extract_audio_from_video,
inputs=[video_input, gr.State("french")],
outputs=[audio_input, audio_section, video_section, input_mode, gr.State("french")]
)
# Fonctions de présélection des sections (identiques à l'original)
def preset_action():
return (True, True, True, True, True, False, False, False, False)
def preset_info():
return (True, False, False, False, False, True, True, True, True)
def preset_complet():
return (True, True, True, True, True, True, True, True, True)
# Gestion de l'analyse directe (adaptée pour Transformers uniquement)
def handle_analysis_direct(
audio_file, hf_token, language, local_model, start_trim, end_trim,
s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
):
# Mode Transformers uniquement (pré-quantisé 8-bit)
transcription_mode = f"Transformers ({local_model} 8-bit)"
model_key = local_model
# Construire la liste des sections sélectionnées
sections_checkboxes = [
(s_resume, "resume_executif"),
(s_discussions, "discussions_principales"),
(s_plan_action, "plan_action"),
(s_decisions, "decisions_prises"),
(s_prochaines_etapes, "prochaines_etapes"),
(s_sujets_principaux, "sujets_principaux"),
(s_points_importants, "points_importants"),
(s_questions, "questions_discussions"),
(s_elements_suivi, "elements_suivi")
]
selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
# Appeler la fonction d'analyse directe (chunk duration automatique)
_, summary = handle_direct_transcription(
audio_file, hf_token, language, transcription_mode,
model_key, selected_sections, start_trim, end_trim
)
return summary
# Événements de présélection (identiques à l'original)
btn_preset_action.click(
fn=preset_action,
outputs=[
section_resume_executif, section_discussions, section_plan_action,
section_decisions, section_prochaines_etapes, section_sujets_principaux,
section_points_importants, section_questions, section_elements_suivi
]
)
btn_preset_info.click(
fn=preset_info,
outputs=[
section_resume_executif, section_discussions, section_plan_action,
section_decisions, section_prochaines_etapes, section_sujets_principaux,
section_points_importants, section_questions, section_elements_suivi
]
)
btn_preset_complet.click(
fn=preset_complet,
outputs=[
section_resume_executif, section_discussions, section_plan_action,
section_decisions, section_prochaines_etapes, section_sujets_principaux,
section_points_importants, section_questions, section_elements_suivi
]
)
# Analyse principale (adaptée pour HF Spaces)
btn_direct_transcribe.click(
fn=handle_analysis_direct,
inputs=[
audio_input,
gr.State(value=hf_token),
gr.State("french"),
local_model_choice,
start_trim_input,
end_trim_input,
section_resume_executif,
section_discussions,
section_plan_action,
section_decisions,
section_prochaines_etapes,
section_sujets_principaux,
section_points_importants,
section_questions,
section_elements_suivi
],
outputs=[final_summary_output]
)
# Footer (identique à l'original)
with gr.Row():
gr.Markdown(
"""
---
**MeetingNotes** | Powered by [Voxtral](https://mistral.ai/) |
🚀 Intelligent meeting analysis | 💾 HF Spaces with Zero GPU
""",
elem_classes="footer-info"
)
# Retourner demo (thème et CSS déjà configurés dans gr.Blocks pour Gradio 6)
return demo