Spaces:

VincentGOURBIN
/

MeetingNotes-Voxtral-Analysis

Running on Zero

App Files Files Community

VincentGOURBIN commited on Aug 2, 2025

Commit

0d65c9e

verified ·

1 Parent(s): f006ab1

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

src/ai/voxtral_spaces_analyzer.py +33 -46
src/ui/spaces_interface.py +6 -16

src/ai/voxtral_spaces_analyzer.py CHANGED Viewed

@@ -41,15 +41,20 @@ class VoxtralSpacesAnalyzer:
         Args:
             model_name (str): Name of the Voxtral model to use (pre-quantized)
         """
-        # Only pre-quantized models are supported in Spaces version
         model_mapping = {
-            "Voxtral-Mini-3B-2507": "mzbac/voxtral-mini-3b-8bit",
-            "Voxtral-Small-24B-2507": "VincentGOURBIN/voxtral-small-8bit"
         }
-        self.model_name = model_mapping.get(model_name, "mzbac/voxtral-mini-3b-8bit")
         self.current_model_key = model_name
-        self.max_duration_minutes = 20  # Reduced for Spaces environment
         self.gpu_manager = ZeroGPUManager()
         self.token_tracker = TokenTracker("Transformers-HF-Spaces")
@@ -62,11 +67,11 @@ class VoxtralSpacesAnalyzer:
     def switch_model(self, model_name: str):
         """Switch to a different model (will reload if different)."""
         model_mapping = {
-            "Voxtral-Mini-3B-2507": "mzbac/voxtral-mini-3b-8bit",
-            "Voxtral-Small-24B-2507": "VincentGOURBIN/voxtral-small-8bit"
         }
-        new_model_path = model_mapping.get(model_name, "mzbac/voxtral-mini-3b-8bit")
         if self.model_name != new_model_path:
             print(f"🔄 Switching to {model_name}")
@@ -88,37 +93,17 @@ class VoxtralSpacesAnalyzer:
         dtype = self.gpu_manager.dtype
         print(f"🔄 Loading {self.current_model_key} on {device} with {dtype}...")
-        # Load processor
-        self.processor = AutoProcessor.from_pretrained(self.model_name)
-        # Simplified model loading strategy - force no quantization config
-        device_str = "cuda" if device == "cuda" else "mps" if device == "mps" else "cpu"
-        print(f"📦 Loading {self.current_model_key} pre-quantized model")
-        try:
-            # First try: load without any special config
-            self.model = VoxtralForConditionalGeneration.from_pretrained(
-                self.model_name,
-                torch_dtype=dtype,
-                low_cpu_mem_usage=True,
-                device_map=device_str
-            )
-        except Exception as e:
-            if "quantization config" in str(e).lower():
-                print(f"⚠️ Quantization config issue, trying alternative loading...")
-                # Alternative: load config first and modify it
-                config = AutoConfig.from_pretrained(self.model_name)
-                if hasattr(config, 'quantization_config'):
-                    config.quantization_config = None
-                self.model = VoxtralForConditionalGeneration.from_pretrained(
-                    self.model_name,
-                    config=config,
-                    torch_dtype=dtype,
-                    low_cpu_mem_usage=True,
-                    device_map=device_str
-                )
-            else:
-                raise e
         print(f"✅ {self.current_model_key} loaded successfully on {device}")
@@ -138,13 +123,15 @@ class VoxtralSpacesAnalyzer:
         audio = AudioSegment.from_file(wav_path)
         return len(audio) / (1000 * 60)
-    def _create_time_chunks(self, wav_path: str, chunk_duration_minutes: int = None) -> List[Tuple[float, float]]:
-        """Create time-based chunks for processing."""
         total_duration = self._get_audio_duration(wav_path) * 60  # seconds
-        # Use provided chunk duration or fall back to default
-        chunk_minutes = chunk_duration_minutes if chunk_duration_minutes else self.max_duration_minutes
         max_chunk_seconds = chunk_minutes * 60
         if total_duration <= max_chunk_seconds:
             return [(0, total_duration)]
@@ -179,17 +166,16 @@ class VoxtralSpacesAnalyzer:
         wav_path: str,
         language: str = "french",
         selected_sections: list = None,
-        chunk_duration_minutes: int = 15,
         reference_speakers_data: str = None
     ) -> Dict[str, str]:
         """
         Analyze audio by chunks using Voxtral with Zero GPU.
         Args:
             wav_path (str): Path to audio file
             language (str): Expected language
             selected_sections (list): Analysis sections to include
-            chunk_duration_minutes (int): Chunk duration in minutes
             reference_speakers_data (str): Speaker diarization data
         Returns:
@@ -203,9 +189,10 @@ class VoxtralSpacesAnalyzer:
             duration = self._get_audio_duration(wav_path)
             print(f"🎵 Audio duration: {duration:.1f} minutes")
-            # Create chunks with specified duration
-            chunks = self._create_time_chunks(wav_path, chunk_duration_minutes)
-            print(f"📦 Splitting into {len(chunks)} chunks of {chunk_duration_minutes}min")
             chunk_summaries = []

         Args:
             model_name (str): Name of the Voxtral model to use (pre-quantized)
         """
+        # Use original Mistral models for HF Spaces
         model_mapping = {
+            "Voxtral-Mini-3B-2507": "mistralai/Voxtral-Mini-3B-2507",
+            "Voxtral-Small-24B-2507": "mistralai/Voxtral-Small-24B-2507"
         }
+        self.model_name = model_mapping.get(model_name, "mistralai/Voxtral-Mini-3B-2507")
         self.current_model_key = model_name
+        # Optimized chunk durations for Zero GPU (different per model)
+        self.chunk_durations = {
+            "Voxtral-Mini-3B-2507": 15,  # 15 minutes for Mini
+            "Voxtral-Small-24B-2507": 10   # 10 minutes for Small (larger model)
+        }
         self.gpu_manager = ZeroGPUManager()
         self.token_tracker = TokenTracker("Transformers-HF-Spaces")
     def switch_model(self, model_name: str):
         """Switch to a different model (will reload if different)."""
         model_mapping = {
+            "Voxtral-Mini-3B-2507": "mistralai/Voxtral-Mini-3B-2507",
+            "Voxtral-Small-24B-2507": "mistralai/Voxtral-Small-24B-2507"
         }
+        new_model_path = model_mapping.get(model_name, "mistralai/Voxtral-Mini-3B-2507")
         if self.model_name != new_model_path:
             print(f"🔄 Switching to {model_name}")
         dtype = self.gpu_manager.dtype
         print(f"🔄 Loading {self.current_model_key} on {device} with {dtype}...")
+        # Load processor and model following HuggingFace reference implementation
+        print(f"📦 Loading {self.current_model_key} (original Mistral model)")
+        self.processor = AutoProcessor.from_pretrained(self.model_name)
+        # Use reference implementation from HuggingFace docs
+        self.model = VoxtralForConditionalGeneration.from_pretrained(
+            self.model_name,
+            torch_dtype=dtype,
+            device_map=device
+        )
         print(f"✅ {self.current_model_key} loaded successfully on {device}")
         audio = AudioSegment.from_file(wav_path)
         return len(audio) / (1000 * 60)
+    def _create_time_chunks(self, wav_path: str) -> List[Tuple[float, float]]:
+        """Create time-based chunks for processing with model-optimized durations."""
         total_duration = self._get_audio_duration(wav_path) * 60  # seconds
+        # Use model-specific optimized chunk duration for Zero GPU
+        chunk_minutes = self.chunk_durations.get(self.current_model_key, 15)
         max_chunk_seconds = chunk_minutes * 60
+        print(f"🎯 Using {chunk_minutes}min chunks optimized for {self.current_model_key} on Zero GPU")
         if total_duration <= max_chunk_seconds:
             return [(0, total_duration)]
         wav_path: str,
         language: str = "french",
         selected_sections: list = None,
         reference_speakers_data: str = None
     ) -> Dict[str, str]:
         """
         Analyze audio by chunks using Voxtral with Zero GPU.
+        Uses model-optimized chunk durations (15min for Mini, 10min for Small).
         Args:
             wav_path (str): Path to audio file
             language (str): Expected language
             selected_sections (list): Analysis sections to include
             reference_speakers_data (str): Speaker diarization data
         Returns:
             duration = self._get_audio_duration(wav_path)
             print(f"🎵 Audio duration: {duration:.1f} minutes")
+            # Create chunks with model-optimized duration
+            chunks = self._create_time_chunks(wav_path)
+            chunk_minutes = self.chunk_durations.get(self.current_model_key, 15)
+            print(f"📦 Splitting into {len(chunks)} chunks of {chunk_minutes}min")
             chunk_summaries = []

src/ui/spaces_interface.py CHANGED Viewed

@@ -220,7 +220,7 @@ def handle_speaker_rename(new_name):
 @gpu_inference(duration=300)
 def handle_direct_transcription(
     audio_file, hf_token, language, transcription_mode, model_key,
-    selected_sections, diarization_data, start_trim, end_trim, chunk_duration
 ):
     """Gestion de l'analyse directe adaptée pour HF Spaces."""
     initialize_components()
@@ -239,12 +239,11 @@ def handle_direct_transcription(
         if analyzer.current_model_key != model_name:
             analyzer.switch_model(model_name)
-        # Lancer l'analyse
         results = analyzer.analyze_audio_chunks(
             wav_path=audio_file,
             language="auto",
             selected_sections=selected_sections,
-            chunk_duration_minutes=int(chunk_duration),
             reference_speakers_data=diarization_data
         )
@@ -463,15 +462,7 @@ def create_spaces_interface():
         with gr.Column(elem_classes="processing-section"):
             gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
             gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
-            # Contrôle taille des chunks
-            chunk_duration_slider = gr.Slider(
-                minimum=5,
-                maximum=25,
-                value=15,
-                step=5,
-                label=UILabels.CHUNK_DURATION_LABEL
-            )
             # Configuration des sections de résumé
             gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
@@ -543,7 +534,7 @@ def create_spaces_interface():
         # Gestion de l'analyse directe (adaptée pour Transformers uniquement)
         def handle_analysis_direct(
-            audio_file, hf_token, language, local_model, start_trim, end_trim, chunk_duration,
             s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
             s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
         ):
@@ -566,10 +557,10 @@ def create_spaces_interface():
             selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
-            # Appeler la fonction d'analyse directe
             _, summary = handle_direct_transcription(
                 audio_file, hf_token, language, transcription_mode,
-                model_key, selected_sections, current_diarization_context, start_trim, end_trim, chunk_duration
             )
             return summary
@@ -611,7 +602,6 @@ def create_spaces_interface():
                 local_model_choice,
                 start_trim_input,
                 end_trim_input,
-                chunk_duration_slider,
                 section_resume_executif,
                 section_discussions,
                 section_plan_action,

 @gpu_inference(duration=300)
 def handle_direct_transcription(
     audio_file, hf_token, language, transcription_mode, model_key,
+    selected_sections, diarization_data, start_trim, end_trim
 ):
     """Gestion de l'analyse directe adaptée pour HF Spaces."""
     initialize_components()
         if analyzer.current_model_key != model_name:
             analyzer.switch_model(model_name)
+        # Lancer l'analyse (chunk duration automatique selon le modèle)
         results = analyzer.analyze_audio_chunks(
             wav_path=audio_file,
             language="auto",
             selected_sections=selected_sections,
             reference_speakers_data=diarization_data
         )
         with gr.Column(elem_classes="processing-section"):
             gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
             gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
+            gr.Markdown("*Chunk duration is automatically optimized: 15min for Mini, 10min for Small (Zero GPU optimization)*")
             # Configuration des sections de résumé
             gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
         # Gestion de l'analyse directe (adaptée pour Transformers uniquement)
         def handle_analysis_direct(
+            audio_file, hf_token, language, local_model, start_trim, end_trim,
             s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
             s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
         ):
             selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
+            # Appeler la fonction d'analyse directe (chunk duration automatique)
             _, summary = handle_direct_transcription(
                 audio_file, hf_token, language, transcription_mode,
+                model_key, selected_sections, current_diarization_context, start_trim, end_trim
             )
             return summary
                 local_model_choice,
                 start_trim_input,
                 end_trim_input,
                 section_resume_executif,
                 section_discussions,
                 section_plan_action,