VincentGOURBIN commited on
Commit
0d65c9e
·
verified ·
1 Parent(s): f006ab1

Upload folder using huggingface_hub

Browse files
src/ai/voxtral_spaces_analyzer.py CHANGED
@@ -41,15 +41,20 @@ class VoxtralSpacesAnalyzer:
41
  Args:
42
  model_name (str): Name of the Voxtral model to use (pre-quantized)
43
  """
44
- # Only pre-quantized models are supported in Spaces version
45
  model_mapping = {
46
- "Voxtral-Mini-3B-2507": "mzbac/voxtral-mini-3b-8bit",
47
- "Voxtral-Small-24B-2507": "VincentGOURBIN/voxtral-small-8bit"
48
  }
49
 
50
- self.model_name = model_mapping.get(model_name, "mzbac/voxtral-mini-3b-8bit")
51
  self.current_model_key = model_name
52
- self.max_duration_minutes = 20 # Reduced for Spaces environment
 
 
 
 
 
53
  self.gpu_manager = ZeroGPUManager()
54
  self.token_tracker = TokenTracker("Transformers-HF-Spaces")
55
 
@@ -62,11 +67,11 @@ class VoxtralSpacesAnalyzer:
62
  def switch_model(self, model_name: str):
63
  """Switch to a different model (will reload if different)."""
64
  model_mapping = {
65
- "Voxtral-Mini-3B-2507": "mzbac/voxtral-mini-3b-8bit",
66
- "Voxtral-Small-24B-2507": "VincentGOURBIN/voxtral-small-8bit"
67
  }
68
 
69
- new_model_path = model_mapping.get(model_name, "mzbac/voxtral-mini-3b-8bit")
70
 
71
  if self.model_name != new_model_path:
72
  print(f"🔄 Switching to {model_name}")
@@ -88,37 +93,17 @@ class VoxtralSpacesAnalyzer:
88
  dtype = self.gpu_manager.dtype
89
  print(f"🔄 Loading {self.current_model_key} on {device} with {dtype}...")
90
 
91
- # Load processor
92
- self.processor = AutoProcessor.from_pretrained(self.model_name)
93
 
94
- # Simplified model loading strategy - force no quantization config
95
- device_str = "cuda" if device == "cuda" else "mps" if device == "mps" else "cpu"
96
- print(f"📦 Loading {self.current_model_key} pre-quantized model")
97
 
98
- try:
99
- # First try: load without any special config
100
- self.model = VoxtralForConditionalGeneration.from_pretrained(
101
- self.model_name,
102
- torch_dtype=dtype,
103
- low_cpu_mem_usage=True,
104
- device_map=device_str
105
- )
106
- except Exception as e:
107
- if "quantization config" in str(e).lower():
108
- print(f"⚠️ Quantization config issue, trying alternative loading...")
109
- # Alternative: load config first and modify it
110
- config = AutoConfig.from_pretrained(self.model_name)
111
- if hasattr(config, 'quantization_config'):
112
- config.quantization_config = None
113
- self.model = VoxtralForConditionalGeneration.from_pretrained(
114
- self.model_name,
115
- config=config,
116
- torch_dtype=dtype,
117
- low_cpu_mem_usage=True,
118
- device_map=device_str
119
- )
120
- else:
121
- raise e
122
 
123
  print(f"✅ {self.current_model_key} loaded successfully on {device}")
124
 
@@ -138,13 +123,15 @@ class VoxtralSpacesAnalyzer:
138
  audio = AudioSegment.from_file(wav_path)
139
  return len(audio) / (1000 * 60)
140
 
141
- def _create_time_chunks(self, wav_path: str, chunk_duration_minutes: int = None) -> List[Tuple[float, float]]:
142
- """Create time-based chunks for processing."""
143
  total_duration = self._get_audio_duration(wav_path) * 60 # seconds
144
- # Use provided chunk duration or fall back to default
145
- chunk_minutes = chunk_duration_minutes if chunk_duration_minutes else self.max_duration_minutes
146
  max_chunk_seconds = chunk_minutes * 60
147
 
 
 
148
  if total_duration <= max_chunk_seconds:
149
  return [(0, total_duration)]
150
 
@@ -179,17 +166,16 @@ class VoxtralSpacesAnalyzer:
179
  wav_path: str,
180
  language: str = "french",
181
  selected_sections: list = None,
182
- chunk_duration_minutes: int = 15,
183
  reference_speakers_data: str = None
184
  ) -> Dict[str, str]:
185
  """
186
  Analyze audio by chunks using Voxtral with Zero GPU.
 
187
 
188
  Args:
189
  wav_path (str): Path to audio file
190
  language (str): Expected language
191
  selected_sections (list): Analysis sections to include
192
- chunk_duration_minutes (int): Chunk duration in minutes
193
  reference_speakers_data (str): Speaker diarization data
194
 
195
  Returns:
@@ -203,9 +189,10 @@ class VoxtralSpacesAnalyzer:
203
  duration = self._get_audio_duration(wav_path)
204
  print(f"🎵 Audio duration: {duration:.1f} minutes")
205
 
206
- # Create chunks with specified duration
207
- chunks = self._create_time_chunks(wav_path, chunk_duration_minutes)
208
- print(f"📦 Splitting into {len(chunks)} chunks of {chunk_duration_minutes}min")
 
209
 
210
  chunk_summaries = []
211
 
 
41
  Args:
42
  model_name (str): Name of the Voxtral model to use (pre-quantized)
43
  """
44
+ # Use original Mistral models for HF Spaces
45
  model_mapping = {
46
+ "Voxtral-Mini-3B-2507": "mistralai/Voxtral-Mini-3B-2507",
47
+ "Voxtral-Small-24B-2507": "mistralai/Voxtral-Small-24B-2507"
48
  }
49
 
50
+ self.model_name = model_mapping.get(model_name, "mistralai/Voxtral-Mini-3B-2507")
51
  self.current_model_key = model_name
52
+
53
+ # Optimized chunk durations for Zero GPU (different per model)
54
+ self.chunk_durations = {
55
+ "Voxtral-Mini-3B-2507": 15, # 15 minutes for Mini
56
+ "Voxtral-Small-24B-2507": 10 # 10 minutes for Small (larger model)
57
+ }
58
  self.gpu_manager = ZeroGPUManager()
59
  self.token_tracker = TokenTracker("Transformers-HF-Spaces")
60
 
 
67
  def switch_model(self, model_name: str):
68
  """Switch to a different model (will reload if different)."""
69
  model_mapping = {
70
+ "Voxtral-Mini-3B-2507": "mistralai/Voxtral-Mini-3B-2507",
71
+ "Voxtral-Small-24B-2507": "mistralai/Voxtral-Small-24B-2507"
72
  }
73
 
74
+ new_model_path = model_mapping.get(model_name, "mistralai/Voxtral-Mini-3B-2507")
75
 
76
  if self.model_name != new_model_path:
77
  print(f"🔄 Switching to {model_name}")
 
93
  dtype = self.gpu_manager.dtype
94
  print(f"🔄 Loading {self.current_model_key} on {device} with {dtype}...")
95
 
96
+ # Load processor and model following HuggingFace reference implementation
97
+ print(f"📦 Loading {self.current_model_key} (original Mistral model)")
98
 
99
+ self.processor = AutoProcessor.from_pretrained(self.model_name)
 
 
100
 
101
+ # Use reference implementation from HuggingFace docs
102
+ self.model = VoxtralForConditionalGeneration.from_pretrained(
103
+ self.model_name,
104
+ torch_dtype=dtype,
105
+ device_map=device
106
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  print(f"✅ {self.current_model_key} loaded successfully on {device}")
109
 
 
123
  audio = AudioSegment.from_file(wav_path)
124
  return len(audio) / (1000 * 60)
125
 
126
+ def _create_time_chunks(self, wav_path: str) -> List[Tuple[float, float]]:
127
+ """Create time-based chunks for processing with model-optimized durations."""
128
  total_duration = self._get_audio_duration(wav_path) * 60 # seconds
129
+ # Use model-specific optimized chunk duration for Zero GPU
130
+ chunk_minutes = self.chunk_durations.get(self.current_model_key, 15)
131
  max_chunk_seconds = chunk_minutes * 60
132
 
133
+ print(f"🎯 Using {chunk_minutes}min chunks optimized for {self.current_model_key} on Zero GPU")
134
+
135
  if total_duration <= max_chunk_seconds:
136
  return [(0, total_duration)]
137
 
 
166
  wav_path: str,
167
  language: str = "french",
168
  selected_sections: list = None,
 
169
  reference_speakers_data: str = None
170
  ) -> Dict[str, str]:
171
  """
172
  Analyze audio by chunks using Voxtral with Zero GPU.
173
+ Uses model-optimized chunk durations (15min for Mini, 10min for Small).
174
 
175
  Args:
176
  wav_path (str): Path to audio file
177
  language (str): Expected language
178
  selected_sections (list): Analysis sections to include
 
179
  reference_speakers_data (str): Speaker diarization data
180
 
181
  Returns:
 
189
  duration = self._get_audio_duration(wav_path)
190
  print(f"🎵 Audio duration: {duration:.1f} minutes")
191
 
192
+ # Create chunks with model-optimized duration
193
+ chunks = self._create_time_chunks(wav_path)
194
+ chunk_minutes = self.chunk_durations.get(self.current_model_key, 15)
195
+ print(f"📦 Splitting into {len(chunks)} chunks of {chunk_minutes}min")
196
 
197
  chunk_summaries = []
198
 
src/ui/spaces_interface.py CHANGED
@@ -220,7 +220,7 @@ def handle_speaker_rename(new_name):
220
  @gpu_inference(duration=300)
221
  def handle_direct_transcription(
222
  audio_file, hf_token, language, transcription_mode, model_key,
223
- selected_sections, diarization_data, start_trim, end_trim, chunk_duration
224
  ):
225
  """Gestion de l'analyse directe adaptée pour HF Spaces."""
226
  initialize_components()
@@ -239,12 +239,11 @@ def handle_direct_transcription(
239
  if analyzer.current_model_key != model_name:
240
  analyzer.switch_model(model_name)
241
 
242
- # Lancer l'analyse
243
  results = analyzer.analyze_audio_chunks(
244
  wav_path=audio_file,
245
  language="auto",
246
  selected_sections=selected_sections,
247
- chunk_duration_minutes=int(chunk_duration),
248
  reference_speakers_data=diarization_data
249
  )
250
 
@@ -463,15 +462,7 @@ def create_spaces_interface():
463
  with gr.Column(elem_classes="processing-section"):
464
  gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
465
  gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
466
-
467
- # Contrôle taille des chunks
468
- chunk_duration_slider = gr.Slider(
469
- minimum=5,
470
- maximum=25,
471
- value=15,
472
- step=5,
473
- label=UILabels.CHUNK_DURATION_LABEL
474
- )
475
 
476
  # Configuration des sections de résumé
477
  gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
@@ -543,7 +534,7 @@ def create_spaces_interface():
543
 
544
  # Gestion de l'analyse directe (adaptée pour Transformers uniquement)
545
  def handle_analysis_direct(
546
- audio_file, hf_token, language, local_model, start_trim, end_trim, chunk_duration,
547
  s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
548
  s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
549
  ):
@@ -566,10 +557,10 @@ def create_spaces_interface():
566
 
567
  selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
568
 
569
- # Appeler la fonction d'analyse directe
570
  _, summary = handle_direct_transcription(
571
  audio_file, hf_token, language, transcription_mode,
572
- model_key, selected_sections, current_diarization_context, start_trim, end_trim, chunk_duration
573
  )
574
  return summary
575
 
@@ -611,7 +602,6 @@ def create_spaces_interface():
611
  local_model_choice,
612
  start_trim_input,
613
  end_trim_input,
614
- chunk_duration_slider,
615
  section_resume_executif,
616
  section_discussions,
617
  section_plan_action,
 
220
  @gpu_inference(duration=300)
221
  def handle_direct_transcription(
222
  audio_file, hf_token, language, transcription_mode, model_key,
223
+ selected_sections, diarization_data, start_trim, end_trim
224
  ):
225
  """Gestion de l'analyse directe adaptée pour HF Spaces."""
226
  initialize_components()
 
239
  if analyzer.current_model_key != model_name:
240
  analyzer.switch_model(model_name)
241
 
242
+ # Lancer l'analyse (chunk duration automatique selon le modèle)
243
  results = analyzer.analyze_audio_chunks(
244
  wav_path=audio_file,
245
  language="auto",
246
  selected_sections=selected_sections,
 
247
  reference_speakers_data=diarization_data
248
  )
249
 
 
462
  with gr.Column(elem_classes="processing-section"):
463
  gr.Markdown(UILabels.MAIN_ANALYSIS_TITLE)
464
  gr.Markdown(UILabels.MAIN_ANALYSIS_DESCRIPTION)
465
+ gr.Markdown("*Chunk duration is automatically optimized: 15min for Mini, 10min for Small (Zero GPU optimization)*")
 
 
 
 
 
 
 
 
466
 
467
  # Configuration des sections de résumé
468
  gr.Markdown(UILabels.SUMMARY_SECTIONS_TITLE)
 
534
 
535
  # Gestion de l'analyse directe (adaptée pour Transformers uniquement)
536
  def handle_analysis_direct(
537
+ audio_file, hf_token, language, local_model, start_trim, end_trim,
538
  s_resume, s_discussions, s_plan_action, s_decisions, s_prochaines_etapes,
539
  s_sujets_principaux, s_points_importants, s_questions, s_elements_suivi
540
  ):
 
557
 
558
  selected_sections = [section_key for is_selected, section_key in sections_checkboxes if is_selected]
559
 
560
+ # Appeler la fonction d'analyse directe (chunk duration automatique)
561
  _, summary = handle_direct_transcription(
562
  audio_file, hf_token, language, transcription_mode,
563
+ model_key, selected_sections, current_diarization_context, start_trim, end_trim
564
  )
565
  return summary
566
 
 
602
  local_model_choice,
603
  start_trim_input,
604
  end_trim_input,
 
605
  section_resume_executif,
606
  section_discussions,
607
  section_plan_action,