Rajor78 commited on
Commit
69b32e0
·
verified ·
1 Parent(s): 7c8d9d7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +676 -0
app.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import time
4
+ import json
5
+ import argparse
6
+ from pathlib import Path
7
+ import numpy as np
8
+ import torch
9
+ import pandas as pd
10
+ import matplotlib.pyplot as plt
11
+ import re
12
+ from docx import Document
13
+ from docx.shared import RGBColor, Pt
14
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
15
+ from langdetect import detect
16
+
17
+ # Import Hugging Face components
18
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
19
+ from pyannote.audio import Pipeline
20
+ from datasets import Dataset
21
+
22
+ # Constants
23
+ SPACY_MODELS = {
24
+ 'es': 'es_core_news_sm', # Spanish
25
+ 'en': 'en_core_web_sm', # English
26
+ 'fr': 'fr_core_news_sm', # French
27
+ 'it': 'it_core_news_sm', # Italian
28
+ 'de': 'de_core_news_sm', # German
29
+ 'pt': 'pt_core_news_sm', # Portuguese
30
+ 'nl': 'nl_core_news_sm', # Dutch
31
+ 'ca': 'ca_core_news_sm', # Catalan
32
+ }
33
+
34
+ # Function to load Spacy model based on language
35
+ def load_spacy_model(language):
36
+ import spacy
37
+ from spacy.cli import download as spacy_download
38
+
39
+ model_name = SPACY_MODELS.get(language, 'es_core_news_sm')
40
+
41
+ try:
42
+ print(f"Attempting to load Spacy model for language: {language} ({model_name})...")
43
+ nlp = spacy.load(model_name)
44
+ return nlp
45
+ except OSError:
46
+ print(f"Model {model_name} not found. Installing...")
47
+ spacy_download(model_name)
48
+ nlp = spacy.load(model_name)
49
+ return nlp
50
+ except Exception as e:
51
+ print(f"Could not load Spacy model for language {language}: {str(e)}")
52
+ print("Trying to load default English model...")
53
+ try:
54
+ spacy_download('en_core_web_sm')
55
+ return spacy.load('en_core_web_sm')
56
+ except Exception as e2:
57
+ print(f"Could not load English model either: {str(e2)}")
58
+ print("Using a minimal model...")
59
+ return spacy.blank('en')
60
+
61
+ # Function to extract audio from a video
62
+ def extract_audio(video_path, audio_path):
63
+ try:
64
+ command = f"ffmpeg -i '{video_path}' -ar 16000 -ac 1 -c:a pcm_s16le '{audio_path}' -y"
65
+ subprocess.run(command, shell=True, check=True)
66
+ print(f"Audio extracted and saved to: {audio_path}")
67
+ return True
68
+ except subprocess.CalledProcessError as e:
69
+ print(f"Error extracting audio: {e}")
70
+ return False
71
+
72
+ # Function to detect language of the audio
73
+ def detect_language(transcribed_text):
74
+ try:
75
+ language = detect(transcribed_text)
76
+ print(f"Detected language: {language}")
77
+ return language
78
+ except Exception as e:
79
+ print(f"Error detecting language: {e}")
80
+ return "es" # Spanish by default
81
+
82
+ # Function to perform speaker diarization with pyannote.audio
83
+ def diarize_speakers(audio_path, huggingface_token=None):
84
+ try:
85
+ print("Initializing speaker diarization...")
86
+
87
+ # Use pyannote.audio for diarization
88
+ use_auth = True if huggingface_token else False
89
+
90
+ # If Hugging Face token is provided, use it
91
+ if huggingface_token:
92
+ diarization_pipeline = Pipeline.from_pretrained(
93
+ "pyannote/speaker-diarization-3.1",
94
+ use_auth_token=huggingface_token
95
+ )
96
+ else:
97
+ # Try to load without token (will only work if license has been accepted)
98
+ try:
99
+ diarization_pipeline = Pipeline.from_pretrained(
100
+ "pyannote/speaker-diarization-3.1",
101
+ use_auth_token=False
102
+ )
103
+ except Exception as e:
104
+ print(f"Error loading diarization model without token: {e}")
105
+ print("It's recommended to create a Hugging Face account, accept the model license, and provide a token.")
106
+ return {}
107
+
108
+ print("Running diarization...")
109
+ diarization = diarization_pipeline(audio_path)
110
+
111
+ # Store speaker information and turns
112
+ speakers = {}
113
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
114
+ if speaker not in speakers:
115
+ speakers[speaker] = []
116
+ speakers[speaker].append({
117
+ 'start': turn.start,
118
+ 'end': turn.end
119
+ })
120
+
121
+ # Rename speakers to be more user-friendly
122
+ renamed_speakers = {}
123
+ for i, (speaker, turns) in enumerate(speakers.items(), 1):
124
+ renamed_speakers[f"Speaker {i}"] = turns
125
+
126
+ print(f"Diarization completed. {len(renamed_speakers)} speakers identified.")
127
+ return renamed_speakers
128
+ except Exception as e:
129
+ print(f"Error in speaker diarization: {e}")
130
+ print("Continuing without diarization...")
131
+ return {}
132
+
133
+ # Function to transcribe audio with Whisper and get timestamps
134
+ def transcribe_audio_with_timing(audio_path, model_name="openai/whisper-base", language=None):
135
+ try:
136
+ print(f"Loading Whisper model ({model_name})...")
137
+
138
+ # Use Transformers pipeline for transcription
139
+ transcription_pipeline = pipeline(
140
+ "automatic-speech-recognition",
141
+ model=model_name,
142
+ chunk_length_s=30,
143
+ device=0 if torch.cuda.is_available() else -1,
144
+ return_timestamps="word"
145
+ )
146
+
147
+ print("Transcribing audio with timestamps...")
148
+
149
+ # If language is provided, use it; otherwise, let Whisper detect it
150
+ if language:
151
+ result = transcription_pipeline(audio_path, language=language)
152
+ else:
153
+ result = transcription_pipeline(audio_path)
154
+
155
+ # Process the result to match the expected format
156
+ transcribed_text = result.get("text", "")
157
+
158
+ # Create segments from chunks with timestamps
159
+ segments = []
160
+ chunk_words = result.get("chunks", [])
161
+
162
+ # Group words into sentences/segments
163
+ current_segment = {
164
+ "start": 0,
165
+ "end": 0,
166
+ "text": "",
167
+ "words": []
168
+ }
169
+
170
+ for word_data in chunk_words:
171
+ word = word_data.get("text", "")
172
+ start_time = word_data.get("timestamp", (0, 0))[0]
173
+ end_time = word_data.get("timestamp", (0, 0))[1]
174
+
175
+ # Initialize first segment
176
+ if not current_segment["text"]:
177
+ current_segment["start"] = start_time
178
+
179
+ current_segment["text"] += " " + word
180
+ current_segment["words"].append(word_data)
181
+ current_segment["end"] = end_time
182
+
183
+ # Start a new segment at sentence end
184
+ if word.endswith((".", "!", "?")):
185
+ segments.append(current_segment)
186
+ current_segment = {
187
+ "start": end_time,
188
+ "end": end_time,
189
+ "text": "",
190
+ "words": []
191
+ }
192
+
193
+ # Add the last segment if not empty
194
+ if current_segment["text"]:
195
+ segments.append(current_segment)
196
+
197
+ detected_language = result.get("language", "unknown")
198
+
199
+ print(f"Transcription completed in language: {detected_language}")
200
+ return transcribed_text, segments, detected_language
201
+ except Exception as e:
202
+ print(f"Error in transcription: {e}")
203
+ return "", [], "unknown"
204
+
205
+ # Function to assign speakers to transcribed segments
206
+ def assign_speakers_to_segments(segments, speakers):
207
+ if not speakers:
208
+ # If no speaker information, assign "Unknown Speaker" to all segments
209
+ for segment in segments:
210
+ segment['speaker'] = "Unknown Speaker"
211
+ return segments
212
+
213
+ for segment in segments:
214
+ start_time = segment['start']
215
+ end_time = segment['end']
216
+
217
+ # Find the speaker with the most overlap for this segment
218
+ best_speaker = None
219
+ max_overlap = 0
220
+
221
+ for speaker, turns in speakers.items():
222
+ for turn in turns:
223
+ turn_start = turn['start']
224
+ turn_end = turn['end']
225
+
226
+ # Calculate overlap time
227
+ overlap_start = max(start_time, turn_start)
228
+ overlap_end = min(end_time, turn_end)
229
+ overlap = max(0, overlap_end - overlap_start)
230
+
231
+ if overlap > max_overlap:
232
+ max_overlap = overlap
233
+ best_speaker = speaker
234
+
235
+ # Assign the best speaker found or "Unknown" if no match
236
+ segment['speaker'] = best_speaker if best_speaker else "Unknown Speaker"
237
+
238
+ return segments
239
+
240
+ # Function to extract speaker information (how much each one speaks)
241
+ def analyze_speaker_stats(segments):
242
+ speaker_stats = {}
243
+ total_duration = 0
244
+
245
+ for segment in segments:
246
+ speaker = segment.get('speaker', 'Unknown Speaker')
247
+ duration = segment['end'] - segment['start']
248
+ total_duration += duration
249
+
250
+ if speaker not in speaker_stats:
251
+ speaker_stats[speaker] = {
252
+ 'total_time': 0,
253
+ 'word_count': 0,
254
+ 'segments': 0
255
+ }
256
+
257
+ speaker_stats[speaker]['total_time'] += duration
258
+ speaker_stats[speaker]['word_count'] += len(segment['text'].split())
259
+ speaker_stats[speaker]['segments'] += 1
260
+
261
+ # Calculate percentages
262
+ for speaker in speaker_stats:
263
+ speaker_stats[speaker]['percentage'] = (speaker_stats[speaker]['total_time'] / total_duration) * 100
264
+
265
+ return speaker_stats, total_duration
266
+
267
+ # Function to generate speaker analysis charts
268
+ def generate_speaker_analysis_charts(speaker_stats, output_path):
269
+ try:
270
+ # Create DataFrame for easier visualization
271
+ speakers = list(speaker_stats.keys())
272
+ percentages = [speaker_stats[speaker]['percentage'] for speaker in speakers]
273
+ word_counts = [speaker_stats[speaker]['word_count'] for speaker in speakers]
274
+
275
+ # Create figure with two subplots
276
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
277
+
278
+ # Chart 1: Speaking time by speaker (pie)
279
+ ax1.pie(percentages, labels=speakers, autopct='%1.1f%%', startangle=90)
280
+ ax1.set_title('Speaking Time Distribution')
281
+
282
+ # Chart 2: Number of words by speaker (bars)
283
+ ax2.bar(speakers, word_counts)
284
+ ax2.set_title('Word Count by Speaker')
285
+ ax2.set_ylabel('Word Count')
286
+ ax2.tick_params(axis='x', rotation=45)
287
+
288
+ plt.tight_layout()
289
+ plt.savefig(output_path)
290
+ print(f"Analysis charts saved to: {output_path}")
291
+ return True
292
+ except Exception as e:
293
+ print(f"Error generating analysis charts: {e}")
294
+ return False
295
+
296
+ # Function to choose organization mode: chronological or by speakers
297
+ def organize_segments(segments, mode="chronological"):
298
+ if mode == "by_speaker":
299
+ # Organize by speakers
300
+ speakers_content = {}
301
+ for segment in segments:
302
+ speaker = segment.get('speaker', 'Unknown Speaker')
303
+ if speaker not in speakers_content:
304
+ speakers_content[speaker] = []
305
+ speakers_content[speaker].append(segment)
306
+
307
+ # Sort segments by time within each speaker
308
+ for speaker in speakers_content:
309
+ speakers_content[speaker].sort(key=lambda x: x['start'])
310
+
311
+ return speakers_content
312
+ else:
313
+ # Organize chronologically (already sorted by time)
314
+ return segments
315
+
316
+ # Function to divide text into paragraphs based on organization mode
317
+ def process_segments_for_document(segments, mode="chronological"):
318
+ if mode == "by_speaker":
319
+ # Organize by speakers
320
+ speakers_content = organize_segments(segments, "by_speaker")
321
+ paragraphs = []
322
+
323
+ for speaker, speaker_segments in speakers_content.items():
324
+ speaker_text = ""
325
+ for segment in speaker_segments:
326
+ speaker_text += segment['text'] + " "
327
+
328
+ paragraphs.append({
329
+ 'speaker': speaker,
330
+ 'text': speaker_text
331
+ })
332
+
333
+ return paragraphs
334
+ else:
335
+ # Organize chronologically
336
+ chronological_paragraphs = []
337
+ current_paragraph = []
338
+ current_speaker = None
339
+ current_timestamp = None
340
+
341
+ for segment in segments:
342
+ speaker = segment.get('speaker', 'Unknown Speaker')
343
+ text = segment['text']
344
+ start_time = segment['start']
345
+ end_time = segment['end']
346
+
347
+ # Format time as HH:MM:SS
348
+ time_str = format_timestamp(start_time)
349
+
350
+ # If speaker changes, start a new paragraph
351
+ if current_speaker and current_speaker != speaker and current_paragraph:
352
+ chronological_paragraphs.append({
353
+ 'speaker': current_speaker,
354
+ 'text': ' '.join(current_paragraph),
355
+ 'timestamp': current_timestamp
356
+ })
357
+ current_paragraph = []
358
+
359
+ # Update current speaker and add text
360
+ current_speaker = speaker
361
+ current_timestamp = time_str
362
+ current_paragraph.append(text)
363
+
364
+ # Add the last paragraph if there's content
365
+ if current_paragraph:
366
+ chronological_paragraphs.append({
367
+ 'speaker': current_speaker,
368
+ 'text': ' '.join(current_paragraph),
369
+ 'timestamp': current_timestamp
370
+ })
371
+
372
+ return chronological_paragraphs
373
+
374
+ # Function to format time in HH:MM:SS format
375
+ def format_timestamp(seconds):
376
+ m, s = divmod(seconds, 60)
377
+ h, m = divmod(m, 60)
378
+ return f"{int(h):02d}:{int(m):02d}:{int(s):02d}"
379
+
380
+ # Function to improve text style and grammar before saving
381
+ def correct_text(text, language="es"):
382
+ try:
383
+ import language_tool_python
384
+
385
+ language_code = language[:2].lower() # Get only the 2-letter language code
386
+ supported_languages = ["es", "en", "fr", "de", "pt", "nl"]
387
+
388
+ if language_code not in supported_languages:
389
+ print(f"Grammar correction not available for language {language_code}, using Spanish by default.")
390
+ language_code = "es"
391
+
392
+ tool = language_tool_python.LanguageTool(language_code)
393
+ matches = tool.check(text)
394
+ corrected_text = language_tool_python.utils.correct(text, matches)
395
+ return corrected_text
396
+ except Exception as e:
397
+ print(f"Error correcting text: {e}")
398
+ return text # Return original text if there's an error
399
+
400
+ # Function to create Word document with organized transcription
401
+ def create_word_document(paragraphs, output_path, include_timestamps=True, stats=None, chart_path=None):
402
+ try:
403
+ doc = Document()
404
+
405
+ # Configure document style
406
+ style = doc.styles['Normal']
407
+ style.font.name = 'Arial'
408
+ style.font.size = Pt(11)
409
+
410
+ # Main title
411
+ title = doc.add_heading('Transcription with Speaker Identification', 0)
412
+ title.alignment = WD_ALIGN_PARAGRAPH.CENTER
413
+
414
+ # Add statistics information if available
415
+ if stats:
416
+ doc.add_heading('Participation Summary', level=1)
417
+ stats_table = doc.add_table(rows=1, cols=5)
418
+ stats_table.style = 'Table Grid'
419
+
420
+ # Table headers
421
+ hdr_cells = stats_table.rows[0].cells
422
+ hdr_cells[0].text = 'Speaker'
423
+ hdr_cells[1].text = 'Time (s)'
424
+ hdr_cells[2].text = 'Percentage (%)'
425
+ hdr_cells[3].text = 'Words'
426
+ hdr_cells[4].text = 'Interventions'
427
+
428
+ # Add data for each speaker
429
+ for speaker, data in stats.items():
430
+ row_cells = stats_table.add_row().cells
431
+ row_cells[0].text = speaker
432
+ row_cells[1].text = f"{data['total_time']:.2f}"
433
+ row_cells[2].text = f"{data['percentage']:.2f}"
434
+ row_cells[3].text = f"{data['word_count']}"
435
+ row_cells[4].text = f"{data['segments']}"
436
+
437
+ doc.add_paragraph()
438
+
439
+ # Add chart if available
440
+ if chart_path and os.path.exists(chart_path):
441
+ doc.add_heading('Graphical Analysis', level=1)
442
+ doc.add_picture(chart_path, width=Pt(450))
443
+ doc.add_paragraph()
444
+
445
+ # Transcription title
446
+ doc.add_heading('Complete Transcription', level=1)
447
+
448
+ # Add paragraphs to document
449
+ for paragraph in paragraphs:
450
+ speaker = paragraph['speaker']
451
+ text = paragraph['text']
452
+
453
+ # Create paragraph with appropriate formatting
454
+ p = doc.add_paragraph()
455
+
456
+ # Add timestamp if available and option is enabled
457
+ if include_timestamps and 'timestamp' in paragraph:
458
+ timestamp_run = p.add_run(f"[{paragraph['timestamp']}] ")
459
+ timestamp_run.bold = True
460
+ timestamp_run.font.color.rgb = RGBColor(128, 128, 128)
461
+
462
+ # Add speaker
463
+ speaker_run = p.add_run(f"{speaker}: ")
464
+ speaker_run.bold = True
465
+
466
+ # Text color according to speaker for easier reading
467
+ if "Speaker 1" in speaker:
468
+ speaker_run.font.color.rgb = RGBColor(0, 0, 200) # Blue
469
+ elif "Speaker 2" in speaker:
470
+ speaker_run.font.color.rgb = RGBColor(200, 0, 0) # Red
471
+ elif "Speaker 3" in speaker:
472
+ speaker_run.font.color.rgb = RGBColor(0, 150, 0) # Green
473
+ elif "Speaker 4" in speaker:
474
+ speaker_run.font.color.rgb = RGBColor(128, 0, 128) # Purple
475
+
476
+ # Add paragraph text
477
+ text_run = p.add_run(text)
478
+
479
+ # Add separator for better readability
480
+ doc.add_paragraph()
481
+
482
+ # Save document
483
+ doc.save(output_path)
484
+ print(f"Word document saved to: {output_path}")
485
+ return True
486
+ except Exception as e:
487
+ print(f"Error creating Word document: {str(e)}")
488
+ return False
489
+
490
+ # Function to save results as JSON for later processing
491
+ def save_json_results(segments, output_path):
492
+ try:
493
+ # Convert segments to serializable format
494
+ serializable_segments = []
495
+ for segment in segments:
496
+ serializable_segment = {
497
+ 'start': segment['start'],
498
+ 'end': segment['end'],
499
+ 'text': segment['text'],
500
+ 'speaker': segment.get('speaker', 'Unknown Speaker')
501
+ }
502
+ serializable_segments.append(serializable_segment)
503
+
504
+ # Save to JSON file
505
+ with open(output_path, 'w', encoding='utf-8') as f:
506
+ json.dump(serializable_segments, f, ensure_ascii=False, indent=2)
507
+
508
+ print(f"Results saved in JSON format: {output_path}")
509
+ return True
510
+ except Exception as e:
511
+ print(f"Error saving results to JSON: {e}")
512
+ return False
513
+
514
+ # Function to save results to Hugging Face Dataset
515
+ def save_to_huggingface_dataset(segments, output_path=None, push_to_hub=False, repo_id=None, token=None):
516
+ try:
517
+ # Prepare data for Dataset format
518
+ data = {
519
+ "segment_id": [],
520
+ "start_time": [],
521
+ "end_time": [],
522
+ "speaker": [],
523
+ "text": []
524
+ }
525
+
526
+ for i, segment in enumerate(segments):
527
+ data["segment_id"].append(i)
528
+ data["start_time"].append(segment["start"])
529
+ data["end_time"].append(segment["end"])
530
+ data["speaker"].append(segment.get("speaker", "Unknown Speaker"))
531
+ data["text"].append(segment["text"])
532
+
533
+ # Create Dataset
534
+ dataset = Dataset.from_dict(data)
535
+
536
+ # Save locally if path provided
537
+ if output_path:
538
+ dataset.save_to_disk(output_path)
539
+ print(f"Dataset saved locally to: {output_path}")
540
+
541
+ # Push to Hugging Face Hub if requested
542
+ if push_to_hub and repo_id:
543
+ dataset.push_to_hub(repo_id, token=token)
544
+ print(f"Dataset pushed to Hugging Face Hub: {repo_id}")
545
+
546
+ return dataset
547
+ except Exception as e:
548
+ print(f"Error saving to Hugging Face dataset: {e}")
549
+ return None
550
+
551
+ # Main function
552
+ def main():
553
+ parser = argparse.ArgumentParser(description="Audio transcription with speaker diarization using Hugging Face models")
554
+ parser.add_argument("--video", type=str, help="Path to video file")
555
+ parser.add_argument("--audio", type=str, help="Path to audio file (if already extracted)")
556
+ parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save output files")
557
+ parser.add_argument("--model", type=str, default="openai/whisper-base",
558
+ help="Whisper model to use: openai/whisper-tiny, openai/whisper-base, openai/whisper-small, openai/whisper-medium, openai/whisper-large")
559
+ parser.add_argument("--language", type=str, help="Language code (e.g., 'es' for Spanish)")
560
+ parser.add_argument("--hf_token", type=str, help="Hugging Face API token for speaker diarization")
561
+ parser.add_argument("--organization", type=str, default="chronological",
562
+ choices=["chronological", "by_speaker"], help="Transcription organization mode")
563
+ parser.add_argument("--push_to_hub", action="store_true", help="Push results to Hugging Face Hub")
564
+ parser.add_argument("--repo_id", type=str, help="Hugging Face repository ID for pushing dataset")
565
+
566
+ args = parser.parse_args()
567
+
568
+ # Create output directory if it doesn't exist
569
+ os.makedirs(args.output_dir, exist_ok=True)
570
+
571
+ # Timestamp for output files
572
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
573
+
574
+ try:
575
+ print("=== TRANSCRIPTION WITH SPEAKER DETECTION ===")
576
+
577
+ # Check input file
578
+ if args.audio:
579
+ audio_path = args.audio
580
+ base_filename = os.path.splitext(os.path.basename(audio_path))[0]
581
+ elif args.video:
582
+ video_path = args.video
583
+ base_filename = os.path.splitext(os.path.basename(video_path))[0]
584
+ audio_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}.wav")
585
+
586
+ # Extract audio from video
587
+ if not extract_audio(video_path, audio_path):
588
+ print("Could not extract audio. Process canceled.")
589
+ return
590
+ else:
591
+ print("Error: You must provide either a video file or an audio file.")
592
+ return
593
+
594
+ # Output file paths
595
+ word_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_transcription.docx")
596
+ json_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_data.json")
597
+ chart_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_analysis.png")
598
+ dataset_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_dataset")
599
+
600
+ print(f"\nProcessing audio: {audio_path}")
601
+ start_time = time.time()
602
+
603
+ # Transcribe with Whisper
604
+ print(f"\nStarting transcription with Whisper model {args.model}...")
605
+ transcribed_text, segments, detected_language = transcribe_audio_with_timing(
606
+ audio_path,
607
+ model_name=args.model,
608
+ language=args.language
609
+ )
610
+
611
+ if not transcribed_text:
612
+ print("Transcription failed. Process canceled.")
613
+ return
614
+
615
+ print(f"Transcription completed: {transcribed_text[:100]}...\n")
616
+
617
+ # If no language specified, use the detected one
618
+ if not args.language:
619
+ detected_language = detect_language(transcribed_text) if detected_language == "unknown" else detected_language
620
+ else:
621
+ detected_language = args.language
622
+
623
+ # Speaker diarization
624
+ print("Starting speaker detection...")
625
+ speakers = diarize_speakers(audio_path, args.hf_token)
626
+
627
+ # Assign speakers to segments
628
+ segments_with_speakers = assign_speakers_to_segments(segments, speakers)
629
+
630
+ # Analyze speaker statistics
631
+ speaker_stats, total_duration = analyze_speaker_stats(segments_with_speakers)
632
+ print("\n=== PARTICIPATION STATISTICS ===")
633
+ for speaker, stats in speaker_stats.items():
634
+ print(f"{speaker}: {stats['percentage']:.2f}% of time, {stats['word_count']} words, {stats['segments']} interventions")
635
+
636
+ # Generate analysis charts
637
+ generate_speaker_analysis_charts(speaker_stats, chart_output_path)
638
+
639
+ # Process segments according to selected organization mode
640
+ paragraphs = process_segments_for_document(segments_with_speakers, args.organization)
641
+
642
+ # Save results as JSON
643
+ save_json_results(segments_with_speakers, json_output_path)
644
+
645
+ # Create Word document with transcription
646
+ create_word_document(
647
+ paragraphs,
648
+ word_output_path,
649
+ include_timestamps=True,
650
+ stats=speaker_stats,
651
+ chart_path=chart_output_path
652
+ )
653
+
654
+ # Save to Hugging Face Dataset
655
+ if args.push_to_hub or os.path.exists(dataset_output_path):
656
+ save_to_huggingface_dataset(
657
+ segments_with_speakers,
658
+ output_path=dataset_output_path,
659
+ push_to_hub=args.push_to_hub,
660
+ repo_id=args.repo_id,
661
+ token=args.hf_token
662
+ )
663
+
664
+ # Total processing time
665
+ end_time = time.time()
666
+ elapsed_time = end_time - start_time
667
+ print(f"\nTotal processing time: {elapsed_time:.2f} seconds")
668
+
669
+ print("\nProcess completed successfully!")
670
+
671
+ except Exception as e:
672
+ print(f"Unexpected error during the process: {str(e)}")
673
+
674
+ # Run the script
675
+ if __name__ == "__main__":
676
+ main()