DineshJ96 commited on
Commit
eb74f8a
·
1 Parent(s): 1eabe29

app & req file updated

Browse files
Files changed (2) hide show
  1. app.py +153 -111
  2. requirements.txt +17 -13
app.py CHANGED
@@ -1,60 +1,79 @@
1
  # app.py
2
- # A HYBRID audio processing application using APIs for speed and local models for specialization.
3
- # - Whisper API for Transcription
4
- # - Pyannote (local) for Diarization
5
- # - Gemini API for Translation
6
 
7
  import os
8
  import torch
9
  import gradio as gr
 
10
  import soundfile as sf
 
 
 
11
  import tempfile
12
  import logging
13
  import warnings
14
- import openai
15
- import google.generativeai as genai
16
  from pyannote.audio import Pipeline as PyannotePipeline
 
17
 
18
  # --- 1. Initial Setup & Configuration ---
19
-
20
- # Suppress less important warnings
21
  warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.functional')
22
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
 
24
- # --- API Key Configuration ---
25
- try:
26
- openai.api_key = os.environ["OPENAI_API_KEY"]
27
- genai.configure(api_key=os.environ["GEMINI_API_KEY"])
28
- logging.info("API keys for OpenAI and Gemini loaded successfully.")
29
- except KeyError as e:
30
- logging.error(f"FATAL: Missing API Key - {e}. The application cannot run without it.")
31
- # We will raise a gr.Error in the main function if keys are missing.
32
-
33
- # --- Helper Function for Instructions ---
34
- def get_api_key_instructions():
35
- """Generates instructions for setting the required API keys."""
36
- return """
37
- **IMPORTANT: API Keys Required**
38
- This application uses external AI services and requires three secrets to be set:
39
-
40
- 1. **`OPENAI_API_KEY`**: For speech-to-text via the Whisper API.
41
- - Get it from: [platform.openai.com/api-keys](https://platform.openai.com/api-keys)
42
-
43
- 2. **`GEMINI_API_KEY`**: For language translation.
44
- - Get it from: [aistudio.google.com/app/apikey](https://aistudio.google.com/app/apikey)
45
 
46
- 3. **`HF_TOKEN`**: For the local speaker diarization model.
47
- - Get it from: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
48
- - You must also accept the license for [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1).
49
-
50
- **How to Add Keys to this Space:**
51
- Go to the **Settings** tab, find **Repository secrets**, click **New secret**, and add each of the three secrets listed above. Restart the Space after saving.
 
 
 
52
  """
53
 
54
- # --- 2. Global Model Loading (pyannote only) ---
55
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
56
- logging.info(f"Using device: {DEVICE} for local models.")
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  HF_TOKEN = os.environ.get("HF_TOKEN")
59
  DIARIZATION_PIPELINE = None
60
  if HF_TOKEN:
@@ -68,49 +87,30 @@ if HF_TOKEN:
68
  else:
69
  logging.warning("HF_TOKEN not set. Speaker diarization will be disabled.")
70
 
 
 
 
71
 
72
  # --- 3. Core Processing Functions ---
73
-
74
- def transcribe_with_whisper_api(audio_path):
75
- """Sends audio to OpenAI's Whisper API and gets a verbose transcript."""
76
- logging.info("Sending audio to Whisper API for transcription...")
77
- with open(audio_path, "rb") as audio_file:
78
- transcript = openai.audio.transcriptions.create(
79
- model="whisper-1",
80
- file=audio_file,
81
- response_format="verbose_json",
82
- timestamp_granularities=["word"]
83
- )
84
- logging.info("Received response from Whisper API.")
85
- return transcript.words, transcript.text, transcript.language
86
-
87
- def translate_with_gemini_api(text_to_translate, source_language):
88
- """Sends text to Google's Gemini Pro API for translation."""
89
- logging.info(f"Sending text to Gemini API for translation from '{source_language}'...")
90
- model = genai.GenerativeModel('gemini-1.5-flash')
91
- prompt = f"You are an expert linguist. Translate the following text from {source_language} into clear, natural-sounding English. Maintain the original meaning and tone.\n\nText to Translate:\n---\n{text_to_translate}\n---\n\nEnglish Translation:"
92
-
93
  try:
94
- response = model.generate_content(prompt)
95
- logging.info("Received response from Gemini API.")
96
- return response.text
 
 
 
 
97
  except Exception as e:
98
- logging.error(f"Gemini API call failed: {e}")
99
- return f"Translation failed due to an API error: {e}"
100
-
101
 
102
  def process_audio(audio_input):
103
- """The main hybrid pipeline function."""
104
- # Check if API keys were loaded at startup
105
- if not os.environ.get("OPENAI_API_KEY") or not os.environ.get("GEMINI_API_KEY"):
106
- raise gr.Error("Missing OpenAI or Gemini API Key. Please check the instructions and set the repository secrets.")
107
-
108
  if audio_input is None:
109
- return "Please provide audio.", "", "", gr.update(visible=False)
110
-
111
  temp_audio_path = None
112
  try:
113
- # Step 1: Standardize audio input to a temporary file path
114
  if isinstance(audio_input, tuple):
115
  sample_rate, audio_data = audio_input
116
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
@@ -119,10 +119,34 @@ def process_audio(audio_input):
119
  else:
120
  temp_audio_path = audio_input
121
 
122
- # Step 2: ASR via Whisper API (Fast and Accurate)
123
- word_timestamps, full_text, detected_language_code = transcribe_with_whisper_api(temp_audio_path)
 
 
 
 
 
124
 
125
- # Step 3: Diarization via local Pyannote model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  diarization = None
127
  if DIARIZATION_PIPELINE:
128
  logging.info("Performing speaker diarization...")
@@ -131,52 +155,69 @@ def process_audio(audio_input):
131
  except Exception as e:
132
  logging.error(f"Diarization failed: {e}")
133
 
134
- # Step 4: Align ASR and Diarization results
135
  logging.info("Aligning transcription with speaker segments...")
136
- final_segments = []
137
  if diarization:
138
  speaker_map = [{'start': turn.start, 'end': turn.end, 'speaker': speaker} for turn, _, speaker in diarization.itertracks(yield_label=True)]
139
  for word_info in word_timestamps:
140
- assigned_speaker = "Unknown"
141
- for segment in speaker_map:
142
- if word_info['start'] >= segment['start'] and word_info['end'] <= segment['end']:
143
- assigned_speaker = segment['speaker']
144
- break
145
- final_segments.append({'start': word_info['start'], 'end': word_info['end'], 'text': word_info['word'], 'speaker': assigned_speaker})
146
- else: # Fallback if no diarization
147
  for word_info in word_timestamps:
148
- final_segments.append({'start': word_info['start'], 'end': word_info['end'], 'text': word_info['word'], 'speaker': 'SPEAKER_00'})
149
 
150
- # Merge consecutive words
151
- merged_segments = []
152
- if final_segments:
153
- current_segment = final_segments[0]
154
- current_segment['text'] = current_segment['text'].strip()
155
- for i in range(1, len(final_segments)):
156
- next_segment = final_segments[i]
157
- if next_segment['speaker'] == current_segment['speaker'] and (next_segment['start'] - current_segment['end'] < 0.1):
158
- current_segment['text'] += next_segment['text']
159
- current_segment['end'] = next_segment['end']
160
  else:
161
- merged_segments.append(current_segment)
162
- current_segment = next_segment
163
- current_segment['text'] = current_segment['text'].strip()
164
- merged_segments.append(current_segment)
165
-
166
- diarized_text = "\n".join(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['speaker']}: {segment['text'].strip()}" for segment in merged_segments)
167
 
168
- # Step 5: Translation via Gemini API
 
 
169
  translation_output = "Source language is English. No translation needed."
170
  if detected_language_code != 'en':
171
- translation_output = translate_with_gemini_api(full_text, detected_language_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- # Step 6: Generate Report
174
- report_content = f"# Audio Processing Report\n\n## Detected Language\n{detected_language_code}\n\n---\n\n## Diarized Transcription\n{diarized_text}\n\n---\n\n## English Translation\n{translation_output}"
 
 
 
 
 
 
 
 
 
175
  with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding='utf-8') as report_file:
176
  report_file.write(report_content)
177
  report_path = report_file.name
178
 
179
- return detected_language_code, diarized_text, translation_output, gr.update(value=report_path, visible=True)
180
 
181
  except Exception as e:
182
  logging.error(f"An unexpected error occurred: {e}", exc_info=True)
@@ -187,18 +228,19 @@ def process_audio(audio_input):
187
  if DEVICE == "cuda":
188
  torch.cuda.empty_cache()
189
 
 
190
  # --- 4. Gradio User Interface ---
191
- with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Audio Processor") as app:
192
- gr.Markdown("# Hybrid AI Audio Processor")
193
- gr.Markdown("A high-speed tool using Whisper API, local Speaker Diarization, and Gemini API for Translation.")
194
 
195
  with gr.Row():
196
  with gr.Column(scale=1):
197
  gr.Markdown("### 1. Provide Audio")
198
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio")
199
  process_button = gr.Button("Process Audio", variant="primary")
200
- with gr.Accordion("API Key Instructions (IMPORTANT)", open=True):
201
- gr.Markdown(get_api_key_instructions())
202
 
203
  with gr.Column(scale=2):
204
  gr.Markdown("### 2. Processing Results")
@@ -207,7 +249,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Audio Processor") as app:
207
  with gr.TabItem("Diarized Transcription"):
208
  diarized_transcription_output = gr.Textbox(label="Full Transcription (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
209
  with gr.TabItem("Translation (to English)"):
210
- translation_output = gr.Textbox(label="Full Translation by Gemini", lines=15, interactive=False, show_copy_button=True)
211
 
212
  gr.Markdown("### 3. Download Full Report")
213
  download_report_button = gr.File(label="Download Report (.txt)", visible=False, interactive=False)
 
1
  # app.py
2
+ # A 100% OPEN-SOURCE audio processing application.
3
+ # - Local Whisper for Transcription
4
+ # - Local Pyannote for Diarization
5
+ # - Local Helsinki-NLP for Translation
6
 
7
  import os
8
  import torch
9
  import gradio as gr
10
+ import numpy as np
11
  import soundfile as sf
12
+ import torchaudio
13
+ from transformers import pipeline as hf_pipeline
14
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
15
  import tempfile
16
  import logging
17
  import warnings
 
 
18
  from pyannote.audio import Pipeline as PyannotePipeline
19
+ from langdetect import detect, LangDetectException
20
 
21
  # --- 1. Initial Setup & Configuration ---
 
 
22
  warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.functional')
23
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
 
25
+ # Language name mapping
26
+ LANGUAGE_NAME_MAPPING = {
27
+ "en": "English", "zh-cn": "Chinese", "de": "German", "es": "Spanish", "ru": "Russian",
28
+ "ko": "Korean", "fr": "French", "ja": "Japanese", "pt": "Portuguese", "tr": "Turkish",
29
+ "pl": "Polish", "ca": "Catalan", "nl": "Dutch", "ar": "Arabic", "sv": "Swedish",
30
+ "it": "Italian", "id": "Indonesian", "hi": "Hindi", "fi": "Finnish", "vi": "Vietnamese",
31
+ "he": "Hebrew", "uk": "Ukrainian", "el": "Greek", "ms": "Malay", "cs": "Czech",
32
+ "ro": "Romanian", "da": "Danish", "hu": "Hungarian", "ta": "Tamil", "no": "Norwegian",
33
+ "th": "Thai", "ur": "Urdu", "hr": "Croatian", "bg": "Bulgarian", "lt": "Lithuanian", "la": "Latin",
34
+ "mi": "Maori", "ml": "Malayalam", "cy": "Welsh", "sk": "Slovak", "te": "Telugu", "pa": "Punjabi",
35
+ "lv": "Latvian", "bn": "Bengali", "sr": "Serbian", "az": "Azerbaijani", "sl": "Slovenian",
36
+ "kn": "Kannada", "et": "Estonian", "mk": "Macedonian", "br": "Breton", "eu": "Basque",
37
+ "is": "Icelandic", "hy": "Armenian", "ne": "Nepali", "mn": "Mongolian", "bs": "Bosnian",
38
+ "kk": "Kazakh", "sq": "Albanian", "sw": "Swahili", "gl": "Galician", "mr": "Marathi",
39
+ "si": "Sinhala", "am": "Amharic", "yo": "Yoruba", "uz": "Uzbek", "af": "Afrikaans",
40
+ "oc": "Occitan", "ka": "Georgian", "be": "Belarusian", "tg": "Tajik", "sd": "Sindhi",
41
+ "gu": "Gujarati", "so": "Somali", "lo": "Lao", "yi": "Yiddish", "ky": "Kyrgyz",
42
+ "tk": "Turkmen", "ht": "Haitian Creole", "ps": "Pashto", "as": "Assamese", "tt": "Tatar",
43
+ "ha": "Hausa", "ba": "Bashkir", "jw": "Javanese", "su": "Sundanese"
44
+ }
 
45
 
46
+ def get_hf_token_instructions():
47
+ """Generates instructions for setting the HF_TOKEN for pyannote."""
48
+ return """
49
+ **IMPORTANT: Authentication Required for Speaker Identification**
50
+ This feature uses the `pyannote/speaker-diarization-3.1` model, which requires a Hugging Face access token.
51
+ **How to Add Your Token:**
52
+ 1. **Accept the model license:** Visit [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) and agree to the terms.
53
+ 2. **Get your token:** Find it in your Hugging Face account settings: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
54
+ 3. **Add the token to this Space:** Go to the **Settings** tab, find **Repository secrets**, click **New secret**, and add a secret named `HF_TOKEN` with your token as the value. Restart the Space after saving.
55
  """
56
 
57
+ # --- 2. Global Model Loading (All Local) ---
58
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
59
+ TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
60
+ logging.info(f"Using device: {DEVICE} with data type: {TORCH_DTYPE}")
61
 
62
+ # ASR Pipeline (Local Whisper)
63
+ ASR_PIPELINE = None
64
+ try:
65
+ logging.info("Loading ASR pipeline (Whisper)...")
66
+ ASR_PIPELINE = hf_pipeline(
67
+ "automatic-speech-recognition",
68
+ model="openai/whisper-large-v3",
69
+ torch_dtype=TORCH_DTYPE,
70
+ device=DEVICE,
71
+ )
72
+ logging.info("ASR pipeline loaded successfully.")
73
+ except Exception as e:
74
+ logging.error(f"Fatal error: Could not load ASR pipeline. {e}")
75
+
76
+ # Speaker Diarization Pipeline (Local Pyannote)
77
  HF_TOKEN = os.environ.get("HF_TOKEN")
78
  DIARIZATION_PIPELINE = None
79
  if HF_TOKEN:
 
87
  else:
88
  logging.warning("HF_TOKEN not set. Speaker diarization will be disabled.")
89
 
90
+ # Translation Model Cache (Local Helsinki-NLP)
91
+ TRANSLATION_MODELS = {}
92
+ logging.info("Translation model cache initialized.")
93
 
94
  # --- 3. Core Processing Functions ---
95
+ def load_and_resample_audio(audio_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  try:
97
+ waveform, sample_rate = torchaudio.load(audio_path, channels_first=True)
98
+ if waveform.shape[0] > 1:
99
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
100
+ if sample_rate != 16000:
101
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
102
+ waveform = resampler(waveform)
103
+ return waveform.squeeze(0).numpy()
104
  except Exception as e:
105
+ raise IOError(f"Error processing audio file {audio_path}: {e}")
 
 
106
 
107
  def process_audio(audio_input):
 
 
 
 
 
108
  if audio_input is None:
109
+ raise gr.Error("Please provide an audio file or record audio.")
110
+
111
  temp_audio_path = None
112
  try:
113
+ # Step 1: Handle audio input
114
  if isinstance(audio_input, tuple):
115
  sample_rate, audio_data = audio_input
116
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
 
119
  else:
120
  temp_audio_path = audio_input
121
 
122
+ logging.info("Standardizing audio...")
123
+ audio_waveform_16k = load_and_resample_audio(temp_audio_path)
124
+
125
+ # Step 2: ASR with local Whisper pipeline
126
+ logging.info("Starting ASR with local Whisper pipeline...")
127
+ if not ASR_PIPELINE:
128
+ raise gr.Error("ASR pipeline not available. The application cannot proceed.")
129
 
130
+ asr_output = ASR_PIPELINE(
131
+ audio_waveform_16k,
132
+ chunk_length_s=30,
133
+ batch_size=8,
134
+ return_timestamps="word"
135
+ )
136
+ word_timestamps = asr_output.get("chunks", [])
137
+ full_text = asr_output.get("text", "").strip()
138
+
139
+ # Step 3: Language Detection
140
+ detected_language_code = "en"
141
+ if full_text:
142
+ try:
143
+ detected_language_code = detect(full_text)
144
+ except LangDetectException:
145
+ logging.warning("Language detection failed, defaulting to English.")
146
+ detected_language_name = LANGUAGE_NAME_MAPPING.get(detected_language_code, "Unknown")
147
+ logging.info(f"Transcription complete. Language: {detected_language_name}")
148
+
149
+ # Step 4: Speaker Diarization
150
  diarization = None
151
  if DIARIZATION_PIPELINE:
152
  logging.info("Performing speaker diarization...")
 
155
  except Exception as e:
156
  logging.error(f"Diarization failed: {e}")
157
 
158
+ # Step 5: Align ASR and Diarization results
159
  logging.info("Aligning transcription with speaker segments...")
160
+ merged_segments = []
161
  if diarization:
162
  speaker_map = [{'start': turn.start, 'end': turn.end, 'speaker': speaker} for turn, _, speaker in diarization.itertracks(yield_label=True)]
163
  for word_info in word_timestamps:
164
+ word_start, word_end = word_info['timestamp']
165
+ assigned_speaker = next((seg['speaker'] for seg in speaker_map if word_start >= seg['start'] and word_end <= seg['end']), "Unknown")
166
+ merged_segments.append({'start': word_start, 'end': word_end, 'text': word_info['text'], 'speaker': assigned_speaker})
167
+ else:
 
 
 
168
  for word_info in word_timestamps:
169
+ merged_segments.append({'start': word_info['timestamp'][0], 'end': word_info['timestamp'][1], 'text': word_info['text'], 'speaker': 'SPEAKER_00'})
170
 
171
+ # Merge consecutive words from the same speaker
172
+ final_segments = []
173
+ if merged_segments:
174
+ current_segment = merged_segments[0]
175
+ for i in range(1, len(merged_segments)):
176
+ next_seg = merged_segments[i]
177
+ if next_seg['speaker'] == current_segment['speaker'] and (next_seg['start'] - current_segment['end'] < 0.5):
178
+ current_segment['text'] += " " + next_seg['text']
179
+ current_segment['end'] = next_seg['end']
 
180
  else:
181
+ final_segments.append(current_segment)
182
+ current_segment = next_seg
183
+ final_segments.append(current_segment)
 
 
 
184
 
185
+ diarized_text = "\n".join(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}: {seg['text'].strip()}" for seg in final_segments)
186
+
187
+ # Step 6: Translation with local Helsinki-NLP models
188
  translation_output = "Source language is English. No translation needed."
189
  if detected_language_code != 'en':
190
+ model_name = 'Helsinki-NLP/opus-mt-tam-en' if detected_language_code == 'ta' else f'Helsinki-NLP/opus-mt-{detected_language_code}-en'
191
+ try:
192
+ if model_name not in TRANSLATION_MODELS:
193
+ logging.info(f"Loading translation model: {model_name}")
194
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
195
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(DEVICE)
196
+ TRANSLATION_MODELS[model_name] = (tokenizer, model)
197
+
198
+ tokenizer, model = TRANSLATION_MODELS[model_name]
199
+
200
+ texts_to_translate = [seg['text'] for seg in final_segments]
201
+ inputs = tokenizer(texts_to_translate, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
202
+ translated_ids = model.generate(**inputs)
203
+ translated_texts = tokenizer.batch_decode(translated_ids, skip_special_tokens=True)
204
 
205
+ # Reconstruct translated output with speaker and timing info
206
+ translation_lines = []
207
+ for i, segment in enumerate(final_segments):
208
+ translation_lines.append(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['speaker']}: {translated_texts[i]}")
209
+ translation_output = "\n".join(translation_lines)
210
+
211
+ except Exception as e:
212
+ translation_output = f"Translation failed for '{detected_language_name}'. Model may not be available. Error: {e}"
213
+
214
+ # Step 7: Generate Report
215
+ report_content = f"# Audio Processing Report\n\n## Detected Language\n{detected_language_name} ({detected_language_code})\n\n---\n\n## Diarized Transcription\n{diarized_text}\n\n---\n\n## English Translation\n{translation_output}"
216
  with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding='utf-8') as report_file:
217
  report_file.write(report_content)
218
  report_path = report_file.name
219
 
220
+ return (f"{detected_language_name} ({detected_language_code})", diarized_text, translation_output, gr.update(value=report_path, visible=True))
221
 
222
  except Exception as e:
223
  logging.error(f"An unexpected error occurred: {e}", exc_info=True)
 
228
  if DEVICE == "cuda":
229
  torch.cuda.empty_cache()
230
 
231
+
232
  # --- 4. Gradio User Interface ---
233
+ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Audio Processor") as app:
234
+ gr.Markdown("# Advanced Open-Source Audio Processor")
235
+ gr.Markdown("A 100% cost-free tool for transcribing, identifying speakers, and translating audio.")
236
 
237
  with gr.Row():
238
  with gr.Column(scale=1):
239
  gr.Markdown("### 1. Provide Audio")
240
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio")
241
  process_button = gr.Button("Process Audio", variant="primary")
242
+ with gr.Accordion("Authentication Instructions (Required for Speaker ID)", open=False):
243
+ gr.Markdown(get_hf_token_instructions())
244
 
245
  with gr.Column(scale=2):
246
  gr.Markdown("### 2. Processing Results")
 
249
  with gr.TabItem("Diarized Transcription"):
250
  diarized_transcription_output = gr.Textbox(label="Full Transcription (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
251
  with gr.TabItem("Translation (to English)"):
252
+ translation_output = gr.Textbox(label="Full Translation (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
253
 
254
  gr.Markdown("### 3. Download Full Report")
255
  download_report_button = gr.File(label="Download Report (.txt)", visible=False, interactive=False)
requirements.txt CHANGED
@@ -1,19 +1,23 @@
1
- # UI and Core
2
- gradio
 
 
 
 
3
 
4
- # API Clients for AI Services
5
- openai
6
- google-generativeai
7
 
8
- # Local, Self-Hosted AI for Diarization
9
  pyannote.audio==3.1.1
10
- torch
11
- torchaudio
12
- numpy<2.0
13
  soundfile
 
 
 
 
 
 
 
14
  pyyaml
15
  einops
16
- pytorch-lightning
17
-
18
- # Other Utilities
19
- huggingface_hub
 
1
+ # Core ML Libraries - Pinned for stability
2
+ torch==2.1.2
3
+ torchaudio==2.1.2
4
+ transformers==4.41.2
5
+ accelerate>=0.21.0
6
+ numpy<2.0
7
 
8
+ # Application and UI
9
+ gradio
 
10
 
11
+ # Audio Processing
12
  pyannote.audio==3.1.1
 
 
 
13
  soundfile
14
+
15
+ # Language Detection
16
+ langdetect
17
+
18
+ # Other dependencies
19
+ sentencepiece
20
+ huggingface_hub
21
  pyyaml
22
  einops
23
+ pytorch-lightning