Spaces:
Sleeping
Sleeping
app & req file updated
Browse files- app.py +153 -111
- requirements.txt +17 -13
app.py
CHANGED
|
@@ -1,60 +1,79 @@
|
|
| 1 |
# app.py
|
| 2 |
-
# A
|
| 3 |
-
# - Whisper
|
| 4 |
-
# - Pyannote
|
| 5 |
-
# -
|
| 6 |
|
| 7 |
import os
|
| 8 |
import torch
|
| 9 |
import gradio as gr
|
|
|
|
| 10 |
import soundfile as sf
|
|
|
|
|
|
|
|
|
|
| 11 |
import tempfile
|
| 12 |
import logging
|
| 13 |
import warnings
|
| 14 |
-
import openai
|
| 15 |
-
import google.generativeai as genai
|
| 16 |
from pyannote.audio import Pipeline as PyannotePipeline
|
|
|
|
| 17 |
|
| 18 |
# --- 1. Initial Setup & Configuration ---
|
| 19 |
-
|
| 20 |
-
# Suppress less important warnings
|
| 21 |
warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.functional')
|
| 22 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
"""
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
- Get it from: [aistudio.google.com/app/apikey](https://aistudio.google.com/app/apikey)
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
"""
|
| 53 |
|
| 54 |
-
# --- 2. Global Model Loading (
|
| 55 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 56 |
-
|
|
|
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 59 |
DIARIZATION_PIPELINE = None
|
| 60 |
if HF_TOKEN:
|
|
@@ -68,49 +87,30 @@ if HF_TOKEN:
|
|
| 68 |
else:
|
| 69 |
logging.warning("HF_TOKEN not set. Speaker diarization will be disabled.")
|
| 70 |
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# --- 3. Core Processing Functions ---
|
| 73 |
-
|
| 74 |
-
def transcribe_with_whisper_api(audio_path):
|
| 75 |
-
"""Sends audio to OpenAI's Whisper API and gets a verbose transcript."""
|
| 76 |
-
logging.info("Sending audio to Whisper API for transcription...")
|
| 77 |
-
with open(audio_path, "rb") as audio_file:
|
| 78 |
-
transcript = openai.audio.transcriptions.create(
|
| 79 |
-
model="whisper-1",
|
| 80 |
-
file=audio_file,
|
| 81 |
-
response_format="verbose_json",
|
| 82 |
-
timestamp_granularities=["word"]
|
| 83 |
-
)
|
| 84 |
-
logging.info("Received response from Whisper API.")
|
| 85 |
-
return transcript.words, transcript.text, transcript.language
|
| 86 |
-
|
| 87 |
-
def translate_with_gemini_api(text_to_translate, source_language):
|
| 88 |
-
"""Sends text to Google's Gemini Pro API for translation."""
|
| 89 |
-
logging.info(f"Sending text to Gemini API for translation from '{source_language}'...")
|
| 90 |
-
model = genai.GenerativeModel('gemini-1.5-flash')
|
| 91 |
-
prompt = f"You are an expert linguist. Translate the following text from {source_language} into clear, natural-sounding English. Maintain the original meaning and tone.\n\nText to Translate:\n---\n{text_to_translate}\n---\n\nEnglish Translation:"
|
| 92 |
-
|
| 93 |
try:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
except Exception as e:
|
| 98 |
-
|
| 99 |
-
return f"Translation failed due to an API error: {e}"
|
| 100 |
-
|
| 101 |
|
| 102 |
def process_audio(audio_input):
|
| 103 |
-
"""The main hybrid pipeline function."""
|
| 104 |
-
# Check if API keys were loaded at startup
|
| 105 |
-
if not os.environ.get("OPENAI_API_KEY") or not os.environ.get("GEMINI_API_KEY"):
|
| 106 |
-
raise gr.Error("Missing OpenAI or Gemini API Key. Please check the instructions and set the repository secrets.")
|
| 107 |
-
|
| 108 |
if audio_input is None:
|
| 109 |
-
|
| 110 |
-
|
| 111 |
temp_audio_path = None
|
| 112 |
try:
|
| 113 |
-
# Step 1:
|
| 114 |
if isinstance(audio_input, tuple):
|
| 115 |
sample_rate, audio_data = audio_input
|
| 116 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
|
@@ -119,10 +119,34 @@ def process_audio(audio_input):
|
|
| 119 |
else:
|
| 120 |
temp_audio_path = audio_input
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
diarization = None
|
| 127 |
if DIARIZATION_PIPELINE:
|
| 128 |
logging.info("Performing speaker diarization...")
|
|
@@ -131,52 +155,69 @@ def process_audio(audio_input):
|
|
| 131 |
except Exception as e:
|
| 132 |
logging.error(f"Diarization failed: {e}")
|
| 133 |
|
| 134 |
-
# Step
|
| 135 |
logging.info("Aligning transcription with speaker segments...")
|
| 136 |
-
|
| 137 |
if diarization:
|
| 138 |
speaker_map = [{'start': turn.start, 'end': turn.end, 'speaker': speaker} for turn, _, speaker in diarization.itertracks(yield_label=True)]
|
| 139 |
for word_info in word_timestamps:
|
| 140 |
-
|
| 141 |
-
for
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
break
|
| 145 |
-
final_segments.append({'start': word_info['start'], 'end': word_info['end'], 'text': word_info['word'], 'speaker': assigned_speaker})
|
| 146 |
-
else: # Fallback if no diarization
|
| 147 |
for word_info in word_timestamps:
|
| 148 |
-
|
| 149 |
|
| 150 |
-
# Merge consecutive words
|
| 151 |
-
|
| 152 |
-
if
|
| 153 |
-
current_segment =
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
current_segment['
|
| 159 |
-
current_segment['end'] = next_segment['end']
|
| 160 |
else:
|
| 161 |
-
|
| 162 |
-
current_segment =
|
| 163 |
-
|
| 164 |
-
merged_segments.append(current_segment)
|
| 165 |
-
|
| 166 |
-
diarized_text = "\n".join(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['speaker']}: {segment['text'].strip()}" for segment in merged_segments)
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
| 169 |
translation_output = "Source language is English. No translation needed."
|
| 170 |
if detected_language_code != 'en':
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding='utf-8') as report_file:
|
| 176 |
report_file.write(report_content)
|
| 177 |
report_path = report_file.name
|
| 178 |
|
| 179 |
-
return detected_language_code, diarized_text, translation_output, gr.update(value=report_path, visible=True)
|
| 180 |
|
| 181 |
except Exception as e:
|
| 182 |
logging.error(f"An unexpected error occurred: {e}", exc_info=True)
|
|
@@ -187,18 +228,19 @@ def process_audio(audio_input):
|
|
| 187 |
if DEVICE == "cuda":
|
| 188 |
torch.cuda.empty_cache()
|
| 189 |
|
|
|
|
| 190 |
# --- 4. Gradio User Interface ---
|
| 191 |
-
with gr.Blocks(theme=gr.themes.Soft(), title="
|
| 192 |
-
gr.Markdown("#
|
| 193 |
-
gr.Markdown("A
|
| 194 |
|
| 195 |
with gr.Row():
|
| 196 |
with gr.Column(scale=1):
|
| 197 |
gr.Markdown("### 1. Provide Audio")
|
| 198 |
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio")
|
| 199 |
process_button = gr.Button("Process Audio", variant="primary")
|
| 200 |
-
with gr.Accordion("
|
| 201 |
-
gr.Markdown(
|
| 202 |
|
| 203 |
with gr.Column(scale=2):
|
| 204 |
gr.Markdown("### 2. Processing Results")
|
|
@@ -207,7 +249,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Audio Processor") as app:
|
|
| 207 |
with gr.TabItem("Diarized Transcription"):
|
| 208 |
diarized_transcription_output = gr.Textbox(label="Full Transcription (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
|
| 209 |
with gr.TabItem("Translation (to English)"):
|
| 210 |
-
translation_output = gr.Textbox(label="Full Translation
|
| 211 |
|
| 212 |
gr.Markdown("### 3. Download Full Report")
|
| 213 |
download_report_button = gr.File(label="Download Report (.txt)", visible=False, interactive=False)
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
# A 100% OPEN-SOURCE audio processing application.
|
| 3 |
+
# - Local Whisper for Transcription
|
| 4 |
+
# - Local Pyannote for Diarization
|
| 5 |
+
# - Local Helsinki-NLP for Translation
|
| 6 |
|
| 7 |
import os
|
| 8 |
import torch
|
| 9 |
import gradio as gr
|
| 10 |
+
import numpy as np
|
| 11 |
import soundfile as sf
|
| 12 |
+
import torchaudio
|
| 13 |
+
from transformers import pipeline as hf_pipeline
|
| 14 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 15 |
import tempfile
|
| 16 |
import logging
|
| 17 |
import warnings
|
|
|
|
|
|
|
| 18 |
from pyannote.audio import Pipeline as PyannotePipeline
|
| 19 |
+
from langdetect import detect, LangDetectException
|
| 20 |
|
| 21 |
# --- 1. Initial Setup & Configuration ---
|
|
|
|
|
|
|
| 22 |
warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.functional')
|
| 23 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 24 |
|
| 25 |
+
# Language name mapping
|
| 26 |
+
LANGUAGE_NAME_MAPPING = {
|
| 27 |
+
"en": "English", "zh-cn": "Chinese", "de": "German", "es": "Spanish", "ru": "Russian",
|
| 28 |
+
"ko": "Korean", "fr": "French", "ja": "Japanese", "pt": "Portuguese", "tr": "Turkish",
|
| 29 |
+
"pl": "Polish", "ca": "Catalan", "nl": "Dutch", "ar": "Arabic", "sv": "Swedish",
|
| 30 |
+
"it": "Italian", "id": "Indonesian", "hi": "Hindi", "fi": "Finnish", "vi": "Vietnamese",
|
| 31 |
+
"he": "Hebrew", "uk": "Ukrainian", "el": "Greek", "ms": "Malay", "cs": "Czech",
|
| 32 |
+
"ro": "Romanian", "da": "Danish", "hu": "Hungarian", "ta": "Tamil", "no": "Norwegian",
|
| 33 |
+
"th": "Thai", "ur": "Urdu", "hr": "Croatian", "bg": "Bulgarian", "lt": "Lithuanian", "la": "Latin",
|
| 34 |
+
"mi": "Maori", "ml": "Malayalam", "cy": "Welsh", "sk": "Slovak", "te": "Telugu", "pa": "Punjabi",
|
| 35 |
+
"lv": "Latvian", "bn": "Bengali", "sr": "Serbian", "az": "Azerbaijani", "sl": "Slovenian",
|
| 36 |
+
"kn": "Kannada", "et": "Estonian", "mk": "Macedonian", "br": "Breton", "eu": "Basque",
|
| 37 |
+
"is": "Icelandic", "hy": "Armenian", "ne": "Nepali", "mn": "Mongolian", "bs": "Bosnian",
|
| 38 |
+
"kk": "Kazakh", "sq": "Albanian", "sw": "Swahili", "gl": "Galician", "mr": "Marathi",
|
| 39 |
+
"si": "Sinhala", "am": "Amharic", "yo": "Yoruba", "uz": "Uzbek", "af": "Afrikaans",
|
| 40 |
+
"oc": "Occitan", "ka": "Georgian", "be": "Belarusian", "tg": "Tajik", "sd": "Sindhi",
|
| 41 |
+
"gu": "Gujarati", "so": "Somali", "lo": "Lao", "yi": "Yiddish", "ky": "Kyrgyz",
|
| 42 |
+
"tk": "Turkmen", "ht": "Haitian Creole", "ps": "Pashto", "as": "Assamese", "tt": "Tatar",
|
| 43 |
+
"ha": "Hausa", "ba": "Bashkir", "jw": "Javanese", "su": "Sundanese"
|
| 44 |
+
}
|
|
|
|
| 45 |
|
| 46 |
+
def get_hf_token_instructions():
|
| 47 |
+
"""Generates instructions for setting the HF_TOKEN for pyannote."""
|
| 48 |
+
return """
|
| 49 |
+
**IMPORTANT: Authentication Required for Speaker Identification**
|
| 50 |
+
This feature uses the `pyannote/speaker-diarization-3.1` model, which requires a Hugging Face access token.
|
| 51 |
+
**How to Add Your Token:**
|
| 52 |
+
1. **Accept the model license:** Visit [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) and agree to the terms.
|
| 53 |
+
2. **Get your token:** Find it in your Hugging Face account settings: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
| 54 |
+
3. **Add the token to this Space:** Go to the **Settings** tab, find **Repository secrets**, click **New secret**, and add a secret named `HF_TOKEN` with your token as the value. Restart the Space after saving.
|
| 55 |
"""
|
| 56 |
|
| 57 |
+
# --- 2. Global Model Loading (All Local) ---
|
| 58 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 59 |
+
TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 60 |
+
logging.info(f"Using device: {DEVICE} with data type: {TORCH_DTYPE}")
|
| 61 |
|
| 62 |
+
# ASR Pipeline (Local Whisper)
|
| 63 |
+
ASR_PIPELINE = None
|
| 64 |
+
try:
|
| 65 |
+
logging.info("Loading ASR pipeline (Whisper)...")
|
| 66 |
+
ASR_PIPELINE = hf_pipeline(
|
| 67 |
+
"automatic-speech-recognition",
|
| 68 |
+
model="openai/whisper-large-v3",
|
| 69 |
+
torch_dtype=TORCH_DTYPE,
|
| 70 |
+
device=DEVICE,
|
| 71 |
+
)
|
| 72 |
+
logging.info("ASR pipeline loaded successfully.")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logging.error(f"Fatal error: Could not load ASR pipeline. {e}")
|
| 75 |
+
|
| 76 |
+
# Speaker Diarization Pipeline (Local Pyannote)
|
| 77 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 78 |
DIARIZATION_PIPELINE = None
|
| 79 |
if HF_TOKEN:
|
|
|
|
| 87 |
else:
|
| 88 |
logging.warning("HF_TOKEN not set. Speaker diarization will be disabled.")
|
| 89 |
|
| 90 |
+
# Translation Model Cache (Local Helsinki-NLP)
|
| 91 |
+
TRANSLATION_MODELS = {}
|
| 92 |
+
logging.info("Translation model cache initialized.")
|
| 93 |
|
| 94 |
# --- 3. Core Processing Functions ---
|
| 95 |
+
def load_and_resample_audio(audio_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
try:
|
| 97 |
+
waveform, sample_rate = torchaudio.load(audio_path, channels_first=True)
|
| 98 |
+
if waveform.shape[0] > 1:
|
| 99 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
| 100 |
+
if sample_rate != 16000:
|
| 101 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
| 102 |
+
waveform = resampler(waveform)
|
| 103 |
+
return waveform.squeeze(0).numpy()
|
| 104 |
except Exception as e:
|
| 105 |
+
raise IOError(f"Error processing audio file {audio_path}: {e}")
|
|
|
|
|
|
|
| 106 |
|
| 107 |
def process_audio(audio_input):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
if audio_input is None:
|
| 109 |
+
raise gr.Error("Please provide an audio file or record audio.")
|
| 110 |
+
|
| 111 |
temp_audio_path = None
|
| 112 |
try:
|
| 113 |
+
# Step 1: Handle audio input
|
| 114 |
if isinstance(audio_input, tuple):
|
| 115 |
sample_rate, audio_data = audio_input
|
| 116 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
|
|
|
| 119 |
else:
|
| 120 |
temp_audio_path = audio_input
|
| 121 |
|
| 122 |
+
logging.info("Standardizing audio...")
|
| 123 |
+
audio_waveform_16k = load_and_resample_audio(temp_audio_path)
|
| 124 |
+
|
| 125 |
+
# Step 2: ASR with local Whisper pipeline
|
| 126 |
+
logging.info("Starting ASR with local Whisper pipeline...")
|
| 127 |
+
if not ASR_PIPELINE:
|
| 128 |
+
raise gr.Error("ASR pipeline not available. The application cannot proceed.")
|
| 129 |
|
| 130 |
+
asr_output = ASR_PIPELINE(
|
| 131 |
+
audio_waveform_16k,
|
| 132 |
+
chunk_length_s=30,
|
| 133 |
+
batch_size=8,
|
| 134 |
+
return_timestamps="word"
|
| 135 |
+
)
|
| 136 |
+
word_timestamps = asr_output.get("chunks", [])
|
| 137 |
+
full_text = asr_output.get("text", "").strip()
|
| 138 |
+
|
| 139 |
+
# Step 3: Language Detection
|
| 140 |
+
detected_language_code = "en"
|
| 141 |
+
if full_text:
|
| 142 |
+
try:
|
| 143 |
+
detected_language_code = detect(full_text)
|
| 144 |
+
except LangDetectException:
|
| 145 |
+
logging.warning("Language detection failed, defaulting to English.")
|
| 146 |
+
detected_language_name = LANGUAGE_NAME_MAPPING.get(detected_language_code, "Unknown")
|
| 147 |
+
logging.info(f"Transcription complete. Language: {detected_language_name}")
|
| 148 |
+
|
| 149 |
+
# Step 4: Speaker Diarization
|
| 150 |
diarization = None
|
| 151 |
if DIARIZATION_PIPELINE:
|
| 152 |
logging.info("Performing speaker diarization...")
|
|
|
|
| 155 |
except Exception as e:
|
| 156 |
logging.error(f"Diarization failed: {e}")
|
| 157 |
|
| 158 |
+
# Step 5: Align ASR and Diarization results
|
| 159 |
logging.info("Aligning transcription with speaker segments...")
|
| 160 |
+
merged_segments = []
|
| 161 |
if diarization:
|
| 162 |
speaker_map = [{'start': turn.start, 'end': turn.end, 'speaker': speaker} for turn, _, speaker in diarization.itertracks(yield_label=True)]
|
| 163 |
for word_info in word_timestamps:
|
| 164 |
+
word_start, word_end = word_info['timestamp']
|
| 165 |
+
assigned_speaker = next((seg['speaker'] for seg in speaker_map if word_start >= seg['start'] and word_end <= seg['end']), "Unknown")
|
| 166 |
+
merged_segments.append({'start': word_start, 'end': word_end, 'text': word_info['text'], 'speaker': assigned_speaker})
|
| 167 |
+
else:
|
|
|
|
|
|
|
|
|
|
| 168 |
for word_info in word_timestamps:
|
| 169 |
+
merged_segments.append({'start': word_info['timestamp'][0], 'end': word_info['timestamp'][1], 'text': word_info['text'], 'speaker': 'SPEAKER_00'})
|
| 170 |
|
| 171 |
+
# Merge consecutive words from the same speaker
|
| 172 |
+
final_segments = []
|
| 173 |
+
if merged_segments:
|
| 174 |
+
current_segment = merged_segments[0]
|
| 175 |
+
for i in range(1, len(merged_segments)):
|
| 176 |
+
next_seg = merged_segments[i]
|
| 177 |
+
if next_seg['speaker'] == current_segment['speaker'] and (next_seg['start'] - current_segment['end'] < 0.5):
|
| 178 |
+
current_segment['text'] += " " + next_seg['text']
|
| 179 |
+
current_segment['end'] = next_seg['end']
|
|
|
|
| 180 |
else:
|
| 181 |
+
final_segments.append(current_segment)
|
| 182 |
+
current_segment = next_seg
|
| 183 |
+
final_segments.append(current_segment)
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
+
diarized_text = "\n".join(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}: {seg['text'].strip()}" for seg in final_segments)
|
| 186 |
+
|
| 187 |
+
# Step 6: Translation with local Helsinki-NLP models
|
| 188 |
translation_output = "Source language is English. No translation needed."
|
| 189 |
if detected_language_code != 'en':
|
| 190 |
+
model_name = 'Helsinki-NLP/opus-mt-tam-en' if detected_language_code == 'ta' else f'Helsinki-NLP/opus-mt-{detected_language_code}-en'
|
| 191 |
+
try:
|
| 192 |
+
if model_name not in TRANSLATION_MODELS:
|
| 193 |
+
logging.info(f"Loading translation model: {model_name}")
|
| 194 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 195 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(DEVICE)
|
| 196 |
+
TRANSLATION_MODELS[model_name] = (tokenizer, model)
|
| 197 |
+
|
| 198 |
+
tokenizer, model = TRANSLATION_MODELS[model_name]
|
| 199 |
+
|
| 200 |
+
texts_to_translate = [seg['text'] for seg in final_segments]
|
| 201 |
+
inputs = tokenizer(texts_to_translate, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
|
| 202 |
+
translated_ids = model.generate(**inputs)
|
| 203 |
+
translated_texts = tokenizer.batch_decode(translated_ids, skip_special_tokens=True)
|
| 204 |
|
| 205 |
+
# Reconstruct translated output with speaker and timing info
|
| 206 |
+
translation_lines = []
|
| 207 |
+
for i, segment in enumerate(final_segments):
|
| 208 |
+
translation_lines.append(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['speaker']}: {translated_texts[i]}")
|
| 209 |
+
translation_output = "\n".join(translation_lines)
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
translation_output = f"Translation failed for '{detected_language_name}'. Model may not be available. Error: {e}"
|
| 213 |
+
|
| 214 |
+
# Step 7: Generate Report
|
| 215 |
+
report_content = f"# Audio Processing Report\n\n## Detected Language\n{detected_language_name} ({detected_language_code})\n\n---\n\n## Diarized Transcription\n{diarized_text}\n\n---\n\n## English Translation\n{translation_output}"
|
| 216 |
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding='utf-8') as report_file:
|
| 217 |
report_file.write(report_content)
|
| 218 |
report_path = report_file.name
|
| 219 |
|
| 220 |
+
return (f"{detected_language_name} ({detected_language_code})", diarized_text, translation_output, gr.update(value=report_path, visible=True))
|
| 221 |
|
| 222 |
except Exception as e:
|
| 223 |
logging.error(f"An unexpected error occurred: {e}", exc_info=True)
|
|
|
|
| 228 |
if DEVICE == "cuda":
|
| 229 |
torch.cuda.empty_cache()
|
| 230 |
|
| 231 |
+
|
| 232 |
# --- 4. Gradio User Interface ---
|
| 233 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Audio Processor") as app:
|
| 234 |
+
gr.Markdown("# Advanced Open-Source Audio Processor")
|
| 235 |
+
gr.Markdown("A 100% cost-free tool for transcribing, identifying speakers, and translating audio.")
|
| 236 |
|
| 237 |
with gr.Row():
|
| 238 |
with gr.Column(scale=1):
|
| 239 |
gr.Markdown("### 1. Provide Audio")
|
| 240 |
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio")
|
| 241 |
process_button = gr.Button("Process Audio", variant="primary")
|
| 242 |
+
with gr.Accordion("Authentication Instructions (Required for Speaker ID)", open=False):
|
| 243 |
+
gr.Markdown(get_hf_token_instructions())
|
| 244 |
|
| 245 |
with gr.Column(scale=2):
|
| 246 |
gr.Markdown("### 2. Processing Results")
|
|
|
|
| 249 |
with gr.TabItem("Diarized Transcription"):
|
| 250 |
diarized_transcription_output = gr.Textbox(label="Full Transcription (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
|
| 251 |
with gr.TabItem("Translation (to English)"):
|
| 252 |
+
translation_output = gr.Textbox(label="Full Translation (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
|
| 253 |
|
| 254 |
gr.Markdown("### 3. Download Full Report")
|
| 255 |
download_report_button = gr.File(label="Download Report (.txt)", visible=False, interactive=False)
|
requirements.txt
CHANGED
|
@@ -1,19 +1,23 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
|
| 6 |
-
google-generativeai
|
| 7 |
|
| 8 |
-
#
|
| 9 |
pyannote.audio==3.1.1
|
| 10 |
-
torch
|
| 11 |
-
torchaudio
|
| 12 |
-
numpy<2.0
|
| 13 |
soundfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
pyyaml
|
| 15 |
einops
|
| 16 |
-
pytorch-lightning
|
| 17 |
-
|
| 18 |
-
# Other Utilities
|
| 19 |
-
huggingface_hub
|
|
|
|
| 1 |
+
# Core ML Libraries - Pinned for stability
|
| 2 |
+
torch==2.1.2
|
| 3 |
+
torchaudio==2.1.2
|
| 4 |
+
transformers==4.41.2
|
| 5 |
+
accelerate>=0.21.0
|
| 6 |
+
numpy<2.0
|
| 7 |
|
| 8 |
+
# Application and UI
|
| 9 |
+
gradio
|
|
|
|
| 10 |
|
| 11 |
+
# Audio Processing
|
| 12 |
pyannote.audio==3.1.1
|
|
|
|
|
|
|
|
|
|
| 13 |
soundfile
|
| 14 |
+
|
| 15 |
+
# Language Detection
|
| 16 |
+
langdetect
|
| 17 |
+
|
| 18 |
+
# Other dependencies
|
| 19 |
+
sentencepiece
|
| 20 |
+
huggingface_hub
|
| 21 |
pyyaml
|
| 22 |
einops
|
| 23 |
+
pytorch-lightning
|
|
|
|
|
|
|
|
|