Spaces:

DineshJ96
/

speaker-diarization

Build error

App Files Files Community

DineshJ96 commited on Jul 27, 2025

Commit

eb74f8a

1 Parent(s): 1eabe29

app & req file updated

Browse files

Files changed (2) hide show

app.py +153 -111
requirements.txt +17 -13

app.py CHANGED Viewed

@@ -1,60 +1,79 @@
 # app.py
-# A HYBRID audio processing application using APIs for speed and local models for specialization.
-# - Whisper API for Transcription
-# - Pyannote (local) for Diarization
-# - Gemini API for Translation
 import os
 import torch
 import gradio as gr
 import soundfile as sf
 import tempfile
 import logging
 import warnings
-import openai
-import google.generativeai as genai
 from pyannote.audio import Pipeline as PyannotePipeline
 # --- 1. Initial Setup & Configuration ---
-# Suppress less important warnings
 warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.functional')
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# --- API Key Configuration ---
-try:
-    openai.api_key = os.environ["OPENAI_API_KEY"]
-    genai.configure(api_key=os.environ["GEMINI_API_KEY"])
-    logging.info("API keys for OpenAI and Gemini loaded successfully.")
-except KeyError as e:
-    logging.error(f"FATAL: Missing API Key - {e}. The application cannot run without it.")
-    # We will raise a gr.Error in the main function if keys are missing.
-# --- Helper Function for Instructions ---
-def get_api_key_instructions():
-    """Generates instructions for setting the required API keys."""
-    return """
-    **IMPORTANT: API Keys Required**
-    This application uses external AI services and requires three secrets to be set:
-    1.  **`OPENAI_API_KEY`**: For speech-to-text via the Whisper API.
-        - Get it from: [platform.openai.com/api-keys](https://platform.openai.com/api-keys)
-    2.  **`GEMINI_API_KEY`**: For language translation.
-        - Get it from: [aistudio.google.com/app/apikey](https://aistudio.google.com/app/apikey)
-    3.  **`HF_TOKEN`**: For the local speaker diarization model.
-        - Get it from: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
-        - You must also accept the license for [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1).
-    **How to Add Keys to this Space:**
-    Go to the **Settings** tab, find **Repository secrets**, click **New secret**, and add each of the three secrets listed above. Restart the Space after saving.
     """
-# --- 2. Global Model Loading (pyannote only) ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-logging.info(f"Using device: {DEVICE} for local models.")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 DIARIZATION_PIPELINE = None
 if HF_TOKEN:
@@ -68,49 +87,30 @@ if HF_TOKEN:
 else:
     logging.warning("HF_TOKEN not set. Speaker diarization will be disabled.")
 # --- 3. Core Processing Functions ---
-def transcribe_with_whisper_api(audio_path):
-    """Sends audio to OpenAI's Whisper API and gets a verbose transcript."""
-    logging.info("Sending audio to Whisper API for transcription...")
-    with open(audio_path, "rb") as audio_file:
-        transcript = openai.audio.transcriptions.create(
-            model="whisper-1",
-            file=audio_file,
-            response_format="verbose_json",
-            timestamp_granularities=["word"]
-        )
-    logging.info("Received response from Whisper API.")
-    return transcript.words, transcript.text, transcript.language
-def translate_with_gemini_api(text_to_translate, source_language):
-    """Sends text to Google's Gemini Pro API for translation."""
-    logging.info(f"Sending text to Gemini API for translation from '{source_language}'...")
-    model = genai.GenerativeModel('gemini-1.5-flash')
-    prompt = f"You are an expert linguist. Translate the following text from {source_language} into clear, natural-sounding English. Maintain the original meaning and tone.\n\nText to Translate:\n---\n{text_to_translate}\n---\n\nEnglish Translation:"
     try:
-        response = model.generate_content(prompt)
-        logging.info("Received response from Gemini API.")
-        return response.text
     except Exception as e:
-        logging.error(f"Gemini API call failed: {e}")
-        return f"Translation failed due to an API error: {e}"
 def process_audio(audio_input):
-    """The main hybrid pipeline function."""
-    # Check if API keys were loaded at startup
-    if not os.environ.get("OPENAI_API_KEY") or not os.environ.get("GEMINI_API_KEY"):
-        raise gr.Error("Missing OpenAI or Gemini API Key. Please check the instructions and set the repository secrets.")
     if audio_input is None:
-        return "Please provide audio.", "", "", gr.update(visible=False)
     temp_audio_path = None
     try:
-        # Step 1: Standardize audio input to a temporary file path
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
@@ -119,10 +119,34 @@ def process_audio(audio_input):
         else:
             temp_audio_path = audio_input
-        # Step 2: ASR via Whisper API (Fast and Accurate)
-        word_timestamps, full_text, detected_language_code = transcribe_with_whisper_api(temp_audio_path)
-        # Step 3: Diarization via local Pyannote model
         diarization = None
         if DIARIZATION_PIPELINE:
             logging.info("Performing speaker diarization...")
@@ -131,52 +155,69 @@ def process_audio(audio_input):
             except Exception as e:
                 logging.error(f"Diarization failed: {e}")
-        # Step 4: Align ASR and Diarization results
         logging.info("Aligning transcription with speaker segments...")
-        final_segments = []
         if diarization:
             speaker_map = [{'start': turn.start, 'end': turn.end, 'speaker': speaker} for turn, _, speaker in diarization.itertracks(yield_label=True)]
             for word_info in word_timestamps:
-                assigned_speaker = "Unknown"
-                for segment in speaker_map:
-                    if word_info['start'] >= segment['start'] and word_info['end'] <= segment['end']:
-                        assigned_speaker = segment['speaker']
-                        break
-                final_segments.append({'start': word_info['start'], 'end': word_info['end'], 'text': word_info['word'], 'speaker': assigned_speaker})
-        else: # Fallback if no diarization
             for word_info in word_timestamps:
-                 final_segments.append({'start': word_info['start'], 'end': word_info['end'], 'text': word_info['word'], 'speaker': 'SPEAKER_00'})
-        # Merge consecutive words
-        merged_segments = []
-        if final_segments:
-            current_segment = final_segments[0]
-            current_segment['text'] = current_segment['text'].strip()
-            for i in range(1, len(final_segments)):
-                next_segment = final_segments[i]
-                if next_segment['speaker'] == current_segment['speaker'] and (next_segment['start'] - current_segment['end'] < 0.1):
-                    current_segment['text'] += next_segment['text']
-                    current_segment['end'] = next_segment['end']
                 else:
-                    merged_segments.append(current_segment)
-                    current_segment = next_segment
-                    current_segment['text'] = current_segment['text'].strip()
-            merged_segments.append(current_segment)
-        diarized_text = "\n".join(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['speaker']}: {segment['text'].strip()}" for segment in merged_segments)
-        # Step 5: Translation via Gemini API
         translation_output = "Source language is English. No translation needed."
         if detected_language_code != 'en':
-            translation_output = translate_with_gemini_api(full_text, detected_language_code)
-        # Step 6: Generate Report
-        report_content = f"# Audio Processing Report\n\n## Detected Language\n{detected_language_code}\n\n---\n\n## Diarized Transcription\n{diarized_text}\n\n---\n\n## English Translation\n{translation_output}"
         with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding='utf-8') as report_file:
             report_file.write(report_content)
             report_path = report_file.name
-        return detected_language_code, diarized_text, translation_output, gr.update(value=report_path, visible=True)
     except Exception as e:
         logging.error(f"An unexpected error occurred: {e}", exc_info=True)
@@ -187,18 +228,19 @@ def process_audio(audio_input):
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
 # --- 4. Gradio User Interface ---
-with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Audio Processor") as app:
-    gr.Markdown("# Hybrid AI Audio Processor")
-    gr.Markdown("A high-speed tool using Whisper API, local Speaker Diarization, and Gemini API for Translation.")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Provide Audio")
             audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio")
             process_button = gr.Button("Process Audio", variant="primary")
-            with gr.Accordion("API Key Instructions (IMPORTANT)", open=True):
-                gr.Markdown(get_api_key_instructions())
         with gr.Column(scale=2):
             gr.Markdown("### 2. Processing Results")
@@ -207,7 +249,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hybrid Audio Processor") as app:
                 with gr.TabItem("Diarized Transcription"):
                     diarized_transcription_output = gr.Textbox(label="Full Transcription (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
                 with gr.TabItem("Translation (to English)"):
-                    translation_output = gr.Textbox(label="Full Translation by Gemini", lines=15, interactive=False, show_copy_button=True)
             gr.Markdown("### 3. Download Full Report")
             download_report_button = gr.File(label="Download Report (.txt)", visible=False, interactive=False)

 # app.py
+# A 100% OPEN-SOURCE audio processing application.
+# - Local Whisper for Transcription
+# - Local Pyannote for Diarization
+# - Local Helsinki-NLP for Translation
 import os
 import torch
 import gradio as gr
+import numpy as np
 import soundfile as sf
+import torchaudio
+from transformers import pipeline as hf_pipeline
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import tempfile
 import logging
 import warnings
 from pyannote.audio import Pipeline as PyannotePipeline
+from langdetect import detect, LangDetectException
 # --- 1. Initial Setup & Configuration ---
 warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.functional')
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Language name mapping
+LANGUAGE_NAME_MAPPING = {
+    "en": "English", "zh-cn": "Chinese", "de": "German", "es": "Spanish", "ru": "Russian",
+    "ko": "Korean", "fr": "French", "ja": "Japanese", "pt": "Portuguese", "tr": "Turkish",
+    "pl": "Polish", "ca": "Catalan", "nl": "Dutch", "ar": "Arabic", "sv": "Swedish",
+    "it": "Italian", "id": "Indonesian", "hi": "Hindi", "fi": "Finnish", "vi": "Vietnamese",
+    "he": "Hebrew", "uk": "Ukrainian", "el": "Greek", "ms": "Malay", "cs": "Czech",
+    "ro": "Romanian", "da": "Danish", "hu": "Hungarian", "ta": "Tamil", "no": "Norwegian",
+    "th": "Thai", "ur": "Urdu", "hr": "Croatian", "bg": "Bulgarian", "lt": "Lithuanian", "la": "Latin",
+    "mi": "Maori", "ml": "Malayalam", "cy": "Welsh", "sk": "Slovak", "te": "Telugu", "pa": "Punjabi",
+    "lv": "Latvian", "bn": "Bengali", "sr": "Serbian", "az": "Azerbaijani", "sl": "Slovenian",
+    "kn": "Kannada", "et": "Estonian", "mk": "Macedonian", "br": "Breton", "eu": "Basque",
+    "is": "Icelandic", "hy": "Armenian", "ne": "Nepali", "mn": "Mongolian", "bs": "Bosnian",
+    "kk": "Kazakh", "sq": "Albanian", "sw": "Swahili", "gl": "Galician", "mr": "Marathi",
+    "si": "Sinhala", "am": "Amharic", "yo": "Yoruba", "uz": "Uzbek", "af": "Afrikaans",
+    "oc": "Occitan", "ka": "Georgian", "be": "Belarusian", "tg": "Tajik", "sd": "Sindhi",
+    "gu": "Gujarati", "so": "Somali", "lo": "Lao", "yi": "Yiddish", "ky": "Kyrgyz",
+    "tk": "Turkmen", "ht": "Haitian Creole", "ps": "Pashto", "as": "Assamese", "tt": "Tatar",
+    "ha": "Hausa", "ba": "Bashkir", "jw": "Javanese", "su": "Sundanese"
+}
+def get_hf_token_instructions():
+    """Generates instructions for setting the HF_TOKEN for pyannote."""
+    return """
+    **IMPORTANT: Authentication Required for Speaker Identification**
+    This feature uses the `pyannote/speaker-diarization-3.1` model, which requires a Hugging Face access token.
+    **How to Add Your Token:**
+    1.  **Accept the model license:** Visit [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) and agree to the terms.
+    2.  **Get your token:** Find it in your Hugging Face account settings: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
+    3.  **Add the token to this Space:** Go to the **Settings** tab, find **Repository secrets**, click **New secret**, and add a secret named `HF_TOKEN` with your token as the value. Restart the Space after saving.
     """
+# --- 2. Global Model Loading (All Local) ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+logging.info(f"Using device: {DEVICE} with data type: {TORCH_DTYPE}")
+# ASR Pipeline (Local Whisper)
+ASR_PIPELINE = None
+try:
+    logging.info("Loading ASR pipeline (Whisper)...")
+    ASR_PIPELINE = hf_pipeline(
+        "automatic-speech-recognition",
+        model="openai/whisper-large-v3",
+        torch_dtype=TORCH_DTYPE,
+        device=DEVICE,
+    )
+    logging.info("ASR pipeline loaded successfully.")
+except Exception as e:
+    logging.error(f"Fatal error: Could not load ASR pipeline. {e}")
+# Speaker Diarization Pipeline (Local Pyannote)
 HF_TOKEN = os.environ.get("HF_TOKEN")
 DIARIZATION_PIPELINE = None
 if HF_TOKEN:
 else:
     logging.warning("HF_TOKEN not set. Speaker diarization will be disabled.")
+# Translation Model Cache (Local Helsinki-NLP)
+TRANSLATION_MODELS = {}
+logging.info("Translation model cache initialized.")
 # --- 3. Core Processing Functions ---
+def load_and_resample_audio(audio_path):
     try:
+        waveform, sample_rate = torchaudio.load(audio_path, channels_first=True)
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+            waveform = resampler(waveform)
+        return waveform.squeeze(0).numpy()
     except Exception as e:
+        raise IOError(f"Error processing audio file {audio_path}: {e}")
 def process_audio(audio_input):
     if audio_input is None:
+        raise gr.Error("Please provide an audio file or record audio.")
     temp_audio_path = None
     try:
+        # Step 1: Handle audio input
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
         else:
             temp_audio_path = audio_input
+        logging.info("Standardizing audio...")
+        audio_waveform_16k = load_and_resample_audio(temp_audio_path)
+        # Step 2: ASR with local Whisper pipeline
+        logging.info("Starting ASR with local Whisper pipeline...")
+        if not ASR_PIPELINE:
+            raise gr.Error("ASR pipeline not available. The application cannot proceed.")
+        asr_output = ASR_PIPELINE(
+            audio_waveform_16k,
+            chunk_length_s=30,
+            batch_size=8,
+            return_timestamps="word"
+        )
+        word_timestamps = asr_output.get("chunks", [])
+        full_text = asr_output.get("text", "").strip()
+        # Step 3: Language Detection
+        detected_language_code = "en"
+        if full_text:
+            try:
+                detected_language_code = detect(full_text)
+            except LangDetectException:
+                logging.warning("Language detection failed, defaulting to English.")
+        detected_language_name = LANGUAGE_NAME_MAPPING.get(detected_language_code, "Unknown")
+        logging.info(f"Transcription complete. Language: {detected_language_name}")
+        # Step 4: Speaker Diarization
         diarization = None
         if DIARIZATION_PIPELINE:
             logging.info("Performing speaker diarization...")
             except Exception as e:
                 logging.error(f"Diarization failed: {e}")
+        # Step 5: Align ASR and Diarization results
         logging.info("Aligning transcription with speaker segments...")
+        merged_segments = []
         if diarization:
             speaker_map = [{'start': turn.start, 'end': turn.end, 'speaker': speaker} for turn, _, speaker in diarization.itertracks(yield_label=True)]
             for word_info in word_timestamps:
+                word_start, word_end = word_info['timestamp']
+                assigned_speaker = next((seg['speaker'] for seg in speaker_map if word_start >= seg['start'] and word_end <= seg['end']), "Unknown")
+                merged_segments.append({'start': word_start, 'end': word_end, 'text': word_info['text'], 'speaker': assigned_speaker})
+        else:
             for word_info in word_timestamps:
+                merged_segments.append({'start': word_info['timestamp'][0], 'end': word_info['timestamp'][1], 'text': word_info['text'], 'speaker': 'SPEAKER_00'})
+        # Merge consecutive words from the same speaker
+        final_segments = []
+        if merged_segments:
+            current_segment = merged_segments[0]
+            for i in range(1, len(merged_segments)):
+                next_seg = merged_segments[i]
+                if next_seg['speaker'] == current_segment['speaker'] and (next_seg['start'] - current_segment['end'] < 0.5):
+                    current_segment['text'] += " " + next_seg['text']
+                    current_segment['end'] = next_seg['end']
                 else:
+                    final_segments.append(current_segment)
+                    current_segment = next_seg
+            final_segments.append(current_segment)
+        diarized_text = "\n".join(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}: {seg['text'].strip()}" for seg in final_segments)
+        # Step 6: Translation with local Helsinki-NLP models
         translation_output = "Source language is English. No translation needed."
         if detected_language_code != 'en':
+            model_name = 'Helsinki-NLP/opus-mt-tam-en' if detected_language_code == 'ta' else f'Helsinki-NLP/opus-mt-{detected_language_code}-en'
+            try:
+                if model_name not in TRANSLATION_MODELS:
+                    logging.info(f"Loading translation model: {model_name}")
+                    tokenizer = AutoTokenizer.from_pretrained(model_name)
+                    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(DEVICE)
+                    TRANSLATION_MODELS[model_name] = (tokenizer, model)
+                tokenizer, model = TRANSLATION_MODELS[model_name]
+                texts_to_translate = [seg['text'] for seg in final_segments]
+                inputs = tokenizer(texts_to_translate, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
+                translated_ids = model.generate(**inputs)
+                translated_texts = tokenizer.batch_decode(translated_ids, skip_special_tokens=True)
+                # Reconstruct translated output with speaker and timing info
+                translation_lines = []
+                for i, segment in enumerate(final_segments):
+                    translation_lines.append(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['speaker']}: {translated_texts[i]}")
+                translation_output = "\n".join(translation_lines)
+            except Exception as e:
+                translation_output = f"Translation failed for '{detected_language_name}'. Model may not be available. Error: {e}"
+        # Step 7: Generate Report
+        report_content = f"# Audio Processing Report\n\n## Detected Language\n{detected_language_name} ({detected_language_code})\n\n---\n\n## Diarized Transcription\n{diarized_text}\n\n---\n\n## English Translation\n{translation_output}"
         with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding='utf-8') as report_file:
             report_file.write(report_content)
             report_path = report_file.name
+        return (f"{detected_language_name} ({detected_language_code})", diarized_text, translation_output, gr.update(value=report_path, visible=True))
     except Exception as e:
         logging.error(f"An unexpected error occurred: {e}", exc_info=True)
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
 # --- 4. Gradio User Interface ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Audio Processor") as app:
+    gr.Markdown("# Advanced Open-Source Audio Processor")
+    gr.Markdown("A 100% cost-free tool for transcribing, identifying speakers, and translating audio.")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Provide Audio")
             audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio")
             process_button = gr.Button("Process Audio", variant="primary")
+            with gr.Accordion("Authentication Instructions (Required for Speaker ID)", open=False):
+                gr.Markdown(get_hf_token_instructions())
         with gr.Column(scale=2):
             gr.Markdown("### 2. Processing Results")
                 with gr.TabItem("Diarized Transcription"):
                     diarized_transcription_output = gr.Textbox(label="Full Transcription (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
                 with gr.TabItem("Translation (to English)"):
+                    translation_output = gr.Textbox(label="Full Translation (with speaker labels)", lines=15, interactive=False, show_copy_button=True)
             gr.Markdown("### 3. Download Full Report")
             download_report_button = gr.File(label="Download Report (.txt)", visible=False, interactive=False)

requirements.txt CHANGED Viewed

@@ -1,19 +1,23 @@
-# UI and Core
-gradio
-# API Clients for AI Services
-openai
-google-generativeai
-# Local, Self-Hosted AI for Diarization
 pyannote.audio==3.1.1
-torch
-torchaudio
-numpy<2.0
 soundfile
 pyyaml
 einops
-pytorch-lightning
-# Other Utilities
-huggingface_hub

+# Core ML Libraries - Pinned for stability
+torch==2.1.2
+torchaudio==2.1.2
+transformers==4.41.2
+accelerate>=0.21.0
+numpy<2.0
+# Application and UI
+gradio
+# Audio Processing
 pyannote.audio==3.1.1
 soundfile
+# Language Detection
+langdetect
+# Other dependencies
+sentencepiece
+huggingface_hub
 pyyaml
 einops
+pytorch-lightning