Spaces:

quynhthames
/

vietnamese-audio-analyzer

Runtime error

App Files Files Community

quynhthames commited on May 23, 2025

Commit

eeeeb9c

1 Parent(s): 81607f6

t1

Browse files

Files changed (2) hide show

app.py +334 -0
requirements.txt +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import gradio as gr
+import torch
+import numpy as np
+import json
+import html
+from itertools import groupby
+from sentence_transformers import SentenceTransformer, util
+from underthesea import sent_tokenize
+from transformers import pipeline
+import tempfile
+import os
+import gc
+# === Setup Models & Tokens ===
+HF_TOKEN = "REMOVED_SECRET"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load whisper lazily inside function to save startup time
+whisper_model = None
+# Speaker diarization pipeline
+from pyannote.audio import Pipeline
+diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HF_TOKEN)
+diarization_pipeline.to(device)
+# Vietnamese punctuation corrector
+corrector = pipeline("text2text-generation", model="bmd1905/vietnamese-correction-v2", device=0 if torch.cuda.is_available() else -1)
+# SentenceTransformer for embeddings
+embedding_model = SentenceTransformer("VoVanPhuc/sup-SimCSE-VietNamese-phobert-base", device=str(device))
+# Cache for embeddings and transcript
+cached_transcript_segments = None
+cached_embeddings = None
+# Dynamic color generator
+def generate_color_palette(n):
+    import colorsys
+    hues = np.linspace(0, 1, n, endpoint=False)
+    colors = []
+    for h in hues:
+        r, g, b = colorsys.hsv_to_rgb(h, 0.6, 0.9)
+        colors.append(f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, 0.5)")
+    return colors
+# Step 1: Audio conversion
+def convert_to_wav(audio_file):
+    import subprocess
+    if not audio_file:
+        return None, "No audio provided."
+    input_path = audio_file.name
+    output_path = tempfile.mktemp(suffix=".wav")
+    # Convert only if not wav or not correct sample rate
+    try:
+        # ffmpeg command: 1 channel, 16000 Hz sample rate, wav format
+        subprocess.run(
+            ["ffmpeg", "-y", "-i", input_path, "-ac", "1", "-ar", "16000", output_path],
+            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
+    except Exception as e:
+        return None, f"Error converting audio: {e}"
+    return output_path, "Audio converted to WAV."
+# Step 2: Transcription
+def transcribe_audio(wav_path, progress=gr.Progress()):
+    global whisper_model
+    if whisper_model is None:
+        import whisper
+        whisper_model = whisper.load_model("large", device=str(device))
+    progress(0.1, desc="Transcribing audio...")
+    result = whisper_model.transcribe(wav_path, language="vi")
+    progress(1.0, desc="Transcription complete.")
+    return result
+# Step 3: Diarization
+def diarize_audio(wav_path, progress=gr.Progress()):
+    progress(0.1, desc="Running diarization...")
+    diarization = diarization_pipeline(wav_path)
+    progress(1.0, desc="Diarization complete.")
+    return diarization
+def merge_transcript_with_speakers(transcript_segments, diarization):
+    merged = []
+    for seg in transcript_segments:
+        start = seg["start"]
+        end = seg["end"]
+        text = seg["text"].strip()
+        speaker = "Unknown"
+        max_overlap = 0
+        for turn, _, label in diarization.itertracks(yield_label=True):
+            overlap = max(0, min(end, turn.end) - max(start, turn.start))
+            if overlap > max_overlap:
+                speaker = label
+                max_overlap = overlap
+        merged.append((speaker, text))
+    grouped = [
+        {"speaker": speaker, "text": ' '.join(text for _, text in group)}
+        for speaker, group in groupby(merged, key=lambda x: x[0])
+    ]
+    return grouped
+# Step 4: Punctuation correction
+def correct_punctuation(transcript, progress=gr.Progress()):
+    MAX_LENGTH = 4096
+    BATCH_SIZE = 8
+    texts = [turn['text'] for turn in transcript]
+    def batch(lst, batch_size):
+        for i in range(0, len(lst), batch_size):
+            yield lst[i:i + batch_size]
+    corrected_texts = []
+    total_batches = (len(texts) + BATCH_SIZE - 1) // BATCH_SIZE
+    for i, text_batch in enumerate(batch(texts, BATCH_SIZE)):
+        progress(i / total_batches, desc="Correcting punctuation...")
+        predictions = corrector(text_batch, max_length=MAX_LENGTH)
+        corrected_texts.extend([p['generated_text'] for p in predictions])
+    progress(1.0, desc="Punctuation correction complete.")
+    for turn, corrected_text in zip(transcript, corrected_texts):
+        turn['text'] = corrected_text
+    return transcript
+# Step 5: Content analysis (keyword highlighting)
+def highlight_transcript(transcript, keywords, percentile):
+    global cached_transcript_segments, cached_embeddings
+    if cached_transcript_segments is None or cached_transcript_segments != transcript:
+        # Flatten sentences
+        flattened = []
+        for idx, turn in enumerate(transcript):
+            # Only keep sentences with enough words
+            def is_relevant_sentence(text, min_word_count=6):
+                words = [w for w in text.split() if w.isalpha()]
+                return len(words) >= min_word_count
+            if is_relevant_sentence(turn["text"]):
+                for sent in sent_tokenize(turn["text"]):
+                    sent = sent.strip()
+                    if is_relevant_sentence(sent):
+                        flattened.append({"speaker": turn["speaker"], "text": sent, "turn_idx": idx})
+        cached_transcript_segments = transcript
+        # Sliding windows
+        def sliding_windows(sentences, window_size=2, step=1):
+            windows = []
+            for i in range(0, len(sentences) - window_size + 1, step):
+                chunk = sentences[i:i + window_size]
+                windows.append({
+                    "start_idx": i,
+                    "end_idx": i + window_size,
+                    "speakers": [s["speaker"] for s in chunk],
+                    "text": " ".join(s["text"] for s in chunk)
+                })
+            return windows
+        windows = sliding_windows(flattened)
+        window_texts = [w["text"] for w in windows]
+        cached_embeddings = embedding_model.encode(window_texts, convert_to_tensor=True)
+    else:
+        # reuse cached_embeddings
+        flattened = []
+        for idx, turn in enumerate(transcript):
+            def is_relevant_sentence(text, min_word_count=6):
+                words = [w for w in text.split() if w.isalpha()]
+                return len(words) >= min_word_count
+            if is_relevant_sentence(turn["text"]):
+                for sent in sent_tokenize(turn["text"]):
+                    sent = sent.strip()
+                    if is_relevant_sentence(sent):
+                        flattened.append({"speaker": turn["speaker"], "text": sent, "turn_idx": idx})
+        windows = []
+        for i in range(len(flattened)-1):
+            chunk = flattened[i:i+2]
+            windows.append({
+                "start_idx": i,
+                "end_idx": i+2,
+                "speakers": [s["speaker"] for s in chunk],
+                "text": " ".join(s["text"] for s in chunk)
+            })
+    # Generate colors dynamically for keywords
+    unique_keywords = list(set([k.strip().lower() for k in keywords if k.strip() != ""]))
+    colors = generate_color_palette(len(unique_keywords))
+    keyword_color_map = dict(zip(unique_keywords, colors))
+    matched_windows = []
+    for keyword in unique_keywords:
+        if not keyword:
+            continue
+        keyword_embedding = embedding_model.encode([keyword], convert_to_tensor=True)
+        sims = util.cos_sim(cached_embeddings, keyword_embedding).squeeze()
+        top_indices, threshold = auto_top_k(sims.cpu().numpy(), percentile=percentile)
+        for i in top_indices:
+            matched_windows.append({
+                "start": windows[i]["start_idx"],
+                "end": windows[i]["end_idx"],
+                "keywords": [{
+                    "keyword": keyword,
+                    "color": keyword_color_map[keyword],
+                    "score": sims[i].item()
+                }]
+            })
+    # Merge overlapping windows
+    matched_windows.sort(key=lambda x: x["start"])
+    merged = []
+    for w in matched_windows:
+        if not merged or w["start"] > merged[-1]["end"]:
+            merged.append(w.copy())
+        else:
+            merged[-1]["end"] = max(merged[-1]["end"], w["end"])
+            merged[-1]["keywords"].extend(w["keywords"])
+    # Build highlight map
+    highlight_map = {}
+    for mw in merged:
+        for idx in range(mw["start"], mw["end"]):
+            sent_info = flattened[idx]
+            turn_idx = sent_info["turn_idx"]
+            if turn_idx not in highlight_map:
+                highlight_map[turn_idx] = []
+            highlight_map[turn_idx].extend(mw["keywords"])
+    # Compose HTML transcript with highlights and speaker colors & tooltip similarity scores
+    # Assign a color per speaker
+    speakers = list(set([turn["speaker"] for turn in transcript]))
+    speaker_colors = generate_color_palette(len(speakers))
+    speaker_color_map = dict(zip(speakers, speaker_colors))
+    html_lines = []
+    for i, turn in enumerate(transcript):
+        sp = turn["speaker"]
+        sp_color = speaker_color_map.get(sp, "black")
+        text = html.escape(turn["text"])
+        # Apply highlights for keywords in this turn
+        if i in highlight_map:
+            keywords_info = highlight_map[i]
+            # Combine same keywords (by name)
+            combined = {}
+            for k in keywords_info:
+                combined[k["keyword"]] = k
+            # Sort keywords by score desc
+            sorted_kw = sorted(combined.values(), key=lambda x: x["score"], reverse=True)
+            tooltip_text = ", ".join(f'{kw["keyword"]} ({kw["score"]:.3f})' for kw in sorted_kw)
+            # Wrap keywords with span colored background
+            for kw in sorted_kw:
+                # Replace all keyword occurrences (case insensitive)
+                text = replace_case_insensitive(text, kw["keyword"], f'<span style="background-color:{kw["color"]};" title="{tooltip_text}">{kw["keyword"]}</span>')
+            # Speaker label with color
+            html_lines.append(f'<p><b><span style="color:{sp_color};">Speaker: {sp}</span></b><br>{text}</p>')
+        else:
+            html_lines.append(f'<p><b><span style="color:{sp_color};">Speaker: {sp}</span></b><br>{text}</p>')
+    final_html = "<br>".join(html_lines)
+    return final_html
+def auto_top_k(similarities, percentile=90):
+    threshold = np.percentile(similarities, percentile)
+    top_indices = np.where(similarities >= threshold)[0]
+    return top_indices, threshold
+def replace_case_insensitive(text, keyword, replacement):
+    import re
+    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
+    return pattern.sub(replacement, text)
+# Main app function
+def run_pipeline(audio_file, keywords_raw, percentile, transcript_input, proceed_clicked):
+    if not proceed_clicked:
+        return "", "Waiting for input...", None
+    keywords = [k.strip().lower() for k in keywords_raw.split(",") if k.strip() != ""]
+    if transcript_input.strip():
+        # Use pasted transcript - parse as JSON or text with speaker info?
+        # For now, assume JSON list [{"speaker":"spk1","text":"..."}]
+        try:
+            transcript = json.loads(transcript_input)
+        except:
+            return "", "Invalid transcript JSON format.", None
+        transcript_html = highlight_transcript(transcript, keywords, percentile)
+        # Prepare JSON for download
+        transcript_json_str = json.dumps(transcript, ensure_ascii=False, indent=2)
+        return transcript_html, "Loaded transcript and analyzed.", gr.File.update(value=None)
+    if not audio_file:
+        return "", "Please upload audio file or paste transcript.", None
+    status = "Converting audio..."
+    wav_path, msg = convert_to_wav(audio_file)
+    if not wav_path:
+        return "", msg, None
+    status = "Transcribing audio..."
+    result = transcribe_audio(wav_path)
+    segments = result["segments"]
+    # Diarize
+    status = "Diarizing audio..."
+    diarization = diarize_audio(wav_path)
+    # Merge transcript with speakers
+    merged = merge_transcript_with_speakers(segments, diarization)
+    # Punctuation correction
+    status = "Correcting punctuation..."
+    merged = correct_punctuation(merged)
+    # Content analysis + highlighting
+    status = "Highlighting transcript..."
+    transcript_html = highlight_transcript(merged, keywords, percentile)
+    # Save JSON for download
+    transcript_json_str = json.dumps(merged, ensure_ascii=False, indent=2)
+    # Cleanup temp files
+    try:
+        os.remove(wav_path)
+    except:
+        pass
+    gc.collect()
+    return transcript_html, "Processing complete.", gr.File.update(value=None)
+with gr.Blocks() as demo:
+    gr.Markdown("## Vietnamese Audio Transcript & Keyword Analysis")
+    with gr.Row():
+        with gr.Column(scale=2):
+            audio_input = gr.Audio(label="Upload or record audio (16kHz mono WAV recommended)", source="upload", type="file")
+            transcript_input = gr.Textbox(label="Or paste final transcript JSON (skip upload & transcription)", lines=6, placeholder='Paste JSON here')
+            keywords_input = gr.Textbox(label="Enter keywords separated by commas", value="hoa hồng, chiến lược giá")
+            percentile_slider = gr.Slider(50, 100, value=90, step=1, label="Similarity percentile threshold for keyword matching")
+            proceed_btn = gr.Button("Proceed")
+        with gr.Column(scale=3):
+            output_html = gr.HTML()
+            status_text = gr.Textbox(label="Status", interactive=False)
+    proceed_btn.click(
+        run_pipeline,
+        inputs=[audio_input, keywords_input, percentile_slider, transcript_input, proceed_btn],
+        outputs=[output_html, status_text, None]
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+git+https://github.com/openai/whisper.git
+pyannote.audio
+sentence_transformers
+underthesea
+pyvi
+torch
+numpy
+transformers
+gradio