Spaces:

samir72
/

AudioChatTranscriber

Sleeping

App Files Files Community

samir72 commited on Sep 19, 2025

Commit

4dff2f5

1 Parent(s): e81615e

Feature: summarization from Youtube

Browse files

Files changed (6) hide show

Youtubetranscription_summarizer.py +197 -0
__pycache__/Youtubetranscription_summarizer.cpython-313.pyc +0 -0
app.py +66 -27
gradio_client_audichattranscriber.py +35 -0
packages.txt +1 -0
requirements.txt +4 -2

Youtubetranscription_summarizer.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os, tempfile, subprocess, json, re, time, shutil
+from pathlib import Path
+from typing import Optional, Callable, Any
+import yt_dlp
+from faster_whisper import WhisperModel
+def main(url:str):
+    # Get YouTube URL from user
+    ensure_ffmpeg()
+    url = get_video_id(url)
+    #Pass the URL to download audio and convert to wav
+    wav_path = download_youtube_audio_wav16k_api(url)
+    #Transcribe the audio wav file
+    transcript = transcribe_faster_whisper(wav_path, model_name="base.en")
+    #print(f"Transcription completed. Language: {transcript['language']}")
+    #print(json.dumps(transcript, indent=2))
+    #Summarize the transcript using Phi
+    return transcript
+def get_video_id(url:str)->str:
+    # prompt = input("Enter YouTube URL\n")
+    # if prompt:
+    #     url = prompt
+    # elif prompt.lower() == "quit":
+    #     return None
+    # elif len(prompt) == 0:
+    #     print("Please enter a YouTube URL.\n")
+    #     return None
+    m = re.search(r"(?:v=|/shorts/|/live/|/embed/)([A-Za-z0-9_-]{6,})", url)
+    return m.group(1) if m else str(abs(hash(url)))
+def ensure_ffmpeg():
+    """
+    Verify that ffmpeg is available in PATH.
+    Raises RuntimeError with helpful guidance if missing.
+    Prints ffmpeg version to logs if found.
+    """
+    ffmpeg_path = shutil.which("ffmpeg")
+    if ffmpeg_path is None:
+        raise RuntimeError(
+            "FFmpeg not found in PATH.\n\n"
+            "👉 For Hugging Face Spaces:\n"
+            "   • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
+            "   • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
+            "Without ffmpeg, yt-dlp cannot extract/convert audio."
+        )
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-version"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            check=False,
+        )
+        print("✅ ffmpeg found at:", ffmpeg_path)
+        print(result.stdout.splitlines()[0])  # show first line of version info
+    except Exception as e:
+        raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")
+class YTDLPError(RuntimeError):
+    pass
+def _require(bin_name: str):
+    if shutil.which(bin_name) is None:
+        raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")
+def download_youtube_audio_wav16k_api(
+    youtube_url: str,
+    out_dir: Optional[str] = None,
+    target_sr: int = 16000,
+    target_channels: int = 1,
+    quiet: bool = True,
+    keep_intermediate: bool = False,
+    progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
+) -> str:
+    """
+    Download YouTube audio via yt_dlp's Python API, extract to WAV,
+    and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.
+    Args
+    ----
+    youtube_url : str
+    out_dir : Optional[str]    Directory for outputs (temp dir if None).
+    target_sr : int            Sample rate for final WAV (default 16000).
+    target_channels : int      Channels for final WAV (default 1 = mono).
+    quiet : bool               Suppress yt-dlp logs if True.
+    keep_intermediate : bool   Keep the pre-downsampled WAV if True.
+    progress_hook : callable   Optional yt-dlp progress hook.
+    Raises
+    ------
+    YTDLPError on failure.
+    """
+    if not youtube_url or not isinstance(youtube_url, str):
+        raise ValueError("youtube_url must be a non-empty string.")
+    _require("ffmpeg")  # we call ffmpeg ourselves
+    # yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly
+    work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
+    work_dir.mkdir(parents=True, exist_ok=True)
+    # First stage: let yt-dlp extract WAV (whatever SR/channels)
+    out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
+    hooks = [progress_hook] if progress_hook else []
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "outtmpl": out_template,
+        "noplaylist": True,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": "0",
+            }
+        ],
+        "quiet": quiet,
+        "no_warnings": quiet,
+        "progress_hooks": hooks,
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.extract_info(youtube_url, download=True)
+    except Exception as e:
+        raise YTDLPError(f"yt-dlp API failed: {e}") from e
+    # Locate the produced WAV (pre-downsampled)
+    pre_wavs = list(work_dir.glob("*.wav"))
+    if not pre_wavs:
+        raise YTDLPError("yt-dlp completed but no WAV was found.")
+    pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)
+    # Second stage: force 16 kHz mono via ffmpeg
+    final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
+    try:
+        subprocess.run(
+            [
+                "ffmpeg", "-y",
+                "-i", str(pre_wav),
+                "-ac", str(target_channels),
+                "-ar", str(target_sr),
+                str(final_wav),
+            ],
+            check=True,
+            stdout=subprocess.PIPE if quiet else None,
+            stderr=subprocess.PIPE if quiet else None,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
+    # Clean up intermediates if desired
+    if not keep_intermediate:
+        try:
+            if pre_wav.exists() and pre_wav != final_wav:
+                pre_wav.unlink()
+        except Exception:
+            pass
+    return str(final_wav)
+def transcribe_faster_whisper(wav_path:str, model_name="base.en"):
+    model = WhisperModel(model_name)
+    segments, info = model.transcribe(wav_path, beam_size=1, vad_filter=True)
+    out = []
+    for s in segments:
+        out.append({"start": s.start, "end": s.end, "text": s.text})
+    #return {"language": info.language, "segments": out}
+    return {"segments": out}
+def summarize_with_phi(transcript_segments, sysprompt, userprompt, phi_client):
+    # map-reduce pseudo:
+    CHUNK_SEC = 600  # ~10min per chunk as a starting point
+    chunks, cur, cur_t = [], [], 0.0
+    for seg in transcript_segments:
+        cur.append(seg); cur_t += (seg["end"]-seg["start"])
+        if cur_t >= CHUNK_SEC:
+            chunks.append(cur); cur, cur_t = [], 0.0
+    if cur: chunks.append(cur)
+    partials = []
+    for idx, chunk in enumerate(chunks, 1):
+        text = "\n".join(f"[{int(s['start']//60):02d}:{int(s['start']%60):02d}] {s['text']}" for s in chunk)
+        prompt = f"{userprompt}\n\nTRANSCRIPT CHUNK {idx}:\n{text}\n\nReturn: bullet summary + key timestamps."
+        partials.append(phi_client.summarize(sysprompt, prompt))  # your existing call
+    merged_prompt = f"Merge the {len(partials)} chunk summaries into one concise summary + top 5 timestamps."
+    return phi_client.summarize(sysprompt, merged_prompt + "\n\n" + "\n\n".join(partials))
+if __name__ == "__main__":
+    main(url=None)  # for local testing

__pycache__/Youtubetranscription_summarizer.cpython-313.pyc ADDED Viewed

Binary file (9.29 kB). View file

app.py CHANGED Viewed

@@ -7,12 +7,15 @@ import gradio as gr
 from dotenv import load_dotenv
 from openai import AzureOpenAI  # official OpenAI SDK, works with Azure endpoints
 import json
 # --- LLM call (Azure OpenAI with API key) -----------------------------------
-def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> str:
     """
-    Calls Azure OpenAI Chat Completions with audio input (base64 mp3).
     """
     load_dotenv()
@@ -23,8 +26,8 @@ def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> st
     if not endpoint or not api_key or not deployment:
         return "Server misconfiguration: required env vars missing."
     try:
         client = AzureOpenAI(
             api_key=api_key,
@@ -35,30 +38,56 @@ def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> st
         system_message = sys_prompt.strip() if sys_prompt else (
             "You are an AI assistant with a charter to clearly analyze the customer enquiry."
         )
-        user_text = user_prompt.strip() if user_prompt else "Summarize the audio content."
         response = client.chat.completions.create(
             model=deployment,
             messages=[
                 {"role": "system", "content": system_message},
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": user_text},
-                        {
-                            "type": "input_audio",
-                            "input_audio": {"data": audio_b64, "format": "mp3"},
-                        },
-                    ],
-                },
             ],
         )
-        print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt)}, audio_size={len(audio_b64)}")
         return response.choices[0].message.content
     except Exception as ex:
         return print(f"Error from Azure OpenAI: {ex}")
-        #pass
 #----Retrieve meta data from metadata.json file------------------------------
 def retrieve_file_path(file_name):
@@ -101,6 +130,8 @@ def download_to_temp_mp3(url: str) -> str:
 def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
     tmp_to_cleanup = []
     try:
         audio_path = None
         if upload_path:
@@ -108,14 +139,21 @@ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
         elif record_path:
             audio_path = record_path
         elif url and url.strip():
-            audio_path = download_to_temp_mp3(url.strip())
-            tmp_to_cleanup.append(audio_path)
-        if not audio_path:
-            return "Please provide an audio file via upload, recording, or URL."
-        audio_b64 = encode_audio_from_path(audio_path)
-        return summarize_audio_b64(audio_b64, sys_prompt, user_prompt)
     except Exception as e:
         return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
@@ -134,7 +172,8 @@ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
 with gr.Blocks(title="Audio Summarizer") as demo:
     gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
-    gr.Markdown("Upload a mp3, record audio, or paste a URL. The app sends base64 audio to Azure OpenAI.")
     with gr.Row():
         with gr.Column():
@@ -142,7 +181,7 @@ with gr.Blocks(title="Audio Summarizer") as demo:
         with gr.Column():
             record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
         with gr.Column():
-            url_input = gr.Textbox(label="mp3 URL", placeholder="https://example.com/audio.mp3")
     ### Get system and user prompts from metadata.json file
     file_name = 'metadata.json'

 from dotenv import load_dotenv
 from openai import AzureOpenAI  # official OpenAI SDK, works with Azure endpoints
 import json
+import subprocess
+import Youtubetranscription_summarizer
+import re
 # --- LLM call (Azure OpenAI with API key) -----------------------------------
+def summarize_input(audio_b64: str = None, text_input: str = None, sys_prompt: str = None, user_prompt: str = None) -> str:
     """
+    Calls Azure OpenAI Chat Completions with audio input (base64 mp3) or text input, or both.
     """
     load_dotenv()
     if not endpoint or not api_key or not deployment:
         return "Server misconfiguration: required env vars missing."
+    # Reset json_text for logging
+    json_text = ""
     try:
         client = AzureOpenAI(
             api_key=api_key,
         system_message = sys_prompt.strip() if sys_prompt else (
             "You are an AI assistant with a charter to clearly analyze the customer enquiry."
         )
+        user_text = user_prompt.strip() if user_prompt else (
+            "Summarize the provided content." if audio_b64 or text_input else "No input provided."
+        )
+        content = [{"type": "text", "text": user_text}]
+        if audio_b64:
+            content.append({
+                "type": "input_audio",
+                "input_audio": {"data": audio_b64, "format": "mp3"},
+            })
+        if text_input is not None:
+            # Debugging: Print the type and value of text_input
+            #print(f"Debug: text_input type={type(text_input)}, value={text_input}")
+            if isinstance(text_input, str):
+                try:
+                    # Try to parse the string as JSON to see if it's a list or dict
+                    parsed = json.loads(text_input)
+                    if isinstance(parsed, (list, dict)):
+                        # If it's a list or dict, convert back to JSON string
+                        content.append({"type": "text", "text": json.dumps(parsed)})
+                    else:
+                        # If it's a string but not a JSON list/dict, use it as-is
+                        content.append({"type": "text", "text": text_input})
+                except json.JSONDecodeError:
+                    # If it's not valid JSON, treat it as a regular string
+                    content.append({"type": "text", "text": text_input})
+            elif isinstance(text_input, (list, dict)):
+                try:
+                    # Convert list or dict to JSON-formatted string
+                    json_text = json.dumps(text_input)
+                    content.append({"type": "text", "text": json_text})
+                except (TypeError, ValueError):
+                    return "Error: text_input (list or dict) could not be converted to JSON."
+            else:
+                return f"Error: text_input must be a string, list, or dict, got {type(text_input)}."
         response = client.chat.completions.create(
             model=deployment,
             messages=[
                 {"role": "system", "content": system_message},
+                {"role": "user", "content": content},
             ],
         )
+        print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt or '')}, "
+              f"audio_size={len(audio_b64 or '')}, text_input_size={len(json_text or '')}")
         return response.choices[0].message.content
     except Exception as ex:
         return print(f"Error from Azure OpenAI: {ex}")
 #----Retrieve meta data from metadata.json file------------------------------
 def retrieve_file_path(file_name):
 def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
     tmp_to_cleanup = []
+    audio_b64 = None
+    text_input = None
     try:
         audio_path = None
         if upload_path:
         elif record_path:
             audio_path = record_path
         elif url and url.strip():
+            #Check if it's a youtube url
+            CheckURL = re.search(r"Youtube", url, re.IGNORECASE)
+            if CheckURL:
+                # Get the transcription from youtube
+                text_input = Youtubetranscription_summarizer.main(url.strip()) # Youtube files are transcribed and summarized
+                tmp_to_cleanup.append(text_input)
+            else:
+                audio_path = download_to_temp_mp3(url.strip())
+                tmp_to_cleanup.append(audio_path)
+        if not audio_path and text_input is None:
+            return "Please provide content via upload, recording, or URL."
+        # If we have an audio file, encode it
+        if audio_path:
+            audio_b64 = encode_audio_from_path(audio_path)
+        return summarize_input(audio_b64, text_input, sys_prompt, user_prompt)
     except Exception as e:
         return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
 with gr.Blocks(title="Audio Summarizer") as demo:
     gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
+    gr.Markdown("Upload an mp3(**YouTube is the new feature add**), record audio, or paste a URL, use the default user prompt and system prompt and  click 'Summarize'.")
+    gr.Markdown("Users are encouraged to modify the user and system prompts to suit their needs.")
     with gr.Row():
         with gr.Column():
         with gr.Column():
             record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
         with gr.Column():
+            url_input = gr.Textbox(label="YouTube or standard mp3 URL", placeholder="https://example.com/audio.mp3")
     ### Get system and user prompts from metadata.json file
     file_name = 'metadata.json'

gradio_client_audichattranscriber.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from datetime import datetime
+import gradio as gr
+from dotenv import load_dotenv
+from gradio_client import Client  # Gradio client for Hugging Face models
+def main():
+    """
+    Calls Gradio app hosted on Hugging Face using Gradio client.
+    """
+    load_dotenv() # Load .env file for HF token if needed
+    try:
+        client = Client("samir72/AudioChatTranscriber")  # Hugging Face model with Gradio app
+        #client.view_api()  # View available API endpoints
+        response = client.predict(
+			upload_path=None,
+            record_path=None,
+            url="https://audio-samples.github.io/samples/mp3/blizzard_biased/sample-0.mp3",
+			sys_prompt="You are an AI assistant with a listening charter to clearly analyze the customer enquiry.",
+			user_prompt="Summarize the audio content",
+			api_name="/process_audio"
+        )
+        print(f"Gradio API call at {datetime.now()}")
+        print(f"Summarized Output : {response}")
+        return response
+    except Exception as ex:
+        return print(f"Error calling Gradio app: {ex}")
+        #pass
+if __name__ == "__main__":
+    main()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -1,7 +1,9 @@
-python-dotenv==1.1.1
 gradio==5.45.0
 requests==2.32.5
 azure-identity==1.25.0
 azure-ai-projects==1.0.0
 numpy==1.26.4
-openai==1.107.3

+dotenv==0.9.9
 gradio==5.45.0
 requests==2.32.5
 azure-identity==1.25.0
 azure-ai-projects==1.0.0
 numpy==1.26.4
+openai==1.107.3
+yt_dlp==2025.9.5
+faster_whisper==1.2.0