Spaces:
Sleeping
Sleeping
samir72
commited on
Commit
·
4dff2f5
1
Parent(s):
e81615e
Feature: summarization from Youtube
Browse files- Youtubetranscription_summarizer.py +197 -0
- __pycache__/Youtubetranscription_summarizer.cpython-313.pyc +0 -0
- app.py +66 -27
- gradio_client_audichattranscriber.py +35 -0
- packages.txt +1 -0
- requirements.txt +4 -2
Youtubetranscription_summarizer.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, tempfile, subprocess, json, re, time, shutil
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Optional, Callable, Any
|
| 4 |
+
import yt_dlp
|
| 5 |
+
from faster_whisper import WhisperModel
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def main(url:str):
|
| 9 |
+
# Get YouTube URL from user
|
| 10 |
+
ensure_ffmpeg()
|
| 11 |
+
url = get_video_id(url)
|
| 12 |
+
#Pass the URL to download audio and convert to wav
|
| 13 |
+
wav_path = download_youtube_audio_wav16k_api(url)
|
| 14 |
+
#Transcribe the audio wav file
|
| 15 |
+
transcript = transcribe_faster_whisper(wav_path, model_name="base.en")
|
| 16 |
+
#print(f"Transcription completed. Language: {transcript['language']}")
|
| 17 |
+
#print(json.dumps(transcript, indent=2))
|
| 18 |
+
#Summarize the transcript using Phi
|
| 19 |
+
return transcript
|
| 20 |
+
|
| 21 |
+
def get_video_id(url:str)->str:
|
| 22 |
+
# prompt = input("Enter YouTube URL\n")
|
| 23 |
+
# if prompt:
|
| 24 |
+
# url = prompt
|
| 25 |
+
# elif prompt.lower() == "quit":
|
| 26 |
+
# return None
|
| 27 |
+
# elif len(prompt) == 0:
|
| 28 |
+
# print("Please enter a YouTube URL.\n")
|
| 29 |
+
# return None
|
| 30 |
+
m = re.search(r"(?:v=|/shorts/|/live/|/embed/)([A-Za-z0-9_-]{6,})", url)
|
| 31 |
+
return m.group(1) if m else str(abs(hash(url)))
|
| 32 |
+
|
| 33 |
+
def ensure_ffmpeg():
|
| 34 |
+
"""
|
| 35 |
+
Verify that ffmpeg is available in PATH.
|
| 36 |
+
Raises RuntimeError with helpful guidance if missing.
|
| 37 |
+
Prints ffmpeg version to logs if found.
|
| 38 |
+
"""
|
| 39 |
+
ffmpeg_path = shutil.which("ffmpeg")
|
| 40 |
+
if ffmpeg_path is None:
|
| 41 |
+
raise RuntimeError(
|
| 42 |
+
"FFmpeg not found in PATH.\n\n"
|
| 43 |
+
"👉 For Hugging Face Spaces:\n"
|
| 44 |
+
" • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
|
| 45 |
+
" • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
|
| 46 |
+
"Without ffmpeg, yt-dlp cannot extract/convert audio."
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
result = subprocess.run(
|
| 51 |
+
["ffmpeg", "-version"],
|
| 52 |
+
stdout=subprocess.PIPE,
|
| 53 |
+
stderr=subprocess.STDOUT,
|
| 54 |
+
text=True,
|
| 55 |
+
check=False,
|
| 56 |
+
)
|
| 57 |
+
print("✅ ffmpeg found at:", ffmpeg_path)
|
| 58 |
+
print(result.stdout.splitlines()[0]) # show first line of version info
|
| 59 |
+
except Exception as e:
|
| 60 |
+
raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class YTDLPError(RuntimeError):
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
def _require(bin_name: str):
|
| 67 |
+
if shutil.which(bin_name) is None:
|
| 68 |
+
raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")
|
| 69 |
+
|
| 70 |
+
def download_youtube_audio_wav16k_api(
|
| 71 |
+
youtube_url: str,
|
| 72 |
+
out_dir: Optional[str] = None,
|
| 73 |
+
target_sr: int = 16000,
|
| 74 |
+
target_channels: int = 1,
|
| 75 |
+
quiet: bool = True,
|
| 76 |
+
keep_intermediate: bool = False,
|
| 77 |
+
progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
|
| 78 |
+
) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Download YouTube audio via yt_dlp's Python API, extract to WAV,
|
| 81 |
+
and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.
|
| 82 |
+
|
| 83 |
+
Args
|
| 84 |
+
----
|
| 85 |
+
youtube_url : str
|
| 86 |
+
out_dir : Optional[str] Directory for outputs (temp dir if None).
|
| 87 |
+
target_sr : int Sample rate for final WAV (default 16000).
|
| 88 |
+
target_channels : int Channels for final WAV (default 1 = mono).
|
| 89 |
+
quiet : bool Suppress yt-dlp logs if True.
|
| 90 |
+
keep_intermediate : bool Keep the pre-downsampled WAV if True.
|
| 91 |
+
progress_hook : callable Optional yt-dlp progress hook.
|
| 92 |
+
|
| 93 |
+
Raises
|
| 94 |
+
------
|
| 95 |
+
YTDLPError on failure.
|
| 96 |
+
"""
|
| 97 |
+
if not youtube_url or not isinstance(youtube_url, str):
|
| 98 |
+
raise ValueError("youtube_url must be a non-empty string.")
|
| 99 |
+
|
| 100 |
+
_require("ffmpeg") # we call ffmpeg ourselves
|
| 101 |
+
# yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly
|
| 102 |
+
|
| 103 |
+
work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
|
| 104 |
+
work_dir.mkdir(parents=True, exist_ok=True)
|
| 105 |
+
|
| 106 |
+
# First stage: let yt-dlp extract WAV (whatever SR/channels)
|
| 107 |
+
out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
|
| 108 |
+
hooks = [progress_hook] if progress_hook else []
|
| 109 |
+
|
| 110 |
+
ydl_opts = {
|
| 111 |
+
"format": "bestaudio/best",
|
| 112 |
+
"outtmpl": out_template,
|
| 113 |
+
"noplaylist": True,
|
| 114 |
+
"postprocessors": [
|
| 115 |
+
{
|
| 116 |
+
"key": "FFmpegExtractAudio",
|
| 117 |
+
"preferredcodec": "wav",
|
| 118 |
+
"preferredquality": "0",
|
| 119 |
+
}
|
| 120 |
+
],
|
| 121 |
+
"quiet": quiet,
|
| 122 |
+
"no_warnings": quiet,
|
| 123 |
+
"progress_hooks": hooks,
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 128 |
+
ydl.extract_info(youtube_url, download=True)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
raise YTDLPError(f"yt-dlp API failed: {e}") from e
|
| 131 |
+
|
| 132 |
+
# Locate the produced WAV (pre-downsampled)
|
| 133 |
+
pre_wavs = list(work_dir.glob("*.wav"))
|
| 134 |
+
if not pre_wavs:
|
| 135 |
+
raise YTDLPError("yt-dlp completed but no WAV was found.")
|
| 136 |
+
pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)
|
| 137 |
+
|
| 138 |
+
# Second stage: force 16 kHz mono via ffmpeg
|
| 139 |
+
final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
|
| 140 |
+
try:
|
| 141 |
+
subprocess.run(
|
| 142 |
+
[
|
| 143 |
+
"ffmpeg", "-y",
|
| 144 |
+
"-i", str(pre_wav),
|
| 145 |
+
"-ac", str(target_channels),
|
| 146 |
+
"-ar", str(target_sr),
|
| 147 |
+
str(final_wav),
|
| 148 |
+
],
|
| 149 |
+
check=True,
|
| 150 |
+
stdout=subprocess.PIPE if quiet else None,
|
| 151 |
+
stderr=subprocess.PIPE if quiet else None,
|
| 152 |
+
text=True,
|
| 153 |
+
)
|
| 154 |
+
except subprocess.CalledProcessError as e:
|
| 155 |
+
raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
|
| 156 |
+
|
| 157 |
+
# Clean up intermediates if desired
|
| 158 |
+
if not keep_intermediate:
|
| 159 |
+
try:
|
| 160 |
+
if pre_wav.exists() and pre_wav != final_wav:
|
| 161 |
+
pre_wav.unlink()
|
| 162 |
+
except Exception:
|
| 163 |
+
pass
|
| 164 |
+
|
| 165 |
+
return str(final_wav)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def transcribe_faster_whisper(wav_path:str, model_name="base.en"):
|
| 169 |
+
model = WhisperModel(model_name)
|
| 170 |
+
segments, info = model.transcribe(wav_path, beam_size=1, vad_filter=True)
|
| 171 |
+
out = []
|
| 172 |
+
for s in segments:
|
| 173 |
+
out.append({"start": s.start, "end": s.end, "text": s.text})
|
| 174 |
+
#return {"language": info.language, "segments": out}
|
| 175 |
+
return {"segments": out}
|
| 176 |
+
|
| 177 |
+
def summarize_with_phi(transcript_segments, sysprompt, userprompt, phi_client):
|
| 178 |
+
# map-reduce pseudo:
|
| 179 |
+
CHUNK_SEC = 600 # ~10min per chunk as a starting point
|
| 180 |
+
chunks, cur, cur_t = [], [], 0.0
|
| 181 |
+
for seg in transcript_segments:
|
| 182 |
+
cur.append(seg); cur_t += (seg["end"]-seg["start"])
|
| 183 |
+
if cur_t >= CHUNK_SEC:
|
| 184 |
+
chunks.append(cur); cur, cur_t = [], 0.0
|
| 185 |
+
if cur: chunks.append(cur)
|
| 186 |
+
|
| 187 |
+
partials = []
|
| 188 |
+
for idx, chunk in enumerate(chunks, 1):
|
| 189 |
+
text = "\n".join(f"[{int(s['start']//60):02d}:{int(s['start']%60):02d}] {s['text']}" for s in chunk)
|
| 190 |
+
prompt = f"{userprompt}\n\nTRANSCRIPT CHUNK {idx}:\n{text}\n\nReturn: bullet summary + key timestamps."
|
| 191 |
+
partials.append(phi_client.summarize(sysprompt, prompt)) # your existing call
|
| 192 |
+
|
| 193 |
+
merged_prompt = f"Merge the {len(partials)} chunk summaries into one concise summary + top 5 timestamps."
|
| 194 |
+
return phi_client.summarize(sysprompt, merged_prompt + "\n\n" + "\n\n".join(partials))
|
| 195 |
+
|
| 196 |
+
if __name__ == "__main__":
|
| 197 |
+
main(url=None) # for local testing
|
__pycache__/Youtubetranscription_summarizer.cpython-313.pyc
ADDED
|
Binary file (9.29 kB). View file
|
|
|
app.py
CHANGED
|
@@ -7,12 +7,15 @@ import gradio as gr
|
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from openai import AzureOpenAI # official OpenAI SDK, works with Azure endpoints
|
| 9 |
import json
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# --- LLM call (Azure OpenAI with API key) -----------------------------------
|
| 12 |
|
| 13 |
-
def
|
| 14 |
"""
|
| 15 |
-
Calls Azure OpenAI Chat Completions with audio input (base64 mp3).
|
| 16 |
"""
|
| 17 |
load_dotenv()
|
| 18 |
|
|
@@ -23,8 +26,8 @@ def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> st
|
|
| 23 |
|
| 24 |
if not endpoint or not api_key or not deployment:
|
| 25 |
return "Server misconfiguration: required env vars missing."
|
| 26 |
-
|
| 27 |
-
|
| 28 |
try:
|
| 29 |
client = AzureOpenAI(
|
| 30 |
api_key=api_key,
|
|
@@ -35,30 +38,56 @@ def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> st
|
|
| 35 |
system_message = sys_prompt.strip() if sys_prompt else (
|
| 36 |
"You are an AI assistant with a charter to clearly analyze the customer enquiry."
|
| 37 |
)
|
| 38 |
-
user_text = user_prompt.strip() if user_prompt else
|
|
|
|
|
|
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
response = client.chat.completions.create(
|
| 41 |
model=deployment,
|
| 42 |
messages=[
|
| 43 |
{"role": "system", "content": system_message},
|
| 44 |
-
{
|
| 45 |
-
"role": "user",
|
| 46 |
-
"content": [
|
| 47 |
-
{"type": "text", "text": user_text},
|
| 48 |
-
{
|
| 49 |
-
"type": "input_audio",
|
| 50 |
-
"input_audio": {"data": audio_b64, "format": "mp3"},
|
| 51 |
-
},
|
| 52 |
-
],
|
| 53 |
-
},
|
| 54 |
],
|
| 55 |
)
|
| 56 |
-
print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt)},
|
|
|
|
| 57 |
return response.choices[0].message.content
|
| 58 |
|
| 59 |
except Exception as ex:
|
| 60 |
return print(f"Error from Azure OpenAI: {ex}")
|
| 61 |
-
#pass
|
| 62 |
|
| 63 |
#----Retrieve meta data from metadata.json file------------------------------
|
| 64 |
def retrieve_file_path(file_name):
|
|
@@ -101,6 +130,8 @@ def download_to_temp_mp3(url: str) -> str:
|
|
| 101 |
|
| 102 |
def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
|
| 103 |
tmp_to_cleanup = []
|
|
|
|
|
|
|
| 104 |
try:
|
| 105 |
audio_path = None
|
| 106 |
if upload_path:
|
|
@@ -108,14 +139,21 @@ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
|
|
| 108 |
elif record_path:
|
| 109 |
audio_path = record_path
|
| 110 |
elif url and url.strip():
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
except Exception as e:
|
| 121 |
return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
|
|
@@ -134,7 +172,8 @@ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
|
|
| 134 |
|
| 135 |
with gr.Blocks(title="Audio Summarizer") as demo:
|
| 136 |
gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
|
| 137 |
-
gr.Markdown("Upload
|
|
|
|
| 138 |
|
| 139 |
with gr.Row():
|
| 140 |
with gr.Column():
|
|
@@ -142,7 +181,7 @@ with gr.Blocks(title="Audio Summarizer") as demo:
|
|
| 142 |
with gr.Column():
|
| 143 |
record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
|
| 144 |
with gr.Column():
|
| 145 |
-
url_input = gr.Textbox(label="mp3 URL", placeholder="https://example.com/audio.mp3")
|
| 146 |
|
| 147 |
### Get system and user prompts from metadata.json file
|
| 148 |
file_name = 'metadata.json'
|
|
|
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from openai import AzureOpenAI # official OpenAI SDK, works with Azure endpoints
|
| 9 |
import json
|
| 10 |
+
import subprocess
|
| 11 |
+
import Youtubetranscription_summarizer
|
| 12 |
+
import re
|
| 13 |
|
| 14 |
# --- LLM call (Azure OpenAI with API key) -----------------------------------
|
| 15 |
|
| 16 |
+
def summarize_input(audio_b64: str = None, text_input: str = None, sys_prompt: str = None, user_prompt: str = None) -> str:
|
| 17 |
"""
|
| 18 |
+
Calls Azure OpenAI Chat Completions with audio input (base64 mp3) or text input, or both.
|
| 19 |
"""
|
| 20 |
load_dotenv()
|
| 21 |
|
|
|
|
| 26 |
|
| 27 |
if not endpoint or not api_key or not deployment:
|
| 28 |
return "Server misconfiguration: required env vars missing."
|
| 29 |
+
# Reset json_text for logging
|
| 30 |
+
json_text = ""
|
| 31 |
try:
|
| 32 |
client = AzureOpenAI(
|
| 33 |
api_key=api_key,
|
|
|
|
| 38 |
system_message = sys_prompt.strip() if sys_prompt else (
|
| 39 |
"You are an AI assistant with a charter to clearly analyze the customer enquiry."
|
| 40 |
)
|
| 41 |
+
user_text = user_prompt.strip() if user_prompt else (
|
| 42 |
+
"Summarize the provided content." if audio_b64 or text_input else "No input provided."
|
| 43 |
+
)
|
| 44 |
|
| 45 |
+
content = [{"type": "text", "text": user_text}]
|
| 46 |
+
|
| 47 |
+
if audio_b64:
|
| 48 |
+
content.append({
|
| 49 |
+
"type": "input_audio",
|
| 50 |
+
"input_audio": {"data": audio_b64, "format": "mp3"},
|
| 51 |
+
})
|
| 52 |
+
if text_input is not None:
|
| 53 |
+
# Debugging: Print the type and value of text_input
|
| 54 |
+
#print(f"Debug: text_input type={type(text_input)}, value={text_input}")
|
| 55 |
+
if isinstance(text_input, str):
|
| 56 |
+
try:
|
| 57 |
+
# Try to parse the string as JSON to see if it's a list or dict
|
| 58 |
+
parsed = json.loads(text_input)
|
| 59 |
+
if isinstance(parsed, (list, dict)):
|
| 60 |
+
# If it's a list or dict, convert back to JSON string
|
| 61 |
+
content.append({"type": "text", "text": json.dumps(parsed)})
|
| 62 |
+
else:
|
| 63 |
+
# If it's a string but not a JSON list/dict, use it as-is
|
| 64 |
+
content.append({"type": "text", "text": text_input})
|
| 65 |
+
except json.JSONDecodeError:
|
| 66 |
+
# If it's not valid JSON, treat it as a regular string
|
| 67 |
+
content.append({"type": "text", "text": text_input})
|
| 68 |
+
elif isinstance(text_input, (list, dict)):
|
| 69 |
+
try:
|
| 70 |
+
# Convert list or dict to JSON-formatted string
|
| 71 |
+
json_text = json.dumps(text_input)
|
| 72 |
+
content.append({"type": "text", "text": json_text})
|
| 73 |
+
except (TypeError, ValueError):
|
| 74 |
+
return "Error: text_input (list or dict) could not be converted to JSON."
|
| 75 |
+
else:
|
| 76 |
+
return f"Error: text_input must be a string, list, or dict, got {type(text_input)}."
|
| 77 |
+
|
| 78 |
response = client.chat.completions.create(
|
| 79 |
model=deployment,
|
| 80 |
messages=[
|
| 81 |
{"role": "system", "content": system_message},
|
| 82 |
+
{"role": "user", "content": content},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
],
|
| 84 |
)
|
| 85 |
+
print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt or '')}, "
|
| 86 |
+
f"audio_size={len(audio_b64 or '')}, text_input_size={len(json_text or '')}")
|
| 87 |
return response.choices[0].message.content
|
| 88 |
|
| 89 |
except Exception as ex:
|
| 90 |
return print(f"Error from Azure OpenAI: {ex}")
|
|
|
|
| 91 |
|
| 92 |
#----Retrieve meta data from metadata.json file------------------------------
|
| 93 |
def retrieve_file_path(file_name):
|
|
|
|
| 130 |
|
| 131 |
def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
|
| 132 |
tmp_to_cleanup = []
|
| 133 |
+
audio_b64 = None
|
| 134 |
+
text_input = None
|
| 135 |
try:
|
| 136 |
audio_path = None
|
| 137 |
if upload_path:
|
|
|
|
| 139 |
elif record_path:
|
| 140 |
audio_path = record_path
|
| 141 |
elif url and url.strip():
|
| 142 |
+
#Check if it's a youtube url
|
| 143 |
+
CheckURL = re.search(r"Youtube", url, re.IGNORECASE)
|
| 144 |
+
if CheckURL:
|
| 145 |
+
# Get the transcription from youtube
|
| 146 |
+
text_input = Youtubetranscription_summarizer.main(url.strip()) # Youtube files are transcribed and summarized
|
| 147 |
+
tmp_to_cleanup.append(text_input)
|
| 148 |
+
else:
|
| 149 |
+
audio_path = download_to_temp_mp3(url.strip())
|
| 150 |
+
tmp_to_cleanup.append(audio_path)
|
| 151 |
+
if not audio_path and text_input is None:
|
| 152 |
+
return "Please provide content via upload, recording, or URL."
|
| 153 |
+
# If we have an audio file, encode it
|
| 154 |
+
if audio_path:
|
| 155 |
+
audio_b64 = encode_audio_from_path(audio_path)
|
| 156 |
+
return summarize_input(audio_b64, text_input, sys_prompt, user_prompt)
|
| 157 |
|
| 158 |
except Exception as e:
|
| 159 |
return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
|
|
|
|
| 172 |
|
| 173 |
with gr.Blocks(title="Audio Summarizer") as demo:
|
| 174 |
gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
|
| 175 |
+
gr.Markdown("Upload an mp3(**YouTube is the new feature add**), record audio, or paste a URL, use the default user prompt and system prompt and click 'Summarize'.")
|
| 176 |
+
gr.Markdown("Users are encouraged to modify the user and system prompts to suit their needs.")
|
| 177 |
|
| 178 |
with gr.Row():
|
| 179 |
with gr.Column():
|
|
|
|
| 181 |
with gr.Column():
|
| 182 |
record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
|
| 183 |
with gr.Column():
|
| 184 |
+
url_input = gr.Textbox(label="YouTube or standard mp3 URL", placeholder="https://example.com/audio.mp3")
|
| 185 |
|
| 186 |
### Get system and user prompts from metadata.json file
|
| 187 |
file_name = 'metadata.json'
|
gradio_client_audichattranscriber.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from gradio_client import Client # Gradio client for Hugging Face models
|
| 5 |
+
|
| 6 |
+
def main():
|
| 7 |
+
"""
|
| 8 |
+
Calls Gradio app hosted on Hugging Face using Gradio client.
|
| 9 |
+
"""
|
| 10 |
+
load_dotenv() # Load .env file for HF token if needed
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
client = Client("samir72/AudioChatTranscriber") # Hugging Face model with Gradio app
|
| 15 |
+
#client.view_api() # View available API endpoints
|
| 16 |
+
response = client.predict(
|
| 17 |
+
upload_path=None,
|
| 18 |
+
record_path=None,
|
| 19 |
+
url="https://audio-samples.github.io/samples/mp3/blizzard_biased/sample-0.mp3",
|
| 20 |
+
sys_prompt="You are an AI assistant with a listening charter to clearly analyze the customer enquiry.",
|
| 21 |
+
user_prompt="Summarize the audio content",
|
| 22 |
+
api_name="/process_audio"
|
| 23 |
+
)
|
| 24 |
+
print(f"Gradio API call at {datetime.now()}")
|
| 25 |
+
print(f"Summarized Output : {response}")
|
| 26 |
+
return response
|
| 27 |
+
|
| 28 |
+
except Exception as ex:
|
| 29 |
+
return print(f"Error calling Gradio app: {ex}")
|
| 30 |
+
#pass
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
main()
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
requirements.txt
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
-
|
| 2 |
gradio==5.45.0
|
| 3 |
requests==2.32.5
|
| 4 |
azure-identity==1.25.0
|
| 5 |
azure-ai-projects==1.0.0
|
| 6 |
numpy==1.26.4
|
| 7 |
-
openai==1.107.3
|
|
|
|
|
|
|
|
|
| 1 |
+
dotenv==0.9.9
|
| 2 |
gradio==5.45.0
|
| 3 |
requests==2.32.5
|
| 4 |
azure-identity==1.25.0
|
| 5 |
azure-ai-projects==1.0.0
|
| 6 |
numpy==1.26.4
|
| 7 |
+
openai==1.107.3
|
| 8 |
+
yt_dlp==2025.9.5
|
| 9 |
+
faster_whisper==1.2.0
|