samir72 commited on
Commit
4dff2f5
·
1 Parent(s): e81615e

Feature: summarization from Youtube

Browse files
Youtubetranscription_summarizer.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, tempfile, subprocess, json, re, time, shutil
2
+ from pathlib import Path
3
+ from typing import Optional, Callable, Any
4
+ import yt_dlp
5
+ from faster_whisper import WhisperModel
6
+
7
+
8
+ def main(url:str):
9
+ # Get YouTube URL from user
10
+ ensure_ffmpeg()
11
+ url = get_video_id(url)
12
+ #Pass the URL to download audio and convert to wav
13
+ wav_path = download_youtube_audio_wav16k_api(url)
14
+ #Transcribe the audio wav file
15
+ transcript = transcribe_faster_whisper(wav_path, model_name="base.en")
16
+ #print(f"Transcription completed. Language: {transcript['language']}")
17
+ #print(json.dumps(transcript, indent=2))
18
+ #Summarize the transcript using Phi
19
+ return transcript
20
+
21
+ def get_video_id(url:str)->str:
22
+ # prompt = input("Enter YouTube URL\n")
23
+ # if prompt:
24
+ # url = prompt
25
+ # elif prompt.lower() == "quit":
26
+ # return None
27
+ # elif len(prompt) == 0:
28
+ # print("Please enter a YouTube URL.\n")
29
+ # return None
30
+ m = re.search(r"(?:v=|/shorts/|/live/|/embed/)([A-Za-z0-9_-]{6,})", url)
31
+ return m.group(1) if m else str(abs(hash(url)))
32
+
33
+ def ensure_ffmpeg():
34
+ """
35
+ Verify that ffmpeg is available in PATH.
36
+ Raises RuntimeError with helpful guidance if missing.
37
+ Prints ffmpeg version to logs if found.
38
+ """
39
+ ffmpeg_path = shutil.which("ffmpeg")
40
+ if ffmpeg_path is None:
41
+ raise RuntimeError(
42
+ "FFmpeg not found in PATH.\n\n"
43
+ "👉 For Hugging Face Spaces:\n"
44
+ " • If using Gradio/Streamlit template → add a `packages.txt` file at repo root with a line: ffmpeg\n"
45
+ " • If using Docker template → add `apt-get install -y ffmpeg` in your Dockerfile\n\n"
46
+ "Without ffmpeg, yt-dlp cannot extract/convert audio."
47
+ )
48
+
49
+ try:
50
+ result = subprocess.run(
51
+ ["ffmpeg", "-version"],
52
+ stdout=subprocess.PIPE,
53
+ stderr=subprocess.STDOUT,
54
+ text=True,
55
+ check=False,
56
+ )
57
+ print("✅ ffmpeg found at:", ffmpeg_path)
58
+ print(result.stdout.splitlines()[0]) # show first line of version info
59
+ except Exception as e:
60
+ raise RuntimeError(f"ffmpeg was found at {ffmpeg_path} but could not run: {e}")
61
+
62
+
63
+ class YTDLPError(RuntimeError):
64
+ pass
65
+
66
+ def _require(bin_name: str):
67
+ if shutil.which(bin_name) is None:
68
+ raise YTDLPError(f"Required executable '{bin_name}' not found in PATH.")
69
+
70
+ def download_youtube_audio_wav16k_api(
71
+ youtube_url: str,
72
+ out_dir: Optional[str] = None,
73
+ target_sr: int = 16000,
74
+ target_channels: int = 1,
75
+ quiet: bool = True,
76
+ keep_intermediate: bool = False,
77
+ progress_hook: Optional[Callable[[dict[str, Any]], None]] = None,
78
+ ) -> str:
79
+ """
80
+ Download YouTube audio via yt_dlp's Python API, extract to WAV,
81
+ and post-process with ffmpeg to 16 kHz mono. Returns path to the final WAV.
82
+
83
+ Args
84
+ ----
85
+ youtube_url : str
86
+ out_dir : Optional[str] Directory for outputs (temp dir if None).
87
+ target_sr : int Sample rate for final WAV (default 16000).
88
+ target_channels : int Channels for final WAV (default 1 = mono).
89
+ quiet : bool Suppress yt-dlp logs if True.
90
+ keep_intermediate : bool Keep the pre-downsampled WAV if True.
91
+ progress_hook : callable Optional yt-dlp progress hook.
92
+
93
+ Raises
94
+ ------
95
+ YTDLPError on failure.
96
+ """
97
+ if not youtube_url or not isinstance(youtube_url, str):
98
+ raise ValueError("youtube_url must be a non-empty string.")
99
+
100
+ _require("ffmpeg") # we call ffmpeg ourselves
101
+ # yt-dlp bundles ffmpeg via postprocessors, but we still run ffmpeg explicitly
102
+
103
+ work_dir = Path(out_dir or tempfile.mkdtemp(prefix="ytwav_")).resolve()
104
+ work_dir.mkdir(parents=True, exist_ok=True)
105
+
106
+ # First stage: let yt-dlp extract WAV (whatever SR/channels)
107
+ out_template = str(work_dir / "%(title).100B [%(id)s].%(ext)s")
108
+ hooks = [progress_hook] if progress_hook else []
109
+
110
+ ydl_opts = {
111
+ "format": "bestaudio/best",
112
+ "outtmpl": out_template,
113
+ "noplaylist": True,
114
+ "postprocessors": [
115
+ {
116
+ "key": "FFmpegExtractAudio",
117
+ "preferredcodec": "wav",
118
+ "preferredquality": "0",
119
+ }
120
+ ],
121
+ "quiet": quiet,
122
+ "no_warnings": quiet,
123
+ "progress_hooks": hooks,
124
+ }
125
+
126
+ try:
127
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
128
+ ydl.extract_info(youtube_url, download=True)
129
+ except Exception as e:
130
+ raise YTDLPError(f"yt-dlp API failed: {e}") from e
131
+
132
+ # Locate the produced WAV (pre-downsampled)
133
+ pre_wavs = list(work_dir.glob("*.wav"))
134
+ if not pre_wavs:
135
+ raise YTDLPError("yt-dlp completed but no WAV was found.")
136
+ pre_wav = max(pre_wavs, key=lambda p: p.stat().st_mtime)
137
+
138
+ # Second stage: force 16 kHz mono via ffmpeg
139
+ final_wav = pre_wav.with_name(pre_wav.stem + f".{target_sr}Hz.{target_channels}ch.wav")
140
+ try:
141
+ subprocess.run(
142
+ [
143
+ "ffmpeg", "-y",
144
+ "-i", str(pre_wav),
145
+ "-ac", str(target_channels),
146
+ "-ar", str(target_sr),
147
+ str(final_wav),
148
+ ],
149
+ check=True,
150
+ stdout=subprocess.PIPE if quiet else None,
151
+ stderr=subprocess.PIPE if quiet else None,
152
+ text=True,
153
+ )
154
+ except subprocess.CalledProcessError as e:
155
+ raise YTDLPError(f"ffmpeg failed to resample: {e.stderr or e.stdout}") from e
156
+
157
+ # Clean up intermediates if desired
158
+ if not keep_intermediate:
159
+ try:
160
+ if pre_wav.exists() and pre_wav != final_wav:
161
+ pre_wav.unlink()
162
+ except Exception:
163
+ pass
164
+
165
+ return str(final_wav)
166
+
167
+
168
+ def transcribe_faster_whisper(wav_path:str, model_name="base.en"):
169
+ model = WhisperModel(model_name)
170
+ segments, info = model.transcribe(wav_path, beam_size=1, vad_filter=True)
171
+ out = []
172
+ for s in segments:
173
+ out.append({"start": s.start, "end": s.end, "text": s.text})
174
+ #return {"language": info.language, "segments": out}
175
+ return {"segments": out}
176
+
177
+ def summarize_with_phi(transcript_segments, sysprompt, userprompt, phi_client):
178
+ # map-reduce pseudo:
179
+ CHUNK_SEC = 600 # ~10min per chunk as a starting point
180
+ chunks, cur, cur_t = [], [], 0.0
181
+ for seg in transcript_segments:
182
+ cur.append(seg); cur_t += (seg["end"]-seg["start"])
183
+ if cur_t >= CHUNK_SEC:
184
+ chunks.append(cur); cur, cur_t = [], 0.0
185
+ if cur: chunks.append(cur)
186
+
187
+ partials = []
188
+ for idx, chunk in enumerate(chunks, 1):
189
+ text = "\n".join(f"[{int(s['start']//60):02d}:{int(s['start']%60):02d}] {s['text']}" for s in chunk)
190
+ prompt = f"{userprompt}\n\nTRANSCRIPT CHUNK {idx}:\n{text}\n\nReturn: bullet summary + key timestamps."
191
+ partials.append(phi_client.summarize(sysprompt, prompt)) # your existing call
192
+
193
+ merged_prompt = f"Merge the {len(partials)} chunk summaries into one concise summary + top 5 timestamps."
194
+ return phi_client.summarize(sysprompt, merged_prompt + "\n\n" + "\n\n".join(partials))
195
+
196
+ if __name__ == "__main__":
197
+ main(url=None) # for local testing
__pycache__/Youtubetranscription_summarizer.cpython-313.pyc ADDED
Binary file (9.29 kB). View file
 
app.py CHANGED
@@ -7,12 +7,15 @@ import gradio as gr
7
  from dotenv import load_dotenv
8
  from openai import AzureOpenAI # official OpenAI SDK, works with Azure endpoints
9
  import json
 
 
 
10
 
11
  # --- LLM call (Azure OpenAI with API key) -----------------------------------
12
 
13
- def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> str:
14
  """
15
- Calls Azure OpenAI Chat Completions with audio input (base64 mp3).
16
  """
17
  load_dotenv()
18
 
@@ -23,8 +26,8 @@ def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> st
23
 
24
  if not endpoint or not api_key or not deployment:
25
  return "Server misconfiguration: required env vars missing."
26
-
27
-
28
  try:
29
  client = AzureOpenAI(
30
  api_key=api_key,
@@ -35,30 +38,56 @@ def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> st
35
  system_message = sys_prompt.strip() if sys_prompt else (
36
  "You are an AI assistant with a charter to clearly analyze the customer enquiry."
37
  )
38
- user_text = user_prompt.strip() if user_prompt else "Summarize the audio content."
 
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  response = client.chat.completions.create(
41
  model=deployment,
42
  messages=[
43
  {"role": "system", "content": system_message},
44
- {
45
- "role": "user",
46
- "content": [
47
- {"type": "text", "text": user_text},
48
- {
49
- "type": "input_audio",
50
- "input_audio": {"data": audio_b64, "format": "mp3"},
51
- },
52
- ],
53
- },
54
  ],
55
  )
56
- print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt)}, audio_size={len(audio_b64)}")
 
57
  return response.choices[0].message.content
58
 
59
  except Exception as ex:
60
  return print(f"Error from Azure OpenAI: {ex}")
61
- #pass
62
 
63
  #----Retrieve meta data from metadata.json file------------------------------
64
  def retrieve_file_path(file_name):
@@ -101,6 +130,8 @@ def download_to_temp_mp3(url: str) -> str:
101
 
102
  def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
103
  tmp_to_cleanup = []
 
 
104
  try:
105
  audio_path = None
106
  if upload_path:
@@ -108,14 +139,21 @@ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
108
  elif record_path:
109
  audio_path = record_path
110
  elif url and url.strip():
111
- audio_path = download_to_temp_mp3(url.strip())
112
- tmp_to_cleanup.append(audio_path)
113
-
114
- if not audio_path:
115
- return "Please provide an audio file via upload, recording, or URL."
116
-
117
- audio_b64 = encode_audio_from_path(audio_path)
118
- return summarize_audio_b64(audio_b64, sys_prompt, user_prompt)
 
 
 
 
 
 
 
119
 
120
  except Exception as e:
121
  return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
@@ -134,7 +172,8 @@ def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
134
 
135
  with gr.Blocks(title="Audio Summarizer") as demo:
136
  gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
137
- gr.Markdown("Upload a mp3, record audio, or paste a URL. The app sends base64 audio to Azure OpenAI.")
 
138
 
139
  with gr.Row():
140
  with gr.Column():
@@ -142,7 +181,7 @@ with gr.Blocks(title="Audio Summarizer") as demo:
142
  with gr.Column():
143
  record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
144
  with gr.Column():
145
- url_input = gr.Textbox(label="mp3 URL", placeholder="https://example.com/audio.mp3")
146
 
147
  ### Get system and user prompts from metadata.json file
148
  file_name = 'metadata.json'
 
7
  from dotenv import load_dotenv
8
  from openai import AzureOpenAI # official OpenAI SDK, works with Azure endpoints
9
  import json
10
+ import subprocess
11
+ import Youtubetranscription_summarizer
12
+ import re
13
 
14
  # --- LLM call (Azure OpenAI with API key) -----------------------------------
15
 
16
+ def summarize_input(audio_b64: str = None, text_input: str = None, sys_prompt: str = None, user_prompt: str = None) -> str:
17
  """
18
+ Calls Azure OpenAI Chat Completions with audio input (base64 mp3) or text input, or both.
19
  """
20
  load_dotenv()
21
 
 
26
 
27
  if not endpoint or not api_key or not deployment:
28
  return "Server misconfiguration: required env vars missing."
29
+ # Reset json_text for logging
30
+ json_text = ""
31
  try:
32
  client = AzureOpenAI(
33
  api_key=api_key,
 
38
  system_message = sys_prompt.strip() if sys_prompt else (
39
  "You are an AI assistant with a charter to clearly analyze the customer enquiry."
40
  )
41
+ user_text = user_prompt.strip() if user_prompt else (
42
+ "Summarize the provided content." if audio_b64 or text_input else "No input provided."
43
+ )
44
 
45
+ content = [{"type": "text", "text": user_text}]
46
+
47
+ if audio_b64:
48
+ content.append({
49
+ "type": "input_audio",
50
+ "input_audio": {"data": audio_b64, "format": "mp3"},
51
+ })
52
+ if text_input is not None:
53
+ # Debugging: Print the type and value of text_input
54
+ #print(f"Debug: text_input type={type(text_input)}, value={text_input}")
55
+ if isinstance(text_input, str):
56
+ try:
57
+ # Try to parse the string as JSON to see if it's a list or dict
58
+ parsed = json.loads(text_input)
59
+ if isinstance(parsed, (list, dict)):
60
+ # If it's a list or dict, convert back to JSON string
61
+ content.append({"type": "text", "text": json.dumps(parsed)})
62
+ else:
63
+ # If it's a string but not a JSON list/dict, use it as-is
64
+ content.append({"type": "text", "text": text_input})
65
+ except json.JSONDecodeError:
66
+ # If it's not valid JSON, treat it as a regular string
67
+ content.append({"type": "text", "text": text_input})
68
+ elif isinstance(text_input, (list, dict)):
69
+ try:
70
+ # Convert list or dict to JSON-formatted string
71
+ json_text = json.dumps(text_input)
72
+ content.append({"type": "text", "text": json_text})
73
+ except (TypeError, ValueError):
74
+ return "Error: text_input (list or dict) could not be converted to JSON."
75
+ else:
76
+ return f"Error: text_input must be a string, list, or dict, got {type(text_input)}."
77
+
78
  response = client.chat.completions.create(
79
  model=deployment,
80
  messages=[
81
  {"role": "system", "content": system_message},
82
+ {"role": "user", "content": content},
 
 
 
 
 
 
 
 
 
83
  ],
84
  )
85
+ print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt or '')}, "
86
+ f"audio_size={len(audio_b64 or '')}, text_input_size={len(json_text or '')}")
87
  return response.choices[0].message.content
88
 
89
  except Exception as ex:
90
  return print(f"Error from Azure OpenAI: {ex}")
 
91
 
92
  #----Retrieve meta data from metadata.json file------------------------------
93
  def retrieve_file_path(file_name):
 
130
 
131
  def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
132
  tmp_to_cleanup = []
133
+ audio_b64 = None
134
+ text_input = None
135
  try:
136
  audio_path = None
137
  if upload_path:
 
139
  elif record_path:
140
  audio_path = record_path
141
  elif url and url.strip():
142
+ #Check if it's a youtube url
143
+ CheckURL = re.search(r"Youtube", url, re.IGNORECASE)
144
+ if CheckURL:
145
+ # Get the transcription from youtube
146
+ text_input = Youtubetranscription_summarizer.main(url.strip()) # Youtube files are transcribed and summarized
147
+ tmp_to_cleanup.append(text_input)
148
+ else:
149
+ audio_path = download_to_temp_mp3(url.strip())
150
+ tmp_to_cleanup.append(audio_path)
151
+ if not audio_path and text_input is None:
152
+ return "Please provide content via upload, recording, or URL."
153
+ # If we have an audio file, encode it
154
+ if audio_path:
155
+ audio_b64 = encode_audio_from_path(audio_path)
156
+ return summarize_input(audio_b64, text_input, sys_prompt, user_prompt)
157
 
158
  except Exception as e:
159
  return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
 
172
 
173
  with gr.Blocks(title="Audio Summarizer") as demo:
174
  gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
175
+ gr.Markdown("Upload an mp3(**YouTube is the new feature add**), record audio, or paste a URL, use the default user prompt and system prompt and click 'Summarize'.")
176
+ gr.Markdown("Users are encouraged to modify the user and system prompts to suit their needs.")
177
 
178
  with gr.Row():
179
  with gr.Column():
 
181
  with gr.Column():
182
  record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
183
  with gr.Column():
184
+ url_input = gr.Textbox(label="YouTube or standard mp3 URL", placeholder="https://example.com/audio.mp3")
185
 
186
  ### Get system and user prompts from metadata.json file
187
  file_name = 'metadata.json'
gradio_client_audichattranscriber.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import gradio as gr
3
+ from dotenv import load_dotenv
4
+ from gradio_client import Client # Gradio client for Hugging Face models
5
+
6
+ def main():
7
+ """
8
+ Calls Gradio app hosted on Hugging Face using Gradio client.
9
+ """
10
+ load_dotenv() # Load .env file for HF token if needed
11
+
12
+
13
+ try:
14
+ client = Client("samir72/AudioChatTranscriber") # Hugging Face model with Gradio app
15
+ #client.view_api() # View available API endpoints
16
+ response = client.predict(
17
+ upload_path=None,
18
+ record_path=None,
19
+ url="https://audio-samples.github.io/samples/mp3/blizzard_biased/sample-0.mp3",
20
+ sys_prompt="You are an AI assistant with a listening charter to clearly analyze the customer enquiry.",
21
+ user_prompt="Summarize the audio content",
22
+ api_name="/process_audio"
23
+ )
24
+ print(f"Gradio API call at {datetime.now()}")
25
+ print(f"Summarized Output : {response}")
26
+ return response
27
+
28
+ except Exception as ex:
29
+ return print(f"Error calling Gradio app: {ex}")
30
+ #pass
31
+
32
+
33
+
34
+ if __name__ == "__main__":
35
+ main()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt CHANGED
@@ -1,7 +1,9 @@
1
- python-dotenv==1.1.1
2
  gradio==5.45.0
3
  requests==2.32.5
4
  azure-identity==1.25.0
5
  azure-ai-projects==1.0.0
6
  numpy==1.26.4
7
- openai==1.107.3
 
 
 
1
+ dotenv==0.9.9
2
  gradio==5.45.0
3
  requests==2.32.5
4
  azure-identity==1.25.0
5
  azure-ai-projects==1.0.0
6
  numpy==1.26.4
7
+ openai==1.107.3
8
+ yt_dlp==2025.9.5
9
+ faster_whisper==1.2.0