Lucky Nkosi
update YouTube TranscriptApi
fdb1dfc
# app.py
# A feature-rich web app that summarizes YouTube videos using multiple models,
# chunking for long transcripts, and user-controlled output length.
import gradio as gr
from transformers import pipeline
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
# Model and Splitter Setup
MODELS = {
"DistilBART (Default, Fast)": "sshleifer/distilbart-cnn-12-6",
"BART (Large, More Accurate)": "facebook/bart-large-cnn",
"Pegasus (Best for News/Articles)": "google/pegasus-xsum",
}
# Model cache to store loaded models
loaded_models = {}
def get_model(model_name):
"""
Loads a model pipeline if not already loaded and returns it.
Uses 'loaded_models' dictionary as a cache for performance.
"""
if model_name not in MODELS:
raise ValueError(f"Unknown Model: {model_name}")
if model_name in loaded_models:
print(f"Returning cached model: {model_name}")
return loaded_models[model_name]
print(f"Loading new model: {model_name} (this may take a minute)...")
start_time = time.time()
model_id = MODELS[model_name]
summarizer = pipeline("summarization", model=model_id)
loaded_models[model_name] = summarizer
end_time = time.time()
print(f"Model '{model_name}' loaded in {end_time - start_time:.2f} seconds.")
return summarizer
# Instantiate the Text Splitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
# Core Summarization Function
def youtube_summarizer(model_name, youtube_url, min_len, max_len):
"""
Main function to orchestrate the summarization process.
"""
try:
summarizer = get_model(model_name)
except Exception as e:
return f"Error loading model: {e}"
if not youtube_url or "youtube.com/watch?v=" not in youtube_url:
return "Please enter a valid YouTube video URL."
try:
video_id = youtube_url.split("v=")[1].split("&")[0]
print(f"Fetching transcript for video ID: {video_id}")
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
transcript = " ".join([d['text'] for d in transcript_list])
print("Transcript fetched successfully.")
print("Splitting transcript into chunks...")
chunks = text_splitter.split_text(transcript)
print("Summarizing individual chunks...")
initial_summaries = []
for chunk in chunks:
chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
initial_summaries.append(chunk_summary[0]['summary_text'])
print("Creating final summary...")
combined_summary_text = " ".join(initial_summaries)
final_summary = summarizer(
combined_summary_text,
max_length=max_len,
min_length=min_len,
do_sample=False
)
return final_summary[0]['summary_text']
except TranscriptsDisabled:
return "Error: Transcripts are disabled for this video."
except NoTranscriptFound:
return "Error: No English transcript found for this video. The API may not support auto-generated captions."
except Exception as e:
return f"An unexpected error occurred: {e}"
# Gradio Interface
print("Creating Gradio interface...")
demo = gr.Interface(
fn=youtube_summarizer,
inputs=[
gr.Dropdown(
choices=list(MODELS.keys()),
value="DistilBART (Default, Fast)",
label="Select Summarization Model"
),
gr.Textbox(label="YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=..."),
gr.Slider(minimum=30, maximum=200, value=70, step=10, label="Minimum Final Summary Length"),
gr.Slider(minimum=100, maximum=500, value=350, step=10, label="Maximum Final Summary Length")
],
outputs=gr.Textbox(label="Video Summary", lines=10),
title="Advanced YouTube Video Summarizer",
description="""
**Summarize any YouTube video with your choice of AI model!**
1. Select a summarization model from the dropdown.
2. Paste a YouTube video URL.
3. Adjust the sliders to control the length of the final summary.
*Note: Loading a new model for the first time may take a minute or two. Subsequent uses will be much faster.*
""",
allow_flagging="never",
examples=[
["DistilBART (Default, Fast)", "https://www.youtube.com/watch?v=jaYN-iwgw2g", 50, 150],
["BART (Large, More Accurate)", "https://www.youtube.com/watch?v=Yo22h_i_5kY", 100, 300]
]
)
print("Launching Gradio app...")
demo.launch()