Spaces:

RBLucky
/

YouTube-AI-Summarizer

Sleeping

Lucky Nkosi

update YouTube TranscriptApi

fdb1dfc 11 months ago

4.7 kB

	# app.py
	# A feature-rich web app that summarizes YouTube videos using multiple models,
	# chunking for long transcripts, and user-controlled output length.

	import gradio as gr
	from transformers import pipeline
	from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import time

	# Model and Splitter Setup

	MODELS = {
	"DistilBART (Default, Fast)": "sshleifer/distilbart-cnn-12-6",
	"BART (Large, More Accurate)": "facebook/bart-large-cnn",
	"Pegasus (Best for News/Articles)": "google/pegasus-xsum",
	}

	# Model cache to store loaded models
	loaded_models = {}

	def get_model(model_name):
	"""
	Loads a model pipeline if not already loaded and returns it.
	Uses 'loaded_models' dictionary as a cache for performance.
	"""

	if model_name not in MODELS:
	raise ValueError(f"Unknown Model: {model_name}")

	if model_name in loaded_models:
	print(f"Returning cached model: {model_name}")
	return loaded_models[model_name]

	print(f"Loading new model: {model_name} (this may take a minute)...")
	start_time = time.time()

	model_id = MODELS[model_name]
	summarizer = pipeline("summarization", model=model_id)
	loaded_models[model_name] = summarizer

	end_time = time.time()
	print(f"Model '{model_name}' loaded in {end_time - start_time:.2f} seconds.")

	return summarizer

	# Instantiate the Text Splitter
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=100
	)

	# Core Summarization Function
	def youtube_summarizer(model_name, youtube_url, min_len, max_len):
	"""
	Main function to orchestrate the summarization process.
	"""
	try:
	summarizer = get_model(model_name)
	except Exception as e:
	return f"Error loading model: {e}"

	if not youtube_url or "youtube.com/watch?v=" not in youtube_url:
	return "Please enter a valid YouTube video URL."

	try:
	video_id = youtube_url.split("v=")[1].split("&")[0]
	print(f"Fetching transcript for video ID: {video_id}")
	transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
	transcript = " ".join([d['text'] for d in transcript_list])
	print("Transcript fetched successfully.")

	print("Splitting transcript into chunks...")
	chunks = text_splitter.split_text(transcript)

	print("Summarizing individual chunks...")
	initial_summaries = []
	for chunk in chunks:
	chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
	initial_summaries.append(chunk_summary[0]['summary_text'])

	print("Creating final summary...")
	combined_summary_text = " ".join(initial_summaries)
	final_summary = summarizer(
	combined_summary_text,
	max_length=max_len,
	min_length=min_len,
	do_sample=False
	)

	return final_summary[0]['summary_text']

	except TranscriptsDisabled:
	return "Error: Transcripts are disabled for this video."
	except NoTranscriptFound:
	return "Error: No English transcript found for this video. The API may not support auto-generated captions."
	except Exception as e:
	return f"An unexpected error occurred: {e}"

	# Gradio Interface
	print("Creating Gradio interface...")

	demo = gr.Interface(
	fn=youtube_summarizer,
	inputs=[
	gr.Dropdown(
	choices=list(MODELS.keys()),
	value="DistilBART (Default, Fast)",
	label="Select Summarization Model"
	),
	gr.Textbox(label="YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=..."),
	gr.Slider(minimum=30, maximum=200, value=70, step=10, label="Minimum Final Summary Length"),
	gr.Slider(minimum=100, maximum=500, value=350, step=10, label="Maximum Final Summary Length")
	],
	outputs=gr.Textbox(label="Video Summary", lines=10),
	title="Advanced YouTube Video Summarizer",
	description="""
	Summarize any YouTube video with your choice of AI model!

	1. Select a summarization model from the dropdown.
	2. Paste a YouTube video URL.
	3. Adjust the sliders to control the length of the final summary.

	Note: Loading a new model for the first time may take a minute or two. Subsequent uses will be much faster.
	""",
	allow_flagging="never",
	examples=[
	["DistilBART (Default, Fast)", "https://www.youtube.com/watch?v=jaYN-iwgw2g", 50, 150],
	["BART (Large, More Accurate)", "https://www.youtube.com/watch?v=Yo22h_i_5kY", 100, 300]
	]
	)

	print("Launching Gradio app...")
	demo.launch()