# app.py # A feature-rich web app that summarizes YouTube videos using multiple models, # chunking for long transcripts, and user-controlled output length. import gradio as gr from transformers import pipeline from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound from langchain.text_splitter import RecursiveCharacterTextSplitter import time # Model and Splitter Setup MODELS = { "DistilBART (Default, Fast)": "sshleifer/distilbart-cnn-12-6", "BART (Large, More Accurate)": "facebook/bart-large-cnn", "Pegasus (Best for News/Articles)": "google/pegasus-xsum", } # Model cache to store loaded models loaded_models = {} def get_model(model_name): """ Loads a model pipeline if not already loaded and returns it. Uses 'loaded_models' dictionary as a cache for performance. """ if model_name not in MODELS: raise ValueError(f"Unknown Model: {model_name}") if model_name in loaded_models: print(f"Returning cached model: {model_name}") return loaded_models[model_name] print(f"Loading new model: {model_name} (this may take a minute)...") start_time = time.time() model_id = MODELS[model_name] summarizer = pipeline("summarization", model=model_id) loaded_models[model_name] = summarizer end_time = time.time() print(f"Model '{model_name}' loaded in {end_time - start_time:.2f} seconds.") return summarizer # Instantiate the Text Splitter text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100 ) # Core Summarization Function def youtube_summarizer(model_name, youtube_url, min_len, max_len): """ Main function to orchestrate the summarization process. """ try: summarizer = get_model(model_name) except Exception as e: return f"Error loading model: {e}" if not youtube_url or "youtube.com/watch?v=" not in youtube_url: return "Please enter a valid YouTube video URL." try: video_id = youtube_url.split("v=")[1].split("&")[0] print(f"Fetching transcript for video ID: {video_id}") transcript_list = YouTubeTranscriptApi.get_transcript(video_id) transcript = " ".join([d['text'] for d in transcript_list]) print("Transcript fetched successfully.") print("Splitting transcript into chunks...") chunks = text_splitter.split_text(transcript) print("Summarizing individual chunks...") initial_summaries = [] for chunk in chunks: chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False) initial_summaries.append(chunk_summary[0]['summary_text']) print("Creating final summary...") combined_summary_text = " ".join(initial_summaries) final_summary = summarizer( combined_summary_text, max_length=max_len, min_length=min_len, do_sample=False ) return final_summary[0]['summary_text'] except TranscriptsDisabled: return "Error: Transcripts are disabled for this video." except NoTranscriptFound: return "Error: No English transcript found for this video. The API may not support auto-generated captions." except Exception as e: return f"An unexpected error occurred: {e}" # Gradio Interface print("Creating Gradio interface...") demo = gr.Interface( fn=youtube_summarizer, inputs=[ gr.Dropdown( choices=list(MODELS.keys()), value="DistilBART (Default, Fast)", label="Select Summarization Model" ), gr.Textbox(label="YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=..."), gr.Slider(minimum=30, maximum=200, value=70, step=10, label="Minimum Final Summary Length"), gr.Slider(minimum=100, maximum=500, value=350, step=10, label="Maximum Final Summary Length") ], outputs=gr.Textbox(label="Video Summary", lines=10), title="Advanced YouTube Video Summarizer", description=""" **Summarize any YouTube video with your choice of AI model!** 1. Select a summarization model from the dropdown. 2. Paste a YouTube video URL. 3. Adjust the sliders to control the length of the final summary. *Note: Loading a new model for the first time may take a minute or two. Subsequent uses will be much faster.* """, allow_flagging="never", examples=[ ["DistilBART (Default, Fast)", "https://www.youtube.com/watch?v=jaYN-iwgw2g", 50, 150], ["BART (Large, More Accurate)", "https://www.youtube.com/watch?v=Yo22h_i_5kY", 100, 300] ] ) print("Launching Gradio app...") demo.launch()