Spaces:
Sleeping
Sleeping
Update app.py
Browse filesfourth changes
app.py
CHANGED
|
@@ -1,36 +1,125 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pytube
|
| 3 |
from transformers import pipeline
|
|
|
|
|
|
|
| 4 |
|
| 5 |
# Initialize pipelines
|
| 6 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30)
|
| 7 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 8 |
|
| 9 |
-
def
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Build Gradio app
|
| 29 |
-
with gr.Blocks() as demo:
|
| 30 |
-
gr.Markdown("## π Multi
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
btn.click(summarize_youtube, inputs=url_input, outputs=[vid, txt, summ])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pytube
|
| 3 |
from transformers import pipeline
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
|
| 7 |
# Initialize pipelines
|
| 8 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30)
|
| 9 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 10 |
|
| 11 |
+
def extract_video_id(url):
|
| 12 |
+
"""Extract video ID from various YouTube URL formats"""
|
| 13 |
+
patterns = [
|
| 14 |
+
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
|
| 15 |
+
r'(?:embed\/)([0-9A-Za-z_-]{11})',
|
| 16 |
+
r'(?:v\/)([0-9A-Za-z_-]{11})'
|
| 17 |
+
]
|
| 18 |
+
for pattern in patterns:
|
| 19 |
+
match = re.search(pattern, url)
|
| 20 |
+
if match:
|
| 21 |
+
return match.group(1)
|
| 22 |
+
return None
|
| 23 |
|
| 24 |
+
def summarize_youtube(url):
|
| 25 |
+
try:
|
| 26 |
+
# Clean up any existing audio file
|
| 27 |
+
if os.path.exists("audio.mp4"):
|
| 28 |
+
os.remove("audio.mp4")
|
| 29 |
+
|
| 30 |
+
# Create YouTube object with error handling
|
| 31 |
+
yt = pytube.YouTube(url, use_oauth=False, allow_oauth_cache=False)
|
| 32 |
+
|
| 33 |
+
# Get audio stream with better filtering
|
| 34 |
+
audio_streams = yt.streams.filter(only_audio=True, file_extension='mp4')
|
| 35 |
+
if not audio_streams:
|
| 36 |
+
# Fallback to any audio stream
|
| 37 |
+
audio_streams = yt.streams.filter(only_audio=True)
|
| 38 |
+
|
| 39 |
+
if not audio_streams:
|
| 40 |
+
return "β Error: No audio streams available", "Could not extract audio from video", "No summary available"
|
| 41 |
+
|
| 42 |
+
stream = audio_streams.first()
|
| 43 |
+
|
| 44 |
+
# Download with proper filename
|
| 45 |
+
audio_file = stream.download(filename="audio")
|
| 46 |
+
|
| 47 |
+
# Transcribe
|
| 48 |
+
result = asr(audio_file)
|
| 49 |
+
transcript = result["text"]
|
| 50 |
+
|
| 51 |
+
# Clean up audio file
|
| 52 |
+
if os.path.exists(audio_file):
|
| 53 |
+
os.remove(audio_file)
|
| 54 |
+
|
| 55 |
+
# Check transcript length for summarization
|
| 56 |
+
if len(transcript.split()) < 10:
|
| 57 |
+
return "β Error: Transcript too short", transcript, "Cannot summarize - transcript too brief"
|
| 58 |
+
|
| 59 |
+
# Summarize with better parameters
|
| 60 |
+
max_chunk = 1024 # BART's max input length
|
| 61 |
+
if len(transcript) > max_chuck:
|
| 62 |
+
# Split transcript into chunks if too long
|
| 63 |
+
words = transcript.split()
|
| 64 |
+
chunks = [' '.join(words[i:i+200]) for i in range(0, len(words), 200)]
|
| 65 |
+
summaries = []
|
| 66 |
+
|
| 67 |
+
for chunk in chunks[:3]: # Limit to first 3 chunks to avoid timeout
|
| 68 |
+
if len(chunk.strip()) > 50:
|
| 69 |
+
chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
|
| 70 |
+
summaries.append(chunk_summary)
|
| 71 |
+
|
| 72 |
+
summary = " ".join(summaries)
|
| 73 |
+
else:
|
| 74 |
+
summary = summarizer(transcript, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
|
| 75 |
+
|
| 76 |
+
# Create embed HTML
|
| 77 |
+
v_id = extract_video_id(url)
|
| 78 |
+
if v_id:
|
| 79 |
+
embed_html = f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{v_id}" frameborder="0" allowfullscreen></iframe>'
|
| 80 |
+
else:
|
| 81 |
+
embed_html = "β Could not extract video ID"
|
| 82 |
+
|
| 83 |
+
return embed_html, transcript, summary
|
| 84 |
+
|
| 85 |
+
except pytube.exceptions.RegexMatchError:
|
| 86 |
+
return "β Error: Invalid YouTube URL", "Please check the URL format", "No summary available"
|
| 87 |
+
except pytube.exceptions.VideoUnavailable:
|
| 88 |
+
return "β Error: Video unavailable", "Video may be private or deleted", "No summary available"
|
| 89 |
+
except Exception as e:
|
| 90 |
+
return f"β Error: {str(e)}", "An error occurred during processing", "No summary available"
|
| 91 |
|
| 92 |
# Build Gradio app
|
| 93 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 94 |
+
gr.Markdown("## π Multi-lingual YouTube Summarizer (Hindi / Hinglish / English)")
|
| 95 |
+
gr.Markdown("Enter a YouTube URL to get an AI-generated summary of the video content.")
|
| 96 |
+
|
| 97 |
+
with gr.Row():
|
| 98 |
+
with gr.Column():
|
| 99 |
+
url_input = gr.Textbox(
|
| 100 |
+
label="YouTube URL",
|
| 101 |
+
placeholder="https://www.youtube.com/watch?v=...",
|
| 102 |
+
lines=1
|
| 103 |
+
)
|
| 104 |
+
btn = gr.Button("π Summarize Video", variant="primary")
|
| 105 |
+
|
| 106 |
+
with gr.Row():
|
| 107 |
+
with gr.Column():
|
| 108 |
+
vid = gr.HTML(label="Video Player")
|
| 109 |
+
with gr.Column():
|
| 110 |
+
with gr.Accordion("π Transcript", open=False):
|
| 111 |
+
txt = gr.Textbox(label="Full Transcript", lines=10, max_lines=15)
|
| 112 |
+
summ = gr.Textbox(label="π Summary", lines=5)
|
| 113 |
+
|
| 114 |
btn.click(summarize_youtube, inputs=url_input, outputs=[vid, txt, summ])
|
| 115 |
+
|
| 116 |
+
# Add examples
|
| 117 |
+
gr.Examples(
|
| 118 |
+
examples=[
|
| 119 |
+
["https://www.youtube.com/watch?v=dQw4w9WgXcQ"], # Replace with actual examples
|
| 120 |
+
],
|
| 121 |
+
inputs=url_input
|
| 122 |
+
)
|
| 123 |
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
demo.launch(share=True)
|