daafa999's picture
Update app.py
d6b24f8 verified
Raw
History Blame
9.18 kB
import streamlit as st
from gtts import gTTS
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptUnavailable, NoTranscriptFound
import unicodedata
from deepmultilingualpunctuation import PunctuationModel
from transformers import pipeline
import io
import re
# =============================================================================
# CACHE MODELS - Agar tidak reload setiap kali form disubmit
# =============================================================================
@st.cache_resource
def load_punctuation_model():
"""Load punctuation model once and cache it"""
return PunctuationModel("oliverguhr/fullstop-punctuation-multilingual--large")
@st.cache_resource
def load_summarization_pipeline():
"""Load summarization pipeline once and cache it"""
return pipeline(
"summarization",
model="t5-base",
tokenizer="t5-base",
device=0 if st.runtime.get_option("server.headless") == False and hasattr(st, 'runtime') else -1 # CPU fallback
)
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================
def extract_video_id(url):
"""Extract video ID from various YouTube URL formats"""
# Handle short URLs: youtu.be/VIDEO_ID
if "youtu.be" in url:
parsed = urlparse(url)
return parsed.path.lstrip('/')
# Handle standard URLs: youtube.com/watch?v=VIDEO_ID
if "watch" in url:
parsed = urlparse(url)
params = parse_qs(parsed.query)
return params.get('v', [None])[0]
# Handle embed URLs: youtube.com/embed/VIDEO_ID
if "embed" in url:
parsed = urlparse(url)
return parsed.path.split('/')[-1]
# Handle short URLs without protocol
if re.match(r'^[a-zA-Z0-9_-]{11}$', url.strip()):
return url.strip()
return None
def get_transcript_text(video_id, language='en'):
"""Fetch and combine transcript text from YouTube"""
try:
# Try to get transcript in preferred language, fallback to any available
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
transcript = transcript_list.find_transcript([language])
except (TranscriptUnavailable, NoTranscriptFound):
# Fallback: get any available transcript
transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en'])
transcript_data = transcript.fetch()
# Combine all text segments
text_segments = [item["text"] for item in transcript_data]
return " ".join(text_segments)
def chunk_text(text, max_chars=4000):
"""Split text into chunks respecting sentence boundaries"""
chunks = []
current_chunk = ""
# Split by sentences (basic approach)
sentences = re.split(r'(?<=[.!?])\s+', text)
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks if chunks else [text] # Fallback if no sentences found
def normalize_text(text):
"""Normalize unicode characters in text"""
return unicodedata.normalize('NFKD', text)
# =============================================================================
# MAIN SUMMARIZATION FUNCTION
# =============================================================================
def summarize_video(url, language='en'):
"""Main function to summarize YouTube video"""
# Extract video ID
video_id = extract_video_id(url)
if not video_id:
raise ValueError("Invalid YouTube URL. Please check the link and try again.")
# Get transcript
with st.spinner("πŸ“ Mengambil transkrip video..."):
video_transcript = get_transcript_text(video_id, language)
if not video_transcript or len(video_transcript.strip()) < 50:
raise ValueError("Transkrip tidak ditemukan atau terlalu pendek. Video mungkin tidak memiliki subtitle.")
# Normalize text
normalized_text = normalize_text(video_transcript)
# Add punctuation
with st.spinner("✏️ Menambahkan tanda baca..."):
punctuation_model = load_punctuation_model()
punctuated_text = punctuation_model.restore_punctuation(normalized_text)
# Summarization
with st.spinner("πŸ€– Meringkas konten..."):
summarizer = load_summarization_pipeline()
# T5-base max input: ~512 tokens (~2000-3000 chars safe limit)
chunks = chunk_text(punctuated_text, max_chars=3000)
summaries = []
for i, chunk in enumerate(chunks):
# T5 expects prefix "summarize: " for some versions
input_text = f"summarize: {chunk}" if "t5" in "t5-base" else chunk
summary_result = summarizer(
input_text,
max_length=150,
min_length=30,
do_sample=False,
truncation=True
)
summaries.append(summary_result[0]['summary_text'])
st.progress(min((i + 1) / len(chunks), 1.0))
final_summary = " ".join(summaries)
return final_summary
# =============================================================================
# STREAMLIT APP
# =============================================================================
def main():
st.set_page_config(page_title="YouTube Summarizer", page_icon="🎬", layout="centered")
st.title("🎬 YouTube Video Summarizer")
st.markdown("""
Masukkan URL video YouTube untuk mendapatkan ringkasan otomatis berbasis AI.
Mendukung video dengan subtitle/closed caption.
""")
# Input form
with st.form(key="summarizer_form"):
video_url = st.text_input(
"πŸ”— URL Video YouTube",
placeholder="https://www.youtube.com/watch?v=..."
)
language = st.selectbox(
"🌐 Bahasa Transkrip (opsional)",
options=['en', 'id', 'es', 'fr', 'de', 'pt', 'auto'],
index=0,
help="Pilih bahasa transkrip. 'auto' akan mencoba mendeteksi otomatis."
)
col1, col2 = st.columns([1, 3])
with col1:
submit_button = st.form_submit_button("πŸš€ Ringkas", use_container_width=True)
# Process submission
if submit_button:
if not video_url.strip():
st.error("⚠️ Harap masukkan URL video YouTube yang valid.")
return
try:
# Generate summary
summary = summarize_video(video_url, language if language != 'auto' else 'en')
# Display results
st.success("βœ… Ringkasan berhasil dibuat!")
st.subheader("πŸ“„ Hasil Ringkasan")
st.markdown(f"> {summary}")
# Text-to-Speech
with st.spinner("πŸ”Š Membuat audio..."):
# Detect language for gTTS (simplified: default to 'en')
tts_lang = 'id' if any(kata in summary.lower() for kata in ['dan', 'yang', 'di', 'ke']) else 'en'
tts = gTTS(text=summary, lang=tts_lang, slow=False)
# Convert to bytes for download (no file I/O)
audio_buffer = io.BytesIO()
tts.write_to_fp(audio_buffer)
audio_buffer.seek(0)
# Download button
st.download_button(
label="πŸ“₯ Download Ringkasan Audio (MP3)",
data=audio_buffer,
file_name="youtube_summary.mp3",
mime="audio/mpeg",
use_container_width=True
)
# Copy summary to clipboard hint
st.code(summary, language="text")
st.caption("πŸ’‘ Tip: Klik teks di atas untuk menyalin ringkasan.")
except Exception as e:
st.error(f"❌ Terjadi kesalahan: {str(e)}")
with st.expander("πŸ” Detail Error (untuk debugging)"):
st.exception(e)
# Sidebar info
with st.sidebar:
st.header("ℹ️ Informasi")
st.markdown("""
**Fitur:**
- βœ… Ekstrak transkrip otomatis
- βœ… Penambahan tanda baca AI
- βœ… Ringkasan multi-bahasa
- βœ… Export ke audio MP3
**Batasan:**
- Video harus memiliki subtitle/closed caption
- Durasi video sangat panjang mungkin diproses per bagian
- Model T5-base optimal untuk teks bahasa Inggris
**Tips:**
- Gunakan video dengan subtitle resmi untuk hasil terbaik
- Untuk video bahasa Indonesia, pastikan subtitle tersedia
""")
st.markdown("---")
st.caption("Dibuat dengan ❀️ menggunakan Streamlit & Hugging Face")
if __name__ == "__main__":
main()