import streamlit as st from gtts import gTTS from urllib.parse import urlparse, parse_qs from youtube_transcript_api import YouTubeTranscriptApi, TranscriptUnavailable, NoTranscriptFound import unicodedata from deepmultilingualpunctuation import PunctuationModel from transformers import pipeline import io import re # ============================================================================= # CACHE MODELS - Agar tidak reload setiap kali form disubmit # ============================================================================= @st.cache_resource def load_punctuation_model(): """Load punctuation model once and cache it""" return PunctuationModel("oliverguhr/fullstop-punctuation-multilingual--large") @st.cache_resource def load_summarization_pipeline(): """Load summarization pipeline once and cache it""" return pipeline( "summarization", model="t5-base", tokenizer="t5-base", device=0 if st.runtime.get_option("server.headless") == False and hasattr(st, 'runtime') else -1 # CPU fallback ) # ============================================================================= # HELPER FUNCTIONS # ============================================================================= def extract_video_id(url): """Extract video ID from various YouTube URL formats""" # Handle short URLs: youtu.be/VIDEO_ID if "youtu.be" in url: parsed = urlparse(url) return parsed.path.lstrip('/') # Handle standard URLs: youtube.com/watch?v=VIDEO_ID if "watch" in url: parsed = urlparse(url) params = parse_qs(parsed.query) return params.get('v', [None])[0] # Handle embed URLs: youtube.com/embed/VIDEO_ID if "embed" in url: parsed = urlparse(url) return parsed.path.split('/')[-1] # Handle short URLs without protocol if re.match(r'^[a-zA-Z0-9_-]{11}$', url.strip()): return url.strip() return None def get_transcript_text(video_id, language='en'): """Fetch and combine transcript text from YouTube""" try: # Try to get transcript in preferred language, fallback to any available transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) transcript = transcript_list.find_transcript([language]) except (TranscriptUnavailable, NoTranscriptFound): # Fallback: get any available transcript transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en']) transcript_data = transcript.fetch() # Combine all text segments text_segments = [item["text"] for item in transcript_data] return " ".join(text_segments) def chunk_text(text, max_chars=4000): """Split text into chunks respecting sentence boundaries""" chunks = [] current_chunk = "" # Split by sentences (basic approach) sentences = re.split(r'(?<=[.!?])\s+', text) for sentence in sentences: if len(current_chunk) + len(sentence) <= max_chars: current_chunk += sentence + " " else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + " " if current_chunk: chunks.append(current_chunk.strip()) return chunks if chunks else [text] # Fallback if no sentences found def normalize_text(text): """Normalize unicode characters in text""" return unicodedata.normalize('NFKD', text) # ============================================================================= # MAIN SUMMARIZATION FUNCTION # ============================================================================= def summarize_video(url, language='en'): """Main function to summarize YouTube video""" # Extract video ID video_id = extract_video_id(url) if not video_id: raise ValueError("Invalid YouTube URL. Please check the link and try again.") # Get transcript with st.spinner("📝 Mengambil transkrip video..."): video_transcript = get_transcript_text(video_id, language) if not video_transcript or len(video_transcript.strip()) < 50: raise ValueError("Transkrip tidak ditemukan atau terlalu pendek. Video mungkin tidak memiliki subtitle.") # Normalize text normalized_text = normalize_text(video_transcript) # Add punctuation with st.spinner("âœī¸ Menambahkan tanda baca..."): punctuation_model = load_punctuation_model() punctuated_text = punctuation_model.restore_punctuation(normalized_text) # Summarization with st.spinner("🤖 Meringkas konten..."): summarizer = load_summarization_pipeline() # T5-base max input: ~512 tokens (~2000-3000 chars safe limit) chunks = chunk_text(punctuated_text, max_chars=3000) summaries = [] for i, chunk in enumerate(chunks): # T5 expects prefix "summarize: " for some versions input_text = f"summarize: {chunk}" if "t5" in "t5-base" else chunk summary_result = summarizer( input_text, max_length=150, min_length=30, do_sample=False, truncation=True ) summaries.append(summary_result[0]['summary_text']) st.progress(min((i + 1) / len(chunks), 1.0)) final_summary = " ".join(summaries) return final_summary # ============================================================================= # STREAMLIT APP # ============================================================================= def main(): st.set_page_config(page_title="YouTube Summarizer", page_icon="đŸŽŦ", layout="centered") st.title("đŸŽŦ YouTube Video Summarizer") st.markdown(""" Masukkan URL video YouTube untuk mendapatkan ringkasan otomatis berbasis AI. Mendukung video dengan subtitle/closed caption. """) # Input form with st.form(key="summarizer_form"): video_url = st.text_input( "🔗 URL Video YouTube", placeholder="https://www.youtube.com/watch?v=..." ) language = st.selectbox( "🌐 Bahasa Transkrip (opsional)", options=['en', 'id', 'es', 'fr', 'de', 'pt', 'auto'], index=0, help="Pilih bahasa transkrip. 'auto' akan mencoba mendeteksi otomatis." ) col1, col2 = st.columns([1, 3]) with col1: submit_button = st.form_submit_button("🚀 Ringkas", use_container_width=True) # Process submission if submit_button: if not video_url.strip(): st.error("âš ī¸ Harap masukkan URL video YouTube yang valid.") return try: # Generate summary summary = summarize_video(video_url, language if language != 'auto' else 'en') # Display results st.success("✅ Ringkasan berhasil dibuat!") st.subheader("📄 Hasil Ringkasan") st.markdown(f"> {summary}") # Text-to-Speech with st.spinner("🔊 Membuat audio..."): # Detect language for gTTS (simplified: default to 'en') tts_lang = 'id' if any(kata in summary.lower() for kata in ['dan', 'yang', 'di', 'ke']) else 'en' tts = gTTS(text=summary, lang=tts_lang, slow=False) # Convert to bytes for download (no file I/O) audio_buffer = io.BytesIO() tts.write_to_fp(audio_buffer) audio_buffer.seek(0) # Download button st.download_button( label="đŸ“Ĩ Download Ringkasan Audio (MP3)", data=audio_buffer, file_name="youtube_summary.mp3", mime="audio/mpeg", use_container_width=True ) # Copy summary to clipboard hint st.code(summary, language="text") st.caption("💡 Tip: Klik teks di atas untuk menyalin ringkasan.") except Exception as e: st.error(f"❌ Terjadi kesalahan: {str(e)}") with st.expander("🔍 Detail Error (untuk debugging)"): st.exception(e) # Sidebar info with st.sidebar: st.header("â„šī¸ Informasi") st.markdown(""" **Fitur:** - ✅ Ekstrak transkrip otomatis - ✅ Penambahan tanda baca AI - ✅ Ringkasan multi-bahasa - ✅ Export ke audio MP3 **Batasan:** - Video harus memiliki subtitle/closed caption - Durasi video sangat panjang mungkin diproses per bagian - Model T5-base optimal untuk teks bahasa Inggris **Tips:** - Gunakan video dengan subtitle resmi untuk hasil terbaik - Untuk video bahasa Indonesia, pastikan subtitle tersedia """) st.markdown("---") st.caption("Dibuat dengan â¤ī¸ menggunakan Streamlit & Hugging Face") if __name__ == "__main__": main()