Spaces:
Build error
Build error
| import streamlit as st | |
| from gtts import gTTS | |
| from urllib.parse import urlparse, parse_qs | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptUnavailable, NoTranscriptFound | |
| import unicodedata | |
| from deepmultilingualpunctuation import PunctuationModel | |
| from transformers import pipeline | |
| import io | |
| import re | |
| # ============================================================================= | |
| # CACHE MODELS - Agar tidak reload setiap kali form disubmit | |
| # ============================================================================= | |
| def load_punctuation_model(): | |
| """Load punctuation model once and cache it""" | |
| return PunctuationModel("oliverguhr/fullstop-punctuation-multilingual--large") | |
| def load_summarization_pipeline(): | |
| """Load summarization pipeline once and cache it""" | |
| return pipeline( | |
| "summarization", | |
| model="t5-base", | |
| tokenizer="t5-base", | |
| device=0 if st.runtime.get_option("server.headless") == False and hasattr(st, 'runtime') else -1 # CPU fallback | |
| ) | |
| # ============================================================================= | |
| # HELPER FUNCTIONS | |
| # ============================================================================= | |
| def extract_video_id(url): | |
| """Extract video ID from various YouTube URL formats""" | |
| # Handle short URLs: youtu.be/VIDEO_ID | |
| if "youtu.be" in url: | |
| parsed = urlparse(url) | |
| return parsed.path.lstrip('/') | |
| # Handle standard URLs: youtube.com/watch?v=VIDEO_ID | |
| if "watch" in url: | |
| parsed = urlparse(url) | |
| params = parse_qs(parsed.query) | |
| return params.get('v', [None])[0] | |
| # Handle embed URLs: youtube.com/embed/VIDEO_ID | |
| if "embed" in url: | |
| parsed = urlparse(url) | |
| return parsed.path.split('/')[-1] | |
| # Handle short URLs without protocol | |
| if re.match(r'^[a-zA-Z0-9_-]{11}$', url.strip()): | |
| return url.strip() | |
| return None | |
| def get_transcript_text(video_id, language='en'): | |
| """Fetch and combine transcript text from YouTube""" | |
| try: | |
| # Try to get transcript in preferred language, fallback to any available | |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
| transcript = transcript_list.find_transcript([language]) | |
| except (TranscriptUnavailable, NoTranscriptFound): | |
| # Fallback: get any available transcript | |
| transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en']) | |
| transcript_data = transcript.fetch() | |
| # Combine all text segments | |
| text_segments = [item["text"] for item in transcript_data] | |
| return " ".join(text_segments) | |
| def chunk_text(text, max_chars=4000): | |
| """Split text into chunks respecting sentence boundaries""" | |
| chunks = [] | |
| current_chunk = "" | |
| # Split by sentences (basic approach) | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= max_chars: | |
| current_chunk += sentence + " " | |
| else: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks if chunks else [text] # Fallback if no sentences found | |
| def normalize_text(text): | |
| """Normalize unicode characters in text""" | |
| return unicodedata.normalize('NFKD', text) | |
| # ============================================================================= | |
| # MAIN SUMMARIZATION FUNCTION | |
| # ============================================================================= | |
| def summarize_video(url, language='en'): | |
| """Main function to summarize YouTube video""" | |
| # Extract video ID | |
| video_id = extract_video_id(url) | |
| if not video_id: | |
| raise ValueError("Invalid YouTube URL. Please check the link and try again.") | |
| # Get transcript | |
| with st.spinner("π Mengambil transkrip video..."): | |
| video_transcript = get_transcript_text(video_id, language) | |
| if not video_transcript or len(video_transcript.strip()) < 50: | |
| raise ValueError("Transkrip tidak ditemukan atau terlalu pendek. Video mungkin tidak memiliki subtitle.") | |
| # Normalize text | |
| normalized_text = normalize_text(video_transcript) | |
| # Add punctuation | |
| with st.spinner("βοΈ Menambahkan tanda baca..."): | |
| punctuation_model = load_punctuation_model() | |
| punctuated_text = punctuation_model.restore_punctuation(normalized_text) | |
| # Summarization | |
| with st.spinner("π€ Meringkas konten..."): | |
| summarizer = load_summarization_pipeline() | |
| # T5-base max input: ~512 tokens (~2000-3000 chars safe limit) | |
| chunks = chunk_text(punctuated_text, max_chars=3000) | |
| summaries = [] | |
| for i, chunk in enumerate(chunks): | |
| # T5 expects prefix "summarize: " for some versions | |
| input_text = f"summarize: {chunk}" if "t5" in "t5-base" else chunk | |
| summary_result = summarizer( | |
| input_text, | |
| max_length=150, | |
| min_length=30, | |
| do_sample=False, | |
| truncation=True | |
| ) | |
| summaries.append(summary_result[0]['summary_text']) | |
| st.progress(min((i + 1) / len(chunks), 1.0)) | |
| final_summary = " ".join(summaries) | |
| return final_summary | |
| # ============================================================================= | |
| # STREAMLIT APP | |
| # ============================================================================= | |
| def main(): | |
| st.set_page_config(page_title="YouTube Summarizer", page_icon="π¬", layout="centered") | |
| st.title("π¬ YouTube Video Summarizer") | |
| st.markdown(""" | |
| Masukkan URL video YouTube untuk mendapatkan ringkasan otomatis berbasis AI. | |
| Mendukung video dengan subtitle/closed caption. | |
| """) | |
| # Input form | |
| with st.form(key="summarizer_form"): | |
| video_url = st.text_input( | |
| "π URL Video YouTube", | |
| placeholder="https://www.youtube.com/watch?v=..." | |
| ) | |
| language = st.selectbox( | |
| "π Bahasa Transkrip (opsional)", | |
| options=['en', 'id', 'es', 'fr', 'de', 'pt', 'auto'], | |
| index=0, | |
| help="Pilih bahasa transkrip. 'auto' akan mencoba mendeteksi otomatis." | |
| ) | |
| col1, col2 = st.columns([1, 3]) | |
| with col1: | |
| submit_button = st.form_submit_button("π Ringkas", use_container_width=True) | |
| # Process submission | |
| if submit_button: | |
| if not video_url.strip(): | |
| st.error("β οΈ Harap masukkan URL video YouTube yang valid.") | |
| return | |
| try: | |
| # Generate summary | |
| summary = summarize_video(video_url, language if language != 'auto' else 'en') | |
| # Display results | |
| st.success("β Ringkasan berhasil dibuat!") | |
| st.subheader("π Hasil Ringkasan") | |
| st.markdown(f"> {summary}") | |
| # Text-to-Speech | |
| with st.spinner("π Membuat audio..."): | |
| # Detect language for gTTS (simplified: default to 'en') | |
| tts_lang = 'id' if any(kata in summary.lower() for kata in ['dan', 'yang', 'di', 'ke']) else 'en' | |
| tts = gTTS(text=summary, lang=tts_lang, slow=False) | |
| # Convert to bytes for download (no file I/O) | |
| audio_buffer = io.BytesIO() | |
| tts.write_to_fp(audio_buffer) | |
| audio_buffer.seek(0) | |
| # Download button | |
| st.download_button( | |
| label="π₯ Download Ringkasan Audio (MP3)", | |
| data=audio_buffer, | |
| file_name="youtube_summary.mp3", | |
| mime="audio/mpeg", | |
| use_container_width=True | |
| ) | |
| # Copy summary to clipboard hint | |
| st.code(summary, language="text") | |
| st.caption("π‘ Tip: Klik teks di atas untuk menyalin ringkasan.") | |
| except Exception as e: | |
| st.error(f"β Terjadi kesalahan: {str(e)}") | |
| with st.expander("π Detail Error (untuk debugging)"): | |
| st.exception(e) | |
| # Sidebar info | |
| with st.sidebar: | |
| st.header("βΉοΈ Informasi") | |
| st.markdown(""" | |
| **Fitur:** | |
| - β Ekstrak transkrip otomatis | |
| - β Penambahan tanda baca AI | |
| - β Ringkasan multi-bahasa | |
| - β Export ke audio MP3 | |
| **Batasan:** | |
| - Video harus memiliki subtitle/closed caption | |
| - Durasi video sangat panjang mungkin diproses per bagian | |
| - Model T5-base optimal untuk teks bahasa Inggris | |
| **Tips:** | |
| - Gunakan video dengan subtitle resmi untuk hasil terbaik | |
| - Untuk video bahasa Indonesia, pastikan subtitle tersedia | |
| """) | |
| st.markdown("---") | |
| st.caption("Dibuat dengan β€οΈ menggunakan Streamlit & Hugging Face") | |
| if __name__ == "__main__": | |
| main() |