Files changed (1) hide show
  1. app.py +238 -99
app.py CHANGED
@@ -1,105 +1,244 @@
1
  import streamlit as st
2
- import requests
3
  from gtts import gTTS
4
  from urllib.parse import urlparse, parse_qs
5
- from youtube_transcript_api import YouTubeTranscriptApi
6
  import unicodedata
7
  from deepmultilingualpunctuation import PunctuationModel
8
  from transformers import pipeline
9
-
10
-
11
- def summarize_video(url):
12
- if "watch" in url:
13
- pass
14
- else:
15
- url = url.replace("youtu.be/", "www.youtube.com/watch?v=")
16
-
17
- parsed_url = urlparse(url)
18
- video_id = parse_qs(parsed_url.query)['v'][0]
19
-
20
- # Get the transcript
21
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
22
-
23
- # Combining all the lists into on unique list
24
- text = []
25
- for i in range(0, len(transcript)):
26
- text.append(transcript[i]["text"])
27
-
28
- # Join list items into one paragraph
29
- video_transcript = " ".join(text)
30
- print("Text transcript created")
31
-
32
- print(video_transcript)
33
-
34
- # Text normalization
35
- my_string = unicodedata.normalize('NFKD', video_transcript)
36
- print("Text normalized")
37
-
38
-
39
- # Add punctuation
40
- model = PunctuationModel()
41
- result = model.restore_punctuation(video_transcript)
42
- print("Punctuation restored")
43
-
44
- # SUMMARIZATION
45
-
46
- # instantiate the summarization pipeline
47
- summarization_pipeline = pipeline(
48
- "summarization",
49
- model="t5-base", # you can choose a different model, depending on your requirements
50
- tokenizer="t5-base" # you can choose a different tokenizer, depending on your requirements
51
- )
52
-
53
- # define the input text to summarize
54
- input_text = result
55
-
56
- # split the input text into smaller chunks
57
- chunk_size = 5000
58
- chunks = [input_text[i:i+chunk_size] for i in range(0, len(input_text), chunk_size)]
59
-
60
- # summarize each chunk separately
61
- summaries = []
62
- for chunk in chunks:
63
- summary = summarization_pipeline(chunk, max_length=200, min_length=30, do_sample=False)
64
- summaries.append(summary[0]['summary_text'])
65
-
66
- # combine the summaries of all chunks into a single summary
67
- final_summary = " ".join(summaries)
68
-
69
- # print the generated summary
70
- return final_summary
71
-
72
- # Define the Streamlit app
73
- st.title("YouTube Summarizer")
74
-
75
- # Define the input form
76
- form = st.form(key="input_form")
77
-
78
- # Get the video ID from the URL
79
- video_url = form.text_input("Enter a YouTube video URL")
80
-
81
- # Submit button
82
- submit_button = form.form_submit_button("Summarize Video")
83
-
84
- # Handle form submissions
85
- if submit_button:
86
- # Call the summarize_video function to get the summary
87
- summary = summarize_video(video_url)
88
-
89
- # Display the summary to the user
90
- st.subheader("Summary")
91
- st.write(summary)
92
-
93
- # Convert text summary into audio
94
- tts = gTTS(summary)
95
- print("converting text to audio")
96
- tts.save('Summary.mp3')
97
-
98
- # Download audio transcript
99
- with open('Summary.mp3', 'rb') as f:
100
- st.download_button('Download mp3', f, file_name='Summary.mp3')
101
-
102
-
103
-
104
-
105
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  from gtts import gTTS
3
  from urllib.parse import urlparse, parse_qs
4
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptUnavailable, NoTranscriptFound
5
  import unicodedata
6
  from deepmultilingualpunctuation import PunctuationModel
7
  from transformers import pipeline
8
+ import io
9
+ import re
10
+
11
+ # =============================================================================
12
+ # CACHE MODELS - Agar tidak reload setiap kali form disubmit
13
+ # =============================================================================
14
+ @st.cache_resource
15
+ def load_punctuation_model():
16
+ """Load punctuation model once and cache it"""
17
+ return PunctuationModel("oliverguhr/fullstop-punctuation-multilingual--large")
18
+
19
+ @st.cache_resource
20
+ def load_summarization_pipeline():
21
+ """Load summarization pipeline once and cache it"""
22
+ return pipeline(
23
+ "summarization",
24
+ model="t5-base",
25
+ tokenizer="t5-base",
26
+ device=0 if st.runtime.get_option("server.headless") == False and hasattr(st, 'runtime') else -1 # CPU fallback
27
+ )
28
+
29
+ # =============================================================================
30
+ # HELPER FUNCTIONS
31
+ # =============================================================================
32
+ def extract_video_id(url):
33
+ """Extract video ID from various YouTube URL formats"""
34
+ # Handle short URLs: youtu.be/VIDEO_ID
35
+ if "youtu.be" in url:
36
+ parsed = urlparse(url)
37
+ return parsed.path.lstrip('/')
38
+
39
+ # Handle standard URLs: youtube.com/watch?v=VIDEO_ID
40
+ if "watch" in url:
41
+ parsed = urlparse(url)
42
+ params = parse_qs(parsed.query)
43
+ return params.get('v', [None])[0]
44
+
45
+ # Handle embed URLs: youtube.com/embed/VIDEO_ID
46
+ if "embed" in url:
47
+ parsed = urlparse(url)
48
+ return parsed.path.split('/')[-1]
49
+
50
+ # Handle short URLs without protocol
51
+ if re.match(r'^[a-zA-Z0-9_-]{11}$', url.strip()):
52
+ return url.strip()
53
+
54
+ return None
55
+
56
+ def get_transcript_text(video_id, language='en'):
57
+ """Fetch and combine transcript text from YouTube"""
58
+ try:
59
+ # Try to get transcript in preferred language, fallback to any available
60
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
61
+ transcript = transcript_list.find_transcript([language])
62
+ except (TranscriptUnavailable, NoTranscriptFound):
63
+ # Fallback: get any available transcript
64
+ transcript = YouTubeTranscriptApi.list_transcripts(video_id).find_generated_transcript(['en'])
65
+
66
+ transcript_data = transcript.fetch()
67
+ # Combine all text segments
68
+ text_segments = [item["text"] for item in transcript_data]
69
+ return " ".join(text_segments)
70
+
71
+ def chunk_text(text, max_chars=4000):
72
+ """Split text into chunks respecting sentence boundaries"""
73
+ chunks = []
74
+ current_chunk = ""
75
+
76
+ # Split by sentences (basic approach)
77
+ sentences = re.split(r'(?<=[.!?])\s+', text)
78
+
79
+ for sentence in sentences:
80
+ if len(current_chunk) + len(sentence) <= max_chars:
81
+ current_chunk += sentence + " "
82
+ else:
83
+ if current_chunk:
84
+ chunks.append(current_chunk.strip())
85
+ current_chunk = sentence + " "
86
+
87
+ if current_chunk:
88
+ chunks.append(current_chunk.strip())
89
+
90
+ return chunks if chunks else [text] # Fallback if no sentences found
91
+
92
+ def normalize_text(text):
93
+ """Normalize unicode characters in text"""
94
+ return unicodedata.normalize('NFKD', text)
95
+
96
+ # =============================================================================
97
+ # MAIN SUMMARIZATION FUNCTION
98
+ # =============================================================================
99
+ def summarize_video(url, language='en'):
100
+ """Main function to summarize YouTube video"""
101
+
102
+ # Extract video ID
103
+ video_id = extract_video_id(url)
104
+ if not video_id:
105
+ raise ValueError("Invalid YouTube URL. Please check the link and try again.")
106
+
107
+ # Get transcript
108
+ with st.spinner("πŸ“ Mengambil transkrip video..."):
109
+ video_transcript = get_transcript_text(video_id, language)
110
+
111
+ if not video_transcript or len(video_transcript.strip()) < 50:
112
+ raise ValueError("Transkrip tidak ditemukan atau terlalu pendek. Video mungkin tidak memiliki subtitle.")
113
+
114
+ # Normalize text
115
+ normalized_text = normalize_text(video_transcript)
116
+
117
+ # Add punctuation
118
+ with st.spinner("✏️ Menambahkan tanda baca..."):
119
+ punctuation_model = load_punctuation_model()
120
+ punctuated_text = punctuation_model.restore_punctuation(normalized_text)
121
+
122
+ # Summarization
123
+ with st.spinner("πŸ€– Meringkas konten..."):
124
+ summarizer = load_summarization_pipeline()
125
+
126
+ # T5-base max input: ~512 tokens (~2000-3000 chars safe limit)
127
+ chunks = chunk_text(punctuated_text, max_chars=3000)
128
+ summaries = []
129
+
130
+ for i, chunk in enumerate(chunks):
131
+ # T5 expects prefix "summarize: " for some versions
132
+ input_text = f"summarize: {chunk}" if "t5" in "t5-base" else chunk
133
+
134
+ summary_result = summarizer(
135
+ input_text,
136
+ max_length=150,
137
+ min_length=30,
138
+ do_sample=False,
139
+ truncation=True
140
+ )
141
+ summaries.append(summary_result[0]['summary_text'])
142
+ st.progress(min((i + 1) / len(chunks), 1.0))
143
+
144
+ final_summary = " ".join(summaries)
145
+
146
+ return final_summary
147
+
148
+ # =============================================================================
149
+ # STREAMLIT APP
150
+ # =============================================================================
151
+ def main():
152
+ st.set_page_config(page_title="YouTube Summarizer", page_icon="🎬", layout="centered")
153
+
154
+ st.title("🎬 YouTube Video Summarizer")
155
+ st.markdown("""
156
+ Masukkan URL video YouTube untuk mendapatkan ringkasan otomatis berbasis AI.
157
+ Mendukung video dengan subtitle/closed caption.
158
+ """)
159
+
160
+ # Input form
161
+ with st.form(key="summarizer_form"):
162
+ video_url = st.text_input(
163
+ "πŸ”— URL Video YouTube",
164
+ placeholder="https://www.youtube.com/watch?v=..."
165
+ )
166
+ language = st.selectbox(
167
+ "🌐 Bahasa Transkrip (opsional)",
168
+ options=['en', 'id', 'es', 'fr', 'de', 'pt', 'auto'],
169
+ index=0,
170
+ help="Pilih bahasa transkrip. 'auto' akan mencoba mendeteksi otomatis."
171
+ )
172
+ col1, col2 = st.columns([1, 3])
173
+ with col1:
174
+ submit_button = st.form_submit_button("πŸš€ Ringkas", use_container_width=True)
175
+
176
+ # Process submission
177
+ if submit_button:
178
+ if not video_url.strip():
179
+ st.error("⚠️ Harap masukkan URL video YouTube yang valid.")
180
+ return
181
+
182
+ try:
183
+ # Generate summary
184
+ summary = summarize_video(video_url, language if language != 'auto' else 'en')
185
+
186
+ # Display results
187
+ st.success("βœ… Ringkasan berhasil dibuat!")
188
+ st.subheader("πŸ“„ Hasil Ringkasan")
189
+ st.markdown(f"> {summary}")
190
+
191
+ # Text-to-Speech
192
+ with st.spinner("πŸ”Š Membuat audio..."):
193
+ # Detect language for gTTS (simplified: default to 'en')
194
+ tts_lang = 'id' if any(kata in summary.lower() for kata in ['dan', 'yang', 'di', 'ke']) else 'en'
195
+ tts = gTTS(text=summary, lang=tts_lang, slow=False)
196
+
197
+ # Convert to bytes for download (no file I/O)
198
+ audio_buffer = io.BytesIO()
199
+ tts.write_to_fp(audio_buffer)
200
+ audio_buffer.seek(0)
201
+
202
+ # Download button
203
+ st.download_button(
204
+ label="πŸ“₯ Download Ringkasan Audio (MP3)",
205
+ data=audio_buffer,
206
+ file_name="youtube_summary.mp3",
207
+ mime="audio/mpeg",
208
+ use_container_width=True
209
+ )
210
+
211
+ # Copy summary to clipboard hint
212
+ st.code(summary, language="text")
213
+ st.caption("πŸ’‘ Tip: Klik teks di atas untuk menyalin ringkasan.")
214
+
215
+ except Exception as e:
216
+ st.error(f"❌ Terjadi kesalahan: {str(e)}")
217
+ with st.expander("πŸ” Detail Error (untuk debugging)"):
218
+ st.exception(e)
219
+
220
+ # Sidebar info
221
+ with st.sidebar:
222
+ st.header("ℹ️ Informasi")
223
+ st.markdown("""
224
+ **Fitur:**
225
+ - βœ… Ekstrak transkrip otomatis
226
+ - βœ… Penambahan tanda baca AI
227
+ - βœ… Ringkasan multi-bahasa
228
+ - βœ… Export ke audio MP3
229
+
230
+ **Batasan:**
231
+ - Video harus memiliki subtitle/closed caption
232
+ - Durasi video sangat panjang mungkin diproses per bagian
233
+ - Model T5-base optimal untuk teks bahasa Inggris
234
+
235
+ **Tips:**
236
+ - Gunakan video dengan subtitle resmi untuk hasil terbaik
237
+ - Untuk video bahasa Indonesia, pastikan subtitle tersedia
238
+ """)
239
+
240
+ st.markdown("---")
241
+ st.caption("Dibuat dengan ❀️ menggunakan Streamlit & Hugging Face")
242
+
243
+ if __name__ == "__main__":
244
+ main()