import streamlit as st import re import time import json import requests from urllib.parse import urlparse, parse_qs from youtube_transcript_api import YouTubeTranscriptApi import plotly.graph_objects as go # ============================================================================= # CONFIG # ============================================================================= st.set_page_config(page_title="๐Ÿ”ฅ ViralScope AI", page_icon="๐ŸŽฌ", layout="wide") # ============================================================================= # ROBUST TRANSCRIPT FETCHER (Multi-Layer Fallback) # ============================================================================= @st.cache_data(ttl=1800) def fetch_transcript_robust(video_id: str) -> list[dict] | None: """ Layer 1: youtube-transcript-api (direct get_transcript) Layer 2: Direct HTTP request ke YouTube timedtext API Layer 3: Fallback error handler dengan debug info """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.9,id;q=0.8' } # LAYER 1: Library langsung dengan bahasa prioritas languages = ['id', 'en', 'es', 'de', 'fr', 'pt', 'ru', 'ja', 'ko'] for lang in languages: try: transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang]) if transcript and len(transcript) > 3: return transcript except Exception: continue # LAYER 2: Direct HTTP fallback ke YouTube caption endpoint try: # Ambil halaman video untuk ekstrak URL caption track video_page = requests.get(f"https://www.youtube.com/watch?v={video_id}", headers=headers, timeout=10) video_page.raise_for_status() # Regex untuk menemukan JSON captionTracks caption_match = re.search(r'"captionTracks":(\[.*?\])', video_page.text) if not caption_match: return None caption_tracks = json.loads(caption_match.group(1)) if not caption_tracks: return None # Pilih track pertama (biasanya auto-generated atau manual) track_url = caption_tracks[0].get('baseUrl') if not track_url: return None # Fetch transcript dalam format JSON3 transcript_resp = requests.get(track_url, headers=headers, timeout=10) transcript_resp.raise_for_status() data = transcript_resp.json() # Parse ke format library result = [] for event in data.get('events', []): if 'segs' in event and 'tStartMs' in event: text = ''.join(seg.get('utf8', '') for seg in event.get('segs', [])) if text.strip(): result.append({ 'text': text.strip(), 'start': event['tStartMs'] / 1000, 'duration': event.get('dDurationMs', 0) / 1000 }) return result if result else None except Exception as e: st.warning(f"โš ๏ธ Semua metode gagal. Detail: {type(e).__name__}") return None def extract_video_id(url: str) -> str | None: patterns = [ r'(?:v=|\/)([a-zA-Z0-9_-]{11})(?:\&|\/|$)', r'youtu\.be\/([a-zA-Z0-9_-]{11})', r'^([a-zA-Z0-9_-]{11})$' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) return None def format_time(seconds: float) -> str: m, s = divmod(int(seconds), 60) return f"{m:02d}:{s:02d}" # ============================================================================= # ANALYSIS ENGINE (Lightweight, Pure Python) # ============================================================================= def calculate_emotional_intensity(text: str) -> float: high_arousal = ['shocking', 'viral', 'wow', 'amazing', 'gila', 'luar biasa', 'terkejut', 'marah', 'sedih', 'rahasia', 'ternyata', 'twist', 'unbelievable', 'mind-blowing', 'breakthrough', 'first time'] words = re.findall(r'\b\w+\b', text.lower()) matches = sum(1 for w in words if w in high_arousal) return min(100, 20 + matches * 16) def detect_hook_strength(transcript: list[dict], first_n_seconds: int = 30) -> float: hook_kw = ['rahasia', 'ternyata', 'jangan', 'peringatan', 'viral', 'shocking', 'unbelievable', 'you won\'t believe', 'wait until', 'plot twist'] hook_text = " ".join([seg['text'].lower() for seg in transcript if seg.get('start', 0) <= first_n_seconds]) matches = sum(1 for kw in hook_kw if kw in hook_text) return min(100, matches * 15 + 30) def identify_viral_segments(transcript: list[dict], window: int = 30) -> list[dict]: candidates = [] for i, seg in enumerate(transcript): start = seg['start'] window_text = " ".join([t['text'] for t in transcript[i:] if t['start'] - start <= window]) if not window_text.strip(): continue emotion = calculate_emotional_intensity(window_text) hook_boost = 20 if any(kw in window_text.lower() for kw in ['shocking', 'rahasia', 'viral', 'twist']) else 0 score = min(100, max(0, emotion * 0.6 + hook_boost + 25)) candidates.append({ "start": start, "end": start + window, "preview": window_text[:120] + "...", "score": round(score, 1), "emotion": round(emotion, 1) }) return sorted(candidates, key=lambda x: x['score'], reverse=True)[:5] def calculate_overall_score(transcript: list[dict]) -> dict: full_text = " ".join([t['text'] for t in transcript]) emotion = calculate_emotional_intensity(full_text) hook = detect_hook_strength(transcript) density = min(100, len(full_text.split()) // 5) return { "score": min(100, round(emotion * 0.25 + hook * 0.20 + density * 0.15 + 40, 1)), "breakdown": { "Emotional Intensity": round(emotion, 1), "Hook Strength": round(hook, 1), "Content Density": round(density, 1) } } # ============================================================================= # STREAMLIT UI # ============================================================================= def main(): st.title("๐Ÿ”ฅ ViralScope AI") st.markdown("Analisis potensi viral & temukan segment terbaik untuk Shorts/Reels secara instan.") col1, col2 = st.columns([3, 1]) with col1: url = st.text_input("๐Ÿ”— URL YouTube", placeholder="https://www.youtube.com/watch?v=...") with col2: if st.button("๐Ÿงช Video Test", use_container_width=True): st.session_state.test_url = "https://www.youtube.com/watch?v=jNQXAC9IVRw" st.rerun() if 'test_url' in st.session_state and not url: url = st.session_state.test_url del st.session_state.test_url if st.button("๐Ÿš€ Analisis Sekarang", type="primary", use_container_width=True): if not url: st.error("โŒ Masukkan URL YouTube terlebih dahulu.") return vid_id = extract_video_id(url) if not vid_id: st.error("โŒ Format URL tidak valid.") return with st.spinner("๐Ÿ“ก Menghubungkan ke YouTube (Layer 1 & 2)..."): transcript = fetch_transcript_robust(vid_id) if not transcript: st.error(""" ๐Ÿšซ **Transcript tidak terambil meskipun CC terlihat di YouTube.** ๐Ÿ” **Penyebab Umum:** โ€ข IP Hugging Face Spaces diblokir/di-throttle YouTube โ€ข Video hanya punya "Auto-generated" yang dilindungi anti-bot โ€ข Region Indonesia terkena rate limit YouTube API ๐Ÿ’ก **Solusi Cepat:** 1. Coba video dengan subtitle **Manual** (bukan auto-generated) 2. Gunakan tombol ๐Ÿงช *Video Test* di atas (video lama YouTube, pasti lolos) 3. Deploy alternatif di [Streamlit Community Cloud](https://streamlit.io/cloud) (IP lebih stabil) """) return with st.spinner("๐Ÿง  Menganalisis potensi viral..."): time.sleep(0.3) overall = calculate_overall_score(transcript) segments = identify_viral_segments(transcript) st.divider() col1, col2 = st.columns([1, 2]) with col1: fig = go.Figure(go.Indicator( mode="gauge+number", value=overall['score'], gauge={'axis': {'range': [0, 100]}, 'bar': {'color': 'crimson'}, 'steps': [{'range': [0, 50], 'color': '#ffebee'}, {'range': [50, 75], 'color': '#fff3e0'}, {'range': [75, 100], 'color': '#e8f5e9'}]} )) fig.update_layout(height=220, margin=dict(l=10, r=10, t=30, b=10)) st.plotly_chart(fig, use_container_width=True) with col2: st.subheader("๐Ÿ“Š Detail Skor") for k, v in overall['breakdown'].items(): st.metric(k, f"{v}/100") st.subheader("๐ŸŽฏ Top 5 Segment Viral") for i, seg in enumerate(segments, 1): with st.expander(f"#{i} | Score: {seg['score']} | โฑ๏ธ {format_time(seg['start'])}-{format_time(seg['end'])}", expanded=(i==1)): st.markdown(f"*\"{seg['preview']}\"*") st.caption(f"Emosi: {seg['emotion']}/100 | ๐Ÿ’ก Cocok untuk Reels/TikTok/Shorts") st.divider() st.success("โœ… Analisis selesai! Gunakan segment di atas sebagai hook konten pendek Anda.") if __name__ == "__main__": main()