Spaces:

daafa999
/

ViralScopeAI

Sleeping

File size: 9,839 Bytes

import streamlit as st
import re
import time
import json
import requests
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi
import plotly.graph_objects as go

# =============================================================================
# CONFIG
# =============================================================================
st.set_page_config(page_title="🔥 ViralScope AI", page_icon="🎬", layout="wide")

# =============================================================================
# ROBUST TRANSCRIPT FETCHER (Multi-Layer Fallback)
# =============================================================================
@st.cache_data(ttl=1800)
def fetch_transcript_robust(video_id: str) -> list[dict] | None:
    """
    Layer 1: youtube-transcript-api (direct get_transcript)
    Layer 2: Direct HTTP request ke YouTube timedtext API
    Layer 3: Fallback error handler dengan debug info
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9,id;q=0.8'
    }
    
    # LAYER 1: Library langsung dengan bahasa prioritas
    languages = ['id', 'en', 'es', 'de', 'fr', 'pt', 'ru', 'ja', 'ko']
    for lang in languages:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang])
            if transcript and len(transcript) > 3:
                return transcript
        except Exception:
            continue

    # LAYER 2: Direct HTTP fallback ke YouTube caption endpoint
    try:
        # Ambil halaman video untuk ekstrak URL caption track
        video_page = requests.get(f"https://www.youtube.com/watch?v={video_id}", headers=headers, timeout=10)
        video_page.raise_for_status()
        
        # Regex untuk menemukan JSON captionTracks
        caption_match = re.search(r'"captionTracks":(\[.*?\])', video_page.text)
        if not caption_match:
            return None
            
        caption_tracks = json.loads(caption_match.group(1))
        if not caption_tracks:
            return None
            
        # Pilih track pertama (biasanya auto-generated atau manual)
        track_url = caption_tracks[0].get('baseUrl')
        if not track_url:
            return None
            
        # Fetch transcript dalam format JSON3
        transcript_resp = requests.get(track_url, headers=headers, timeout=10)
        transcript_resp.raise_for_status()
        data = transcript_resp.json()
        
        # Parse ke format library
        result = []
        for event in data.get('events', []):
            if 'segs' in event and 'tStartMs' in event:
                text = ''.join(seg.get('utf8', '') for seg in event.get('segs', []))
                if text.strip():
                    result.append({
                        'text': text.strip(),
                        'start': event['tStartMs'] / 1000,
                        'duration': event.get('dDurationMs', 0) / 1000
                    })
        return result if result else None
        
    except Exception as e:
        st.warning(f"⚠️ Semua metode gagal. Detail: {type(e).__name__}")
        return None

def extract_video_id(url: str) -> str | None:
    patterns = [
        r'(?:v=|\/)([a-zA-Z0-9_-]{11})(?:\&|\/|$)',
        r'youtu\.be\/([a-zA-Z0-9_-]{11})',
        r'^([a-zA-Z0-9_-]{11})$'
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def format_time(seconds: float) -> str:
    m, s = divmod(int(seconds), 60)
    return f"{m:02d}:{s:02d}"

# =============================================================================
# ANALYSIS ENGINE (Lightweight, Pure Python)
# =============================================================================
def calculate_emotional_intensity(text: str) -> float:
    high_arousal = ['shocking', 'viral', 'wow', 'amazing', 'gila', 'luar biasa', 
                    'terkejut', 'marah', 'sedih', 'rahasia', 'ternyata', 'twist',
                    'unbelievable', 'mind-blowing', 'breakthrough', 'first time']
    words = re.findall(r'\b\w+\b', text.lower())
    matches = sum(1 for w in words if w in high_arousal)
    return min(100, 20 + matches * 16)

def detect_hook_strength(transcript: list[dict], first_n_seconds: int = 30) -> float:
    hook_kw = ['rahasia', 'ternyata', 'jangan', 'peringatan', 'viral', 'shocking',
               'unbelievable', 'you won\'t believe', 'wait until', 'plot twist']
    hook_text = " ".join([seg['text'].lower() for seg in transcript if seg.get('start', 0) <= first_n_seconds])
    matches = sum(1 for kw in hook_kw if kw in hook_text)
    return min(100, matches * 15 + 30)

def identify_viral_segments(transcript: list[dict], window: int = 30) -> list[dict]:
    candidates = []
    for i, seg in enumerate(transcript):
        start = seg['start']
        window_text = " ".join([t['text'] for t in transcript[i:] if t['start'] - start <= window])
        if not window_text.strip(): continue
        
        emotion = calculate_emotional_intensity(window_text)
        hook_boost = 20 if any(kw in window_text.lower() for kw in ['shocking', 'rahasia', 'viral', 'twist']) else 0
        score = min(100, max(0, emotion * 0.6 + hook_boost + 25))
        
        candidates.append({
            "start": start,
            "end": start + window,
            "preview": window_text[:120] + "...",
            "score": round(score, 1),
            "emotion": round(emotion, 1)
        })
    return sorted(candidates, key=lambda x: x['score'], reverse=True)[:5]

def calculate_overall_score(transcript: list[dict]) -> dict:
    full_text = " ".join([t['text'] for t in transcript])
    emotion = calculate_emotional_intensity(full_text)
    hook = detect_hook_strength(transcript)
    density = min(100, len(full_text.split()) // 5)
    
    return {
        "score": min(100, round(emotion * 0.25 + hook * 0.20 + density * 0.15 + 40, 1)),
        "breakdown": {
            "Emotional Intensity": round(emotion, 1),
            "Hook Strength": round(hook, 1),
            "Content Density": round(density, 1)
        }
    }

# =============================================================================
# STREAMLIT UI
# =============================================================================
def main():
    st.title("🔥 ViralScope AI")
    st.markdown("Analisis potensi viral & temukan segment terbaik untuk Shorts/Reels secara instan.")
    
    col1, col2 = st.columns([3, 1])
    with col1:
        url = st.text_input("🔗 URL YouTube", placeholder="https://www.youtube.com/watch?v=...")
    with col2:
        if st.button("🧪 Video Test", use_container_width=True):
            st.session_state.test_url = "https://www.youtube.com/watch?v=jNQXAC9IVRw"
            st.rerun()

    if 'test_url' in st.session_state and not url:
        url = st.session_state.test_url
        del st.session_state.test_url

    if st.button("🚀 Analisis Sekarang", type="primary", use_container_width=True):
        if not url:
            st.error("❌ Masukkan URL YouTube terlebih dahulu.")
            return

        vid_id = extract_video_id(url)
        if not vid_id:
            st.error("❌ Format URL tidak valid.")
            return

        with st.spinner("📡 Menghubungkan ke YouTube (Layer 1 & 2)..."):
            transcript = fetch_transcript_robust(vid_id)

        if not transcript:
            st.error("""
            🚫 **Transcript tidak terambil meskipun CC terlihat di YouTube.**
            
            🔍 **Penyebab Umum:**
            • IP Hugging Face Spaces diblokir/di-throttle YouTube
            • Video hanya punya "Auto-generated" yang dilindungi anti-bot
            • Region Indonesia terkena rate limit YouTube API
            
            💡 **Solusi Cepat:**
            1. Coba video dengan subtitle **Manual** (bukan auto-generated)
            2. Gunakan tombol 🧪 *Video Test* di atas (video lama YouTube, pasti lolos)
            3. Deploy alternatif di [Streamlit Community Cloud](https://streamlit.io/cloud) (IP lebih stabil)
            """)
            return

        with st.spinner("🧠 Menganalisis potensi viral..."):
            time.sleep(0.3)
            overall = calculate_overall_score(transcript)
            segments = identify_viral_segments(transcript)

        st.divider()
        col1, col2 = st.columns([1, 2])
        with col1:
            fig = go.Figure(go.Indicator(
                mode="gauge+number", value=overall['score'],
                gauge={'axis': {'range': [0, 100]}, 'bar': {'color': 'crimson'},
                       'steps': [{'range': [0, 50], 'color': '#ffebee'},
                                 {'range': [50, 75], 'color': '#fff3e0'},
                                 {'range': [75, 100], 'color': '#e8f5e9'}]}
            ))
            fig.update_layout(height=220, margin=dict(l=10, r=10, t=30, b=10))
            st.plotly_chart(fig, use_container_width=True)

        with col2:
            st.subheader("📊 Detail Skor")
            for k, v in overall['breakdown'].items():
                st.metric(k, f"{v}/100")

        st.subheader("🎯 Top 5 Segment Viral")
        for i, seg in enumerate(segments, 1):
            with st.expander(f"#{i} | Score: {seg['score']} | ⏱️ {format_time(seg['start'])}-{format_time(seg['end'])}", expanded=(i==1)):
                st.markdown(f"*\"{seg['preview']}\"*")
                st.caption(f"Emosi: {seg['emotion']}/100 | 💡 Cocok untuk Reels/TikTok/Shorts")

        st.divider()
        st.success("✅ Analisis selesai! Gunakan segment di atas sebagai hook konten pendek Anda.")

if __name__ == "__main__":
    main()