Spaces:

Vikrant-Honbute
/

SnapSummaryAI

Running

App Files Files Community

Vikrant-Honbute commited on Apr 23

Commit

4f0c12e

1 Parent(s): a14c409

huggingface space issue fix

Browse files

Files changed (2) hide show

app.py +235 -27
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import os
 from urllib.parse import parse_qs, urlparse
 import certifi
 import streamlit as st
 import validators
 from langchain_community.document_loaders import UnstructuredURLLoader
@@ -23,6 +28,7 @@ st.subheader("Summarize URL")
 # Sidebar
 with st.sidebar:
     groq_api_key = st.text_input("Groq API Key", value="", type="password")
 generic_url = st.text_input("URL", label_visibility="collapsed")
@@ -32,6 +38,9 @@ Content: {text}
 """
 prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
 def is_youtube_url(url: str) -> bool:
     return "youtube.com" in url or "youtu.be" in url
@@ -70,45 +79,244 @@ def build_web_loader(url: str) -> UnstructuredURLLoader:
     )
-def load_youtube_docs(url: str) -> list[Document]:
-    video_id = extract_video_id(url)
-    if not video_id:
-        raise ValueError("Could not parse YouTube video id from URL.")
-    transcript = None
     try:
-        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en", "en-US"])
-    except Exception:
         try:
-            transcript = YouTubeTranscriptApi.get_transcript(video_id)
         except Exception:
-            st.warning(
-                "YouTube transcript fetch failed (often network/IP restriction). "
-                "Trying video metadata fallback..."
             )
-    if transcript:
-        transcript_text = " ".join(item.get("text", "") for item in transcript).strip()
         if transcript_text:
             return [
                 Document(
                     page_content=transcript_text,
-                    metadata={"source": url, "kind": "transcript"},
                 )
             ]
-    try:
-        with YoutubeDL(
-            {
-                "quiet": True,
-                "skip_download": True,
-                "noplaylist": True,
-                "socket_timeout": 20,
-            }
-        ) as ydl:
-            video_info = ydl.extract_info(url, download=False)
-    except Exception as exc:
-        raise RuntimeError(f"YouTube transcript and metadata fetch both failed: {exc}") from exc
     title = (video_info.get("title") or "").strip()
     description = (video_info.get("description") or "").strip()
@@ -142,7 +350,7 @@ if st.button("Summarize the Content from YT or Website"):
             is_youtube = is_youtube_url(generic_url)
             try:
                 if is_youtube:
-                    docs = load_youtube_docs(generic_url)
                 else:
                     docs = build_web_loader(generic_url).load()
             except RequestException:

 import os
+import html
+import json
+import re
+import xml.etree.ElementTree as ET
 from urllib.parse import parse_qs, urlparse
 import certifi
+import httpx
 import streamlit as st
 import validators
 from langchain_community.document_loaders import UnstructuredURLLoader
 # Sidebar
 with st.sidebar:
     groq_api_key = st.text_input("Groq API Key", value="", type="password")
+    youtube_debug = st.checkbox("Debug YouTube Reachability", value=False)
 generic_url = st.text_input("URL", label_visibility="collapsed")
 """
 prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
+LANGUAGE_PREFERENCE = ["en", "en-US", "en-orig", "en-GB"]
+SUBTITLE_EXT_PREFERENCE = ["json3", "srv3", "vtt", "ttml", "srv1", "srv2"]
 def is_youtube_url(url: str) -> bool:
     return "youtube.com" in url or "youtu.be" in url
     )
+def normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", html.unescape(text or "")).strip()
+def debug_youtube_reachability(video_id: str) -> None:
     try:
+        response = httpx.get(f"https://www.youtube.com/watch?v={video_id}", timeout=10)
+        st.info(f"YouTube reachable: {response.status_code}")
+    except Exception as exc:
+        st.warning(f"YouTube blocked or unreachable: {exc}")
+def parse_json3_subtitle(payload: str) -> str:
+    try:
+        data = json.loads(payload)
+    except json.JSONDecodeError:
+        return ""
+    parts = []
+    for event in data.get("events", []):
+        for segment in event.get("segs", []):
+            piece = segment.get("utf8", "")
+            if piece:
+                parts.append(piece.replace("\n", " "))
+    return normalize_text(" ".join(parts))
+def parse_xml_subtitle(payload: str) -> str:
+    try:
+        root = ET.fromstring(payload)
+    except ET.ParseError:
+        return ""
+    chunks = []
+    for node in root.iter():
+        tag = node.tag.lower() if isinstance(node.tag, str) else ""
+        if tag.endswith("text") or tag.endswith("p"):
+            text_value = "".join(node.itertext())
+            if text_value:
+                chunks.append(text_value.replace("\n", " "))
+    return normalize_text(" ".join(chunks))
+def parse_vtt_subtitle(payload: str) -> str:
+    lines = []
+    for raw_line in payload.splitlines():
+        line = raw_line.strip().lstrip("\ufeff")
+        if not line or line.upper().startswith("WEBVTT"):
+            continue
+        if "-->" in line:
+            continue
+        if re.match(r"^\d+$", line):
+            continue
+        if line.startswith(("NOTE", "STYLE", "REGION")):
+            continue
+        lines.append(line)
+    return normalize_text(" ".join(lines))
+def fetch_subtitle_track(track_url: str, ext: str) -> str:
+    response = httpx.get(track_url, timeout=15)
+    response.raise_for_status()
+    payload = response.text
+    ext_lower = (ext or "").lower()
+    if ext_lower == "json3":
+        return parse_json3_subtitle(payload)
+    if ext_lower in {"srv3", "ttml", "xml"}:
+        return parse_xml_subtitle(payload)
+    if ext_lower in {"vtt", "srv1", "srv2"}:
+        return parse_vtt_subtitle(payload)
+    for parser in (parse_json3_subtitle, parse_xml_subtitle, parse_vtt_subtitle):
+        text = parser(payload)
+        if text:
+            return text
+    return ""
+def extract_ydlp_subtitles(video_info: dict) -> str:
+    ext_rank = {ext: idx for idx, ext in enumerate(SUBTITLE_EXT_PREFERENCE)}
+    candidates = []
+    subtitle_maps = [
+        video_info.get("subtitles", {}) or {},
+        video_info.get("automatic_captions", {}) or {},
+    ]
+    for subtitle_map in subtitle_maps:
+        for lang, entries in subtitle_map.items():
+            language_rank = (
+                LANGUAGE_PREFERENCE.index(lang)
+                if lang in LANGUAGE_PREFERENCE
+                else len(LANGUAGE_PREFERENCE)
+            )
+            if not isinstance(entries, list):
+                continue
+            for entry in entries:
+                track_url = entry.get("url")
+                ext = (entry.get("ext") or "").lower()
+                if not track_url:
+                    continue
+                candidates.append(
+                    (
+                        language_rank,
+                        ext_rank.get(ext, len(SUBTITLE_EXT_PREFERENCE)),
+                        track_url,
+                        ext,
+                    )
+                )
+    candidates.sort(key=lambda item: (item[0], item[1]))
+    for _, _, track_url, ext in candidates:
         try:
+            transcript_text = fetch_subtitle_track(track_url, ext)
+            if transcript_text:
+                return transcript_text
         except Exception:
+            continue
+    return ""
+def fetch_transcript_with_proxy(video_id: str) -> str:
+    proxy_url = os.getenv("YOUTUBE_PROXY_URL", "").strip()
+    proxy_username = os.getenv("WEBSHARE_PROXY_USERNAME", "").strip()
+    proxy_password = os.getenv("WEBSHARE_PROXY_PASSWORD", "").strip()
+    if proxy_username and proxy_password:
+        try:
+            from youtube_transcript_api.proxies import WebshareProxyConfig
+            transcript_api = YouTubeTranscriptApi(
+                proxies=WebshareProxyConfig(
+                    proxy_username=proxy_username,
+                    proxy_password=proxy_password,
+                )
+            )
+            transcript_data = transcript_api.fetch(video_id)
+            transcript_text = normalize_text(
+                " ".join(
+                    item.text if hasattr(item, "text") else item.get("text", "")
+                    for item in transcript_data
+                )
             )
+            if transcript_text:
+                return transcript_text
+        except Exception:
+            pass
+    proxy_kwargs = {}
+    if proxy_url:
+        proxy_kwargs["proxies"] = {"https": proxy_url, "http": proxy_url}
+    attempts = [
+        {"languages": ["en", "en-US"], **proxy_kwargs},
+        {"languages": ["en"], **proxy_kwargs},
+        {"languages": ["en", "en-US"]},
+    ]
+    for kwargs in attempts:
+        call_kwargs = dict(kwargs)
+        try:
+            transcript = YouTubeTranscriptApi.get_transcript(video_id, **call_kwargs)
+        except TypeError:
+            call_kwargs.pop("proxies", None)
+            try:
+                transcript = YouTubeTranscriptApi.get_transcript(video_id, **call_kwargs)
+            except Exception:
+                continue
+        except Exception:
+            continue
+        transcript_text = normalize_text(" ".join(item.get("text", "") for item in transcript))
+        if transcript_text:
+            return transcript_text
+    return ""
+def load_youtube_docs(url: str, debug_reachability: bool = False) -> list[Document]:
+    video_id = extract_video_id(url)
+    if not video_id:
+        raise ValueError("Could not parse YouTube video id from URL.")
+    if debug_reachability:
+        debug_youtube_reachability(video_id)
+    try:
+        ydl_options = {
+            "quiet": True,
+            "skip_download": True,
+            "noplaylist": True,
+            "socket_timeout": 20,
+            "writesubtitles": True,
+            "writeautomaticsub": True,
+            "subtitleslangs": ["en", "en-US"],
+            "subtitlesformat": "json3",
+        }
+        with YoutubeDL(ydl_options) as ydl:
+            video_info = ydl.extract_info(url, download=False)
+    except Exception as exc:
+        transcript_text = fetch_transcript_with_proxy(video_id)
         if transcript_text:
             return [
                 Document(
                     page_content=transcript_text,
+                    metadata={"source": url, "kind": "transcript_proxy"},
                 )
             ]
+        raise RuntimeError(f"YouTube fetch failed: {exc}") from exc
+    transcript_text = extract_ydlp_subtitles(video_info)
+    if transcript_text:
+        return [
+            Document(
+                page_content=transcript_text,
+                metadata={"source": url, "kind": "transcript"},
+            )
+        ]
+    transcript_text = fetch_transcript_with_proxy(video_id)
+    if transcript_text:
+        return [
+            Document(
+                page_content=transcript_text,
+                metadata={"source": url, "kind": "transcript_proxy"},
+            )
+        ]
+    st.warning("No subtitles found. Falling back to video metadata.")
     title = (video_info.get("title") or "").strip()
     description = (video_info.get("description") or "").strip()
             is_youtube = is_youtube_url(generic_url)
             try:
                 if is_youtube:
+                    docs = load_youtube_docs(generic_url, debug_reachability=youtube_debug)
                 else:
                     docs = build_web_loader(generic_url).load()
             except RequestException:

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ unstructured==0.16.4
 youtube-transcript-api==0.6.3
 yt-dlp==2025.3.31
 requests==2.32.3
 certifi==2025.1.31

 youtube-transcript-api==0.6.3
 yt-dlp==2025.3.31
 requests==2.32.3
+httpx==0.27.2
 certifi==2025.1.31