Spaces:

ATInc1
/

AIdea-Server

Running

App Files Files Community

Ali Hashhash commited on Apr 14

Commit

5985dfd

1 Parent(s): 226ed2d

update time

Browse files

Files changed (1) hide show

src/api/notes_routes.py +70 -15

src/api/notes_routes.py CHANGED Viewed

@@ -29,40 +29,95 @@ tasks: Dict[str, Dict] = {}
 # ==========================================
-# ⏱️ YouTube Duration Scraper (stdlib only)
 # ==========================================
 def get_youtube_duration(url: str) -> int:
     """
-    Fetches the YouTube video page and extracts the video duration in seconds
-    by scraping the `lengthSeconds` value from the page HTML.
-    Uses only Python standard library (urllib.request + re).
-    Returns the duration as an integer, or 0 if extraction fails for any reason.
     """
     try:
         req = urllib.request.Request(
             url,
             headers={
                 "User-Agent": (
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                     "AppleWebKit/537.36 (KHTML, like Gecko) "
                     "Chrome/124.0.0.0 Safari/537.36"
-                )
             },
         )
-        with urllib.request.urlopen(req, timeout=10) as response:
-            html = response.read().decode("utf-8", errors="ignore")
-        # YouTube embeds duration in the page as: "lengthSeconds":"<value>"
-        match = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
-        if match:
-            duration = int(match.group(1))
-            logger.info(f"⏱️ Extracted video duration: {duration}s")
             return duration
-        logger.warning("⚠️ lengthSeconds not found in YouTube page HTML.")
         return 0
     except Exception as e:
         logger.warning(f"⚠️ Could not fetch YouTube duration: {e}")
         return 0

 # ==========================================
+# ⏱️ YouTube Duration Scraper (stdlib only — robust multi-strategy)
 # ==========================================
 def get_youtube_duration(url: str) -> int:
     """
+    Robustly fetches the YouTube video duration in seconds using only the
+    Python standard library (urllib.request + re).
+    Strategy waterfall (tries each in order, stops on first hit):
+      1. "lengthSeconds":"<digits>"  — standard ytInitialPlayerResponse (quoted)
+      2. "lengthSeconds":<digits>    — unquoted variant
+      3. "approxDurationMs":"<ms>"   — fallback millisecond field → converted to s
+      4. PT<h>H<m>M<s>S              — ISO 8601 duration in <meta> tags
+    Returns the duration as an integer (seconds), or 0 on total failure.
     """
     try:
         req = urllib.request.Request(
             url,
             headers={
+                # Realistic Chrome 124 headers — helps avoid YouTube's bot check
                 "User-Agent": (
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                     "AppleWebKit/537.36 (KHTML, like Gecko) "
                     "Chrome/124.0.0.0 Safari/537.36"
+                ),
+                "Accept-Language": "en-US,en;q=0.9",
+                "Accept": (
+                    "text/html,application/xhtml+xml,application/xml;"
+                    "q=0.9,image/avif,image/webp,*/*;q=0.8"
+                ),
+                "Accept-Encoding": "gzip, deflate, br",
+                "Connection": "keep-alive",
+                "DNT": "1",
+                "Upgrade-Insecure-Requests": "1",
             },
         )
+        with urllib.request.urlopen(req, timeout=15) as response:
+            # Handle gzip / deflate transparently
+            raw = response.read()
+            encoding = response.headers.get("Content-Encoding", "")
+            if encoding == "gzip":
+                import gzip
+                html = gzip.decompress(raw).decode("utf-8", errors="ignore")
+            elif encoding in ("deflate", "br"):
+                import zlib
+                html = zlib.decompress(raw).decode("utf-8", errors="ignore")
+            else:
+                html = raw.decode("utf-8", errors="ignore")
+        # ── Strategy 1: "lengthSeconds":"3661"  (quoted — most common)
+        m = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
+        if m:
+            duration = int(m.group(1))
+            logger.info(f"⏱️ [S1-quoted] Extracted duration: {duration}s")
+            return duration
+        # ── Strategy 2: "lengthSeconds":3661   (unquoted — seen in some responses)
+        m = re.search(r'"lengthSeconds"\s*:\s*(\d+)', html)
+        if m:
+            duration = int(m.group(1))
+            logger.info(f"⏱️ [S2-unquoted] Extracted duration: {duration}s")
             return duration
+        # ── Strategy 3: "approxDurationMs":"3661000"  → convert ms → s
+        m = re.search(r'"approxDurationMs"\s*:\s*"(\d+)"', html)
+        if m:
+            duration = int(m.group(1)) // 1000
+            logger.info(f"⏱️ [S3-approxMs] Extracted duration: {duration}s")
+            return duration
+        # ── Strategy 4: ISO 8601 in <meta itemprop="duration" content="PT1H1M1S">
+        m = re.search(
+            r'["\s]duration["\s]*[=:]["\s]*PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?',
+            html,
+            re.IGNORECASE,
+        )
+        if m:
+            h = int(m.group(1) or 0)
+            mn = int(m.group(2) or 0)
+            s = int(m.group(3) or 0)
+            duration = h * 3600 + mn * 60 + s
+            if duration > 0:
+                logger.info(f"⏱️ [S4-ISO8601] Extracted duration: {duration}s")
+                return duration
+        logger.warning("⚠️ All duration strategies failed for URL: %s", url)
         return 0
     except Exception as e:
         logger.warning(f"⚠️ Could not fetch YouTube duration: {e}")
         return 0