Ali Hashhash commited on
Commit
5985dfd
Β·
1 Parent(s): 226ed2d

update time

Browse files
Files changed (1) hide show
  1. src/api/notes_routes.py +70 -15
src/api/notes_routes.py CHANGED
@@ -29,40 +29,95 @@ tasks: Dict[str, Dict] = {}
29
 
30
 
31
  # ==========================================
32
- # ⏱️ YouTube Duration Scraper (stdlib only)
33
  # ==========================================
34
 
35
  def get_youtube_duration(url: str) -> int:
36
  """
37
- Fetches the YouTube video page and extracts the video duration in seconds
38
- by scraping the `lengthSeconds` value from the page HTML.
39
 
40
- Uses only Python standard library (urllib.request + re).
41
- Returns the duration as an integer, or 0 if extraction fails for any reason.
 
 
 
 
 
42
  """
43
  try:
44
  req = urllib.request.Request(
45
  url,
46
  headers={
 
47
  "User-Agent": (
48
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
49
  "AppleWebKit/537.36 (KHTML, like Gecko) "
50
  "Chrome/124.0.0.0 Safari/537.36"
51
- )
 
 
 
 
 
 
 
 
 
52
  },
53
  )
54
- with urllib.request.urlopen(req, timeout=10) as response:
55
- html = response.read().decode("utf-8", errors="ignore")
56
-
57
- # YouTube embeds duration in the page as: "lengthSeconds":"<value>"
58
- match = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
59
- if match:
60
- duration = int(match.group(1))
61
- logger.info(f"⏱️ Extracted video duration: {duration}s")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  return duration
63
 
64
- logger.warning("⚠️ lengthSeconds not found in YouTube page HTML.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  return 0
 
66
  except Exception as e:
67
  logger.warning(f"⚠️ Could not fetch YouTube duration: {e}")
68
  return 0
 
29
 
30
 
31
  # ==========================================
32
+ # ⏱️ YouTube Duration Scraper (stdlib only β€” robust multi-strategy)
33
  # ==========================================
34
 
35
  def get_youtube_duration(url: str) -> int:
36
  """
37
+ Robustly fetches the YouTube video duration in seconds using only the
38
+ Python standard library (urllib.request + re).
39
 
40
+ Strategy waterfall (tries each in order, stops on first hit):
41
+ 1. "lengthSeconds":"<digits>" β€” standard ytInitialPlayerResponse (quoted)
42
+ 2. "lengthSeconds":<digits> β€” unquoted variant
43
+ 3. "approxDurationMs":"<ms>" β€” fallback millisecond field β†’ converted to s
44
+ 4. PT<h>H<m>M<s>S β€” ISO 8601 duration in <meta> tags
45
+
46
+ Returns the duration as an integer (seconds), or 0 on total failure.
47
  """
48
  try:
49
  req = urllib.request.Request(
50
  url,
51
  headers={
52
+ # Realistic Chrome 124 headers β€” helps avoid YouTube's bot check
53
  "User-Agent": (
54
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
55
  "AppleWebKit/537.36 (KHTML, like Gecko) "
56
  "Chrome/124.0.0.0 Safari/537.36"
57
+ ),
58
+ "Accept-Language": "en-US,en;q=0.9",
59
+ "Accept": (
60
+ "text/html,application/xhtml+xml,application/xml;"
61
+ "q=0.9,image/avif,image/webp,*/*;q=0.8"
62
+ ),
63
+ "Accept-Encoding": "gzip, deflate, br",
64
+ "Connection": "keep-alive",
65
+ "DNT": "1",
66
+ "Upgrade-Insecure-Requests": "1",
67
  },
68
  )
69
+ with urllib.request.urlopen(req, timeout=15) as response:
70
+ # Handle gzip / deflate transparently
71
+ raw = response.read()
72
+ encoding = response.headers.get("Content-Encoding", "")
73
+ if encoding == "gzip":
74
+ import gzip
75
+ html = gzip.decompress(raw).decode("utf-8", errors="ignore")
76
+ elif encoding in ("deflate", "br"):
77
+ import zlib
78
+ html = zlib.decompress(raw).decode("utf-8", errors="ignore")
79
+ else:
80
+ html = raw.decode("utf-8", errors="ignore")
81
+
82
+ # ── Strategy 1: "lengthSeconds":"3661" (quoted β€” most common)
83
+ m = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
84
+ if m:
85
+ duration = int(m.group(1))
86
+ logger.info(f"⏱️ [S1-quoted] Extracted duration: {duration}s")
87
+ return duration
88
+
89
+ # ── Strategy 2: "lengthSeconds":3661 (unquoted β€” seen in some responses)
90
+ m = re.search(r'"lengthSeconds"\s*:\s*(\d+)', html)
91
+ if m:
92
+ duration = int(m.group(1))
93
+ logger.info(f"⏱️ [S2-unquoted] Extracted duration: {duration}s")
94
  return duration
95
 
96
+ # ── Strategy 3: "approxDurationMs":"3661000" β†’ convert ms β†’ s
97
+ m = re.search(r'"approxDurationMs"\s*:\s*"(\d+)"', html)
98
+ if m:
99
+ duration = int(m.group(1)) // 1000
100
+ logger.info(f"⏱️ [S3-approxMs] Extracted duration: {duration}s")
101
+ return duration
102
+
103
+ # ── Strategy 4: ISO 8601 in <meta itemprop="duration" content="PT1H1M1S">
104
+ m = re.search(
105
+ r'["\s]duration["\s]*[=:]["\s]*PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?',
106
+ html,
107
+ re.IGNORECASE,
108
+ )
109
+ if m:
110
+ h = int(m.group(1) or 0)
111
+ mn = int(m.group(2) or 0)
112
+ s = int(m.group(3) or 0)
113
+ duration = h * 3600 + mn * 60 + s
114
+ if duration > 0:
115
+ logger.info(f"⏱️ [S4-ISO8601] Extracted duration: {duration}s")
116
+ return duration
117
+
118
+ logger.warning("⚠️ All duration strategies failed for URL: %s", url)
119
  return 0
120
+
121
  except Exception as e:
122
  logger.warning(f"⚠️ Could not fetch YouTube duration: {e}")
123
  return 0