ALI7ADEL commited on
Commit
87d01c8
ยท
verified ยท
1 Parent(s): d07c690

Update src/api/downloader.py

Browse files
Files changed (1) hide show
  1. src/api/downloader.py +158 -28
src/api/downloader.py CHANGED
@@ -1,39 +1,169 @@
1
- import yt_dlp
2
  import logging
3
  import os
 
 
 
 
4
  from pathlib import Path
5
 
6
  logger = logging.getLogger(__name__)
7
 
8
- # ู…ูƒุงู† ุญูุธ ุงู„ุตูˆุช ุงู„ู…ุคู‚ุช
9
- _AUDIO_DIR = Path("/tmp/yt_audio")
 
 
10
 
11
  class YouTubeDownloader:
 
12
  def __init__(self):
13
  _AUDIO_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- def download_audio(self, url: str, video_id: str):
16
- output_template = str(_AUDIO_DIR / f"{video_id}.%(ext)s")
17
-
18
- ydl_opts = {
19
- 'format': 'bestaudio/best',
20
- 'outtmpl': output_template,
21
- 'verbose': True, # ู…ู‡ู… ุฌุฏุงู‹ ุนุดุงู† ู†ุดูˆู "ุญู„ ุงู„ุดูุฑุงุช" ููŠ ุงู„ู„ูˆุฌุฒ
22
- 'extractor_args': {
23
- 'youtube': {
24
- 'player_client': ['web'],
25
- # ุงู„ุณุทุฑ ุฏู‡ ู‡ูˆ ุงู„ู„ูŠ ุจูŠุจุนุช ุงู„ุดูุฑุฉ ู„ู„ุณูŠุฑูุฑ ุงู„ู„ูŠ ุฅู†ุช ุนู…ู„ุชู‡ (pot_server.py)
26
- 'youtubepot-bgutilhttp:base_url': 'http://127.0.0.1:4416'
27
- }
28
- },
29
- }
30
-
31
- logger.info(f"โ–ถ Starting download for video: {video_id}")
32
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
33
- ydl.download([url])
34
-
35
- # ุงู„ุจุญุซ ุนู† ุงู„ู…ู„ู ุงู„ู„ูŠ ู†ุฒู„ (ุจุฃูŠ ุงู…ุชุฏุงุฏ m4a, webm, etc)
36
- matches = list(_AUDIO_DIR.glob(f"{video_id}.*"))
37
- if matches:
38
- return matches[0]
39
- return None
 
 
1
  import logging
2
  import os
3
+ import re
4
+ import subprocess
5
+ import tempfile
6
+ import time
7
  from pathlib import Path
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
+ # --- ุฅุนุฏุงุฏุงุช ุงู„ู…ุณุงุฑุงุช ---
12
+ _AUDIO_DIR = Path(tempfile.gettempdir()) / "yt_audio"
13
+ _COOKIES_PATH = Path("/tmp/yt_cookies.txt")
14
+ _POT_BASE_URL = "http://127.0.0.1:4416"
15
 
16
  class YouTubeDownloader:
17
+
18
  def __init__(self):
19
  _AUDIO_DIR.mkdir(parents=True, exist_ok=True)
20
+ self._write_cookies()
21
+
22
+ # ู‚ุฑุงุกุฉ ู…ูุงุชูŠุญ ุงู„ู€ API ู…ู† ุงู„ู€ Secrets
23
+ self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
24
+ self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
25
+
26
+ logger.info("๐Ÿš€ Pipeline Status Check:")
27
+ logger.info(f" [1] YouTube Transcript API : โœ… Active")
28
+ logger.info(f" [2] AssemblyAI : {'โœ… Key Set' if self._assemblyai_key else 'โŒ Key Missing'}")
29
+ logger.info(f" [3] yt-dlp + POT Solver : โœ… Active (Fallback)")
30
+
31
+ # โ”€โ”€ ุงู„ู…ุฏุฎู„ ุงู„ุฑุฆูŠุณูŠ (Public API) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
32
+
33
+ def get_transcript(self, url: str) -> str:
34
+ """
35
+ ุงู„ู…ุญุฑูƒ ุงู„ุฑุฆูŠุณูŠ ู„ุฌู„ุจ ุงู„ู†ุต. ุจูŠุฌุฑุจ 4 ุทุฑู‚ ูˆุฑุง ุจุนุถ ู„ุญุฏ ู…ุง ูŠู†ุฌุญ.
36
+ """
37
+ video_id = self._extract_video_id(url)
38
+ logger.info("=" * 55)
39
+ logger.info(f"๐Ÿ” Starting transcription pipeline for: {video_id}")
40
+
41
+ # ุงู„ุฎุทุฉ (1) โ€” ุงู„ุชุฑุฌู…ุฉ ุงู„ุฌุงู‡ุฒุฉ ู…ู† ูŠูˆุชูŠูˆุจ (ู…ุฌุงู†ูŠุฉ ูˆุณุฑูŠุนุฉ)
42
+ result = self._try_transcript_api(video_id)
43
+ if result: return result
44
+
45
+ # ุงู„ุฎุทุฉ (2) โ€” AssemblyAI (ุงู„ู…ู†ู‚ุฐ ู„ุชุฎุทูŠ ุญุธุฑ ุงู„ู€ IP)
46
+ # ู‡ู†ุง ุจุชุจุนุช ุงู„ุฑุงุจุท ู„ูŠู‡ู… ูˆู‡ู… ุจูŠุญู…ู„ูˆุง ุงู„ุตูˆุช ุนู„ู‰ ุณูŠุฑูุฑุงุชู‡ู…
47
+ result = self._try_assemblyai(url, video_id)
48
+ if result: return result
49
+
50
+ # ุงู„ุฎุทุฉ (3) โ€” ุงู„ุชุญู…ูŠู„ ุงู„ู…ุญู„ูŠ ุจุงุณุชุฎุฏุงู… POT Solver (ู„ูˆ ุงู„ู€ IP ููƒ ุญุธุฑู‡)
51
+ result = self._try_ytdlp_download(url, video_id)
52
+ if result: return result
53
+
54
+ # ุงู„ุฎุทุฉ (4) โ€” Supadata (ุงุญุชูŠุงุทูŠ ุฃุฎูŠุฑ)
55
+ result = self._try_supadata(url, video_id)
56
+ if result: return result
57
+
58
+ raise RuntimeError(f"โŒ All strategies failed for video {video_id}. Check logs.")
59
+
60
+ # โ”€โ”€ ุงู„ุฎุทุฉ 1: ุงู„ุชุฑุฌู…ุฉ ุงู„ุฑุณู…ูŠุฉ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
61
+
62
+ def _try_transcript_api(self, video_id: str) -> str | None:
63
+ try:
64
+ from youtube_transcript_api import YouTubeTranscriptApi
65
+ entries = YouTubeTranscriptApi.get_transcript(video_id)
66
+ text = " ".join(e["text"] for e in entries).strip()
67
+ logger.info("โœ… [1/Transcript-API] Success!")
68
+ return text
69
+ except Exception as e:
70
+ logger.warning(f"โš ๏ธ [1/Transcript-API] Failed: {e}")
71
+ return None
72
+
73
+ # โ”€โ”€ ุงู„ุฎุทุฉ 2: AssemblyAI (ุงู„ุญู„ ุงู„ุฌุฐุฑูŠ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
74
+
75
+ def _try_assemblyai(self, url: str, video_id: str) -> str | None:
76
+ if not self._assemblyai_key:
77
+ return None
78
+ try:
79
+ import assemblyai as aai
80
+ aai.settings.api_key = self._assemblyai_key
81
+ logger.info(f"๐Ÿš€ [2/AssemblyAI] Submitting URL to external infra...")
82
+
83
+ config = aai.TranscriptionConfig(language_detection=True, punctuate=True)
84
+ transcriber = aai.Transcriber(config=config)
85
+
86
+ # ุงู„ุทู„ุจ ุฏู‡ ุจูŠุจุนุช ุงู„ุฑุงุจุท ู„ู€ AssemblyAI ูˆู‡ู… ุจูŠุชุตุฑููˆุง
87
+ transcript = transcriber.transcribe(url)
88
+
89
+ if transcript.status == aai.TranscriptStatus.error:
90
+ logger.error(f"โŒ [2/AssemblyAI] Error: {transcript.error}")
91
+ return None
92
+
93
+ logger.info("โœ… [2/AssemblyAI] Transcription Success!")
94
+ return transcript.text
95
+ except Exception as e:
96
+ logger.warning(f"โš ๏ธ [2/AssemblyAI] Failed: {e}")
97
+ return None
98
+
99
+ # โ”€โ”€ ุงู„ุฎุทุฉ 3: ุงู„ุชุญู…ูŠู„ ุงู„ู…ุญู„ูŠ (yt-dlp + POT) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
100
+
101
+ def _try_ytdlp_download(self, url: str, video_id: str) -> str | None:
102
+ # ู…ุณุญ ุฃูŠ ู…ู„ูุงุช ู‚ุฏูŠู…ุฉ
103
+ for f in _AUDIO_DIR.glob(f"{video_id}.*"): f.unlink(missing_ok=True)
104
+
105
+ output_tmpl = str(_AUDIO_DIR / f"{video_id}.%(ext)s")
106
+ cmd = [
107
+ "yt-dlp", "--no-playlist", "--format", "bestaudio/best",
108
+ "--output", output_tmpl,
109
+ "--extractor-args", f"youtube:player_client=web;youtubepot-bgutilhttp:base_url={_POT_BASE_URL}",
110
+ "--verbose", "--no-check-certificate"
111
+ ]
112
+
113
+ logger.info(f"๐Ÿ“‚ [3/yt-dlp] Attempting local download...")
114
+ try:
115
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
116
+ if result.returncode != 0:
117
+ logger.warning("โš ๏ธ [3/yt-dlp] Blocked by YouTube IP Ban.")
118
+ return None
119
+
120
+ # ู„ูˆ ู†ุฒู„ุŒ ู†ุณุชุฎุฏู… Whisper ู…ุญู„ูŠุงู‹
121
+ matches = list(_AUDIO_DIR.glob(f"{video_id}.*"))
122
+ if matches:
123
+ return self._transcribe_whisper(str(matches[0]))
124
+ except Exception as e:
125
+ logger.warning(f"โš ๏ธ [3/yt-dlp] Failed: {e}")
126
+ return None
127
+
128
+ def _transcribe_whisper(self, audio_path: str) -> str | None:
129
+ try:
130
+ import whisper
131
+ model = whisper.load_model("base")
132
+ result = model.transcribe(audio_path)
133
+ return result["text"].strip()
134
+ finally:
135
+ if os.path.exists(audio_path): os.remove(audio_path)
136
+
137
+ # โ”€โ”€ ุงู„ุฎุทุฉ 4: Supadata โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
138
+
139
+ def _try_supadata(self, url: str, video_id: str) -> str | None:
140
+ if not self._supadata_key: return None
141
+ try:
142
+ import urllib.request, json
143
+ req = urllib.request.Request(
144
+ f"https://api.supadata.ai/v1/youtube/transcript?url={url}&text=true",
145
+ headers={"x-api-key": self._supadata_key}
146
+ )
147
+ with urllib.request.urlopen(req, timeout=30) as resp:
148
+ data = json.loads(resp.read())
149
+ return data.get("content", "").strip()
150
+ except: return None
151
+
152
+ # โ”€โ”€ ุฏุงู„ุงุช ู…ุณุงุนุฏุฉ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
153
+
154
+ def cleanup(self, file_path: Path):
155
+ """ู…ู‡ู…ุฉ ู„ู„ู…ุญุงูุธุฉ ุนู„ู‰ ู…ุณุงุญุฉ ุงู„ุณูŠุฑูุฑ"""
156
+ try:
157
+ if file_path and os.path.exists(file_path):
158
+ os.remove(file_path)
159
+ logger.info(f"๐Ÿงน Cleaned up: {file_path}")
160
+ except: pass
161
+
162
+ def _extract_video_id(self, url: str) -> str:
163
+ match = re.search(r"(?:v=|youtu\.be/)([A-Za-z0-9_-]{11})", url)
164
+ return match.group(1) if match else "unknown"
165
 
166
+ def _write_cookies(self):
167
+ data = os.environ.get("YOUTUBE_COOKIES", "").strip()
168
+ if data:
169
+ _COOKIES_PATH.write_text(data)