ALI7ADEL commited on
Commit
9b94974
·
verified ·
1 Parent(s): 187ff0c

Update src/api/downloader.py

Browse files
Files changed (1) hide show
  1. src/api/downloader.py +33 -17
src/api/downloader.py CHANGED
@@ -9,32 +9,40 @@ logger = logging.getLogger(__name__)
9
 
10
  class YouTubeDownloader:
11
  def __init__(self):
 
12
  self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
13
  self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
14
 
15
  def get_transcript(self, url: str) -> str:
16
  video_id = self._extract_video_id(url)
17
- logger.info("=" * 55)
18
- logger.info(f"🔍 AIdea Pipeline for Video: {video_id}")
19
 
20
- # 1. الخطة أ: YouTube Transcript API (سريع ومجاني)
21
  try:
22
  from youtube_transcript_api import YouTubeTranscriptApi
23
  entries = YouTubeTranscriptApi.get_transcript(video_id)
 
24
  return " ".join(e["text"] for e in entries).strip()
25
- except:
26
- logger.warning("⚠️ Plan A (Official API) failed.")
27
 
28
- # 2. الخطة ب: Supadata (المنقذ القوي للـ Transcripts)
29
  if self._supadata_key:
30
  try:
31
- logger.info("🚀 Plan B: Trying Supadata API...")
32
- # تنظيف الرابط
33
  clean_url = f"https://www.youtube.com/watch?v={video_id}"
 
 
 
 
 
 
 
34
  req = urllib.request.Request(
35
  f"https://api.supadata.ai/v1/youtube/transcript?url={clean_url}&text=true",
36
- headers={"x-api-key": self._supadata_key}
37
  )
 
38
  with urllib.request.urlopen(req, timeout=30) as resp:
39
  data = json.loads(resp.read())
40
  text = data.get("content", "").strip()
@@ -42,25 +50,33 @@ class YouTubeDownloader:
42
  logger.info("✅ Plan B Success!")
43
  return text
44
  except Exception as e:
45
- logger.error(f"❌ Plan B (Supadata) Error: {e}")
46
 
47
- # 3. الخطة ج: AssemblyAI (كاحتياطي)
48
  if self._assemblyai_key:
49
  try:
 
50
  import assemblyai as aai
51
  aai.settings.api_key = self._assemblyai_key
52
  transcriber = aai.Transcriber()
53
- transcript = transcriber.transcribe(url)
 
 
 
54
  if transcript.status != aai.TranscriptStatus.error:
 
55
  return transcript.text
56
- except:
57
- logger.warning("⚠️ Plan C (AssemblyAI) failed.")
 
 
58
 
59
- raise RuntimeError(f"❌ All strategies exhausted for {video_id}")
60
 
61
  def _extract_video_id(self, url: str) -> str:
62
- match = re.search(r"(?:v=|youtu\.be/|shorts/)([A-Za-z0-9_-]{11})", str(url))
 
63
  return match.group(1) if match else "unknown"
64
 
65
- def cleanup(self, path): # للحفاظ على التوافق
66
  pass
 
9
 
10
  class YouTubeDownloader:
11
  def __init__(self):
12
+ # سحب المفاتيح من الـ Environment
13
  self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
14
  self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
15
 
16
  def get_transcript(self, url: str) -> str:
17
  video_id = self._extract_video_id(url)
18
+ logger.info(f"🔍 Pipeline for video ID: {video_id}")
 
19
 
20
+ # 1. الخطة أ: YouTube Transcript API (لو فيه ترجمة جاهزة)
21
  try:
22
  from youtube_transcript_api import YouTubeTranscriptApi
23
  entries = YouTubeTranscriptApi.get_transcript(video_id)
24
+ logger.info("✅ Plan A (Official API) Success!")
25
  return " ".join(e["text"] for e in entries).strip()
26
+ except Exception as e:
27
+ logger.warning(f"⚠️ Plan A Failed: {e}")
28
 
29
+ # 2. الخطة ب: Supadata (المنقذ الأول - بإصلاح الـ User-Agent)
30
  if self._supadata_key:
31
  try:
32
+ logger.info("🚀 Plan B: Calling Supadata...")
 
33
  clean_url = f"https://www.youtube.com/watch?v={video_id}"
34
+
35
+ # إضافة Headers عشان نهرب من الـ 403 Forbidden
36
+ headers = {
37
+ "x-api-key": self._supadata_key,
38
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
39
+ }
40
+
41
  req = urllib.request.Request(
42
  f"https://api.supadata.ai/v1/youtube/transcript?url={clean_url}&text=true",
43
+ headers=headers
44
  )
45
+
46
  with urllib.request.urlopen(req, timeout=30) as resp:
47
  data = json.loads(resp.read())
48
  text = data.get("content", "").strip()
 
50
  logger.info("✅ Plan B Success!")
51
  return text
52
  except Exception as e:
53
+ logger.error(f"❌ Plan B (Supadata) failed: {e}")
54
 
55
+ # 3. الخطة ج: AssemblyAI (المنقذ الثاني)
56
  if self._assemblyai_key:
57
  try:
58
+ logger.info("🚀 Plan C: Calling AssemblyAI...")
59
  import assemblyai as aai
60
  aai.settings.api_key = self._assemblyai_key
61
  transcriber = aai.Transcriber()
62
+ # نبعت الرابط المطول لضمان القبول
63
+ clean_url = f"https://www.youtube.com/watch?v={video_id}"
64
+ transcript = transcriber.transcribe(clean_url)
65
+
66
  if transcript.status != aai.TranscriptStatus.error:
67
+ logger.info("✅ Plan C Success!")
68
  return transcript.text
69
+ else:
70
+ logger.warning(f"⚠️ Plan C API Error: {transcript.error}")
71
+ except Exception as e:
72
+ logger.error(f"❌ Plan C (AssemblyAI) failed: {e}")
73
 
74
+ raise RuntimeError(f"❌ All strategies exhausted for {video_id}. No transcript found.")
75
 
76
  def _extract_video_id(self, url: str) -> str:
77
+ # يدعم كل أنواع روابط يوتيوب
78
+ match = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
79
  return match.group(1) if match else "unknown"
80
 
81
+ def cleanup(self, path=None):
82
  pass