eshameo045 commited on
Commit
339760f
·
1 Parent(s): 65203a5
Files changed (2) hide show
  1. utils/llm_handler.py +1 -1
  2. utils/transcript_handler.py +72 -37
utils/llm_handler.py CHANGED
@@ -8,7 +8,7 @@ load_dotenv()
8
 
9
  class LLMHandler:
10
  def __init__(self):
11
- api_key = os.environ.get("OPENAI_API_KEY")
12
  if not api_key:
13
  raise ValueError("OPENAI_API_KEY environment variable not set!")
14
  self.client = OpenAI(api_key=api_key)
 
8
 
9
  class LLMHandler:
10
  def __init__(self):
11
+ api_key = os.environ.get("SUPADATA_API_KEY")
12
  if not api_key:
13
  raise ValueError("OPENAI_API_KEY environment variable not set!")
14
  self.client = OpenAI(api_key=api_key)
utils/transcript_handler.py CHANGED
@@ -1,7 +1,7 @@
1
- from youtube_transcript_api import YouTubeTranscriptApi
2
  import re
3
- import yt_dlp
4
-
5
 
6
  def extract_video_id(url: str) -> str:
7
  patterns = [
@@ -15,41 +15,66 @@ def extract_video_id(url: str) -> str:
15
  return match.group(1)
16
  return None
17
 
18
-
19
- def clean_transcript(text: str) -> str:
20
- text = re.sub(r'\[.*?\]', '', text)
21
- text = re.sub(r'\(.*?\)', '', text)
22
- text = re.sub(r'\s+', ' ', text).strip()
23
- text = text.replace('♪', '').replace('♫', '')
24
- return text
25
-
26
-
27
  def get_transcript(url: str) -> dict:
28
  video_id = extract_video_id(url)
29
-
30
  if not video_id:
31
  return {"success": False, "error": "Invalid YouTube URL."}
32
-
33
  try:
34
- # Pehle naya syntax try karo
35
- try:
36
- ytt_api = YouTubeTranscriptApi()
37
- transcript_data = ytt_api.fetch(video_id, languages=['en', 'hi', 'ur', 'en-US', 'en-GB'])
38
- full_transcript = " ".join([entry.text for entry in transcript_data.snippets])
39
- except:
40
- # Purana syntax try karo
41
- fetched = YouTubeTranscriptApi.get_transcript(video_id)
42
- full_transcript = " ".join([entry['text'] for entry in fetched])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  full_transcript = clean_transcript(full_transcript)
45
 
 
 
 
 
46
  try:
47
- ydl_opts = {'quiet': True, 'skip_download': True, 'no_warnings': True}
48
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
49
- info = ydl.extract_info(url, download=False)
50
- video_title = info.get('title', f'Video {video_id}')
 
 
 
 
 
 
51
  except:
52
- video_title = f'Video {video_id}'
53
 
54
  return {
55
  "success": True,
@@ -59,12 +84,22 @@ def get_transcript(url: str) -> dict:
59
  }
60
 
61
  except Exception as e:
62
- error_msg = str(e)
63
- if "Subtitles are disabled" in error_msg or "subtitles" in error_msg.lower():
64
- return {"success": False, "error": "⚠️ This video has no captions. Please use a video with CC enabled!"}
65
- elif "No transcripts were found" in error_msg:
66
- return {"success": False, "error": "⚠️ No transcript found. Please use a lecture video with captions!"}
67
- elif "live streaming" in error_msg or "live" in error_msg.lower():
68
- return {"success": False, "error": "⚠️ Live streams not supported. Please use a recorded lecture!"}
69
- else:
70
- return {"success": False, "error": f"⚠️ Error: {error_msg}"}
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
2
  import re
3
+ import requests
4
+ import os
5
 
6
  def extract_video_id(url: str) -> str:
7
  patterns = [
 
15
  return match.group(1)
16
  return None
17
 
 
 
 
 
 
 
 
 
 
18
  def get_transcript(url: str) -> dict:
19
  video_id = extract_video_id(url)
20
+
21
  if not video_id:
22
  return {"success": False, "error": "Invalid YouTube URL."}
23
+
24
  try:
25
+ api_key = os.environ.get("SUPADATA_API_KEY")
26
+ if not api_key:
27
+ return {"success": False, "error": "SUPADATA_API_KEY not set!"}
28
+
29
+ # Supadata se transcript lo
30
+ response = requests.get(
31
+ "https://api.supadata.ai/v1/youtube/transcript",
32
+ params={"url": f"https://www.youtube.com/watch?v={video_id}", "text": True},
33
+ headers={"x-api-key": api_key},
34
+ timeout=30
35
+ )
36
+
37
+ if response.status_code != 200:
38
+ error_data = response.json()
39
+ details = error_data.get("details", error_data.get("message", "Unknown error"))
40
+ if "unavailable" in str(details).lower():
41
+ return {"success": False, "error": "⚠️ No transcript found. Please use a lecture video with captions!"}
42
+ elif "live" in str(details).lower():
43
+ return {"success": False, "error": "⚠️ Live streams not supported!"}
44
+ else:
45
+ return {"success": False, "error": f"⚠️ {details}"}
46
+
47
+ data = response.json()
48
+
49
+ # Transcript text join karo
50
+ content = data.get("content", "")
51
+ if isinstance(content, list):
52
+ full_transcript = " ".join([
53
+ item.get("text", "") if isinstance(item, dict) else str(item)
54
+ for item in content
55
+ ])
56
+ else:
57
+ full_transcript = str(content)
58
 
59
  full_transcript = clean_transcript(full_transcript)
60
 
61
+ if not full_transcript.strip():
62
+ return {"success": False, "error": "⚠️ Transcript empty or not available."}
63
+
64
+ # Video title lo
65
  try:
66
+ title_response = requests.get(
67
+ "https://api.supadata.ai/v1/youtube/video",
68
+ params={"url": f"https://www.youtube.com/watch?v={video_id}"},
69
+ headers={"x-api-key": api_key},
70
+ timeout=15
71
+ )
72
+ if title_response.status_code == 200:
73
+ video_title = title_response.json().get("title", f"Video {video_id}")
74
+ else:
75
+ video_title = f"Video {video_id}"
76
  except:
77
+ video_title = f"Video {video_id}"
78
 
79
  return {
80
  "success": True,
 
84
  }
85
 
86
  except Exception as e:
87
+ return {"success": False, "error": f"⚠️ Error: {str(e)}"}
88
+
89
+
90
+ def clean_transcript(text: str) -> str:
91
+ text = re.sub(r'\[.*?\]', '', text)
92
+ text = re.sub(r'\(.*?\)', '', text)
93
+ text = re.sub(r'\s+', ' ', text).strip()
94
+ text = text.replace('♪', '').replace('♫', '')
95
+ return text
96
+
97
+
98
+ def chunk_transcript(transcript: str, chunk_size: int = 500, overlap: int = 50) -> list:
99
+ words = transcript.split()
100
+ chunks = []
101
+ for i in range(0, len(words), chunk_size - overlap):
102
+ chunk = " ".join(words[i:i + chunk_size])
103
+ if chunk:
104
+ chunks.append(chunk)
105
+ return chunks