Spaces:

eshameo045
/

LectureLens_AI

Sleeping

App Files Files Community

eshameo045 commited on 20 days ago

Commit

339760f

1 Parent(s): 65203a5

update

Browse files

Files changed (2) hide show

utils/llm_handler.py +1 -1
utils/transcript_handler.py +72 -37

utils/llm_handler.py CHANGED Viewed

@@ -8,7 +8,7 @@ load_dotenv()
 class LLMHandler:
     def __init__(self):
-        api_key = os.environ.get("OPENAI_API_KEY")
         if not api_key:
             raise ValueError("OPENAI_API_KEY environment variable not set!")
         self.client = OpenAI(api_key=api_key)

 class LLMHandler:
     def __init__(self):
+        api_key = os.environ.get("SUPADATA_API_KEY")
         if not api_key:
             raise ValueError("OPENAI_API_KEY environment variable not set!")
         self.client = OpenAI(api_key=api_key)

utils/transcript_handler.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from youtube_transcript_api import YouTubeTranscriptApi
 import re
-import yt_dlp
 def extract_video_id(url: str) -> str:
     patterns = [
@@ -15,41 +15,66 @@ def extract_video_id(url: str) -> str:
             return match.group(1)
     return None
-def clean_transcript(text: str) -> str:
-    text = re.sub(r'\[.*?\]', '', text)
-    text = re.sub(r'\(.*?\)', '', text)
-    text = re.sub(r'\s+', ' ', text).strip()
-    text = text.replace('♪', '').replace('♫', '')
-    return text
 def get_transcript(url: str) -> dict:
     video_id = extract_video_id(url)
     if not video_id:
         return {"success": False, "error": "Invalid YouTube URL."}
     try:
-        # Pehle naya syntax try karo
-        try:
-            ytt_api = YouTubeTranscriptApi()
-            transcript_data = ytt_api.fetch(video_id, languages=['en', 'hi', 'ur', 'en-US', 'en-GB'])
-            full_transcript = " ".join([entry.text for entry in transcript_data.snippets])
-        except:
-            # Purana syntax try karo
-            fetched = YouTubeTranscriptApi.get_transcript(video_id)
-            full_transcript = " ".join([entry['text'] for entry in fetched])
         full_transcript = clean_transcript(full_transcript)
         try:
-            ydl_opts = {'quiet': True, 'skip_download': True, 'no_warnings': True}
-            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-                info = ydl.extract_info(url, download=False)
-                video_title = info.get('title', f'Video {video_id}')
         except:
-            video_title = f'Video {video_id}'
         return {
             "success": True,
@@ -59,12 +84,22 @@ def get_transcript(url: str) -> dict:
         }
     except Exception as e:
-        error_msg = str(e)
-        if "Subtitles are disabled" in error_msg or "subtitles" in error_msg.lower():
-            return {"success": False, "error": "⚠️ This video has no captions. Please use a video with CC enabled!"}
-        elif "No transcripts were found" in error_msg:
-            return {"success": False, "error": "⚠️ No transcript found. Please use a lecture video with captions!"}
-        elif "live streaming" in error_msg or "live" in error_msg.lower():
-            return {"success": False, "error": "⚠️ Live streams not supported. Please use a recorded lecture!"}
-        else:
-            return {"success": False, "error": f"⚠️ Error: {error_msg}"}

+from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
 import re
+import requests
+import os
 def extract_video_id(url: str) -> str:
     patterns = [
             return match.group(1)
     return None
 def get_transcript(url: str) -> dict:
     video_id = extract_video_id(url)
     if not video_id:
         return {"success": False, "error": "Invalid YouTube URL."}
     try:
+        api_key = os.environ.get("SUPADATA_API_KEY")
+        if not api_key:
+            return {"success": False, "error": "SUPADATA_API_KEY not set!"}
+        # Supadata se transcript lo
+        response = requests.get(
+            "https://api.supadata.ai/v1/youtube/transcript",
+            params={"url": f"https://www.youtube.com/watch?v={video_id}", "text": True},
+            headers={"x-api-key": api_key},
+            timeout=30
+        )
+        if response.status_code != 200:
+            error_data = response.json()
+            details = error_data.get("details", error_data.get("message", "Unknown error"))
+            if "unavailable" in str(details).lower():
+                return {"success": False, "error": "⚠️ No transcript found. Please use a lecture video with captions!"}
+            elif "live" in str(details).lower():
+                return {"success": False, "error": "⚠️ Live streams not supported!"}
+            else:
+                return {"success": False, "error": f"⚠️ {details}"}
+        data = response.json()
+        # Transcript text join karo
+        content = data.get("content", "")
+        if isinstance(content, list):
+            full_transcript = " ".join([
+                item.get("text", "") if isinstance(item, dict) else str(item)
+                for item in content
+            ])
+        else:
+            full_transcript = str(content)
         full_transcript = clean_transcript(full_transcript)
+        if not full_transcript.strip():
+            return {"success": False, "error": "⚠️ Transcript empty or not available."}
+        # Video title lo
         try:
+            title_response = requests.get(
+                "https://api.supadata.ai/v1/youtube/video",
+                params={"url": f"https://www.youtube.com/watch?v={video_id}"},
+                headers={"x-api-key": api_key},
+                timeout=15
+            )
+            if title_response.status_code == 200:
+                video_title = title_response.json().get("title", f"Video {video_id}")
+            else:
+                video_title = f"Video {video_id}"
         except:
+            video_title = f"Video {video_id}"
         return {
             "success": True,
         }
     except Exception as e:
+        return {"success": False, "error": f"⚠️ Error: {str(e)}"}
+def clean_transcript(text: str) -> str:
+    text = re.sub(r'\[.*?\]', '', text)
+    text = re.sub(r'\(.*?\)', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    text = text.replace('♪', '').replace('♫', '')
+    return text
+def chunk_transcript(transcript: str, chunk_size: int = 500, overlap: int = 50) -> list:
+    words = transcript.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = " ".join(words[i:i + chunk_size])
+        if chunk:
+            chunks.append(chunk)
+    return chunks