Spaces:
Sleeping
Sleeping
Update fetcher.py
Browse files- fetcher.py +18 -18
fetcher.py
CHANGED
|
@@ -7,9 +7,9 @@ import requests
|
|
| 7 |
import pandas as pd
|
| 8 |
|
| 9 |
|
| 10 |
-
|
| 11 |
# Video ID extraction
|
| 12 |
-
|
| 13 |
|
| 14 |
def extract_video_id(url_or_id: str) -> str | None:
|
| 15 |
"""Return an 11-char YouTube video ID, or None if not found."""
|
|
@@ -24,9 +24,9 @@ def extract_video_id(url_or_id: str) -> str | None:
|
|
| 24 |
return None
|
| 25 |
|
| 26 |
|
| 27 |
-
|
| 28 |
# Duration parser
|
| 29 |
-
|
| 30 |
|
| 31 |
def _parse_duration(iso: str) -> str:
|
| 32 |
m = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", iso or "PT0S")
|
|
@@ -36,9 +36,9 @@ def _parse_duration(iso: str) -> str:
|
|
| 36 |
return f"{h}:{mn:02d}:{s:02d}" if h else f"{mn}:{s:02d}"
|
| 37 |
|
| 38 |
|
| 39 |
-
|
| 40 |
# Metadata
|
| 41 |
-
|
| 42 |
|
| 43 |
def fetch_video_metadata(video_id: str, api_key: str) -> tuple[dict | None, str | None]:
|
| 44 |
"""Return (meta_dict, error_string). One will be None."""
|
|
@@ -88,9 +88,9 @@ def fetch_video_metadata(video_id: str, api_key: str) -> tuple[dict | None, str
|
|
| 88 |
return None, str(exc)
|
| 89 |
|
| 90 |
|
| 91 |
-
|
| 92 |
# Transcript
|
| 93 |
-
|
| 94 |
|
| 95 |
def fetch_transcript(video_id: str) -> tuple[str, str]:
|
| 96 |
"""Return (text, status_message)."""
|
|
@@ -98,15 +98,15 @@ def fetch_transcript(video_id: str) -> tuple[str, str]:
|
|
| 98 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
| 99 |
segments = YouTubeTranscriptApi.get_transcript(video_id)
|
| 100 |
text = " ".join(s["text"] for s in segments)
|
| 101 |
-
return text, f"
|
| 102 |
except Exception as exc:
|
| 103 |
short = str(exc)[:80]
|
| 104 |
-
return "", f"
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
# Comments
|
| 109 |
-
|
| 110 |
|
| 111 |
def fetch_comments(
|
| 112 |
video_id: str,
|
|
@@ -155,20 +155,20 @@ def fetch_comments(
|
|
| 155 |
break
|
| 156 |
|
| 157 |
if not rows:
|
| 158 |
-
return pd.DataFrame(), "
|
| 159 |
|
| 160 |
df = pd.DataFrame(rows)
|
| 161 |
-
return df, f"
|
| 162 |
|
| 163 |
except requests.exceptions.Timeout:
|
| 164 |
-
return pd.DataFrame(), "
|
| 165 |
except Exception as exc:
|
| 166 |
-
return pd.DataFrame(), f"
|
|
|
|
| 167 |
|
| 168 |
|
| 169 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 170 |
# Search by keyword
|
| 171 |
-
|
| 172 |
|
| 173 |
def search_videos_by_title(
|
| 174 |
keyword: str,
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
|
| 9 |
|
| 10 |
+
|
| 11 |
# Video ID extraction
|
| 12 |
+
|
| 13 |
|
| 14 |
def extract_video_id(url_or_id: str) -> str | None:
|
| 15 |
"""Return an 11-char YouTube video ID, or None if not found."""
|
|
|
|
| 24 |
return None
|
| 25 |
|
| 26 |
|
| 27 |
+
|
| 28 |
# Duration parser
|
| 29 |
+
|
| 30 |
|
| 31 |
def _parse_duration(iso: str) -> str:
|
| 32 |
m = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", iso or "PT0S")
|
|
|
|
| 36 |
return f"{h}:{mn:02d}:{s:02d}" if h else f"{mn}:{s:02d}"
|
| 37 |
|
| 38 |
|
| 39 |
+
|
| 40 |
# Metadata
|
| 41 |
+
|
| 42 |
|
| 43 |
def fetch_video_metadata(video_id: str, api_key: str) -> tuple[dict | None, str | None]:
|
| 44 |
"""Return (meta_dict, error_string). One will be None."""
|
|
|
|
| 88 |
return None, str(exc)
|
| 89 |
|
| 90 |
|
| 91 |
+
|
| 92 |
# Transcript
|
| 93 |
+
|
| 94 |
|
| 95 |
def fetch_transcript(video_id: str) -> tuple[str, str]:
|
| 96 |
"""Return (text, status_message)."""
|
|
|
|
| 98 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
| 99 |
segments = YouTubeTranscriptApi.get_transcript(video_id)
|
| 100 |
text = " ".join(s["text"] for s in segments)
|
| 101 |
+
return text, f" Transcript: {len(text.split())} words"
|
| 102 |
except Exception as exc:
|
| 103 |
short = str(exc)[:80]
|
| 104 |
+
return "", f" Transcript unavailable: {short}"
|
| 105 |
+
|
| 106 |
|
| 107 |
|
|
|
|
| 108 |
# Comments
|
| 109 |
+
|
| 110 |
|
| 111 |
def fetch_comments(
|
| 112 |
video_id: str,
|
|
|
|
| 155 |
break
|
| 156 |
|
| 157 |
if not rows:
|
| 158 |
+
return pd.DataFrame(), " No comments fetched (comments may be disabled)"
|
| 159 |
|
| 160 |
df = pd.DataFrame(rows)
|
| 161 |
+
return df, f" Comments: {len(df)} fetched"
|
| 162 |
|
| 163 |
except requests.exceptions.Timeout:
|
| 164 |
+
return pd.DataFrame(), " Comments request timed out"
|
| 165 |
except Exception as exc:
|
| 166 |
+
return pd.DataFrame(), f" Comments error: {str(exc)[:80]}"
|
| 167 |
+
|
| 168 |
|
| 169 |
|
|
|
|
| 170 |
# Search by keyword
|
| 171 |
+
|
| 172 |
|
| 173 |
def search_videos_by_title(
|
| 174 |
keyword: str,
|