Taylor Fox Dahlin commited on
[Feature] Video metadata (#809)
Browse files* Added accept-language to request headers to coerce certain strings sent by YouTube into english (e.g. 'This video is private.'
* Implemented metadata class.
- .github/workflows/ci.yml +1 -1
- pytube/__main__.py +25 -15
- pytube/extract.py +57 -0
- pytube/metadata.py +48 -0
- pytube/request.py +1 -1
- tests/test_extract.py +10 -0
- tests/test_metadata.py +18 -0
.github/workflows/ci.yml
CHANGED
|
@@ -13,7 +13,7 @@ jobs:
|
|
| 13 |
|
| 14 |
strategy:
|
| 15 |
matrix:
|
| 16 |
-
python: [3.
|
| 17 |
|
| 18 |
steps:
|
| 19 |
- name: Checkout repo
|
|
|
|
| 13 |
|
| 14 |
strategy:
|
| 15 |
matrix:
|
| 16 |
+
python: [3.6, 3.7, 3.8, 3.9]
|
| 17 |
|
| 18 |
steps:
|
| 19 |
- name: Checkout repo
|
pytube/__main__.py
CHANGED
|
@@ -27,6 +27,7 @@ from pytube.extract import apply_descrambler
|
|
| 27 |
from pytube.extract import apply_signature
|
| 28 |
from pytube.extract import get_ytplayer_config
|
| 29 |
from pytube.helpers import install_proxy
|
|
|
|
| 30 |
from pytube.monostate import Monostate
|
| 31 |
from pytube.monostate import OnComplete
|
| 32 |
from pytube.monostate import OnProgress
|
|
@@ -60,23 +61,17 @@ class YouTube:
|
|
| 60 |
|
| 61 |
"""
|
| 62 |
self.js: Optional[str] = None # js fetched by js_url
|
| 63 |
-
self.js_url: Optional[
|
| 64 |
-
str
|
| 65 |
-
] = None # the url to the js, parsed from watch html
|
| 66 |
|
| 67 |
# note: vid_info may eventually be removed. It sounds like it once had
|
| 68 |
# additional formats, but that doesn't appear to still be the case.
|
| 69 |
|
| 70 |
# the url to vid info, parsed from watch html
|
| 71 |
self.vid_info_url: Optional[str] = None
|
| 72 |
-
self.vid_info_raw: Optional[
|
| 73 |
-
str
|
| 74 |
-
] = None # content fetched by vid_info_url
|
| 75 |
self.vid_info: Optional[Dict] = None # parsed content of vid_info_raw
|
| 76 |
|
| 77 |
-
self.watch_html: Optional[
|
| 78 |
-
str
|
| 79 |
-
] = None # the html of /watch?v=<video_id>
|
| 80 |
self.embed_html: Optional[str] = None
|
| 81 |
self.player_config_args: Dict = {} # inline js in the html containing
|
| 82 |
self.player_response: Dict = {}
|
|
@@ -85,6 +80,10 @@ class YouTube:
|
|
| 85 |
|
| 86 |
self.fmt_streams: List[Stream] = []
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# video_id part of /watch?v=<video_id>
|
| 89 |
self.video_id = extract.video_id(url)
|
| 90 |
|
|
@@ -187,6 +186,9 @@ class YouTube:
|
|
| 187 |
video_id=self.video_id, watch_url=self.watch_url
|
| 188 |
)
|
| 189 |
|
|
|
|
|
|
|
|
|
|
| 190 |
self.vid_info_raw = request.get(self.vid_info_url)
|
| 191 |
if not self.age_restricted:
|
| 192 |
self.js_url = extract.js_url(self.watch_html)
|
|
@@ -287,9 +289,7 @@ class YouTube:
|
|
| 287 |
:rtype: str
|
| 288 |
|
| 289 |
"""
|
| 290 |
-
return self.player_response.get("videoDetails", {}).get(
|
| 291 |
-
"shortDescription"
|
| 292 |
-
)
|
| 293 |
|
| 294 |
@property
|
| 295 |
def rating(self) -> float:
|
|
@@ -298,9 +298,7 @@ class YouTube:
|
|
| 298 |
:rtype: float
|
| 299 |
|
| 300 |
"""
|
| 301 |
-
return self.player_response.get("videoDetails", {}).get(
|
| 302 |
-
"averageRating"
|
| 303 |
-
)
|
| 304 |
|
| 305 |
@property
|
| 306 |
def length(self) -> int:
|
|
@@ -338,6 +336,18 @@ class YouTube:
|
|
| 338 |
"author", "unknown"
|
| 339 |
)
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
def register_on_progress_callback(self, func: OnProgress):
|
| 342 |
"""Register a download progress callback function post initialization.
|
| 343 |
|
|
|
|
| 27 |
from pytube.extract import apply_signature
|
| 28 |
from pytube.extract import get_ytplayer_config
|
| 29 |
from pytube.helpers import install_proxy
|
| 30 |
+
from pytube.metadata import YouTubeMetadata
|
| 31 |
from pytube.monostate import Monostate
|
| 32 |
from pytube.monostate import OnComplete
|
| 33 |
from pytube.monostate import OnProgress
|
|
|
|
| 61 |
|
| 62 |
"""
|
| 63 |
self.js: Optional[str] = None # js fetched by js_url
|
| 64 |
+
self.js_url: Optional[str] = None # the url to the js, parsed from watch html
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# note: vid_info may eventually be removed. It sounds like it once had
|
| 67 |
# additional formats, but that doesn't appear to still be the case.
|
| 68 |
|
| 69 |
# the url to vid info, parsed from watch html
|
| 70 |
self.vid_info_url: Optional[str] = None
|
| 71 |
+
self.vid_info_raw: Optional[str] = None # content fetched by vid_info_url
|
|
|
|
|
|
|
| 72 |
self.vid_info: Optional[Dict] = None # parsed content of vid_info_raw
|
| 73 |
|
| 74 |
+
self.watch_html: Optional[str] = None # the html of /watch?v=<video_id>
|
|
|
|
|
|
|
| 75 |
self.embed_html: Optional[str] = None
|
| 76 |
self.player_config_args: Dict = {} # inline js in the html containing
|
| 77 |
self.player_response: Dict = {}
|
|
|
|
| 80 |
|
| 81 |
self.fmt_streams: List[Stream] = []
|
| 82 |
|
| 83 |
+
self.initial_data_raw = None
|
| 84 |
+
self.initial_data = {}
|
| 85 |
+
self._metadata: Optional[YouTubeMetadata] = None
|
| 86 |
+
|
| 87 |
# video_id part of /watch?v=<video_id>
|
| 88 |
self.video_id = extract.video_id(url)
|
| 89 |
|
|
|
|
| 186 |
video_id=self.video_id, watch_url=self.watch_url
|
| 187 |
)
|
| 188 |
|
| 189 |
+
self.initial_data_raw = extract.initial_data(self.watch_html)
|
| 190 |
+
self.initial_data = json.loads(self.initial_data_raw)
|
| 191 |
+
|
| 192 |
self.vid_info_raw = request.get(self.vid_info_url)
|
| 193 |
if not self.age_restricted:
|
| 194 |
self.js_url = extract.js_url(self.watch_html)
|
|
|
|
| 289 |
:rtype: str
|
| 290 |
|
| 291 |
"""
|
| 292 |
+
return self.player_response.get("videoDetails", {}).get("shortDescription")
|
|
|
|
|
|
|
| 293 |
|
| 294 |
@property
|
| 295 |
def rating(self) -> float:
|
|
|
|
| 298 |
:rtype: float
|
| 299 |
|
| 300 |
"""
|
| 301 |
+
return self.player_response.get("videoDetails", {}).get("averageRating")
|
|
|
|
|
|
|
| 302 |
|
| 303 |
@property
|
| 304 |
def length(self) -> int:
|
|
|
|
| 336 |
"author", "unknown"
|
| 337 |
)
|
| 338 |
|
| 339 |
+
@property
|
| 340 |
+
def metadata(self) -> Optional[YouTubeMetadata]:
|
| 341 |
+
"""Get the metadata for the video.
|
| 342 |
+
|
| 343 |
+
:rtype: YouTubeMetadata
|
| 344 |
+
"""
|
| 345 |
+
if self._metadata:
|
| 346 |
+
return self._metadata
|
| 347 |
+
else:
|
| 348 |
+
self._metadata = extract.metadata(self.initial_data)
|
| 349 |
+
return self._metadata
|
| 350 |
+
|
| 351 |
def register_on_progress_callback(self, func: OnProgress):
|
| 352 |
"""Register a download progress callback function post initialization.
|
| 353 |
|
pytube/extract.py
CHANGED
|
@@ -8,6 +8,7 @@ from datetime import datetime
|
|
| 8 |
from typing import Any
|
| 9 |
from typing import Dict
|
| 10 |
from typing import List
|
|
|
|
| 11 |
from typing import Tuple
|
| 12 |
from urllib.parse import parse_qs
|
| 13 |
from urllib.parse import parse_qsl
|
|
@@ -19,6 +20,7 @@ from pytube.cipher import Cipher
|
|
| 19 |
from pytube.exceptions import LiveStreamError
|
| 20 |
from pytube.exceptions import RegexMatchError
|
| 21 |
from pytube.helpers import regex_search
|
|
|
|
| 22 |
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
|
@@ -396,3 +398,58 @@ def apply_descrambler(stream_data: Dict, key: str) -> None:
|
|
| 396 |
]
|
| 397 |
|
| 398 |
logger.debug("applying descrambler")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from typing import Any
|
| 9 |
from typing import Dict
|
| 10 |
from typing import List
|
| 11 |
+
from typing import Optional
|
| 12 |
from typing import Tuple
|
| 13 |
from urllib.parse import parse_qs
|
| 14 |
from urllib.parse import parse_qsl
|
|
|
|
| 20 |
from pytube.exceptions import LiveStreamError
|
| 21 |
from pytube.exceptions import RegexMatchError
|
| 22 |
from pytube.helpers import regex_search
|
| 23 |
+
from pytube.metadata import YouTubeMetadata
|
| 24 |
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
|
|
|
| 398 |
]
|
| 399 |
|
| 400 |
logger.debug("applying descrambler")
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def initial_data(watch_html: str) -> str:
|
| 404 |
+
"""Extract the ytInitialData json from the watch_html page.
|
| 405 |
+
|
| 406 |
+
This mostly contains metadata necessary for rendering the page on-load,
|
| 407 |
+
such as video information, copyright notices, etc.
|
| 408 |
+
|
| 409 |
+
@param watch_html: Html of the watch page
|
| 410 |
+
@return:
|
| 411 |
+
"""
|
| 412 |
+
initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*([^\n]+)"
|
| 413 |
+
try:
|
| 414 |
+
match = regex_search(initial_data_pattern, watch_html, 1)
|
| 415 |
+
except RegexMatchError:
|
| 416 |
+
return "{}"
|
| 417 |
+
else:
|
| 418 |
+
return match[:-1]
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
def metadata(initial_data) -> Optional[YouTubeMetadata]:
|
| 422 |
+
"""Get the informational metadata for the video.
|
| 423 |
+
|
| 424 |
+
e.g.:
|
| 425 |
+
[
|
| 426 |
+
{
|
| 427 |
+
'Song': '강남스타일(Gangnam Style)',
|
| 428 |
+
'Artist': 'PSY',
|
| 429 |
+
'Album': 'PSY SIX RULES Pt.1',
|
| 430 |
+
'Licensed to YouTube by': 'YG Entertainment Inc. [...]'
|
| 431 |
+
}
|
| 432 |
+
]
|
| 433 |
+
|
| 434 |
+
:rtype: YouTubeMetadata
|
| 435 |
+
"""
|
| 436 |
+
try:
|
| 437 |
+
metadata_rows: List = initial_data["contents"]["twoColumnWatchNextResults"][
|
| 438 |
+
"results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"][
|
| 439 |
+
"metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
|
| 440 |
+
except (KeyError, IndexError):
|
| 441 |
+
# If there's an exception accessing this data, it probably doesn't exist.
|
| 442 |
+
return YouTubeMetadata([])
|
| 443 |
+
|
| 444 |
+
# Rows appear to only have "metadataRowRenderer" or "metadataRowHeaderRenderer"
|
| 445 |
+
# and we only care about the former, so we filter the others
|
| 446 |
+
metadata_rows = filter(
|
| 447 |
+
lambda x: "metadataRowRenderer" in x.keys(),
|
| 448 |
+
metadata_rows
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
# We then access the metadataRowRenderer key in each element
|
| 452 |
+
# and build a metadata object from this new list
|
| 453 |
+
metadata_rows = [x["metadataRowRenderer"] for x in metadata_rows]
|
| 454 |
+
|
| 455 |
+
return YouTubeMetadata(metadata_rows)
|
pytube/metadata.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""This module contains the YouTubeMetadata class."""
|
| 3 |
+
import json
|
| 4 |
+
from typing import Dict
|
| 5 |
+
from typing import List
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class YouTubeMetadata:
|
| 10 |
+
def __init__(self, metadata: List):
|
| 11 |
+
self._raw_metadata: List = metadata
|
| 12 |
+
self._metadata = [{}]
|
| 13 |
+
|
| 14 |
+
for el in metadata:
|
| 15 |
+
# We only add metadata to the dict if it has a simpleText title.
|
| 16 |
+
if 'title' in el and 'simpleText' in el['title']:
|
| 17 |
+
metadata_title = el['title']['simpleText']
|
| 18 |
+
else:
|
| 19 |
+
continue
|
| 20 |
+
|
| 21 |
+
contents = el['contents'][0]
|
| 22 |
+
if 'simpleText' in contents:
|
| 23 |
+
self._metadata[-1][metadata_title] = contents['simpleText']
|
| 24 |
+
elif 'runs' in contents:
|
| 25 |
+
self._metadata[-1][metadata_title] = contents['runs'][0]['text']
|
| 26 |
+
|
| 27 |
+
# Upon reaching a dividing line, create a new grouping
|
| 28 |
+
if el.get('hasDividerLine', False):
|
| 29 |
+
self._metadata.append({})
|
| 30 |
+
|
| 31 |
+
# If we happen to create an empty dict at the end, drop it
|
| 32 |
+
if self._metadata[-1] == {}:
|
| 33 |
+
self._metadata = self._metadata[:-1]
|
| 34 |
+
|
| 35 |
+
def __iter__(self):
|
| 36 |
+
for el in self._metadata:
|
| 37 |
+
yield el
|
| 38 |
+
|
| 39 |
+
def __str__(self):
|
| 40 |
+
return json.dumps(self._metadata)
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def raw_metadata(self) -> Optional[Dict]:
|
| 44 |
+
return self._raw_metadata
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def metadata(self):
|
| 48 |
+
return self._metadata
|
pytube/request.py
CHANGED
|
@@ -16,7 +16,7 @@ default_range_size = 9437184 # 9MB
|
|
| 16 |
|
| 17 |
|
| 18 |
def _execute_request(url, method=None, headers=None):
|
| 19 |
-
base_headers = {"User-Agent": "Mozilla/5.0"}
|
| 20 |
if headers:
|
| 21 |
base_headers.update(headers)
|
| 22 |
if url.lower().startswith("http"):
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def _execute_request(url, method=None, headers=None):
|
| 19 |
+
base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
|
| 20 |
if headers:
|
| 21 |
base_headers.update(headers)
|
| 22 |
if url.lower().startswith("http"):
|
tests/test_extract.py
CHANGED
|
@@ -102,3 +102,13 @@ def test_signature_cipher_does_not_error(stream_dict):
|
|
| 102 |
config_args = extract.get_ytplayer_config(stream_dict)['args']
|
| 103 |
extract.apply_descrambler(config_args, "url_encoded_fmt_stream_map")
|
| 104 |
assert "s" in config_args["url_encoded_fmt_stream_map"][0].keys()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
config_args = extract.get_ytplayer_config(stream_dict)['args']
|
| 103 |
extract.apply_descrambler(config_args, "url_encoded_fmt_stream_map")
|
| 104 |
assert "s" in config_args["url_encoded_fmt_stream_map"][0].keys()
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def test_initial_data_missing():
|
| 108 |
+
initial_data = extract.initial_data('')
|
| 109 |
+
assert initial_data == "{}"
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def test_initial_data(stream_dict):
|
| 113 |
+
initial_data = extract.initial_data(stream_dict)
|
| 114 |
+
assert 'contents' in initial_data
|
tests/test_metadata.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Unit tests for the :module:`metadata <metadata>` module."""
|
| 3 |
+
import json
|
| 4 |
+
from pytube import extract
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_extract_metadata_empty():
|
| 8 |
+
ytmd = extract.metadata({})
|
| 9 |
+
assert ytmd._raw_metadata == []
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_metadata_from_initial_data(stream_dict):
|
| 13 |
+
initial_data = extract.initial_data(stream_dict)
|
| 14 |
+
ytmd = extract.metadata(json.loads(initial_data))
|
| 15 |
+
assert len(ytmd.raw_metadata) > 0
|
| 16 |
+
assert 'contents' in ytmd.raw_metadata[0]
|
| 17 |
+
assert len(ytmd.metadata) > 0
|
| 18 |
+
assert 'Song' in ytmd.metadata[0]
|