Taylor Fox Dahlin commited on
Improvement/parser (#838)
Browse files* Added a parser for javascript objects within html
- pytube/__main__.py +1 -3
- pytube/contrib/playlist.py +1 -1
- pytube/extract.py +19 -19
- pytube/parser.py +83 -0
- tests/test_extract.py +1 -1
- tests/test_metadata.py +1 -2
- tests/test_parser.py +58 -0
pytube/__main__.py
CHANGED
|
@@ -80,7 +80,6 @@ class YouTube:
|
|
| 80 |
|
| 81 |
self.fmt_streams: List[Stream] = []
|
| 82 |
|
| 83 |
-
self.initial_data_raw = None
|
| 84 |
self.initial_data = {}
|
| 85 |
self._metadata: Optional[YouTubeMetadata] = None
|
| 86 |
|
|
@@ -190,8 +189,7 @@ class YouTube:
|
|
| 190 |
video_id=self.video_id, watch_url=self.watch_url
|
| 191 |
)
|
| 192 |
|
| 193 |
-
self.
|
| 194 |
-
self.initial_data = json.loads(self.initial_data_raw)
|
| 195 |
|
| 196 |
self.vid_info_raw = request.get(self.vid_info_url)
|
| 197 |
if not self.age_restricted:
|
|
|
|
| 80 |
|
| 81 |
self.fmt_streams: List[Stream] = []
|
| 82 |
|
|
|
|
| 83 |
self.initial_data = {}
|
| 84 |
self._metadata: Optional[YouTubeMetadata] = None
|
| 85 |
|
|
|
|
| 189 |
video_id=self.video_id, watch_url=self.watch_url
|
| 190 |
)
|
| 191 |
|
| 192 |
+
self.initial_data = extract.initial_data(self.watch_html)
|
|
|
|
| 193 |
|
| 194 |
self.vid_info_raw = request.get(self.vid_info_url)
|
| 195 |
if not self.age_restricted:
|
pytube/contrib/playlist.py
CHANGED
|
@@ -51,7 +51,7 @@ class Playlist(Sequence):
|
|
| 51 |
"""
|
| 52 |
req = self.html
|
| 53 |
videos_urls, continuation = self._extract_videos(
|
| 54 |
-
extract.initial_data(self.html)
|
| 55 |
)
|
| 56 |
if until_watch_id:
|
| 57 |
try:
|
|
|
|
| 51 |
"""
|
| 52 |
req = self.html
|
| 53 |
videos_urls, continuation = self._extract_videos(
|
| 54 |
+
json.dumps(extract.initial_data(self.html))
|
| 55 |
)
|
| 56 |
if until_watch_id:
|
| 57 |
try:
|
pytube/extract.py
CHANGED
|
@@ -18,10 +18,12 @@ from urllib.parse import unquote
|
|
| 18 |
from urllib.parse import urlencode
|
| 19 |
|
| 20 |
from pytube.cipher import Cipher
|
|
|
|
| 21 |
from pytube.exceptions import LiveStreamError
|
| 22 |
from pytube.exceptions import RegexMatchError
|
| 23 |
from pytube.helpers import regex_search
|
| 24 |
from pytube.metadata import YouTubeMetadata
|
|
|
|
| 25 |
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
|
@@ -269,31 +271,29 @@ def get_ytplayer_config(html: str) -> Any:
|
|
| 269 |
"""
|
| 270 |
logger.debug("finding initial function name")
|
| 271 |
config_patterns = [
|
| 272 |
-
r"ytplayer\.config\s*=\s*
|
| 273 |
-
r"ytInitialPlayerResponse\s*=\s*
|
| 274 |
]
|
| 275 |
for pattern in config_patterns:
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
return json.loads(yt_player_config)
|
| 282 |
|
| 283 |
# setConfig() needs to be handled a little differently.
|
| 284 |
# We want to parse the entire argument to setConfig()
|
| 285 |
# and use then load that as json to find PLAYER_CONFIG
|
| 286 |
# inside of it.
|
| 287 |
setconfig_patterns = [
|
| 288 |
-
r"yt\.setConfig\(
|
| 289 |
]
|
| 290 |
for pattern in setconfig_patterns:
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
return json.loads(yt_config)['PLAYER_CONFIG']
|
| 297 |
|
| 298 |
raise RegexMatchError(
|
| 299 |
caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
|
|
@@ -431,11 +431,11 @@ def initial_data(watch_html: str) -> str:
|
|
| 431 |
@param watch_html: Html of the watch page
|
| 432 |
@return:
|
| 433 |
"""
|
| 434 |
-
initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*
|
| 435 |
try:
|
| 436 |
-
return
|
| 437 |
-
except
|
| 438 |
-
return
|
| 439 |
|
| 440 |
|
| 441 |
def metadata(initial_data) -> Optional[YouTubeMetadata]:
|
|
|
|
| 18 |
from urllib.parse import urlencode
|
| 19 |
|
| 20 |
from pytube.cipher import Cipher
|
| 21 |
+
from pytube.exceptions import HTMLParseError
|
| 22 |
from pytube.exceptions import LiveStreamError
|
| 23 |
from pytube.exceptions import RegexMatchError
|
| 24 |
from pytube.helpers import regex_search
|
| 25 |
from pytube.metadata import YouTubeMetadata
|
| 26 |
+
from pytube.parser import parse_for_object
|
| 27 |
|
| 28 |
logger = logging.getLogger(__name__)
|
| 29 |
|
|
|
|
| 271 |
"""
|
| 272 |
logger.debug("finding initial function name")
|
| 273 |
config_patterns = [
|
| 274 |
+
r"ytplayer\.config\s*=\s*",
|
| 275 |
+
r"ytInitialPlayerResponse\s*=\s*"
|
| 276 |
]
|
| 277 |
for pattern in config_patterns:
|
| 278 |
+
# Try each pattern consecutively if they don't find a match
|
| 279 |
+
try:
|
| 280 |
+
return parse_for_object(html, pattern)
|
| 281 |
+
except HTMLParseError:
|
| 282 |
+
continue
|
|
|
|
| 283 |
|
| 284 |
# setConfig() needs to be handled a little differently.
|
| 285 |
# We want to parse the entire argument to setConfig()
|
| 286 |
# and use then load that as json to find PLAYER_CONFIG
|
| 287 |
# inside of it.
|
| 288 |
setconfig_patterns = [
|
| 289 |
+
r"yt\.setConfig\(.*['\"]PLAYER_CONFIG['\"]:\s*"
|
| 290 |
]
|
| 291 |
for pattern in setconfig_patterns:
|
| 292 |
+
# Try each pattern consecutively if they don't find a match
|
| 293 |
+
try:
|
| 294 |
+
return parse_for_object(html, pattern)
|
| 295 |
+
except HTMLParseError:
|
| 296 |
+
continue
|
|
|
|
| 297 |
|
| 298 |
raise RegexMatchError(
|
| 299 |
caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
|
|
|
|
| 431 |
@param watch_html: Html of the watch page
|
| 432 |
@return:
|
| 433 |
"""
|
| 434 |
+
initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*"
|
| 435 |
try:
|
| 436 |
+
return parse_for_object(watch_html, initial_data_pattern)
|
| 437 |
+
except HTMLParseError:
|
| 438 |
+
return {}
|
| 439 |
|
| 440 |
|
| 441 |
def metadata(initial_data) -> Optional[YouTubeMetadata]:
|
pytube/parser.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from pytube.exceptions import HTMLParseError
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def parse_for_object(html, preceding_regex):
|
| 8 |
+
"""Parses input html to find the end of a JavaScript object.
|
| 9 |
+
|
| 10 |
+
:param str html:
|
| 11 |
+
HTML to be parsed for an object.
|
| 12 |
+
:param str preceding_regex:
|
| 13 |
+
Regex to find the string preceding the object.
|
| 14 |
+
:rtype dict:
|
| 15 |
+
:returns:
|
| 16 |
+
A dict created from parsing the object.
|
| 17 |
+
"""
|
| 18 |
+
regex = re.compile(preceding_regex)
|
| 19 |
+
result = regex.search(html)
|
| 20 |
+
if not result:
|
| 21 |
+
raise HTMLParseError(f'No matches for regex {preceding_regex}')
|
| 22 |
+
|
| 23 |
+
start_index = result.span()[1]
|
| 24 |
+
return parse_for_object_from_startpoint(html, start_index)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def parse_for_object_from_startpoint(html, start_point):
|
| 28 |
+
"""Parses input html to find the end of a JavaScript object.
|
| 29 |
+
|
| 30 |
+
:param str html:
|
| 31 |
+
HTML to be parsed for an object.
|
| 32 |
+
:param int start_point:
|
| 33 |
+
Index of where the object starts.
|
| 34 |
+
:rtype dict:
|
| 35 |
+
:returns:
|
| 36 |
+
A dict created from parsing the object.
|
| 37 |
+
"""
|
| 38 |
+
html = html[start_point:]
|
| 39 |
+
if html[0] != '{':
|
| 40 |
+
raise HTMLParseError('Invalid start point.')
|
| 41 |
+
|
| 42 |
+
# First letter MUST be a open brace, so we put that in the stack,
|
| 43 |
+
# and skip the first character.
|
| 44 |
+
stack = ['{']
|
| 45 |
+
i = 1
|
| 46 |
+
|
| 47 |
+
context_closers = {
|
| 48 |
+
'{': '}',
|
| 49 |
+
'[': ']',
|
| 50 |
+
'"': '"'
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
while i < len(html):
|
| 54 |
+
if len(stack) == 0:
|
| 55 |
+
break
|
| 56 |
+
curr_char = html[i]
|
| 57 |
+
curr_context = stack[-1]
|
| 58 |
+
|
| 59 |
+
# If we've reached a context closer, we can remove an element off the stack
|
| 60 |
+
if curr_char == context_closers[curr_context]:
|
| 61 |
+
stack.pop()
|
| 62 |
+
i += 1
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
# Strings require special context handling because they can contain
|
| 66 |
+
# context openers *and* closers
|
| 67 |
+
if curr_context == '"':
|
| 68 |
+
# If there's a backslash in a string, we skip a character
|
| 69 |
+
if curr_char == '\\':
|
| 70 |
+
i += 2
|
| 71 |
+
continue
|
| 72 |
+
else:
|
| 73 |
+
# Non-string contexts are when we need to look for context openers.
|
| 74 |
+
if curr_char in context_closers.keys():
|
| 75 |
+
stack.append(curr_char)
|
| 76 |
+
|
| 77 |
+
i += 1
|
| 78 |
+
|
| 79 |
+
full_obj = html[:i]
|
| 80 |
+
try:
|
| 81 |
+
return json.loads(full_obj)
|
| 82 |
+
except json.decoder.JSONDecodeError:
|
| 83 |
+
return ast.literal_eval(full_obj)
|
tests/test_extract.py
CHANGED
|
@@ -106,7 +106,7 @@ def test_signature_cipher_does_not_error(stream_dict):
|
|
| 106 |
|
| 107 |
def test_initial_data_missing():
|
| 108 |
initial_data = extract.initial_data('')
|
| 109 |
-
assert initial_data ==
|
| 110 |
|
| 111 |
|
| 112 |
def test_initial_data(stream_dict):
|
|
|
|
| 106 |
|
| 107 |
def test_initial_data_missing():
|
| 108 |
initial_data = extract.initial_data('')
|
| 109 |
+
assert initial_data == {}
|
| 110 |
|
| 111 |
|
| 112 |
def test_initial_data(stream_dict):
|
tests/test_metadata.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""Unit tests for the :module:`metadata <metadata>` module."""
|
| 3 |
-
import json
|
| 4 |
from pytube import extract
|
| 5 |
|
| 6 |
|
|
@@ -11,7 +10,7 @@ def test_extract_metadata_empty():
|
|
| 11 |
|
| 12 |
def test_metadata_from_initial_data(stream_dict):
|
| 13 |
initial_data = extract.initial_data(stream_dict)
|
| 14 |
-
ytmd = extract.metadata(
|
| 15 |
assert len(ytmd.raw_metadata) > 0
|
| 16 |
assert 'contents' in ytmd.raw_metadata[0]
|
| 17 |
assert len(ytmd.metadata) > 0
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""Unit tests for the :module:`metadata <metadata>` module."""
|
|
|
|
| 3 |
from pytube import extract
|
| 4 |
|
| 5 |
|
|
|
|
| 10 |
|
| 11 |
def test_metadata_from_initial_data(stream_dict):
|
| 12 |
initial_data = extract.initial_data(stream_dict)
|
| 13 |
+
ytmd = extract.metadata(initial_data)
|
| 14 |
assert len(ytmd.raw_metadata) > 0
|
| 15 |
assert 'contents' in ytmd.raw_metadata[0]
|
| 16 |
assert len(ytmd.metadata) > 0
|
tests/test_parser.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pytube.exceptions import HTMLParseError
|
| 5 |
+
from pytube.parser import parse_for_object
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_invalid_start():
|
| 9 |
+
with pytest.raises(HTMLParseError):
|
| 10 |
+
parse_for_object('test = {}', r'invalid_regex')
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_parse_simple_empty_object():
|
| 14 |
+
result = parse_for_object('test = {}', r'test\s*=\s*')
|
| 15 |
+
assert result == {}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_parse_longer_empty_object():
|
| 19 |
+
test_html = """test = {
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
}"""
|
| 23 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
| 24 |
+
assert result == {}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_parse_empty_object_with_trailing_characters():
|
| 28 |
+
test_html = 'test = {};'
|
| 29 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
| 30 |
+
assert result == {}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_parse_simple_object():
|
| 34 |
+
test_html = 'test = {"foo": [], "bar": {}};'
|
| 35 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
| 36 |
+
assert result == {
|
| 37 |
+
'foo': [],
|
| 38 |
+
'bar': {}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_parse_context_closer_in_string_value():
|
| 43 |
+
test_html = 'test = {"foo": "};"};'
|
| 44 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
| 45 |
+
assert result == {
|
| 46 |
+
'foo': '};'
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_parse_object_requiring_ast():
|
| 51 |
+
invalid_json = '{"foo": "bar",}'
|
| 52 |
+
test_html = f'test = {invalid_json}'
|
| 53 |
+
with pytest.raises(json.decoder.JSONDecodeError):
|
| 54 |
+
json.loads(invalid_json)
|
| 55 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
| 56 |
+
assert result == {
|
| 57 |
+
'foo': 'bar'
|
| 58 |
+
}
|