Improvement/parser (#838)

Browse files

* Added a parser for javascript objects within html

Files changed (7) hide show

pytube/__main__.py +1 -3
pytube/contrib/playlist.py +1 -1
pytube/extract.py +19 -19
pytube/parser.py +83 -0
tests/test_extract.py +1 -1
tests/test_metadata.py +1 -2
tests/test_parser.py +58 -0

pytube/__main__.py CHANGED Viewed

@@ -80,7 +80,6 @@ class YouTube:
         self.fmt_streams: List[Stream] = []
-        self.initial_data_raw = None
         self.initial_data = {}
         self._metadata: Optional[YouTubeMetadata] = None
@@ -190,8 +189,7 @@ class YouTube:
                 video_id=self.video_id, watch_url=self.watch_url
             )
-        self.initial_data_raw = extract.initial_data(self.watch_html)
-        self.initial_data = json.loads(self.initial_data_raw)
         self.vid_info_raw = request.get(self.vid_info_url)
         if not self.age_restricted:

         self.fmt_streams: List[Stream] = []
         self.initial_data = {}
         self._metadata: Optional[YouTubeMetadata] = None
                 video_id=self.video_id, watch_url=self.watch_url
             )
+        self.initial_data = extract.initial_data(self.watch_html)
         self.vid_info_raw = request.get(self.vid_info_url)
         if not self.age_restricted:

pytube/contrib/playlist.py CHANGED Viewed

@@ -51,7 +51,7 @@ class Playlist(Sequence):
         """
         req = self.html
         videos_urls, continuation = self._extract_videos(
-            extract.initial_data(self.html)
         )
         if until_watch_id:
             try:

         """
         req = self.html
         videos_urls, continuation = self._extract_videos(
+            json.dumps(extract.initial_data(self.html))
         )
         if until_watch_id:
             try:

pytube/extract.py CHANGED Viewed

@@ -18,10 +18,12 @@ from urllib.parse import unquote
 from urllib.parse import urlencode
 from pytube.cipher import Cipher
 from pytube.exceptions import LiveStreamError
 from pytube.exceptions import RegexMatchError
 from pytube.helpers import regex_search
 from pytube.metadata import YouTubeMetadata
 logger = logging.getLogger(__name__)
@@ -269,31 +271,29 @@ def get_ytplayer_config(html: str) -> Any:
     """
     logger.debug("finding initial function name")
     config_patterns = [
-        r"ytplayer\.config\s*=\s*({.+?});ytplayer",
-        r"ytInitialPlayerResponse\s*=\s*({.+?(?<!gdpr)});"
     ]
     for pattern in config_patterns:
-        regex = re.compile(pattern)
-        function_match = regex.search(html)
-        if function_match:
-            logger.debug("finished regex search, matched: %s", pattern)
-            yt_player_config = function_match.group(1)
-            return json.loads(yt_player_config)
     # setConfig() needs to be handled a little differently.
     # We want to parse the entire argument to setConfig()
     #  and use then load that as json to find PLAYER_CONFIG
     #  inside of it.
     setconfig_patterns = [
-        r"yt\.setConfig\((.*['\"]PLAYER_CONFIG['\"]:\s*{.+?})\);"
     ]
     for pattern in setconfig_patterns:
-        regex = re.compile(pattern)
-        function_match = regex.search(html)
-        if function_match:
-            logger.debug("finished regex search, matched: %s", pattern)
-            yt_config = function_match.group(1)
-            return json.loads(yt_config)['PLAYER_CONFIG']
     raise RegexMatchError(
         caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
@@ -431,11 +431,11 @@ def initial_data(watch_html: str) -> str:
     @param watch_html: Html of the watch page
     @return:
     """
-    initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*([^\n]+);"
     try:
-        return regex_search(initial_data_pattern, watch_html, 1)
-    except RegexMatchError:
-        return "{}"
 def metadata(initial_data) -> Optional[YouTubeMetadata]:

 from urllib.parse import urlencode
 from pytube.cipher import Cipher
+from pytube.exceptions import HTMLParseError
 from pytube.exceptions import LiveStreamError
 from pytube.exceptions import RegexMatchError
 from pytube.helpers import regex_search
 from pytube.metadata import YouTubeMetadata
+from pytube.parser import parse_for_object
 logger = logging.getLogger(__name__)
     """
     logger.debug("finding initial function name")
     config_patterns = [
+        r"ytplayer\.config\s*=\s*",
+        r"ytInitialPlayerResponse\s*=\s*"
     ]
     for pattern in config_patterns:
+        # Try each pattern consecutively if they don't find a match
+        try:
+            return parse_for_object(html, pattern)
+        except HTMLParseError:
+            continue
     # setConfig() needs to be handled a little differently.
     # We want to parse the entire argument to setConfig()
     #  and use then load that as json to find PLAYER_CONFIG
     #  inside of it.
     setconfig_patterns = [
+        r"yt\.setConfig\(.*['\"]PLAYER_CONFIG['\"]:\s*"
     ]
     for pattern in setconfig_patterns:
+        # Try each pattern consecutively if they don't find a match
+        try:
+            return parse_for_object(html, pattern)
+        except HTMLParseError:
+            continue
     raise RegexMatchError(
         caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
     @param watch_html: Html of the watch page
     @return:
     """
+    initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*"
     try:
+        return parse_for_object(watch_html, initial_data_pattern)
+    except HTMLParseError:
+        return {}
 def metadata(initial_data) -> Optional[YouTubeMetadata]:

pytube/parser.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import ast
+import json
+import re
+from pytube.exceptions import HTMLParseError
+def parse_for_object(html, preceding_regex):
+    """Parses input html to find the end of a JavaScript object.
+    :param str html:
+        HTML to be parsed for an object.
+    :param str preceding_regex:
+        Regex to find the string preceding the object.
+    :rtype dict:
+    :returns:
+        A dict created from parsing the object.
+    """
+    regex = re.compile(preceding_regex)
+    result = regex.search(html)
+    if not result:
+        raise HTMLParseError(f'No matches for regex {preceding_regex}')
+    start_index = result.span()[1]
+    return parse_for_object_from_startpoint(html, start_index)
+def parse_for_object_from_startpoint(html, start_point):
+    """Parses input html to find the end of a JavaScript object.
+    :param str html:
+        HTML to be parsed for an object.
+    :param int start_point:
+        Index of where the object starts.
+    :rtype dict:
+    :returns:
+        A dict created from parsing the object.
+    """
+    html = html[start_point:]
+    if html[0] != '{':
+        raise HTMLParseError('Invalid start point.')
+    # First letter MUST be a open brace, so we put that in the stack,
+    # and skip the first character.
+    stack = ['{']
+    i = 1
+    context_closers = {
+        '{': '}',
+        '[': ']',
+        '"': '"'
+    }
+    while i < len(html):
+        if len(stack) == 0:
+            break
+        curr_char = html[i]
+        curr_context = stack[-1]
+        # If we've reached a context closer, we can remove an element off the stack
+        if curr_char == context_closers[curr_context]:
+            stack.pop()
+            i += 1
+            continue
+        # Strings require special context handling because they can contain
+        #  context openers *and* closers
+        if curr_context == '"':
+            # If there's a backslash in a string, we skip a character
+            if curr_char == '\\':
+                i += 2
+                continue
+        else:
+            # Non-string contexts are when we need to look for context openers.
+            if curr_char in context_closers.keys():
+                stack.append(curr_char)
+        i += 1
+    full_obj = html[:i]
+    try:
+        return json.loads(full_obj)
+    except json.decoder.JSONDecodeError:
+        return ast.literal_eval(full_obj)

tests/test_extract.py CHANGED Viewed

@@ -106,7 +106,7 @@ def test_signature_cipher_does_not_error(stream_dict):
 def test_initial_data_missing():
     initial_data = extract.initial_data('')
-    assert initial_data == "{}"
 def test_initial_data(stream_dict):

 def test_initial_data_missing():
     initial_data = extract.initial_data('')
+    assert initial_data == {}
 def test_initial_data(stream_dict):

tests/test_metadata.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 """Unit tests for the :module:`metadata <metadata>` module."""
-import json
 from pytube import extract
@@ -11,7 +10,7 @@ def test_extract_metadata_empty():
 def test_metadata_from_initial_data(stream_dict):
     initial_data = extract.initial_data(stream_dict)
-    ytmd = extract.metadata(json.loads(initial_data))
     assert len(ytmd.raw_metadata) > 0
     assert 'contents' in ytmd.raw_metadata[0]
     assert len(ytmd.metadata) > 0

 # -*- coding: utf-8 -*-
 """Unit tests for the :module:`metadata <metadata>` module."""
 from pytube import extract
 def test_metadata_from_initial_data(stream_dict):
     initial_data = extract.initial_data(stream_dict)
+    ytmd = extract.metadata(initial_data)
     assert len(ytmd.raw_metadata) > 0
     assert 'contents' in ytmd.raw_metadata[0]
     assert len(ytmd.metadata) > 0

tests/test_parser.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import json
+import pytest
+from pytube.exceptions import HTMLParseError
+from pytube.parser import parse_for_object
+def test_invalid_start():
+    with pytest.raises(HTMLParseError):
+        parse_for_object('test = {}', r'invalid_regex')
+def test_parse_simple_empty_object():
+    result = parse_for_object('test = {}', r'test\s*=\s*')
+    assert result == {}
+def test_parse_longer_empty_object():
+    test_html = """test = {
+    }"""
+    result = parse_for_object(test_html, r'test\s*=\s*')
+    assert result == {}
+def test_parse_empty_object_with_trailing_characters():
+    test_html = 'test = {};'
+    result = parse_for_object(test_html, r'test\s*=\s*')
+    assert result == {}
+def test_parse_simple_object():
+    test_html = 'test = {"foo": [], "bar": {}};'
+    result = parse_for_object(test_html, r'test\s*=\s*')
+    assert result == {
+        'foo': [],
+        'bar': {}
+    }
+def test_parse_context_closer_in_string_value():
+    test_html = 'test = {"foo": "};"};'
+    result = parse_for_object(test_html, r'test\s*=\s*')
+    assert result == {
+        'foo': '};'
+    }
+def test_parse_object_requiring_ast():
+    invalid_json = '{"foo": "bar",}'
+    test_html = f'test = {invalid_json}'
+    with pytest.raises(json.decoder.JSONDecodeError):
+        json.loads(invalid_json)
+    result = parse_for_object(test_html, r'test\s*=\s*')
+    assert result == {
+        'foo': 'bar'
+    }