Taylor Fox Dahlin commited on
Fix/404 error on adaptive (#799)
Browse files* Fixes slight mistake in implementation of splice, and added unit test.
* Added sequential filestream request support for adaptive downloads, with unit testing.
* Removed some type hints
* Extracted default values to variables
* Made lru_caches limited to default (128)
- pytube/cipher.py +1 -1
- pytube/request.py +109 -16
- pytube/streams.py +22 -6
- tests/test_cipher.py +5 -0
- tests/test_streams.py +83 -0
pytube/cipher.py
CHANGED
|
@@ -247,7 +247,7 @@ def splice(arr: List, b: int):
|
|
| 247 |
>>> splice([1, 2, 3, 4], 2)
|
| 248 |
[1, 2]
|
| 249 |
"""
|
| 250 |
-
return arr[:b]
|
| 251 |
|
| 252 |
|
| 253 |
def swap(arr: List, b: int):
|
|
|
|
| 247 |
>>> splice([1, 2, 3, 4], 2)
|
| 248 |
[1, 2]
|
| 249 |
"""
|
| 250 |
+
return arr[:b]
|
| 251 |
|
| 252 |
|
| 253 |
def swap(arr: List, b: int):
|
pytube/request.py
CHANGED
|
@@ -2,21 +2,20 @@
|
|
| 2 |
"""Implements a simple wrapper around urlopen."""
|
| 3 |
import logging
|
| 4 |
from functools import lru_cache
|
| 5 |
-
|
| 6 |
-
from
|
| 7 |
-
from typing import Iterable
|
| 8 |
-
from typing import Optional
|
| 9 |
from urllib.request import Request
|
| 10 |
from urllib.request import urlopen
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
-
def _execute_request(
|
| 16 |
-
url: str,
|
| 17 |
-
method: Optional[str] = None,
|
| 18 |
-
headers: Optional[Dict[str, str]] = None,
|
| 19 |
-
) -> HTTPResponse:
|
| 20 |
base_headers = {"User-Agent": "Mozilla/5.0"}
|
| 21 |
if headers:
|
| 22 |
base_headers.update(headers)
|
|
@@ -27,7 +26,7 @@ def _execute_request(
|
|
| 27 |
return urlopen(request) # nosec
|
| 28 |
|
| 29 |
|
| 30 |
-
def get(url, extra_headers=None)
|
| 31 |
"""Send an http GET request.
|
| 32 |
|
| 33 |
:param str url:
|
|
@@ -43,9 +42,51 @@ def get(url, extra_headers=None) -> str:
|
|
| 43 |
return _execute_request(url, headers=extra_headers).read().decode("utf-8")
|
| 44 |
|
| 45 |
|
| 46 |
-
def
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"""Read the response in chunks.
|
| 50 |
:param str url: The URL to perform the GET request for.
|
| 51 |
:param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
|
|
@@ -76,8 +117,8 @@ def stream(
|
|
| 76 |
return # pylint: disable=R1711
|
| 77 |
|
| 78 |
|
| 79 |
-
@lru_cache(
|
| 80 |
-
def filesize(url
|
| 81 |
"""Fetch size in bytes of file at given URL
|
| 82 |
|
| 83 |
:param str url: The URL to get the size of
|
|
@@ -86,7 +127,59 @@ def filesize(url: str) -> int:
|
|
| 86 |
return int(head(url)["content-length"])
|
| 87 |
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"""Fetch headers returned http GET request.
|
| 91 |
|
| 92 |
:param str url:
|
|
|
|
| 2 |
"""Implements a simple wrapper around urlopen."""
|
| 3 |
import logging
|
| 4 |
from functools import lru_cache
|
| 5 |
+
import re
|
| 6 |
+
from urllib import parse
|
|
|
|
|
|
|
| 7 |
from urllib.request import Request
|
| 8 |
from urllib.request import urlopen
|
| 9 |
|
| 10 |
+
from pytube.exceptions import RegexMatchError
|
| 11 |
+
from pytube.helpers import regex_search
|
| 12 |
+
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
+
default_chunk_size = 4096 # 4kb
|
| 15 |
+
default_range_size = 9437184 # 9MB
|
| 16 |
|
| 17 |
|
| 18 |
+
def _execute_request(url, method=None, headers=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
base_headers = {"User-Agent": "Mozilla/5.0"}
|
| 20 |
if headers:
|
| 21 |
base_headers.update(headers)
|
|
|
|
| 26 |
return urlopen(request) # nosec
|
| 27 |
|
| 28 |
|
| 29 |
+
def get(url, extra_headers=None):
|
| 30 |
"""Send an http GET request.
|
| 31 |
|
| 32 |
:param str url:
|
|
|
|
| 42 |
return _execute_request(url, headers=extra_headers).read().decode("utf-8")
|
| 43 |
|
| 44 |
|
| 45 |
+
def seq_stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
|
| 46 |
+
"""Read the response in sequence.
|
| 47 |
+
:param str url: The URL to perform the GET request for.
|
| 48 |
+
:param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
|
| 49 |
+
:param int range_size: The size in bytes of each range request. Defaults
|
| 50 |
+
to 9MB
|
| 51 |
+
:rtype: Iterable[bytes]
|
| 52 |
+
"""
|
| 53 |
+
# YouTube expects a request sequence number as part of the parameters.
|
| 54 |
+
split_url = parse.urlsplit(url)
|
| 55 |
+
base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
|
| 56 |
+
|
| 57 |
+
querys = dict(parse.parse_qsl(split_url.query))
|
| 58 |
+
|
| 59 |
+
# The 0th sequential request provides the file headers, which tell us
|
| 60 |
+
# information about how the file is segmented.
|
| 61 |
+
querys['sq'] = 0
|
| 62 |
+
url = base_url + parse.urlencode(querys)
|
| 63 |
+
|
| 64 |
+
segment_data = b''
|
| 65 |
+
for chunk in stream(url):
|
| 66 |
+
yield chunk
|
| 67 |
+
segment_data += chunk
|
| 68 |
+
|
| 69 |
+
# We can then parse the header to find the number of segments
|
| 70 |
+
stream_info = segment_data.split(b'\r\n')
|
| 71 |
+
segment_count_pattern = re.compile(b'Segment-Count: (\\d+)')
|
| 72 |
+
for line in stream_info:
|
| 73 |
+
match = segment_count_pattern.search(line)
|
| 74 |
+
if match:
|
| 75 |
+
segment_count = int(match.group(1).decode('utf-8'))
|
| 76 |
+
|
| 77 |
+
# We request these segments sequentially to build the file.
|
| 78 |
+
seq_num = 1
|
| 79 |
+
while seq_num <= segment_count:
|
| 80 |
+
# Create sequential request URL
|
| 81 |
+
querys['sq'] = seq_num
|
| 82 |
+
url = base_url + parse.urlencode(querys)
|
| 83 |
+
|
| 84 |
+
yield from stream(url)
|
| 85 |
+
seq_num += 1
|
| 86 |
+
return # pylint: disable=R1711
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
|
| 90 |
"""Read the response in chunks.
|
| 91 |
:param str url: The URL to perform the GET request for.
|
| 92 |
:param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
|
|
|
|
| 117 |
return # pylint: disable=R1711
|
| 118 |
|
| 119 |
|
| 120 |
+
@lru_cache()
|
| 121 |
+
def filesize(url):
|
| 122 |
"""Fetch size in bytes of file at given URL
|
| 123 |
|
| 124 |
:param str url: The URL to get the size of
|
|
|
|
| 127 |
return int(head(url)["content-length"])
|
| 128 |
|
| 129 |
|
| 130 |
+
@lru_cache()
|
| 131 |
+
def seq_filesize(url):
|
| 132 |
+
"""Fetch size in bytes of file at given URL from sequential requests
|
| 133 |
+
|
| 134 |
+
:param str url: The URL to get the size of
|
| 135 |
+
:returns: int: size in bytes of remote file
|
| 136 |
+
"""
|
| 137 |
+
total_filesize = 0
|
| 138 |
+
# YouTube expects a request sequence number as part of the parameters.
|
| 139 |
+
split_url = parse.urlsplit(url)
|
| 140 |
+
base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
|
| 141 |
+
querys = dict(parse.parse_qsl(split_url.query))
|
| 142 |
+
|
| 143 |
+
# The 0th sequential request provides the file headers, which tell us
|
| 144 |
+
# information about how the file is segmented.
|
| 145 |
+
querys['sq'] = 0
|
| 146 |
+
url = base_url + parse.urlencode(querys)
|
| 147 |
+
response = _execute_request(
|
| 148 |
+
url, method="GET"
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
response_value = response.read()
|
| 152 |
+
# The file header must be added to the total filesize
|
| 153 |
+
total_filesize += len(response_value)
|
| 154 |
+
|
| 155 |
+
# We can then parse the header to find the number of segments
|
| 156 |
+
segment_count = 0
|
| 157 |
+
stream_info = response_value.split(b'\r\n')
|
| 158 |
+
segment_regex = b'Segment-Count: (\\d+)'
|
| 159 |
+
for line in stream_info:
|
| 160 |
+
# One of the lines should contain the segment count, but we don't know
|
| 161 |
+
# which, so we need to iterate through the lines to find it
|
| 162 |
+
try:
|
| 163 |
+
segment_count = int(regex_search(segment_regex, line, 1))
|
| 164 |
+
except RegexMatchError:
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
if segment_count == 0:
|
| 168 |
+
raise RegexMatchError('seq_filesize', segment_regex)
|
| 169 |
+
|
| 170 |
+
# We make HEAD requests to the segments sequentially to find the total filesize.
|
| 171 |
+
seq_num = 1
|
| 172 |
+
while seq_num <= segment_count:
|
| 173 |
+
# Create sequential request URL
|
| 174 |
+
querys['sq'] = seq_num
|
| 175 |
+
url = base_url + parse.urlencode(querys)
|
| 176 |
+
|
| 177 |
+
total_filesize += int(head(url)['content-length'])
|
| 178 |
+
seq_num += 1
|
| 179 |
+
return total_filesize
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def head(url):
|
| 183 |
"""Fetch headers returned http GET request.
|
| 184 |
|
| 185 |
:param str url:
|
pytube/streams.py
CHANGED
|
@@ -14,6 +14,7 @@ from typing import BinaryIO
|
|
| 14 |
from typing import Dict
|
| 15 |
from typing import Optional
|
| 16 |
from typing import Tuple
|
|
|
|
| 17 |
from urllib.parse import parse_qs
|
| 18 |
|
| 19 |
from pytube import extract
|
|
@@ -153,7 +154,12 @@ class Stream:
|
|
| 153 |
Filesize (in bytes) of the stream.
|
| 154 |
"""
|
| 155 |
if self._filesize is None:
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
return self._filesize
|
| 158 |
|
| 159 |
@property
|
|
@@ -250,11 +256,21 @@ class Stream:
|
|
| 250 |
)
|
| 251 |
|
| 252 |
with open(file_path, "wb") as fh:
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
self.on_complete(file_path)
|
| 259 |
return file_path
|
| 260 |
|
|
|
|
| 14 |
from typing import Dict
|
| 15 |
from typing import Optional
|
| 16 |
from typing import Tuple
|
| 17 |
+
from urllib.error import HTTPError
|
| 18 |
from urllib.parse import parse_qs
|
| 19 |
|
| 20 |
from pytube import extract
|
|
|
|
| 154 |
Filesize (in bytes) of the stream.
|
| 155 |
"""
|
| 156 |
if self._filesize is None:
|
| 157 |
+
try:
|
| 158 |
+
self._filesize = request.filesize(self.url)
|
| 159 |
+
except HTTPError as e:
|
| 160 |
+
if e.code != 404:
|
| 161 |
+
raise
|
| 162 |
+
self._filesize = request.seq_filesize(self.url)
|
| 163 |
return self._filesize
|
| 164 |
|
| 165 |
@property
|
|
|
|
| 256 |
)
|
| 257 |
|
| 258 |
with open(file_path, "wb") as fh:
|
| 259 |
+
try:
|
| 260 |
+
for chunk in request.stream(self.url):
|
| 261 |
+
# reduce the (bytes) remainder by the length of the chunk.
|
| 262 |
+
bytes_remaining -= len(chunk)
|
| 263 |
+
# send to the on_progress callback.
|
| 264 |
+
self.on_progress(chunk, fh, bytes_remaining)
|
| 265 |
+
except HTTPError as e:
|
| 266 |
+
if e.code != 404:
|
| 267 |
+
raise
|
| 268 |
+
# Some adaptive streams need to be requested with sequence numbers
|
| 269 |
+
for chunk in request.seq_stream(self.url):
|
| 270 |
+
# reduce the (bytes) remainder by the length of the chunk.
|
| 271 |
+
bytes_remaining -= len(chunk)
|
| 272 |
+
# send to the on_progress callback.
|
| 273 |
+
self.on_progress(chunk, fh, bytes_remaining)
|
| 274 |
self.on_complete(file_path)
|
| 275 |
return file_path
|
| 276 |
|
tests/test_cipher.py
CHANGED
|
@@ -23,3 +23,8 @@ def test_get_transform_object_with_no_match_should_error():
|
|
| 23 |
def test_reverse():
|
| 24 |
reversed_array = cipher.reverse([1, 2, 3, 4], None)
|
| 25 |
assert reversed_array == [4, 3, 2, 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def test_reverse():
|
| 24 |
reversed_array = cipher.reverse([1, 2, 3, 4], None)
|
| 25 |
assert reversed_array == [4, 3, 2, 1]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_splice():
|
| 29 |
+
assert cipher.splice([1, 2, 3, 4], 2) == [1, 2]
|
| 30 |
+
assert cipher.splice([1, 2, 3, 4], 1) == [1]
|
tests/test_streams.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
from datetime import datetime
|
|
|
|
| 5 |
from unittest import mock
|
| 6 |
from unittest.mock import MagicMock, Mock
|
|
|
|
| 7 |
|
| 8 |
from pytube import request
|
| 9 |
from pytube import Stream
|
|
@@ -306,3 +308,84 @@ def test_repr_for_adaptive_streams(cipher_signature):
|
|
| 306 |
'vcodec="avc1.640028" progressive="False" type="video">'
|
| 307 |
)
|
| 308 |
assert stream == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
from datetime import datetime
|
| 5 |
+
import pytest
|
| 6 |
from unittest import mock
|
| 7 |
from unittest.mock import MagicMock, Mock
|
| 8 |
+
from urllib.error import HTTPError
|
| 9 |
|
| 10 |
from pytube import request
|
| 11 |
from pytube import Stream
|
|
|
|
| 308 |
'vcodec="avc1.640028" progressive="False" type="video">'
|
| 309 |
)
|
| 310 |
assert stream == expected
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def test_segmented_stream_on_404(cipher_signature):
|
| 314 |
+
stream = cipher_signature.streams.filter(adaptive=True)[0]
|
| 315 |
+
with mock.patch('pytube.request.head') as mock_head:
|
| 316 |
+
with mock.patch('pytube.request.urlopen') as mock_url_open:
|
| 317 |
+
# Mock the responses to YouTube
|
| 318 |
+
mock_url_open_object = mock.Mock()
|
| 319 |
+
|
| 320 |
+
# These are our 4 "segments" of a dash stream
|
| 321 |
+
# The first explains how many pieces there are, and
|
| 322 |
+
# the rest are those pieces
|
| 323 |
+
responses = [
|
| 324 |
+
b'Raw_data\r\nSegment-Count: 3',
|
| 325 |
+
b'a',
|
| 326 |
+
b'b',
|
| 327 |
+
b'c',
|
| 328 |
+
]
|
| 329 |
+
joined_responses = b''.join(responses)
|
| 330 |
+
|
| 331 |
+
# We create response headers to match the segments
|
| 332 |
+
response_headers = [
|
| 333 |
+
{
|
| 334 |
+
'content-length': len(r),
|
| 335 |
+
'Content-Range': '0-%s/%s' % (str(len(r)), str(len(r)))
|
| 336 |
+
}
|
| 337 |
+
for r in responses
|
| 338 |
+
]
|
| 339 |
+
|
| 340 |
+
# Request order for stream:
|
| 341 |
+
# Filesize:
|
| 342 |
+
# 1. head(url) -> 404
|
| 343 |
+
# 2. get(url&sn=0)
|
| 344 |
+
# 3. head(url&sn=[1,2,3])
|
| 345 |
+
# Download:
|
| 346 |
+
# 4. info(url) -> 404
|
| 347 |
+
# 5. get(url&sn=0)
|
| 348 |
+
# 6. get(url&sn=[1,2,3])
|
| 349 |
+
|
| 350 |
+
# Handle filesize requests
|
| 351 |
+
mock_head.side_effect = [
|
| 352 |
+
HTTPError('', 404, 'Not Found', '', ''),
|
| 353 |
+
*response_headers[1:],
|
| 354 |
+
]
|
| 355 |
+
|
| 356 |
+
# Each response must be followed by None, to break iteration
|
| 357 |
+
# in the stream() function
|
| 358 |
+
mock_url_open_object.read.side_effect = [
|
| 359 |
+
responses[0], None,
|
| 360 |
+
responses[0], None,
|
| 361 |
+
responses[1], None,
|
| 362 |
+
responses[2], None,
|
| 363 |
+
responses[3], None,
|
| 364 |
+
]
|
| 365 |
+
|
| 366 |
+
# This handles the HEAD requests to get content-length
|
| 367 |
+
mock_url_open_object.info.side_effect = [
|
| 368 |
+
HTTPError('', 404, 'Not Found', '', ''),
|
| 369 |
+
*response_headers
|
| 370 |
+
]
|
| 371 |
+
|
| 372 |
+
mock_url_open.return_value = mock_url_open_object
|
| 373 |
+
|
| 374 |
+
with mock.patch('builtins.open', new_callable=mock.mock_open) as mock_open:
|
| 375 |
+
file_handle = mock_open.return_value.__enter__.return_value
|
| 376 |
+
fp = stream.download()
|
| 377 |
+
full_content = b''
|
| 378 |
+
for call in file_handle.write.call_args_list:
|
| 379 |
+
args, kwargs = call
|
| 380 |
+
full_content += b''.join(args)
|
| 381 |
+
|
| 382 |
+
assert full_content == joined_responses
|
| 383 |
+
mock_open.assert_called_once_with(fp, 'wb')
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def test_segmented_only_catches_404(cipher_signature):
|
| 387 |
+
stream = cipher_signature.streams.filter(adaptive=True)[0]
|
| 388 |
+
with mock.patch('pytube.request.head') as mock_head:
|
| 389 |
+
mock_head.side_effect = HTTPError('', 403, 'Forbidden', '', '')
|
| 390 |
+
with pytest.raises(HTTPError):
|
| 391 |
+
stream.download()
|