Fuan
commited on
Fix regex to find throttle function name (#1222)
Browse files* Fix regex to find throttle function name
The javascript now stores the throttling function name in an array.
Fix https://github.com/pytube/pytube/issues/1218
* Fix array parsing
Strip whitespaces around symbol names for future-proofing.
The variable name might be "b" right now, but it could change in the
future.
- pytube/cipher.py +22 -4
- tests/conftest.py +14 -0
- tests/mocks/base.js.gz +0 -0
- tests/test_cipher.py +11 -0
pytube/cipher.py
CHANGED
|
@@ -263,9 +263,14 @@ def get_throttling_function_name(js: str) -> str:
|
|
| 263 |
"""
|
| 264 |
function_patterns = [
|
| 265 |
# https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
|
| 266 |
-
#
|
| 267 |
-
#
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
]
|
| 270 |
logger.debug('Finding throttling function name')
|
| 271 |
for pattern in function_patterns:
|
|
@@ -273,7 +278,20 @@ def get_throttling_function_name(js: str) -> str:
|
|
| 273 |
function_match = regex.search(js)
|
| 274 |
if function_match:
|
| 275 |
logger.debug("finished regex search, matched: %s", pattern)
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
raise RegexMatchError(
|
| 279 |
caller="get_throttling_function_name", pattern="multiple"
|
|
|
|
| 263 |
"""
|
| 264 |
function_patterns = [
|
| 265 |
# https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
|
| 266 |
+
# https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8
|
| 267 |
+
# var Bpa = [iha];
|
| 268 |
+
# ...
|
| 269 |
+
# a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b),
|
| 270 |
+
# Bpa.length || iha("")) }};
|
| 271 |
+
# In the above case, `iha` is the relevant function name
|
| 272 |
+
r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
|
| 273 |
+
r'\([a-z]\s*=\s*([a-zA-Z0-9$]{3})(\[\d+\])?\([a-z]\)',
|
| 274 |
]
|
| 275 |
logger.debug('Finding throttling function name')
|
| 276 |
for pattern in function_patterns:
|
|
|
|
| 278 |
function_match = regex.search(js)
|
| 279 |
if function_match:
|
| 280 |
logger.debug("finished regex search, matched: %s", pattern)
|
| 281 |
+
if len(function_match.groups()) == 1:
|
| 282 |
+
return function_match.group(1)
|
| 283 |
+
idx = function_match.group(2)
|
| 284 |
+
if idx:
|
| 285 |
+
idx = idx.strip("[]")
|
| 286 |
+
array = re.search(
|
| 287 |
+
r'var {nfunc}\s*=\s*(\[.+?\]);'.format(
|
| 288 |
+
nfunc=function_match.group(1)),
|
| 289 |
+
js
|
| 290 |
+
)
|
| 291 |
+
if array:
|
| 292 |
+
array = array.group(1).strip("[]").split(",")
|
| 293 |
+
array = [x.strip() for x in array]
|
| 294 |
+
return array[int(idx)]
|
| 295 |
|
| 296 |
raise RegexMatchError(
|
| 297 |
caller="get_throttling_function_name", pattern="multiple"
|
tests/conftest.py
CHANGED
|
@@ -146,3 +146,17 @@ def channel_videos_html():
|
|
| 146 |
)
|
| 147 |
with gzip.open(file_path, 'rb') as f:
|
| 148 |
return f.read().decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
)
|
| 147 |
with gzip.open(file_path, 'rb') as f:
|
| 148 |
return f.read().decode('utf-8')
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@pytest.fixture
|
| 152 |
+
def base_js():
|
| 153 |
+
"""Youtube base.js retrieved on 2022-02-04 from
|
| 154 |
+
https://www.youtube.com/watch?v=vmzxpUsN0uA
|
| 155 |
+
"""
|
| 156 |
+
file_path = os.path.join(
|
| 157 |
+
os.path.dirname(os.path.realpath(__file__)),
|
| 158 |
+
"mocks",
|
| 159 |
+
"base.js.gz",
|
| 160 |
+
)
|
| 161 |
+
with gzip.open(file_path, 'rb') as f:
|
| 162 |
+
return f.read().decode('utf-8')
|
tests/mocks/base.js.gz
ADDED
|
Binary file (611 kB). View file
|
|
|
tests/test_cipher.py
CHANGED
|
@@ -77,3 +77,14 @@ def test_js_splice():
|
|
| 77 |
for args, result in mapping.items():
|
| 78 |
a = [1, 2, 3, 4]
|
| 79 |
assert cipher.js_splice(a, *args) == result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
for args, result in mapping.items():
|
| 78 |
a = [1, 2, 3, 4]
|
| 79 |
assert cipher.js_splice(a, *args) == result
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def test_get_throttling_function_name(base_js):
|
| 83 |
+
# Values expected as of 2022/02/04:
|
| 84 |
+
raw_var = r'var Apa=[hha]'
|
| 85 |
+
assert raw_var in base_js
|
| 86 |
+
raw_code = r'a.url="";a.C&&(b=a.get("n"))&&(b=Apa[0](b),a.set("n",b),'\
|
| 87 |
+
r'Apa.length||hha(""))}};'
|
| 88 |
+
assert raw_code in base_js
|
| 89 |
+
func_name = cipher.get_throttling_function_name(base_js)
|
| 90 |
+
assert func_name == "hha"
|