| import re |
| import base64 |
| import logging |
| from typing import Optional |
| from urllib.parse import urljoin |
|
|
| logger = logging.getLogger("scraper_utils") |
|
|
| class MediaExtractor: |
| @staticmethod |
| def decode_packed(packed_str: str) -> Optional[str]: |
| """Decodes Dean Edwards packed JavaScript.""" |
| try: |
| pattern = r"\}\s*\('(.*?)',\s*(\d+),\s*(\d+),\s*'(.*?)'\.split" |
| match = re.search(pattern, packed_str) |
| if not match: return None |
| |
| p, a, c, k = match.groups() |
| a, c = int(a), int(c) |
| k = k.split('|') |
| |
| def baseN(num, b): |
| return ((num == 0) and "0") or \ |
| (baseN(num // b, b).lstrip("0") + "0123456789abcdefghijklmnopqrstuvwxyz"[num % b]) |
|
|
| result = p |
| for i in range(c - 1, -1, -1): |
| key = baseN(i, a) |
| if i < len(k) and k[i]: |
| result = re.sub(r'\b' + re.escape(key) + r'\b', k[i], result) |
| return result |
| except Exception as e: |
| logger.debug(f"Unpacking failed: {e}") |
| return None |
|
|
| @classmethod |
| async def extract_direct_url(cls, html: str, current_url: str = "") -> Optional[str]: |
| """ |
| Ultra-aggressive extraction of direct video URLs from HTML. |
| Handles hex, base64, packed JS, and raw regex. |
| """ |
| if not html: return None |
| |
| |
| content = html.replace('\\/', '/') |
| |
| |
| hex_matches = re.findall(r'(?:\\x[0-9a-fA-F]{2}){10,}', content) |
| for hm in hex_matches: |
| try: |
| decoded = bytes.fromhex(hm.replace('\\x', '')).decode('utf-8', errors='ignore') |
| content += " " + decoded |
| except: continue |
|
|
| |
| b64_matches = re.findall(r'["\']([A-Za-z0-9+/]{40,}=*)["\']', content) |
| for bm in b64_matches: |
| try: |
| decoded = base64.b64decode(bm).decode('utf-8', errors='ignore') |
| if 'http' in decoded: content += " " + decoded |
| except: continue |
|
|
| |
| if 'eval(function(p,a,c,k,e,' in content: |
| |
| packed_scripts = re.findall(r'eval\(function\(p,a,c,k,e,.*?.split\(\'\|\'\)\)\)', content) |
| for ps in packed_scripts: |
| unpacked = cls.decode_packed(ps) |
| if unpacked: content += " " + unpacked |
|
|
| |
| media_patterns = [ |
| r'["\'](https?://[^"\']+\.(?:mp4|m3u8|m4s|ts|webm|mov|mkv)[^"\']*)["\']', |
| r'[:=]\s*(https?://[^\s"\'<>;]+(?:\.mp4|\.m3u8)[^\s"\'<>;]*)' |
| ] |
| |
| for pattern in media_patterns: |
| for match in re.finditer(pattern, content, re.IGNORECASE): |
| found = match.group(1).replace('\\', '') |
| if 'http' in found and not any(x in found for x in ['.js', '.css', 'ads', 'analytics', 'facebook']): |
| |
| if any(x in found for x in ['.mp4', '.m3u8', '.webm', 'googlevideo', 'okcdn', 'mxcontent', 'vood', 'filelions']): |
| return found |
|
|
| |
| for key in ['"file"', '"src"', '"url"', '"embed_url"']: |
| match = re.search(f'{key}\s*:\s*"([^"]+)"', content) |
| if match: |
| val = match.group(1).replace('\\', '') |
| if val.startswith('http') and ('.mp4' in val or '.m3u8' in val): |
| return val |
|
|
| return None |
|
|