import re
import base64
import logging
from typing import Optional
from urllib.parse import urljoin

logger = logging.getLogger("scraper_utils")

class MediaExtractor:
    @staticmethod
    def decode_packed(packed_str: str) -> Optional[str]:
        """Decodes Dean Edwards packed JavaScript."""
        try:
            pattern = r"\}\s*\('(.*?)',\s*(\d+),\s*(\d+),\s*'(.*?)'\.split"
            match = re.search(pattern, packed_str)
            if not match: return None
            
            p, a, c, k = match.groups()
            a, c = int(a), int(c)
            k = k.split('|')
            
            def baseN(num, b):
                return ((num == 0) and "0") or \
                       (baseN(num // b, b).lstrip("0") + "0123456789abcdefghijklmnopqrstuvwxyz"[num % b])

            result = p
            for i in range(c - 1, -1, -1):
                key = baseN(i, a)
                if i < len(k) and k[i]:
                    result = re.sub(r'\b' + re.escape(key) + r'\b', k[i], result)
            return result
        except Exception as e:
            logger.debug(f"Unpacking failed: {e}")
            return None

    @classmethod
    async def extract_direct_url(cls, html: str, current_url: str = "") -> Optional[str]:
        """
        Ultra-aggressive extraction of direct video URLs from HTML.
        Handles hex, base64, packed JS, and raw regex.
        """
        if not html: return None
        
        # 1. Normalization
        content = html.replace('\\/', '/')
        
        # 2. Decode Hex
        hex_matches = re.findall(r'(?:\\x[0-9a-fA-F]{2}){10,}', content)
        for hm in hex_matches:
            try:
                decoded = bytes.fromhex(hm.replace('\\x', '')).decode('utf-8', errors='ignore')
                content += " " + decoded
            except: continue

        # 3. Decode Base64
        b64_matches = re.findall(r'["\']([A-Za-z0-9+/]{40,}=*)["\']', content)
        for bm in b64_matches:
            try:
                decoded = base64.b64decode(bm).decode('utf-8', errors='ignore')
                if 'http' in decoded: content += " " + decoded
            except: continue

        # 4. Unpack JS
        if 'eval(function(p,a,c,k,e,' in content:
            # Find all packed scripts
            packed_scripts = re.findall(r'eval\(function\(p,a,c,k,e,.*?.split\(\'\|\'\)\)\)', content)
            for ps in packed_scripts:
                unpacked = cls.decode_packed(ps)
                if unpacked: content += " " + unpacked

        # 5. Extraction Regex
        media_patterns = [
            r'["\'](https?://[^"\']+\.(?:mp4|m3u8|m4s|ts|webm|mov|mkv)[^"\']*)["\']',
            r'[:=]\s*(https?://[^\s"\'<>;]+(?:\.mp4|\.m3u8)[^\s"\'<>;]*)'
        ]
        
        for pattern in media_patterns:
            for match in re.finditer(pattern, content, re.IGNORECASE):
                found = match.group(1).replace('\\', '')
                if 'http' in found and not any(x in found for x in ['.js', '.css', 'ads', 'analytics', 'facebook']):
                    # Critical Validation: Must be a media file or known provider
                    if any(x in found for x in ['.mp4', '.m3u8', '.webm', 'googlevideo', 'okcdn', 'mxcontent', 'vood', 'filelions']):
                        return found

        # 6. Fallback: JSON keys
        for key in ['"file"', '"src"', '"url"', '"embed_url"']:
            match = re.search(f'{key}\s*:\s*"([^"]+)"', content)
            if match:
                val = match.group(1).replace('\\', '')
                if val.startswith('http') and ('.mp4' in val or '.m3u8' in val):
                    return val

        return None