Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Dec 11, 2025

Commit

5ddacea

verified ·

1 Parent(s): c9278df

Update Shapp.py

Browse files

Files changed (1) hide show

Shapp.py +1068 -562

Shapp.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import json
 import os
 import re
@@ -8,14 +13,18 @@ import zipfile
 import tempfile
 import chardet
 import tarfile
 from datetime import datetime
-from typing import List, Dict, Optional, Union, Tuple
 from pathlib import Path
 from urllib.parse import urlparse, urljoin
 import requests
 import validators
 import gradio as gr
-from diskcache import Cache
 from bs4 import BeautifulSoup, NavigableString, Tag
 from fake_useragent import UserAgent
 from cleantext import clean
@@ -23,175 +32,271 @@ import qrcode
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
-# --- Playwright Integration ---
 try:
-    from playwright.sync_api import sync_playwright
     PLAYWRIGHT_AVAILABLE = True
 except ImportError:
-    PLAYWRIGHT_AVAILABLE = False
-# -----------------------------
-# Setup enhanced logging with more detailed formatting
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
     handlers=[
         logging.StreamHandler(),
-        logging.FileHandler('app.log', encoding='utf-8')
     ]
 )
 logger = logging.getLogger(__name__)
-# Ensure output directories exist with modern structure
 OUTPUTS_DIR = Path('output')
 QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes'
 SNAPSHOTS_DIR = OUTPUTS_DIR / 'snapshots'
-MEDIA_DIR = OUTPUTS_DIR / 'media'  # New directory for downloaded media
 TEMP_DIR = OUTPUTS_DIR / 'temp'
 for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR, SNAPSHOTS_DIR, MEDIA_DIR]:
     directory.mkdir(parents=True, exist_ok=True)
-def capture_visual_snapshot(url: str, filename: str) -> Optional[str]:
-    """Captures a full-page screenshot using Playwright."""
-    if not PLAYWRIGHT_AVAILABLE:
-        logger.warning(f"Skipping snapshot for {url}: Playwright dependency missing.")
-        return None
-    output_path = SNAPSHOTS_DIR / filename
-    try:
-        with sync_playwright() as p:
-            browser = p.chromium.launch(headless=True)
-            page = browser.new_page()
-            page.set_viewport_size({"width": 1280, "height": 1024})
-            page.goto(url, wait_until="networkidle")
-            page.screenshot(path=output_path, full_page=True)
-            browser.close()
-            logger.info(f"Captured snapshot for {url} at {output_path}")
-            return str(output_path)
-    except Exception as e:
-        logger.error(f"Playwright snapshot failed for {url}: {e}")
-        return None
 class MediaDownloader:
-    """Handles downloading and saving media files."""
-    def __init__(self):
         self.session = requests.Session()
-        self.downloaded_files = {}  # {original_url: local_path}
-    def download_media(self, url: str) -> Optional[str]:
-        """Downloads a single media file and returns its local path."""
-        if url in self.downloaded_files:
-            return self.downloaded_files[url]
         try:
-            response = self.session.get(url, timeout=10, stream=True)
             response.raise_for_status()
             content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
-            # Determine extension based on MIME type or URL suffix
             ext = mimetypes.guess_extension(content_type)
             if not ext:
-                ext = Path(urlparse(url).path).suffix or '.bin'
-            # Create a unique filename based on hash and extension
-            filename = f"{hash(url) & 0xFFFFFFFF}{ext}"
-            local_path = MEDIA_DIR / filename
             with open(local_path, 'wb') as f:
                 for chunk in response.iter_content(chunk_size=8192):
-                    f.write(chunk)
             self.downloaded_files[url] = str(local_path)
             logger.info(f"Downloaded media: {url} -> {local_path}")
             return str(local_path)
         except requests.exceptions.RequestException as e:
             logger.warning(f"Failed to download media {url}: {e}")
             return None
         except Exception as e:
-            logger.error(f"Error during media download: {e}")
             return None
 class EnhancedURLProcessor:
-    """Advanced URL processing with complete content extraction"""
-    def __init__(self):
         self.session = requests.Session()
-        self.timeout = 15
-        self.max_retries = 3
         self.user_agent = UserAgent()
         self.session.headers.update({
             'User-Agent': self.user_agent.random,
-            'Accept': '*/*',
-            'Accept-Language': 'en-US,en;q=0.9',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1',
             'Sec-Fetch-Dest': 'document',
             'Sec-Fetch-Mode': 'navigate',
             'Sec-Fetch-Site': 'none',
-            'Sec-Fetch-User': '?1',
-            'DNT': '1'
         })
-    def validate_url(self, url: str) -> Dict:
-        """Enhanced URL validation with detailed feedback"""
         try:
             if not validators.url(url):
-                return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
             parsed = urlparse(url)
             if not all([parsed.scheme, parsed.netloc]):
-                return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
             try:
-                head_response = self.session.head(url, timeout=5)
                 head_response.raise_for_status()
             except requests.exceptions.RequestException:
-                response = self.session.get(url, timeout=self.timeout)
                 response.raise_for_status()
-            return {
-                'is_valid': True,
-                'message': 'URL is valid and accessible',
-                'details': {
                     'content_type': head_response.headers.get('Content-Type', 'unknown'),
                     'server': head_response.headers.get('Server', 'unknown'),
                     'size': head_response.headers.get('Content-Length', 'unknown')
                 }
-            }
         except Exception as e:
-            return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
-    def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
-        """Enhanced content fetcher with retries and encoding detection"""
         try:
-            logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
             self.session.headers.update({'User-Agent': self.user_agent.random})
-            response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
             response.raise_for_status()
             # Encoding detection
             encoding = response.encoding
             if encoding is None or encoding == 'ISO-8859-1':
-                detected = chardet.detect(response.content)
                 encoding = detected['encoding'] or 'utf-8'
             try:
                 raw_content = response.content.decode(encoding, errors='replace')
             except (UnicodeDecodeError, LookupError):
                 raw_content = response.content.decode('utf-8', errors='replace')
                 encoding = 'utf-8 (fallback)'
             metadata = {
                 'url': url,
                 'final_url': response.url,
@@ -201,34 +306,46 @@ class EnhancedURLProcessor:
                 'content_length': len(response.content),
                 'status_code': response.status_code,
                 'headers': dict(response.headers),
             }
             content_type = metadata['content_type'].lower()
             structured = {}
-            if 'text/html' not in content_type and content_type != '':
-                if 'application/json' in content_type or url.endswith('.json'):
-                    try:
-                        structured = json.loads(raw_content)
-                    except json.JSONDecodeError:
-                        structured = {'text': raw_content, 'parse_error': 'Invalid JSON'}
-                elif 'image/' in content_type:
                     structured = {
-                        'media_type': 'image',
-                        'direct_url': response.url,
-                        'format': content_type.split('/')[-1],
-                        'size_bytes': len(response.content)
                     }
-                else:
-                    structured = {'text': raw_content[:100_000]}
-            return {
-                'structured': structured,
-                'raw_content': raw_content,
-                'metadata': metadata
-            }
         except requests.exceptions.RequestException as e:
             if retry_count < self.max_retries - 1:
                 sleep_time = 2 ** retry_count
                 time.sleep(sleep_time)
                 return self.fetch_content(url, retry_count + 1)
             else:
@@ -237,387 +354,553 @@ class EnhancedURLProcessor:
         except Exception as e:
             logger.error(f"Unexpected error fetching {url}: {e}")
             return None
-    def _process_html_content(self, raw_content: str, base_url: str) -> Dict:
         soup = BeautifulSoup(raw_content, 'html.parser')
-        for tag in soup.find_all(['a', 'img', 'link', 'script', 'video', 'audio']):
-            for attr in ['href', 'src']:
                 if tag.get(attr) and not urlparse(tag[attr]).scheme:
                     try:
                         tag[attr] = urljoin(base_url, tag[attr])
-                    except Exception:
-                        pass
-        structured = self._extract_database_data(soup)
-        structured['raw_html'] = raw_content
         return structured
-    def _create_template_shell(self, raw_content: str, base_url: str) -> Dict:
         soup = BeautifulSoup(raw_content, 'html.parser')
         PLACEHOLDER_TEXT = "[LOREM IPSUM CONTENT]"
-        PLACEHOLDER_IMG = "data:image/svg+xml;charset=UTF-8,%3Csvg%20width%3D%22200%22%20height%3D%22100%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20viewBox%3D%220%200%20200%20100%22%20preserveAspectRatio%3D%22none%22%3E%3Cdefs%3E%3Cstyle%20type%3D%22text%2Fcss%22%3E%23holder_16a4c%20text%20%7B%20fill%3Argba(255%2C255%2C255%2C.75)%3Bfont-weight%3Anormal%3Bfont-family%3AHelvetica%2C%20monospace%3Bfont-size%3A10pt%20%7D%20%3C%2Fstyle%3E%3C%2Fdefs%3E%3Cg%20id%3D%22holder_16a4c%22%3E%3Crect%20width%3D%22200%22%20height%3D%22100%22%20fill%3D%22%23777%22%3E%3C%2Frect%3E%3Cg%3E%3Ctext%20x%3D%2270%22%20y%3D%2255%22%3E200x100%3C%2Ftext%3E%3C%2Fg%3E%3C%2Fg%3E%3C%2Fsvg%3E"
-        for tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'td', 'th', 'label', 'title']:
-            for tag in soup.find_all(tag_name):
-                if tag.string and len(tag.get_text(strip=True)) > 5:
-                    tag.string.replace_with(PLACEHOLDER_TEXT)
         for img in soup.find_all('img'):
             img['src'] = PLACEHOLDER_IMG
         for a in soup.find_all('a'):
-            if 'href' in a.attrs and a['href'].startswith(('http', 'https')):
                 a['href'] = '#'
         for script in soup.find_all('script', type='application/ld+json'):
             script.decompose()
-        for element in soup.find_all(string=lambda text: isinstance(text, NavigableString) and '<!--' in text):
-            element.extract()
         return {
             'template_type': 'html_shell',
             'base_url': base_url,
-            'template_html': str(soup)
         }
-    def _extract_database_data(self, soup: BeautifulSoup) -> Dict:
         structured = {
-            'title': soup.title.string.strip() if soup.title else '',
-            'meta_description': soup.find('meta', attrs={'name': 'description'}).get('content') if soup.find('meta', attrs={'name': 'description'}) else '',
             'core_text_content': '',
-            'images': set(),
-            'videos': set(),
-            'audios': set(),
             'structured_data': [],
             'products': [],
-            'branding_links': set()
         }
         for script in soup.find_all('script', type='application/ld+json'):
             try:
-                ld_data = json.loads(script.text)
                 structured['structured_data'].append(ld_data)
-                if isinstance(ld_data, dict) and ld_data.get('@type') == 'Product':
-                    structured['products'].append(ld_data)
-            except json.JSONDecodeError as e:
-                logger.warning(f"Failed to decode JSON-LD script during data extraction: {e}")
         for img in soup.find_all('img'):
-            if img.get('src'):
-                structured['images'].add(urljoin(soup.base_url if hasattr(soup, 'base_url') else '', img.get('src')))
         for video in soup.find_all('video'):
-            if video.get('src'):
-                structured['videos'].add(urljoin(soup.base_url if hasattr(soup, 'base_url') else '', video.get('src')))
         for audio in soup.find_all('audio'):
-            if audio.get('src'):
-                structured['audios'].add(urljoin(soup.base_url if hasattr(soup, 'base_url') else '', audio.get('src')))
-        main_content_tags = soup.find_all(
-            lambda tag: tag.name in ['main', 'article', 'section'] or
-                        'content' in tag.get('id', '').lower() or
-                        'main' in tag.get('class', [])
-        )
-        if main_content_tags:
-            best_tag = max(main_content_tags, key=lambda tag: len(tag.get_text(strip=True)), default=None)
-            if best_tag:
-                structured['core_text_content'] = clean(best_tag.get_text('\n', strip=True), lower=False, no_line_breaks=False)
         if not structured['core_text_content']:
-            structured['core_text_content'] = clean('\n'.join(soup.stripped_strings), lower=False, no_line_breaks=False)
-        structured['images'] = list(structured['images'])
-        structured['videos'] = list(structured['videos'])
-        structured['audios'] = list(structured['audios'])
-        structured['branding_links'] = list(structured['branding_links'])
         return structured
 class SiteCrawler:
-    def __init__(self, processor: EnhancedURLProcessor):
         self.processor = processor
         self.crawled_urls = set()
-        self.max_pages = 10
-    def _get_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
         parsed_base = urlparse(base_url)
         internal_links = set()
         for a in soup.find_all('a', href=True):
             href = urljoin(base_url, a['href'])
             parsed_href = urlparse(href)
-            if parsed_href.netloc == parsed_base.netloc and parsed_href.scheme in ('http', 'https'):
-                if not any(href.lower().endswith(ext) for ext in ['.pdf', '.zip', '.jpg', '.png', '.css', '.js', '#']):
                     internal_links.add(href)
         return list(internal_links)
-    def crawl_site(self, start_url: str, mode: str) -> Tuple[List[Dict], List[str]]:
-        logger.info(f"Starting limited crawl from {start_url} in mode: {mode}")
-        queue = [start_url]
-        results = []
-        snapshot_paths = []
         while queue and len(self.crawled_urls) < self.max_pages:
-            url = queue.pop(0)
-            if url in self.crawled_urls:
                 continue
             self.crawled_urls.add(url)
             content_result = self.processor.fetch_content(url)
-            if not content_result or 'text/html' not in content_result['metadata']['content_type'].lower():
                 continue
-            raw_content = content_result['raw_content']
-            base_url = content_result['metadata']['final_url']
             soup = BeautifulSoup(raw_content, 'html.parser')
-            filename = f"snapshot_{len(self.crawled_urls)}_{urlparse(base_url).path.replace('/', '_') or 'index'}.png"
-            snapshot_path = capture_visual_snapshot(base_url, filename)
-            if snapshot_path:
-                snapshot_paths.append(snapshot_path)
-            new_links = self._get_links(soup, base_url)
-            queue.extend([link for link in new_links if link not in self.crawled_urls and urlparse(link).netloc == urlparse(start_url).netloc])
-            page_result = {
-                'source': 'crawl',
-                'url': base_url,
-                'metadata': content_result['metadata'],
-                'timestamp': datetime.now().isoformat(),
-                'snapshot_path': snapshot_path if snapshot_path else 'N/A'
-            }
             if mode == "Extract for Template (Shell)":
-                page_result['structured'] = self.processor._create_template_shell(raw_content, base_url)
             elif mode == "Extract for Database (Content Only)":
-                page_result['structured'] = self.processor._extract_database_data(soup)
             else:
-                page_result['structured'] = self.processor._process_html_content(raw_content, base_url)
-            results.append(page_result)
-        return results, snapshot_paths
 class EnhancedFileProcessor:
-    def __init__(self, max_file_size: int = 5 * 1024 * 1024):
         self.max_file_size = max_file_size
         self.supported_extensions = {
             '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.log',
             '.yml', '.yaml', '.ini', '.conf', '.cfg', '.zip', '.tar', '.gz',
             '.bz2', '.7z', '.rar', '.pdf', '.doc', '.docx', '.rtf', '.odt',
-            '.jpg', '.jpeg', '.png', '.gif', '.bmp'
         }
-    def process_file(self, file) -> List[Dict]:
-        if not file:
             return []
-        dataset = []
         try:
-            file_path = file.name
             file_size = os.path.getsize(file_path)
             if file_size > self.max_file_size:
-                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
                 return []
-            with tempfile.TemporaryDirectory() as temp_dir:
-                temp_dir_path = Path(temp_dir)
-                if self._is_archive(file_path):
-                    dataset.extend(self._process_archive(file_path, temp_dir_path))
-                else:
-                    dataset.extend(self._process_single_file(file))
         except Exception as e:
-            logger.error(f"Error processing file: {str(e)}")
             return []
-        return dataset
     def _is_archive(self, filepath: str) -> bool:
-        return any(filepath.lower().endswith(ext) for ext in ['.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'])
-    def _process_single_file(self, file) -> List[Dict]:
         try:
-            file_path = file.name
             file_stat = os.stat(file_path)
-            file_size = file_stat.st_size
-            mime = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
             structured = {}
-            if 'image/' in mime:
                 structured = {
                     'media_type': 'image',
                     'filename': os.path.basename(file_path),
-                    'mime': mime
                 }
             else:
                 with open(file_path, 'rb') as f:
                     raw_bytes = f.read()
-                detected = chardet.detect(raw_bytes)
                 encoding = detected['encoding'] or 'utf-8'
                 try:
-                    complete_content = raw_bytes.decode(encoding, errors='replace')
                 except (UnicodeDecodeError, LookupError):
-                    complete_content = raw_bytes.decode('utf-8', errors='replace')
-                if 'json' in mime:
                     try:
-                        json_data = json.loads(complete_content)
                         structured = json_data
-                        if isinstance(json_data, dict) and 'items' in json_data and isinstance(json_data['items'], list):
-                            structured['products'] = json_data['items']
-                    except json.JSONDecodeError:
-                        structured = {'text': complete_content, 'parse_error': 'Invalid JSON'}
-                elif 'html' in mime or 'htm' in mime:
-                    url_processor = EnhancedURLProcessor()
-                    soup = BeautifulSoup(complete_content, 'html.parser')
-                    structured = url_processor._extract_database_data(soup)
                 else:
-                    structured = {'text': complete_content}
-            return [{
-                'source': 'file',
-                'filename': os.path.basename(file_path),
-                'file_size': file_size,
-                'mime_type': mime,
-                'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
-                'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
-                'structured': structured,
-                'timestamp': datetime.now().isoformat()
-            }]
         except Exception as e:
-            logger.error(f"File processing error: {e}")
             return []
-    def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
         dataset = []
         try:
             if zipfile.is_zipfile(archive_path):
                 with zipfile.ZipFile(archive_path, 'r') as zip_ref:
-                    zip_ref.extractall(extract_to)
                     for file_info in zip_ref.infolist():
-                        if file_info.file_size > 0 and not file_info.filename.endswith('/'):
-                            extracted_path = extract_to / file_info.filename
-                            if extracted_path.suffix.lower() in self.supported_extensions:
-                                temp_file = type('TempFile', (), {'name': str(extracted_path)})()
-                                dataset.extend(self._process_single_file(temp_file))
             elif tarfile.is_tarfile(archive_path):
                 with tarfile.open(archive_path, 'r') as tar_ref:
-                    tar_ref.extractall(extract_to)
                     for member in tar_ref.getmembers():
                         if member.isfile():
-                            extracted_path = extract_to / member.name
-                            if extracted_path.suffix.lower() in self.supported_extensions:
-                                temp_file = type('TempFile', (), {'name': str(extracted_path)})()
-                                dataset.extend(self._process_single_file(temp_file))
         except Exception as e:
-            logger.error(f"Archive processing error: {e}")
         return dataset
-def break_down_data(data: Union[Dict, List[Dict]]) -> Union[Dict, List[Dict]]:
-    def process_item(item: Dict) -> Dict:
-        structured = item.get('structured', {})
-        if structured.get('template_type') == 'html_shell':
-            return item
-        if not structured:
-            content = item.get('content', item.get('raw_content', ''))
-            if isinstance(content, str):
-                structured = {'text': content}
-            elif isinstance(content, dict):
-                structured = content
-        if 'products' not in structured:
-            structured['products'] = []
-        media = []
-        media.extend([{'type': 'image', 'source': src} for src in structured.get('images', [])])
-        media.extend([{'type': 'video', 'source': src} for src in structured.get('videos', [])])
-        media.extend([{'type': 'audio', 'source': src} for src in structured.get('audios', [])])
-        structured['media'] = media
-        if structured['products']:
-            structured['template'] = {
-                'type': 'product_catalog',
-                'items': structured['products'],
-                'metadata': item.get('metadata', {})
-            }
-        item['structured'] = structured
-        return item
-    if isinstance(data, list):
-        return [process_item(item) for item in data]
-    elif isinstance(data, dict):
-        return process_item(data)
-    return data
 class DataChunker:
-    def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
         try:
-            if isinstance(data, dict) and data.get('template_type') == 'html_shell':
                 json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
-            else:
                 json_str = json.dumps(data, ensure_ascii=False)
-            json_bytes = json_str.encode('utf-8')
-            total_length = len(json_bytes)
-            metadata_template = {
-                "chunk_index": 999, "total_chunks": 999, "total_length": total_length,
-                "chunk_hash": 0xFFFFFFFF, "data": ""
-            }
-            overhead_str = json.dumps(metadata_template, ensure_ascii=False).replace('""', '')
-            overhead_bytes = len(overhead_str.encode('utf-8')) + 50
-            effective_chunk_size = max_size - overhead_bytes
-            if effective_chunk_size <= 0:
-                raise ValueError(f"Max size ({max_size}) is too small after accounting for metadata overhead ({overhead_bytes})")
-            num_chunks = (total_length + effective_chunk_size - 1) // effective_chunk_size
             chunks = []
-            start = 0
-            for i in range(num_chunks):
-                end = min(start + effective_chunk_size, total_length)
-                chunk_bytes = json_bytes[start:end]
-                chunk_str = chunk_bytes.decode('utf-8', errors='replace')
-                chunk_hash = hash(chunk_str) & 0xFFFFFFFF
                 chunk = {
-                    "chunk_index": i + 1,
-                    "total_chunks": num_chunks,
-                    "total_length": total_length,
                     "chunk_hash": chunk_hash,
-                    "data": chunk_str
                 }
                 chunks.append(chunk)
-                start = end
             return chunks
         except Exception as e:
             logger.error(f"Error chunking data: {e}")
-            return []
-def generate_stylish_qr(data: Union[str, Dict], filename: str, size: int = 10, border: int = 4,
-                       fill_color: str = "#000000", back_color: str = "#FFFFFF") -> str:
-    try:
-        qr = qrcode.QRCode(
-            version=None,
-            error_correction=qrcode.constants.ERROR_CORRECT_L,
-            box_size=size,
-            border=border
-        )
-        if isinstance(data, dict):
-            qr_data_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
-            qr.add_data(qr_data_str)
-        else:
-            qr.add_data(data)
-        qr.make(fit=True)
-        qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
-        qr_image = qr_image.convert('RGBA')
-        final_image = qr_image
-        output_path = QR_CODES_DIR / filename
-        final_image.save(output_path, quality=95)
-        return str(output_path)
-    except Exception as e:
-        logger.error(f"QR generation error: {e}")
-        return ""
-def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
-    try:
         chunker = DataChunker()
         paths = []
-        if isinstance(data, dict) and data.get('template_type') == 'html_shell':
-            combined = True
         if combined:
             chunks = chunker.chunk_data(data)
             for i, chunk in enumerate(chunks):
-                filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
-                qr_path = generate_stylish_qr(
                     data=chunk,
                     filename=filename,
                     fill_color="#1a365d",
@@ -626,15 +909,16 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
                 if qr_path:
                     paths.append(qr_path)
         else:
             if isinstance(data, list):
                 for idx, item in enumerate(data):
                     chunks = chunker.chunk_data(item)
                     for chunk_idx, chunk in enumerate(chunks):
-                        filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
-                        qr_path = generate_stylish_qr(
                             data=chunk,
                             filename=filename,
-                            fill_color="#1a365d",
                             back_color="#ffffff"
                         )
                         if qr_path:
@@ -642,8 +926,8 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
             else:
                 chunks = chunker.chunk_data(data)
                 for i, chunk in enumerate(chunks):
-                    filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
-                    qr_path = generate_stylish_qr(
                         data=chunk,
                         filename=filename,
                         fill_color="#1a365d",
@@ -651,68 +935,166 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
                     )
                     if qr_path:
                         paths.append(qr_path)
         return paths
-    except Exception as e:
-        logger.error(f"Error in generate_qr_codes: {e}")
-        return []
-# ←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←
-# THIS WAS THE EXACT LINE THAT WAS FAILING BECAUSE OF AN UNMATCHED `try:` ABOVE
-# ←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←
-def package_database(results: List[Dict]) -> Optional[str]:
-    """
-    Downloads all media referenced in the results and packages everything into a ZIP file.
-    """
-    if not results:
         return None
-    downloader = MediaDownloader()
-    all_downloaded_files = {}
-    # Collect all unique media URLs
-    media מדים_to_download = set()
-    for item in results:
         structured = item.get('structured', {})
-        media_urls_to_download.update(structured.get('images', []))
-        media_urls_to_download.update(structured.get('videos', []))
-        media_urls_to_download.update(structured.get('audios', []))
-    for url in media_urls_to_download:
-        local_path = downloader.download_media(url)
-        if local_path:
-            all_downloaded_files[url] = local_path
-    # Update results to use local relative paths
-    updated_results = []
-    for item in results:
-        item_copy = item.copy()
-        structured = item_copy.get('structured', {})
-        for media_type in ['images', 'videos', 'audios']:
-            if media_type in structured:
-                new_paths = []
-                for url in structured[media_type]:
-                    if url in all_downloaded_files:
-                        new_paths.append(f"media/{Path(all_downloaded_files[url]).name}")
-                    else:
-                        new_paths.append(url)
-                structured[media_type] = new_paths
-        item_copy['structured'] = structured
-        updated_results.append(item_copy)
-    # Create ZIP
-    zip_filename = OUTPUTS_DIR / f"database_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
-    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
-        zf.writestr('data_export.json', json.dumps(updated_results, indent=2, ensure_ascii=False))
-        for original_url, local_path in all_downloaded_files.items():
-            zf.write(local_path, arcname=f"media/{Path(local_path).name}")
-    logger.info(f"Database package created at: {zip_filename}")
-    return str(zip_filename)
 def create_modern_interface():
     css = """
     :root {
         --primary-color: #1a365d;
@@ -722,208 +1104,332 @@ def create_modern_interface():
         --success-color: #48bb78;
         --error-color: #f56565;
         --warning-color: #ed8936;
     }
     .gradio-container {
         max-width: 1200px;
-        margin: auto;
         padding: 2rem;
-        background-color: var(--background-color);
-        border-radius: 1rem;
         box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
     }
     .primary-button {
-        background-color: var(--primary-color) !important;
-        color: white !important;
-        padding: 0.75rem 1.5rem;
-        border-radius: 0.375rem;
         border: none;
         cursor: pointer;
-        transition: all 0.2s;
     }
     .primary-button:hover {
-        background-color: var(--accent-color) !important;
-        transform: translateY(-1px);
     }
     """
     with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator", theme=gr.themes.Soft()) as interface:
-        gr.Markdown(f"""
-        # Advanced Data Processor & QR Code Generator
-        ## Site Crawling & Template/Database Extraction
-        {"**WARNING: Playwright is not installed.** Visual capture and dynamic rendering are disabled. Install using `pip install playwright` and run `playwright install`." if not PLAYWRIGHT_AVAILABLE else ""}
         """)
-        with gr.Tab("URL Processing"):
-            url_input = gr.Textbox(label="Enter URLs (single URL for site crawl, multiple for independent pages)", lines=5, placeholder="https://example1.com\nhttps://example2.com", value="")
-        with gr.Tab("File Input"):
-            file_input = gr.File(label="Upload Files", file_types=["*"], file_count="multiple")
-        with gr.Tab("JSON Input"):
-            text_input = gr.TextArea(label="Direct JSON Input", lines=15, placeholder="Paste your JSON data here...", value="")
         with gr.Row():
             extraction_mode = gr.Radio(
-                label="Extraction Mode (Applies to URLs)",
-                choices=["Full Structured Data", "Extract for Template (Shell)", "Extract for Database (Content Only)"],
                 value="Full Structured Data",
-                info="Template/Database mode with a single URL triggers a limited site crawl and visual capture."
             )
-        with gr.Row():
-            example_btn = gr.Button("Load Example", variant="secondary")
-            clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Row():
-            combine_data = gr.Checkbox(label="Combine all data into sequence", value=True, info="Generate sequential QR codes for combined data (recommended for large datasets)")
-            process_btn = gr.Button("Process & Generate QR / Database", variant="primary", elem_classes="primary-button")
-        output_json = gr.JSON(label="Processed & Structured Data")
         with gr.Row():
-            output_gallery = gr.Gallery(label="Visual Snapshots & Generated QR Codes", columns=3, height=400, show_label=True)
-        output_database_zip = gr.File(label="Database Export (.zip)", interactive=False, file_count="single", visible=False)
-        output_text = gr.Textbox(label="Processing Status", interactive=False)
         def load_example():
             example = {
                 "type": "product_catalog",
                 "items": [
-                    {"id": "123", "name": "Premium Widget", "price": 299.99},
-                    {"id": "456", "name": "Basic Widget", "price": 149.99}
                 ],
-                "metadata": {"timestamp": datetime.now().isoformat()}
             }
             return json.dumps(example, indent=2)
-        def clear_input():
-            return "", None, "", "Full Structured Data", [], None
-        def process_inputs(urls, files, text, combine, mode):
             try:
-                results = []
-                url_processor = EnhancedURLProcessor()
-                file_processor = EnhancedFileProcessor()
-                all_media_paths = []
-                database_zip_path = None
                 if text and text.strip():
                     try:
                         json_data = json.loads(text)
                         if isinstance(json_data, list):
-                            results.extend([{'source': 'json', 'structured': item} for item in json_data])
                         else:
-                            results.append({'source': 'json', 'structured': json_data})
                     except json.JSONDecodeError as e:
-                        return None, [], f"Invalid JSON format: {str(e)}", None
                 if files:
                     for file in files:
-                        file_results = file_processor.process_file(file)
                         if file_results:
                             results.extend(file_results)
                 if urls and urls.strip():
-                    url_list = re.split(r'[,\n]', urls)
-                    url_list = [url.strip() for url in url_list if url.strip()]
                     if len(url_list) == 1 and mode != "Full Structured Data":
-                        crawler = SiteCrawler(url_processor)
                         crawl_results, snapshot_paths = crawler.crawl_site(url_list[0], mode)
                         results.extend(crawl_results)
                         all_media_paths.extend(snapshot_paths)
                     else:
                         for url in url_list:
                             validation = url_processor.validate_url(url)
-                            if validation['is_valid']:
                                 content = url_processor.fetch_content(url)
-                                if content and 'text/html' in content['metadata']['content_type'].lower():
-                                    filename = f"snapshot_{int(time.time())}_{urlparse(url).netloc.replace('.', '_')}.png"
-                                    snapshot_path = capture_visual_snapshot(content['metadata']['final_url'], filename)
-                                    if snapshot_path:
-                                        all_media_paths.append(snapshot_path)
                                     if mode == "Extract for Template (Shell)":
-                                        structured_output = url_processor._create_template_shell(content['raw_content'], content['metadata']['final_url'])
                                     elif mode == "Extract for Database (Content Only)":
-                                        soup = BeautifulSoup(content['raw_content'], 'html.parser')
-                                        structured_output = url_processor._extract_database_data(soup)
                                     else:
-                                        structured_output = url_processor._process_html_content(content['raw_content'], content['metadata']['final_url'])
-                                    results.append({
-                                        'source': 'url', 'url': content['metadata']['final_url'],
-                                        'structured': structured_output, 'metadata': content['metadata'],
-                                        'timestamp': datetime.now().isoformat(), 'snapshot_path': snapshot_path if snapshot_path else 'N/A'
-                                    })
-                                elif content:
-                                    results.append({
-                                        'source': 'url', 'url': content['metadata']['final_url'],
-                                        'structured': content['structured'], 'metadata': content['metadata'],
-                                        'timestamp': datetime.now().isoformat()
-                                    })
-                if results:
-                    results = break_down_data(results)
                 if results:
                     if mode == "Extract for Database (Content Only)":
-                        database_zip_path = package_database(results)
-                        status_msg = f"Database package created. Processed {len(results)} items."
                     else:
-                        qr_paths = generate_qr_codes(results, combine)
                         all_media_paths.extend(qr_paths)
-                        status_msg = f"Processed {len(results)} items. Generated {len(all_media_paths)} media files."
-                    return (
-                        results,
-                        [str(path) for path in all_media_paths],
-                        status_msg,
-                        database_zip_path
-                    )
                 else:
-                    return None, [], "No valid content to process from inputs.", None
             except Exception as e:
                 logger.error(f"Processing error: {e}")
-                return None, [], f"Critical Error during processing: {str(e)}", None
         example_btn.click(load_example, outputs=[text_input])
-        clear_btn.click(clear_input, outputs=[url_input, file_input, text_input, extraction_mode, output_gallery, output_database_zip])
         process_btn.click(
             process_inputs,
-            inputs=[url_input, file_input, text_input, combine_data, extraction_mode],
             outputs=[output_json, output_gallery, output_text, output_database_zip]
-        ).success(
-            fn=lambda zip_path: gr.update(visible=bool(zip_path)),
-            inputs=[output_database_zip],
-            outputs=[output_database_zip]
         )
         gr.Markdown("""
-        ### Database Export (D-Baser)
-        When **Extract for Database (Content Only)** is selected, the system performs the following:
-        1. Isolates entity-specific text, structured data, and media URLs.
-        2. Downloads all unique media files (images, videos) to the `output/media` folder.
-        3. Creates a final ZIP file containing:
-           - `data_export.json`: Structured data with media URLs replaced by relative paths (`media/filename.jpg`).
-           - `media/`: A folder containing all downloaded media files.
-        This ZIP archive is ready to be deployed as a static content store.
         """)
     return interface
 def main():
     try:
         mimetypes.init()
         interface = create_modern_interface()
         interface.launch(
             share=False,
             debug=False,
             show_error=True,
-            show_api=False
         )
     except Exception as e:
         logger.error(f"Application startup error: {e}")

+"""
+Advanced Data Processor & QR Generator
+Enhanced version with better error handling, performance improvements, and cleaner architecture.
+"""
 import json
 import os
 import re
 import tempfile
 import chardet
 import tarfile
+import copy
+import hashlib
 from datetime import datetime
+from typing import List, Dict, Optional, Union, Tuple, Any, Set
 from pathlib import Path
 from urllib.parse import urlparse, urljoin
+from dataclasses import dataclass, asdict
+from contextlib import contextmanager
 import requests
 import validators
 import gradio as gr
 from bs4 import BeautifulSoup, NavigableString, Tag
 from fake_useragent import UserAgent
 from cleantext import clean
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
+# Conditional imports with better error handling
+PLAYWRIGHT_AVAILABLE = False
 try:
+    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
     PLAYWRIGHT_AVAILABLE = True
 except ImportError:
+    logger = logging.getLogger(__name__)
+    logger.warning("Playwright not installed. Install with: pip install playwright && playwright install")
+# Setup enhanced logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
     handlers=[
         logging.StreamHandler(),
+        logging.FileHandler('app.log', encoding='utf-8', mode='a')
     ]
 )
 logger = logging.getLogger(__name__)
+# Constants
 OUTPUTS_DIR = Path('output')
 QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes'
 SNAPSHOTS_DIR = OUTPUTS_DIR / 'snapshots'
+MEDIA_DIR = OUTPUTS_DIR / 'media'
 TEMP_DIR = OUTPUTS_DIR / 'temp'
+MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB default max
+DEFAULT_TIMEOUT = 30
+# Ensure directories exist
 for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR, SNAPSHOTS_DIR, MEDIA_DIR]:
     directory.mkdir(parents=True, exist_ok=True)
+# Data classes for better type safety
+@dataclass
+class URLValidationResult:
+    is_valid: bool
+    message: str
+    details: Dict[str, Any]
+@dataclass
+class FetchResult:
+    structured: Dict[str, Any]
+    raw_content: str
+    metadata: Dict[str, Any]
+@dataclass
+class ProcessedItem:
+    source: str
+    url: Optional[str] = None
+    filename: Optional[str] = None
+    structured: Dict[str, Any] = None
+    metadata: Dict[str, Any] = None
+    timestamp: str = None
+    snapshot_path: Optional[str] = None
+    def __post_init__(self):
+        if self.timestamp is None:
+            self.timestamp = datetime.now().isoformat()
+        if self.structured is None:
+            self.structured = {}
+        if self.metadata is None:
+            self.metadata = {}
+# Media Downloader with better caching and error handling
 class MediaDownloader:
+    """Handles downloading and saving media files with caching."""
+    def __init__(self, cache_dir: Path = TEMP_DIR / 'media_cache'):
         self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+            'Accept': 'image/webp,image/*,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+        })
+        self.cache_dir = cache_dir
+        self.cache_dir.mkdir(exist_ok=True)
+        self.downloaded_files = {}  # {url_hash: local_path}
+    def _get_url_hash(self, url: str) -> str:
+        """Generate consistent hash for URL."""
+        return hashlib.md5(url.encode()).hexdigest()
+    def download_media(self, url: str, timeout: int = 10) -> Optional[str]:
+        """Download media file with caching."""
+        url_hash = self._get_url_hash(url)
+        # Check cache first
+        cache_file = self.cache_dir / f"{url_hash}.cache"
+        if cache_file.exists():
+            try:
+                with open(cache_file, 'r') as f:
+                    cached_path = f.read().strip()
+                if Path(cached_path).exists():
+                    return cached_path
+            except Exception:
+                pass
+        # Download the file
         try:
+            response = self.session.get(url, timeout=timeout, stream=True)
             response.raise_for_status()
+            # Determine file extension
             content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
             ext = mimetypes.guess_extension(content_type)
             if not ext:
+                # Try to get extension from URL
+                parsed = urlparse(url)
+                ext = Path(parsed.path).suffix or '.bin'
+            # Create safe filename
+            safe_filename = f"{url_hash}{ext}"
+            local_path = MEDIA_DIR / safe_filename
+            # Save file
             with open(local_path, 'wb') as f:
                 for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+            # Update cache
+            with open(cache_file, 'w') as f:
+                f.write(str(local_path))
             self.downloaded_files[url] = str(local_path)
             logger.info(f"Downloaded media: {url} -> {local_path}")
             return str(local_path)
         except requests.exceptions.RequestException as e:
             logger.warning(f"Failed to download media {url}: {e}")
             return None
         except Exception as e:
+            logger.error(f"Unexpected error downloading {url}: {e}")
             return None
+    def batch_download(self, urls: List[str], max_workers: int = 5) -> Dict[str, Optional[str]]:
+        """Download multiple files (could be enhanced with threading)."""
+        results = {}
+        for url in urls:
+            results[url] = self.download_media(url)
+        return results
+# Enhanced URL Processor
 class EnhancedURLProcessor:
+    """Advanced URL processing with complete content extraction."""
+    def __init__(self, timeout: int = DEFAULT_TIMEOUT, max_retries: int = 3):
         self.session = requests.Session()
+        self.timeout = timeout
+        self.max_retries = max_retries
         self.user_agent = UserAgent()
         self.session.headers.update({
             'User-Agent': self.user_agent.random,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1',
             'Sec-Fetch-Dest': 'document',
             'Sec-Fetch-Mode': 'navigate',
             'Sec-Fetch-Site': 'none',
+            'DNT': '1',
         })
+    def validate_url(self, url: str) -> URLValidationResult:
+        """Enhanced URL validation with detailed feedback."""
         try:
+            # Basic URL validation
+            if not url or not isinstance(url, str):
+                return URLValidationResult(
+                    is_valid=False,
+                    message='Invalid URL',
+                    details={'error': 'URL must be a non-empty string'}
+                )
+            # Check if URL starts with http(s)
+            if not url.startswith(('http://', 'https://')):
+                url = 'https://' + url
+            # Validate with validators
             if not validators.url(url):
+                return URLValidationResult(
+                    is_valid=False,
+                    message='Invalid URL format',
+                    details={'error': 'URL must be properly formatted'}
+                )
             parsed = urlparse(url)
             if not all([parsed.scheme, parsed.netloc]):
+                return URLValidationResult(
+                    is_valid=False,
+                    message='Incomplete URL',
+                    details={'error': 'Missing scheme or domain'}
+                )
+            # Try to connect
             try:
+                head_response = self.session.head(
+                    url,
+                    timeout=5,
+                    allow_redirects=True
+                )
                 head_response.raise_for_status()
             except requests.exceptions.RequestException:
+                # Try GET if HEAD fails
+                response = self.session.get(url, timeout=5, stream=True)
                 response.raise_for_status()
+            return URLValidationResult(
+                is_valid=True,
+                message='URL is valid and accessible',
+                details={
+                    'final_url': response.url if 'response' in locals() else head_response.url,
                     'content_type': head_response.headers.get('Content-Type', 'unknown'),
                     'server': head_response.headers.get('Server', 'unknown'),
                     'size': head_response.headers.get('Content-Length', 'unknown')
                 }
+            )
         except Exception as e:
+            return URLValidationResult(
+                is_valid=False,
+                message=f'URL validation failed: {str(e)}',
+                details={'error': str(e), 'traceback': str(e.__traceback__)}
+            )
+    def fetch_content(self, url: str, retry_count: int = 0) -> Optional[FetchResult]:
+        """Enhanced content fetcher with retries and encoding detection."""
         try:
+            logger.info(f"Fetching content from: {url} (Attempt {retry_count + 1}/{self.max_retries})")
+            # Update user agent
             self.session.headers.update({'User-Agent': self.user_agent.random})
+            response = self.session.get(
+                url,
+                timeout=self.timeout,
+                allow_redirects=True,
+                stream=True
+            )
             response.raise_for_status()
             # Encoding detection
             encoding = response.encoding
             if encoding is None or encoding == 'ISO-8859-1':
+                # Sample first 10KB for encoding detection
+                sample = response.content[:10240]
+                detected = chardet.detect(sample)
                 encoding = detected['encoding'] or 'utf-8'
+            # Decode content
             try:
                 raw_content = response.content.decode(encoding, errors='replace')
             except (UnicodeDecodeError, LookupError):
                 raw_content = response.content.decode('utf-8', errors='replace')
                 encoding = 'utf-8 (fallback)'
+            # Prepare metadata
             metadata = {
                 'url': url,
                 'final_url': response.url,
                 'content_length': len(response.content),
                 'status_code': response.status_code,
                 'headers': dict(response.headers),
+                'elapsed': response.elapsed.total_seconds(),
             }
+            # Process based on content type
             content_type = metadata['content_type'].lower()
             structured = {}
+            if 'text/html' in content_type:
+                structured = self._process_html_content(raw_content, response.url)
+            elif 'application/json' in content_type or url.endswith('.json'):
+                try:
+                    structured = json.loads(raw_content)
+                except json.JSONDecodeError as e:
                     structured = {
+                        'text': raw_content[:100000],
+                        'parse_error': str(e),
+                        'json_fragment': raw_content[:1000]
                     }
+            elif 'image/' in content_type:
+                structured = {
+                    'media_type': 'image',
+                    'direct_url': response.url,
+                    'format': content_type.split('/')[-1],
+                    'size_bytes': len(response.content),
+                    'filename': Path(urlparse(url).path).name or 'unknown'
+                }
+            else:
+                # Generic content
+                structured = {'text': raw_content[:100000]}
+            return FetchResult(
+                structured=structured,
+                raw_content=raw_content,
+                metadata=metadata
+            )
         except requests.exceptions.RequestException as e:
             if retry_count < self.max_retries - 1:
                 sleep_time = 2 ** retry_count
+                logger.info(f"Retrying {url} after {sleep_time}s...")
                 time.sleep(sleep_time)
                 return self.fetch_content(url, retry_count + 1)
             else:
         except Exception as e:
             logger.error(f"Unexpected error fetching {url}: {e}")
             return None
+    def _process_html_content(self, raw_content: str, base_url: str) -> Dict[str, Any]:
+        """Process HTML content and extract structured data."""
         soup = BeautifulSoup(raw_content, 'html.parser')
+        # Fix relative URLs
+        for tag in soup.find_all(['a', 'img', 'link', 'script', 'video', 'audio', 'source']):
+            for attr in ['href', 'src', 'data-src', 'poster']:
                 if tag.get(attr) and not urlparse(tag[attr]).scheme:
                     try:
                         tag[attr] = urljoin(base_url, tag[attr])
+                    except Exception as e:
+                        logger.debug(f"Failed to join URL: {e}")
+        # Extract structured data
+        structured = self._extract_database_data(soup, base_url)
+        structured['raw_html'] = raw_content[:50000]  # Store truncated HTML
+        structured['base_url'] = base_url
         return structured
+    def _create_template_shell(self, raw_content: str, base_url: str) -> Dict[str, Any]:
+        """Create a template shell from HTML content."""
         soup = BeautifulSoup(raw_content, 'html.parser')
         PLACEHOLDER_TEXT = "[LOREM IPSUM CONTENT]"
+        PLACEHOLDER_IMG = "data:image/svg+xml;charset=UTF-8,%3Csvg%20width%3D%22200%22%20height%3D%22100%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%3Crect%20width%3D%22200%22%20height%3D%22100%22%20fill%3D%22%23777%22%3E%3C%2Frect%3E%3Ctext%20x%3D%2270%22%20y%3D%2255%22%3E200x100%3C%2Ftext%3E%3C%2Fsvg%3E"
+        # Replace text content
+        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'td', 'th', 'label', 'title', 'div']
+        for tag in soup.find_all(text_tags):
+            if tag.string and len(tag.get_text(strip=True)) > 5:
+                tag.string.replace_with(PLACEHOLDER_TEXT)
+        # Replace images
         for img in soup.find_all('img'):
             img['src'] = PLACEHOLDER_IMG
+            if 'srcset' in img.attrs:
+                del img['srcset']
+        # Remove external links
         for a in soup.find_all('a'):
+            if 'href' in a.attrs:
                 a['href'] = '#'
+        # Remove sensitive data
         for script in soup.find_all('script', type='application/ld+json'):
             script.decompose()
+        # Remove comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, NavigableString) and '<!--' in str(text)):
+            comment.extract()
         return {
             'template_type': 'html_shell',
             'base_url': base_url,
+            'template_html': str(soup),
+            'timestamp': datetime.now().isoformat()
         }
+    def _extract_database_data(self, soup: BeautifulSoup, base_url: str) -> Dict[str, Any]:
+        """Extract structured data from HTML."""
         structured = {
+            'title': soup.title.string.strip() if soup.title and soup.title.string else '',
+            'meta_description': '',
             'core_text_content': '',
+            'images': [],
+            'videos': [],
+            'audios': [],
             'structured_data': [],
             'products': [],
+            'links': [],
+            'metadata': {}
         }
+        # Extract meta description
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        if meta_desc:
+            structured['meta_description'] = meta_desc.get('content', '')
+        # Extract JSON-LD structured data
         for script in soup.find_all('script', type='application/ld+json'):
             try:
+                ld_data = json.loads(script.string or '{}')
                 structured['structured_data'].append(ld_data)
+                # Extract products
+                if isinstance(ld_data, dict):
+                    if ld_data.get('@type') == 'Product':
+                        structured['products'].append(ld_data)
+                    elif ld_data.get('@graph'):
+                        for item in ld_data.get('@graph', []):
+                            if isinstance(item, dict) and item.get('@type') == 'Product':
+                                structured['products'].append(item)
+            except (json.JSONDecodeError, TypeError) as e:
+                logger.debug(f"Failed to parse JSON-LD: {e}")
+        # Extract media
         for img in soup.find_all('img'):
+            src = img.get('src') or img.get('data-src')
+            if src:
+                structured['images'].append(urljoin(base_url, src))
         for video in soup.find_all('video'):
+            src = video.get('src') or (video.find('source') and video.find('source').get('src'))
+            if src:
+                structured['videos'].append(urljoin(base_url, src))
         for audio in soup.find_all('audio'):
+            src = audio.get('src') or (audio.find('source') and audio.find('source').get('src'))
+            if src:
+                structured['audios'].append(urljoin(base_url, src))
+        # Extract links
+        for a in soup.find_all('a', href=True):
+            href = a['href']
+            if href.startswith(('http://', 'https://')):
+                structured['links'].append(href)
+        # Extract main content
+        main_content_selectors = [
+            'main', 'article', '[role="main"]',
+            '.main-content', '.content', '#content',
+            '.article', '.post'
+        ]
+        for selector in main_content_selectors:
+            main_tag = soup.select_one(selector)
+            if main_tag:
+                structured['core_text_content'] = clean(
+                    main_tag.get_text('\n', strip=True),
+                    lower=False,
+                    no_line_breaks=False,
+                    no_urls=True,
+                    no_emails=True,
+                    no_phone_numbers=True
+                )[:10000]  # Limit size
+                break
         if not structured['core_text_content']:
+            # Fallback: extract all text
+            structured['core_text_content'] = clean(
+                soup.get_text('\n', strip=True),
+                lower=False,
+                no_line_breaks=False,
+                no_urls=True,
+                no_emails=True,
+                no_phone_numbers=True
+            )[:5000]
+        # Remove duplicates
+        structured['images'] = list(dict.fromkeys(structured['images']))[:50]  # Limit to 50 images
+        structured['videos'] = list(dict.fromkeys(structured['videos']))
+        structured['audios'] = list(dict.fromkeys(structured['audios']))
+        structured['links'] = list(dict.fromkeys(structured['links']))[:100]  # Limit to 100 links
         return structured
+# Site Crawler with improved logic
 class SiteCrawler:
+    """Crawl website with configurable depth and limits."""
+    def __init__(self, processor: EnhancedURLProcessor, max_pages: int = 10, max_depth: int = 2):
         self.processor = processor
+        self.max_pages = max_pages
+        self.max_depth = max_depth
         self.crawled_urls = set()
+        self.results = []
+        self.snapshot_paths = []
+    def _normalize_url(self, url: str, base_url: str) -> str:
+        """Normalize URL by removing fragments and query parameters for crawling."""
+        parsed = urlparse(url)
+        base_parsed = urlparse(base_url)
+        # Ensure same domain
+        if parsed.netloc and parsed.netloc != base_parsed.netloc:
+            return None
+        # Remove fragments and query params for crawling
+        normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
+        return normalized.rstrip('/')
+    def _get_internal_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
+        """Extract internal links from page."""
         parsed_base = urlparse(base_url)
         internal_links = set()
         for a in soup.find_all('a', href=True):
             href = urljoin(base_url, a['href'])
             parsed_href = urlparse(href)
+            # Check if same domain
+            if parsed_href.netloc == parsed_base.netloc:
+                # Filter out non-HTML resources
+                if any(href.lower().endswith(ext) for ext in [
+                    '.pdf', '.zip', '.jpg', '.jpeg', '.png', '.gif',
+                    '.css', '.js', '.mp4', '.mp3', '.avi', '.mov'
+                ]):
+                    continue
+                # Remove fragments
+                href = self._normalize_url(href, base_url)
+                if href:
                     internal_links.add(href)
         return list(internal_links)
+    def crawl_site(self, start_url: str, mode: str = "Full Structured Data") -> Tuple[List[Dict], List[str]]:
+        """Crawl website starting from given URL."""
+        logger.info(f"Starting crawl from {start_url} (max pages: {self.max_pages})")
+        queue = [(start_url, 0)]  # (url, depth)
         while queue and len(self.crawled_urls) < self.max_pages:
+            url, depth = queue.pop(0)
+            if url in self.crawled_urls or depth > self.max_depth:
                 continue
+            logger.info(f"Crawling: {url} (depth: {depth})")
             self.crawled_urls.add(url)
+            # Fetch content
             content_result = self.processor.fetch_content(url)
+            if not content_result:
                 continue
+            # Check if HTML
+            content_type = content_result.metadata.get('content_type', '').lower()
+            if 'text/html' not in content_type:
+                continue
+            # Capture snapshot if Playwright is available
+            snapshot_path = None
+            if PLAYWRIGHT_AVAILABLE:
+                try:
+                    filename = f"snapshot_{len(self.crawled_urls)}_{hashlib.md5(url.encode()).hexdigest()[:8]}.png"
+                    snapshot_path = capture_visual_snapshot(url, filename)
+                    if snapshot_path:
+                        self.snapshot_paths.append(snapshot_path)
+                except Exception as e:
+                    logger.warning(f"Failed to capture snapshot for {url}: {e}")
+            # Process based on mode
+            raw_content = content_result.raw_content
+            base_url = content_result.metadata['final_url']
             soup = BeautifulSoup(raw_content, 'html.parser')
             if mode == "Extract for Template (Shell)":
+                structured = self.processor._create_template_shell(raw_content, base_url)
             elif mode == "Extract for Database (Content Only)":
+                structured = self.processor._extract_database_data(soup, base_url)
             else:
+                structured = self.processor._process_html_content(raw_content, base_url)
+            # Create result item
+            result_item = ProcessedItem(
+                source='crawl',
+                url=base_url,
+                structured=structured,
+                metadata=content_result.metadata,
+                snapshot_path=snapshot_path
+            )
+            self.results.append(asdict(result_item))
+            # Extract links for next level
+            if depth < self.max_depth:
+                new_links = self._get_internal_links(soup, base_url)
+                for link in new_links:
+                    if link not in self.crawled_urls and len(self.crawled_urls) < self.max_pages:
+                        queue.append((link, depth + 1))
+            # Be polite
+            time.sleep(0.5)
+        logger.info(f"Crawl completed. Found {len(self.results)} pages.")
+        return self.results, self.snapshot_paths
+# File Processor with better archive handling
 class EnhancedFileProcessor:
+    """Process various file types including archives."""
+    def __init__(self, max_file_size: int = MAX_FILE_SIZE):
         self.max_file_size = max_file_size
         self.supported_extensions = {
             '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.log',
             '.yml', '.yaml', '.ini', '.conf', '.cfg', '.zip', '.tar', '.gz',
             '.bz2', '.7z', '.rar', '.pdf', '.doc', '.docx', '.rtf', '.odt',
+            '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'
         }
+    def process_file(self, file_path: str) -> List[Dict]:
+        """Process a single file or archive."""
+        if not file_path or not os.path.exists(file_path):
             return []
         try:
             file_size = os.path.getsize(file_path)
             if file_size > self.max_file_size:
+                logger.warning(f"File {file_path} exceeds size limit ({file_size} > {self.max_file_size})")
                 return []
+            if self._is_archive(file_path):
+                return self._process_archive(file_path)
+            else:
+                return self._process_single_file(file_path)
         except Exception as e:
+            logger.error(f"Error processing file {file_path}: {e}")
             return []
     def _is_archive(self, filepath: str) -> bool:
+        """Check if file is an archive."""
+        archive_extensions = ['.zip', '.tar', '.gz', '.bz2', '.7z', '.rar']
+        return any(filepath.lower().endswith(ext) for ext in archive_extensions)
+    def _process_single_file(self, file_path: str) -> List[Dict]:
+        """Process a single file."""
         try:
             file_stat = os.stat(file_path)
+            mime_type, _ = mimetypes.guess_type(file_path)
+            mime_type = mime_type or 'application/octet-stream'
             structured = {}
+            if 'image/' in mime_type:
                 structured = {
                     'media_type': 'image',
                     'filename': os.path.basename(file_path),
+                    'mime_type': mime_type,
+                    'size_bytes': file_stat.st_size
                 }
             else:
+                # Read file content
                 with open(file_path, 'rb') as f:
                     raw_bytes = f.read()
+                # Detect encoding
+                detected = chardet.detect(raw_bytes[:10000])
                 encoding = detected['encoding'] or 'utf-8'
                 try:
+                    content = raw_bytes.decode(encoding, errors='replace')
                 except (UnicodeDecodeError, LookupError):
+                    content = raw_bytes.decode('utf-8', errors='replace')
+                # Parse based on file type
+                if 'json' in mime_type or file_path.endswith('.json'):
                     try:
+                        json_data = json.loads(content)
                         structured = json_data
+                    except json.JSONDecodeError as e:
+                        structured = {
+                            'text': content[:50000],
+                            'parse_error': str(e)
+                        }
+                elif 'html' in mime_type or file_path.endswith(('.html', '.htm')):
+                    processor = EnhancedURLProcessor()
+                    soup = BeautifulSoup(content, 'html.parser')
+                    structured = processor._extract_database_data(soup, f"file://{file_path}")
                 else:
+                    structured = {'text': content[:100000]}
+            result_item = ProcessedItem(
+                source='file',
+                filename=os.path.basename(file_path),
+                structured=structured,
+                metadata={
+                    'file_size': file_stat.st_size,
+                    'mime_type': mime_type,
+                    'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                    'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                    'file_path': file_path
+                }
+            )
+            return [asdict(result_item)]
         except Exception as e:
+            logger.error(f"Error processing single file {file_path}: {e}")
             return []
+    def _process_archive(self, archive_path: str) -> List[Dict]:
+        """Extract and process files from archive."""
         dataset = []
+        temp_dir = tempfile.mkdtemp(prefix='archive_extract_')
         try:
             if zipfile.is_zipfile(archive_path):
                 with zipfile.ZipFile(archive_path, 'r') as zip_ref:
+                    zip_ref.extractall(temp_dir)
                     for file_info in zip_ref.infolist():
+                        if not file_info.is_dir():
+                            file_path = os.path.join(temp_dir, file_info.filename)
+                            if os.path.exists(file_path):
+                                dataset.extend(self._process_single_file(file_path))
             elif tarfile.is_tarfile(archive_path):
                 with tarfile.open(archive_path, 'r') as tar_ref:
+                    tar_ref.extractall(temp_dir)
                     for member in tar_ref.getmembers():
                         if member.isfile():
+                            file_path = os.path.join(temp_dir, member.name)
+                            if os.path.exists(file_path):
+                                dataset.extend(self._process_single_file(file_path))
+            else:
+                logger.warning(f"Unsupported archive format: {archive_path}")
         except Exception as e:
+            logger.error(f"Error processing archive {archive_path}: {e}")
+        finally:
+            # Cleanup
+            try:
+                import shutil
+                shutil.rmtree(temp_dir, ignore_errors=True)
+            except:
+                pass
         return dataset
+# Data Chunker with improved chunking logic
 class DataChunker:
+    """Chunk data for QR code generation."""
+    def __init__(self, max_chunk_size: int = 2953):
+        self.max_chunk_size = max_chunk_size
+    def chunk_data(self, data: Any) -> List[Dict]:
+        """Chunk data into smaller pieces for QR encoding."""
         try:
+            # Serialize data
+            if isinstance(data, dict):
                 json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
+            elif isinstance(data, list):
                 json_str = json.dumps(data, ensure_ascii=False)
+            else:
+                json_str = str(data)
+            # Calculate chunk size
+            total_bytes = len(json_str.encode('utf-8'))
+            chunk_size = self.max_chunk_size
+            # Create chunks
             chunks = []
+            for i in range(0, total_bytes, chunk_size):
+                chunk_str = json_str.encode('utf-8')[i:i + chunk_size].decode('utf-8', errors='ignore')
+                chunk_hash = hashlib.md5(chunk_str.encode()).hexdigest()[:8]
                 chunk = {
+                    "chunk_index": len(chunks) + 1,
+                    "total_chunks": (total_bytes + chunk_size - 1) // chunk_size,
+                    "total_length": total_bytes,
                     "chunk_hash": chunk_hash,
+                    "data": chunk_str,
+                    "timestamp": datetime.now().isoformat()
                 }
                 chunks.append(chunk)
             return chunks
         except Exception as e:
             logger.error(f"Error chunking data: {e}")
+            return [{"error": str(e), "data": str(data)[:100]}]
+# QR Code Generator with styling options
+class QRCodeGenerator:
+    """Generate QR codes with various styling options."""
+    def __init__(self, output_dir: Path = QR_CODES_DIR):
+        self.output_dir = output_dir
+        self.output_dir.mkdir(exist_ok=True)
+    def generate_stylish_qr(self, data: Union[str, Dict], filename: str,
+                           size: int = 10, border: int = 4,
+                           fill_color: str = "#000000",
+                           back_color: str = "#FFFFFF",
+                           logo_path: Optional[str] = None) -> str:
+        """Generate a stylish QR code."""
+        try:
+            # Prepare data
+            if isinstance(data, dict):
+                data_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
+            else:
+                data_str = str(data)
+            # Create QR code
+            qr = qrcode.QRCode(
+                version=None,
+                error_correction=qrcode.constants.ERROR_CORRECT_H,  # High error correction
+                box_size=size,
+                border=border
+            )
+            qr.add_data(data_str)
+            qr.make(fit=True)
+            # Create image
+            qr_img = qr.make_image(fill_color=fill_color, back_color=back_color)
+            qr_img = qr_img.convert('RGBA')
+            # Add logo if provided
+            if logo_path and os.path.exists(logo_path):
+                try:
+                    logo = Image.open(logo_path)
+                    logo_size = qr_img.size[0] // 5
+                    logo = logo.resize((logo_size, logo_size), Image.Resampling.LANCZOS)
+                    # Calculate position
+                    pos = ((qr_img.size[0] - logo.size[0]) // 2,
+                          (qr_img.size[1] - logo.size[1]) // 2)
+                    # Paste logo
+                    qr_img.paste(logo, pos, logo)
+                except Exception as e:
+                    logger.warning(f"Failed to add logo: {e}")
+            # Save image
+            output_path = self.output_dir / filename
+            qr_img.save(output_path, 'PNG', quality=95)
+            logger.info(f"QR code generated: {output_path}")
+            return str(output_path)
+        except Exception as e:
+            logger.error(f"QR generation error: {e}")
+            return ""
+    def generate_qr_sequence(self, data: Any, combined: bool = True,
+                           prefix: str = "qr") -> List[str]:
+        """Generate a sequence of QR codes for data."""
         chunker = DataChunker()
         paths = []
+        timestamp = int(time.time())
         if combined:
+            # Generate QR codes for combined data
             chunks = chunker.chunk_data(data)
             for i, chunk in enumerate(chunks):
+                filename = f'{prefix}_{timestamp}_{i+1}_of_{len(chunks)}.png'
+                qr_path = self.generate_stylish_qr(
                     data=chunk,
                     filename=filename,
                     fill_color="#1a365d",
                 if qr_path:
                     paths.append(qr_path)
         else:
+            # Generate separate QR codes for each item
             if isinstance(data, list):
                 for idx, item in enumerate(data):
                     chunks = chunker.chunk_data(item)
                     for chunk_idx, chunk in enumerate(chunks):
+                        filename = f'{prefix}_item{idx+1}_{chunk_idx+1}_of_{len(chunks)}_{timestamp}.png'
+                        qr_path = self.generate_stylish_qr(
                             data=chunk,
                             filename=filename,
+                            fill_color="#2d3748",
                             back_color="#ffffff"
                         )
                         if qr_path:
             else:
                 chunks = chunker.chunk_data(data)
                 for i, chunk in enumerate(chunks):
+                    filename = f'{prefix}_single_{i+1}_of_{len(chunks)}_{timestamp}.png'
+                    qr_path = self.generate_stylish_qr(
                         data=chunk,
                         filename=filename,
                         fill_color="#1a365d",
                     )
                     if qr_path:
                         paths.append(qr_path)
         return paths
+# Main processing functions
+def capture_visual_snapshot(url: str, filename: str) -> Optional[str]:
+    """Capture webpage screenshot using Playwright."""
+    if not PLAYWRIGHT_AVAILABLE:
+        logger.warning("Playwright not available for screenshots")
+        return None
+    output_path = SNAPSHOTS_DIR / filename
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            context = browser.new_context(
+                viewport={'width': 1280, 'height': 720},
+                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+            )
+            page = context.new_page()
+            # Navigate with timeout
+            page.goto(url, wait_until='networkidle', timeout=30000)
+            # Take full page screenshot
+            page.screenshot(path=output_path, full_page=True)
+            browser.close()
+        logger.info(f"Snapshot captured: {output_path}")
+        return str(output_path)
+    except Exception as e:
+        logger.error(f"Failed to capture snapshot for {url}: {e}")
         return None
+def break_down_data(data: Union[Dict, List[Dict]]) -> Union[Dict, List[Dict]]:
+    """Break down and restructure data for better organization."""
+    def process_item(item: Dict) -> Dict:
         structured = item.get('structured', {})
+        # Handle template shells
+        if structured.get('template_type') == 'html_shell':
+            return item
+        # Ensure structured data exists
+        if not structured:
+            content = item.get('content') or item.get('raw_content', '')
+            if isinstance(content, str):
+                structured = {'text': content}
+            elif isinstance(content, dict):
+                structured = content
+        # Extract media
+        media = []
+        for img in structured.get('images', []):
+            media.append({'type': 'image', 'source': img, 'size': 'unknown'})
+        for vid in structured.get('videos', []):
+            media.append({'type': 'video', 'source': vid, 'size': 'unknown'})
+        for aud in structured.get('audios', []):
+            media.append({'type': 'audio', 'source': aud, 'size': 'unknown'})
+        structured['media'] = media
+        # Extract products
+        if 'products' not in structured:
+            structured['products'] = []
+        # Create template if products exist
+        if structured['products']:
+            structured['template'] = {
+                'type': 'product_catalog',
+                'item_count': len(structured['products']),
+                'items': structured['products'][:10],  # Limit to 10
+                'metadata': item.get('metadata', {})
+            }
+        item['structured'] = structured
+        return item
+    if isinstance(data, list):
+        return [process_item(item) for item in data]
+    elif isinstance(data, dict):
+        return process_item(data)
+    return data
+def package_database(results: List[Dict]) -> Optional[str]:
+    """Package processed data and media into a ZIP file."""
+    if not results:
+        return None
+    try:
+        downloader = MediaDownloader()
+        updated_results = copy.deepcopy(results)
+        # Collect media URLs
+        media_urls = set()
+        for item in updated_results:
+            structured = item.get('structured', {})
+            media_urls.update(structured.get('images', []))
+            media_urls.update(structured.get('videos', []))
+            media_urls.update(structured.get('audios', []))
+        # Download media
+        media_mapping = downloader.batch_download(list(media_urls))
+        # Update results with local paths
+        for item in updated_results:
+            structured = item.get('structured', {})
+            for media_type in ['images', 'videos', 'audios']:
+                if media_type in structured:
+                    new_paths = []
+                    for url in structured[media_type]:
+                        if url in media_mapping and media_mapping[url]:
+                            local_path = Path(media_mapping[url])
+                            new_paths.append(f"media/{local_path.name}")
+                        else:
+                            new_paths.append(url)
+                    structured[media_type] = new_paths
+        # Create ZIP file
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        zip_filename = OUTPUTS_DIR / f"database_export_{timestamp}.zip"
+        with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
+            # Add data
+            zf.writestr(
+                'data_export.json',
+                json.dumps(updated_results, indent=2, ensure_ascii=False)
+            )
+            # Add README
+            readme = f"""Database Export
+Generated: {datetime.now().isoformat()}
+Items: {len(updated_results)}
+Media Files: {len(media_mapping)}
+"""
+            zf.writestr('README.txt', readme)
+            # Add media files
+            for url, local_path in media_mapping.items():
+                if local_path and os.path.exists(local_path):
+                    zf.write(local_path, arcname=f"media/{Path(local_path).name}")
+        logger.info(f"Database package created: {zip_filename}")
+        return str(zip_filename)
+    except Exception as e:
+        logger.error(f"Failed to create database package: {e}")
+        return None
+# Gradio Interface
 def create_modern_interface():
+    """Create modern Gradio interface."""
     css = """
     :root {
         --primary-color: #1a365d;
         --success-color: #48bb78;
         --error-color: #f56565;
         --warning-color: #ed8936;
+        --border-radius: 0.5rem;
     }
     .gradio-container {
         max-width: 1200px;
+        margin: 2rem auto;
+        padding: 2rem;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: var(--border-radius);
+        box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+    }
+    .container-inner {
+        background: white;
+        border-radius: var(--border-radius);
         padding: 2rem;
         box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
     }
+    h1 {
+        background: linear-gradient(90deg, #667eea, #764ba2);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 1rem;
+    }
     .primary-button {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
         border: none;
+        padding: 0.75rem 1.5rem;
+        border-radius: var(--border-radius);
+        font-weight: 600;
         cursor: pointer;
+        transition: transform 0.2s, box-shadow 0.2s;
     }
     .primary-button:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 10px 20px rgba(102, 126, 234, 0.4);
+    }
+    .warning-box {
+        background: linear-gradient(135deg, #f6d365 0%, #fda085 100%);
+        padding: 1rem;
+        border-radius: var(--border-radius);
+        margin-bottom: 1rem;
+        border-left: 4px solid #ed8936;
+    }
+    .tab-nav {
+        background: linear-gradient(135deg, #f7fafc 0%, #edf2f7 100%);
+        border-radius: var(--border-radius);
+        padding: 0.5rem;
+        margin-bottom: 1rem;
     }
     """
     with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""
+        <div class="container-inner">
+            <h1>🚀 Advanced Data Processor & QR Code Generator</h1>
+            <p>Process URLs, files, and JSON data. Generate QR codes and export databases.</p>
+        </div>
         """)
+        # Warning if Playwright not available
+        if not PLAYWRIGHT_AVAILABLE:
+            gr.Markdown("""
+            <div class="warning-box">
+            ⚠️ **Playwright not installed** - Screenshots and advanced rendering disabled.<br>
+            Install with: `pip install playwright && playwright install`
+            </div>
+            """)
+        with gr.Tabs() as tabs:
+            with gr.TabItem("🌐 URL Processing"):
+                url_input = gr.Textbox(
+                    label="Enter URLs",
+                    lines=5,
+                    placeholder="Enter one URL per line:\nhttps://example.com\nhttps://example.org",
+                    value=""
+                )
+            with gr.TabItem("📁 File Input"):
+                file_input = gr.File(
+                    label="Upload Files",
+                    file_types=["*"],
+                    file_count="multiple"
+                )
+            with gr.TabItem("📝 JSON Input"):
+                text_input = gr.TextArea(
+                    label="Direct JSON Input",
+                    lines=15,
+                    placeholder='{"data": "your json here"} or [{"item": 1}, {"item": 2}]',
+                    value=""
+                )
+        # Options
         with gr.Row():
             extraction_mode = gr.Radio(
+                label="Extraction Mode",
+                choices=[
+                    "Full Structured Data",
+                    "Extract for Template (Shell)",
+                    "Extract for Database (Content Only)"
+                ],
                 value="Full Structured Data",
+                info="Template/Database mode with single URL triggers site crawl."
             )
+            combine_data = gr.Checkbox(
+                label="Combine data for sequential QR codes",
+                value=True,
+                info="Recommended for large datasets"
+            )
+        # Buttons
         with gr.Row():
+            example_btn = gr.Button("📋 Load Example", variant="secondary")
+            clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
+            process_btn = gr.Button("⚡ Process & Generate", variant="primary", scale=2)
+        # Outputs
+        output_json = gr.JSON(label="Processed Data", visible=True)
         with gr.Row():
+            output_gallery = gr.Gallery(
+                label="Generated QR Codes & Snapshots",
+                columns=3,
+                height=400,
+                show_label=True
+            )
+        output_database_zip = gr.File(
+            label="Database Export (.zip)",
+            interactive=False
+        )
+        output_text = gr.Textbox(
+            label="Processing Status",
+            interactive=False
+        )
+        # Progress bar
+        progress_bar = gr.Progress()
+        # Example data
         def load_example():
             example = {
+                "name": "Example Product Catalog",
                 "type": "product_catalog",
                 "items": [
+                    {"id": "123", "name": "Premium Widget", "price": 299.99, "category": "Electronics"},
+                    {"id": "456", "name": "Basic Widget", "price": 149.99, "category": "Electronics"},
+                    {"id": "789", "name": "Deluxe Widget", "price": 499.99, "category": "Electronics"}
                 ],
+                "metadata": {
+                    "timestamp": datetime.now().isoformat(),
+                    "source": "example",
+                    "version": "1.0"
+                }
             }
             return json.dumps(example, indent=2)
+        def clear_inputs():
+            return "", None, "", "Full Structured Data", True
+        def process_inputs(urls, files, text, mode, combine):
+            """Main processing function."""
+            results = []
+            all_media_paths = []
+            database_zip_path = None
             try:
+                # Process JSON input
                 if text and text.strip():
                     try:
                         json_data = json.loads(text)
                         if isinstance(json_data, list):
+                            for item in json_data:
+                                results.append(ProcessedItem(
+                                    source='json',
+                                    structured=item
+                                ))
                         else:
+                            results.append(ProcessedItem(
+                                source='json',
+                                structured=json_data
+                            ))
                     except json.JSONDecodeError as e:
+                        return None, [], f"Invalid JSON: {str(e)}", None
+                # Process files
                 if files:
+                    file_processor = EnhancedFileProcessor()
                     for file in files:
+                        file_results = file_processor.process_file(file.name)
                         if file_results:
                             results.extend(file_results)
+                # Process URLs
                 if urls and urls.strip():
+                    url_processor = EnhancedURLProcessor()
+                    url_list = [url.strip() for url in re.split(r'[,\n]', urls) if url.strip()]
                     if len(url_list) == 1 and mode != "Full Structured Data":
+                        # Site crawl
+                        crawler = SiteCrawler(url_processor, max_pages=5)
                         crawl_results, snapshot_paths = crawler.crawl_site(url_list[0], mode)
                         results.extend(crawl_results)
                         all_media_paths.extend(snapshot_paths)
                     else:
+                        # Single URL processing
                         for url in url_list:
                             validation = url_processor.validate_url(url)
+                            if validation.is_valid:
                                 content = url_processor.fetch_content(url)
+                                if content:
+                                    # Capture snapshot
+                                    snapshot_path = None
+                                    if PLAYWRIGHT_AVAILABLE:
+                                        filename = f"snapshot_{hashlib.md5(url.encode()).hexdigest()[:8]}.png"
+                                        snapshot_path = capture_visual_snapshot(url, filename)
+                                        if snapshot_path:
+                                            all_media_paths.append(snapshot_path)
+                                    # Process based on mode
                                     if mode == "Extract for Template (Shell)":
+                                        structured = url_processor._create_template_shell(
+                                            content.raw_content,
+                                            content.metadata['final_url']
+                                        )
                                     elif mode == "Extract for Database (Content Only)":
+                                        soup = BeautifulSoup(content.raw_content, 'html.parser')
+                                        structured = url_processor._extract_database_data(
+                                            soup,
+                                            content.metadata['final_url']
+                                        )
                                     else:
+                                        structured = url_processor._process_html_content(
+                                            content.raw_content,
+                                            content.metadata['final_url']
+                                        )
+                                    results.append(ProcessedItem(
+                                        source='url',
+                                        url=content.metadata['final_url'],
+                                        structured=structured,
+                                        metadata=content.metadata,
+                                        snapshot_path=snapshot_path
+                                    ))
+                # Process results
                 if results:
+                    results_dicts = [asdict(r) for r in results]
+                    processed_results = break_down_data(results_dicts)
                     if mode == "Extract for Database (Content Only)":
+                        # Create database package
+                        database_zip_path = package_database(processed_results)
+                        status_msg = f"✅ Database package created with {len(results)} items"
                     else:
+                        # Generate QR codes
+                        qr_generator = QRCodeGenerator()
+                        qr_paths = qr_generator.generate_qr_sequence(
+                            processed_results,
+                            combined=combine,
+                            prefix="data_qr"
+                        )
                         all_media_paths.extend(qr_paths)
+                        status_msg = f"✅ Processed {len(results)} items, generated {len(qr_paths)} QR codes"
+                    return processed_results, all_media_paths, status_msg, database_zip_path
                 else:
+                    return None, [], "❌ No valid content found in inputs", None
             except Exception as e:
                 logger.error(f"Processing error: {e}")
+                return None, [], f"❌ Error: {str(e)}", None
+        # Connect events
         example_btn.click(load_example, outputs=[text_input])
+        clear_btn.click(clear_inputs, outputs=[url_input, file_input, text_input, extraction_mode, combine_data])
         process_btn.click(
             process_inputs,
+            inputs=[url_input, file_input, text_input, extraction_mode, combine_data],
             outputs=[output_json, output_gallery, output_text, output_database_zip]
         )
+        # Footer
         gr.Markdown("""
+        <div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e2e8f0;">
+        <h3>📚 Features</h3>
+        <ul>
+            <li><strong>URL Processing</strong>: Extract structured data from web pages</li>
+            <li><strong>File Support</strong>: Process various file formats including archives</li>
+            <li><strong>Site Crawling</strong>: Limited crawl for template/database extraction</li>
+            <li><strong>QR Generation</strong>: Create QR codes for data sharing</li>
+            <li><strong>Database Export</strong>: Package data and media for deployment</li>
+        </ul>
+        </div>
         """)
     return interface
 def main():
+    """Main entry point."""
     try:
+        # Initialize mimetypes
         mimetypes.init()
+        # Create and launch interface
         interface = create_modern_interface()
         interface.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
             share=False,
             debug=False,
             show_error=True,
+            show_api=False,
+            favicon_path=None
         )
     except Exception as e:
         logger.error(f"Application startup error: {e}")