acecalisto3 commited on
Commit
5ddacea
·
verified ·
1 Parent(s): c9278df

Update Shapp.py

Browse files
Files changed (1) hide show
  1. Shapp.py +1068 -562
Shapp.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import json
2
  import os
3
  import re
@@ -8,14 +13,18 @@ import zipfile
8
  import tempfile
9
  import chardet
10
  import tarfile
 
 
11
  from datetime import datetime
12
- from typing import List, Dict, Optional, Union, Tuple
13
  from pathlib import Path
14
  from urllib.parse import urlparse, urljoin
 
 
 
15
  import requests
16
  import validators
17
  import gradio as gr
18
- from diskcache import Cache
19
  from bs4 import BeautifulSoup, NavigableString, Tag
20
  from fake_useragent import UserAgent
21
  from cleantext import clean
@@ -23,175 +32,271 @@ import qrcode
23
  from PIL import Image, ImageDraw, ImageFont
24
  import numpy as np
25
 
26
- # --- Playwright Integration ---
 
27
  try:
28
- from playwright.sync_api import sync_playwright
29
  PLAYWRIGHT_AVAILABLE = True
30
  except ImportError:
31
- PLAYWRIGHT_AVAILABLE = False
32
- # -----------------------------
33
 
34
- # Setup enhanced logging with more detailed formatting
35
  logging.basicConfig(
36
  level=logging.INFO,
37
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
38
  handlers=[
39
  logging.StreamHandler(),
40
- logging.FileHandler('app.log', encoding='utf-8')
41
  ]
42
  )
43
  logger = logging.getLogger(__name__)
44
 
45
- # Ensure output directories exist with modern structure
46
  OUTPUTS_DIR = Path('output')
47
  QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes'
48
  SNAPSHOTS_DIR = OUTPUTS_DIR / 'snapshots'
49
- MEDIA_DIR = OUTPUTS_DIR / 'media' # New directory for downloaded media
50
  TEMP_DIR = OUTPUTS_DIR / 'temp'
 
 
 
 
51
  for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR, SNAPSHOTS_DIR, MEDIA_DIR]:
52
  directory.mkdir(parents=True, exist_ok=True)
53
 
54
 
55
- def capture_visual_snapshot(url: str, filename: str) -> Optional[str]:
56
- """Captures a full-page screenshot using Playwright."""
57
- if not PLAYWRIGHT_AVAILABLE:
58
- logger.warning(f"Skipping snapshot for {url}: Playwright dependency missing.")
59
- return None
60
-
61
- output_path = SNAPSHOTS_DIR / filename
62
-
63
- try:
64
- with sync_playwright() as p:
65
- browser = p.chromium.launch(headless=True)
66
- page = browser.new_page()
67
-
68
- page.set_viewport_size({"width": 1280, "height": 1024})
69
- page.goto(url, wait_until="networkidle")
70
-
71
- page.screenshot(path=output_path, full_page=True)
72
- browser.close()
73
- logger.info(f"Captured snapshot for {url} at {output_path}")
74
- return str(output_path)
75
-
76
- except Exception as e:
77
- logger.error(f"Playwright snapshot failed for {url}: {e}")
78
- return None
79
-
80
-
 
 
 
 
 
 
 
 
 
81
  class MediaDownloader:
82
- """Handles downloading and saving media files."""
83
- def __init__(self):
 
84
  self.session = requests.Session()
85
- self.downloaded_files = {} # {original_url: local_path}
86
-
87
- def download_media(self, url: str) -> Optional[str]:
88
- """Downloads a single media file and returns its local path."""
89
- if url in self.downloaded_files:
90
- return self.downloaded_files[url]
91
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  try:
93
- response = self.session.get(url, timeout=10, stream=True)
94
  response.raise_for_status()
95
-
 
96
  content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
97
-
98
- # Determine extension based on MIME type or URL suffix
99
  ext = mimetypes.guess_extension(content_type)
100
  if not ext:
101
- ext = Path(urlparse(url).path).suffix or '.bin'
102
-
103
- # Create a unique filename based on hash and extension
104
- filename = f"{hash(url) & 0xFFFFFFFF}{ext}"
105
- local_path = MEDIA_DIR / filename
106
-
 
 
 
107
  with open(local_path, 'wb') as f:
108
  for chunk in response.iter_content(chunk_size=8192):
109
- f.write(chunk)
110
-
 
 
 
 
 
111
  self.downloaded_files[url] = str(local_path)
112
  logger.info(f"Downloaded media: {url} -> {local_path}")
113
  return str(local_path)
114
-
115
  except requests.exceptions.RequestException as e:
116
  logger.warning(f"Failed to download media {url}: {e}")
117
  return None
118
  except Exception as e:
119
- logger.error(f"Error during media download: {e}")
120
  return None
 
 
 
 
 
 
 
121
 
122
 
 
123
  class EnhancedURLProcessor:
124
- """Advanced URL processing with complete content extraction"""
125
-
126
- def __init__(self):
127
  self.session = requests.Session()
128
- self.timeout = 15
129
- self.max_retries = 3
130
  self.user_agent = UserAgent()
131
-
132
  self.session.headers.update({
133
  'User-Agent': self.user_agent.random,
134
- 'Accept': '*/*',
135
- 'Accept-Language': 'en-US,en;q=0.9',
136
  'Accept-Encoding': 'gzip, deflate, br',
137
  'Connection': 'keep-alive',
138
  'Upgrade-Insecure-Requests': '1',
139
  'Sec-Fetch-Dest': 'document',
140
  'Sec-Fetch-Mode': 'navigate',
141
  'Sec-Fetch-Site': 'none',
142
- 'Sec-Fetch-User': '?1',
143
- 'DNT': '1'
144
  })
145
-
146
- def validate_url(self, url: str) -> Dict:
147
- """Enhanced URL validation with detailed feedback"""
148
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  if not validators.url(url):
150
- return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
151
-
 
 
 
 
152
  parsed = urlparse(url)
153
  if not all([parsed.scheme, parsed.netloc]):
154
- return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
155
-
 
 
 
 
 
156
  try:
157
- head_response = self.session.head(url, timeout=5)
 
 
 
 
158
  head_response.raise_for_status()
159
  except requests.exceptions.RequestException:
160
- response = self.session.get(url, timeout=self.timeout)
 
161
  response.raise_for_status()
162
-
163
- return {
164
- 'is_valid': True,
165
- 'message': 'URL is valid and accessible',
166
- 'details': {
 
167
  'content_type': head_response.headers.get('Content-Type', 'unknown'),
168
  'server': head_response.headers.get('Server', 'unknown'),
169
  'size': head_response.headers.get('Content-Length', 'unknown')
170
  }
171
- }
 
172
  except Exception as e:
173
- return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
174
-
175
- def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
176
- """Enhanced content fetcher with retries and encoding detection"""
 
 
 
 
177
  try:
178
- logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
 
 
179
  self.session.headers.update({'User-Agent': self.user_agent.random})
180
- response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
 
 
 
 
 
 
181
  response.raise_for_status()
182
-
183
  # Encoding detection
184
  encoding = response.encoding
185
  if encoding is None or encoding == 'ISO-8859-1':
186
- detected = chardet.detect(response.content)
 
 
187
  encoding = detected['encoding'] or 'utf-8'
188
-
 
189
  try:
190
  raw_content = response.content.decode(encoding, errors='replace')
191
  except (UnicodeDecodeError, LookupError):
192
  raw_content = response.content.decode('utf-8', errors='replace')
193
  encoding = 'utf-8 (fallback)'
194
-
 
195
  metadata = {
196
  'url': url,
197
  'final_url': response.url,
@@ -201,34 +306,46 @@ class EnhancedURLProcessor:
201
  'content_length': len(response.content),
202
  'status_code': response.status_code,
203
  'headers': dict(response.headers),
 
204
  }
205
-
 
206
  content_type = metadata['content_type'].lower()
207
  structured = {}
208
- if 'text/html' not in content_type and content_type != '':
209
- if 'application/json' in content_type or url.endswith('.json'):
210
- try:
211
- structured = json.loads(raw_content)
212
- except json.JSONDecodeError:
213
- structured = {'text': raw_content, 'parse_error': 'Invalid JSON'}
214
- elif 'image/' in content_type:
215
  structured = {
216
- 'media_type': 'image',
217
- 'direct_url': response.url,
218
- 'format': content_type.split('/')[-1],
219
- 'size_bytes': len(response.content)
220
  }
221
- else:
222
- structured = {'text': raw_content[:100_000]}
223
-
224
- return {
225
- 'structured': structured,
226
- 'raw_content': raw_content,
227
- 'metadata': metadata
228
- }
 
 
 
 
 
 
 
 
 
 
229
  except requests.exceptions.RequestException as e:
230
  if retry_count < self.max_retries - 1:
231
  sleep_time = 2 ** retry_count
 
232
  time.sleep(sleep_time)
233
  return self.fetch_content(url, retry_count + 1)
234
  else:
@@ -237,387 +354,553 @@ class EnhancedURLProcessor:
237
  except Exception as e:
238
  logger.error(f"Unexpected error fetching {url}: {e}")
239
  return None
240
-
241
- def _process_html_content(self, raw_content: str, base_url: str) -> Dict:
 
242
  soup = BeautifulSoup(raw_content, 'html.parser')
243
- for tag in soup.find_all(['a', 'img', 'link', 'script', 'video', 'audio']):
244
- for attr in ['href', 'src']:
 
 
245
  if tag.get(attr) and not urlparse(tag[attr]).scheme:
246
  try:
247
  tag[attr] = urljoin(base_url, tag[attr])
248
- except Exception:
249
- pass
250
- structured = self._extract_database_data(soup)
251
- structured['raw_html'] = raw_content
 
 
 
 
252
  return structured
253
-
254
- def _create_template_shell(self, raw_content: str, base_url: str) -> Dict:
 
255
  soup = BeautifulSoup(raw_content, 'html.parser')
 
256
  PLACEHOLDER_TEXT = "[LOREM IPSUM CONTENT]"
257
- PLACEHOLDER_IMG = "data:image/svg+xml;charset=UTF-8,%3Csvg%20width%3D%22200%22%20height%3D%22100%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20viewBox%3D%220%200%20200%20100%22%20preserveAspectRatio%3D%22none%22%3E%3Cdefs%3E%3Cstyle%20type%3D%22text%2Fcss%22%3E%23holder_16a4c%20text%20%7B%20fill%3Argba(255%2C255%2C255%2C.75)%3Bfont-weight%3Anormal%3Bfont-family%3AHelvetica%2C%20monospace%3Bfont-size%3A10pt%20%7D%20%3C%2Fstyle%3E%3C%2Fdefs%3E%3Cg%20id%3D%22holder_16a4c%22%3E%3Crect%20width%3D%22200%22%20height%3D%22100%22%20fill%3D%22%23777%22%3E%3C%2Frect%3E%3Cg%3E%3Ctext%20x%3D%2270%22%20y%3D%2255%22%3E200x100%3C%2Ftext%3E%3C%2Fg%3E%3C%2Fg%3E%3C%2Fsvg%3E"
258
- for tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'td', 'th', 'label', 'title']:
259
- for tag in soup.find_all(tag_name):
260
- if tag.string and len(tag.get_text(strip=True)) > 5:
261
- tag.string.replace_with(PLACEHOLDER_TEXT)
 
 
 
 
262
  for img in soup.find_all('img'):
263
  img['src'] = PLACEHOLDER_IMG
 
 
 
 
264
  for a in soup.find_all('a'):
265
- if 'href' in a.attrs and a['href'].startswith(('http', 'https')):
266
  a['href'] = '#'
 
 
267
  for script in soup.find_all('script', type='application/ld+json'):
268
  script.decompose()
269
- for element in soup.find_all(string=lambda text: isinstance(text, NavigableString) and '<!--' in text):
270
- element.extract()
 
 
 
271
  return {
272
  'template_type': 'html_shell',
273
  'base_url': base_url,
274
- 'template_html': str(soup)
 
275
  }
276
-
277
- def _extract_database_data(self, soup: BeautifulSoup) -> Dict:
 
278
  structured = {
279
- 'title': soup.title.string.strip() if soup.title else '',
280
- 'meta_description': soup.find('meta', attrs={'name': 'description'}).get('content') if soup.find('meta', attrs={'name': 'description'}) else '',
281
  'core_text_content': '',
282
- 'images': set(),
283
- 'videos': set(),
284
- 'audios': set(),
285
  'structured_data': [],
286
  'products': [],
287
- 'branding_links': set()
 
288
  }
289
-
 
 
 
 
 
 
290
  for script in soup.find_all('script', type='application/ld+json'):
291
  try:
292
- ld_data = json.loads(script.text)
293
  structured['structured_data'].append(ld_data)
294
- if isinstance(ld_data, dict) and ld_data.get('@type') == 'Product':
295
- structured['products'].append(ld_data)
296
- except json.JSONDecodeError as e:
297
- logger.warning(f"Failed to decode JSON-LD script during data extraction: {e}")
298
-
 
 
 
 
 
 
 
 
299
  for img in soup.find_all('img'):
300
- if img.get('src'):
301
- structured['images'].add(urljoin(soup.base_url if hasattr(soup, 'base_url') else '', img.get('src')))
 
 
302
  for video in soup.find_all('video'):
303
- if video.get('src'):
304
- structured['videos'].add(urljoin(soup.base_url if hasattr(soup, 'base_url') else '', video.get('src')))
 
 
305
  for audio in soup.find_all('audio'):
306
- if audio.get('src'):
307
- structured['audios'].add(urljoin(soup.base_url if hasattr(soup, 'base_url') else '', audio.get('src')))
308
-
309
- main_content_tags = soup.find_all(
310
- lambda tag: tag.name in ['main', 'article', 'section'] or
311
- 'content' in tag.get('id', '').lower() or
312
- 'main' in tag.get('class', [])
313
- )
314
-
315
- if main_content_tags:
316
- best_tag = max(main_content_tags, key=lambda tag: len(tag.get_text(strip=True)), default=None)
317
- if best_tag:
318
- structured['core_text_content'] = clean(best_tag.get_text('\n', strip=True), lower=False, no_line_breaks=False)
319
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  if not structured['core_text_content']:
321
- structured['core_text_content'] = clean('\n'.join(soup.stripped_strings), lower=False, no_line_breaks=False)
322
-
323
- structured['images'] = list(structured['images'])
324
- structured['videos'] = list(structured['videos'])
325
- structured['audios'] = list(structured['audios'])
326
- structured['branding_links'] = list(structured['branding_links'])
327
-
 
 
 
 
 
 
 
 
 
328
  return structured
329
 
330
 
 
331
  class SiteCrawler:
332
- def __init__(self, processor: EnhancedURLProcessor):
 
 
333
  self.processor = processor
 
 
334
  self.crawled_urls = set()
335
- self.max_pages = 10
336
-
337
- def _get_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  parsed_base = urlparse(base_url)
339
  internal_links = set()
 
340
  for a in soup.find_all('a', href=True):
341
  href = urljoin(base_url, a['href'])
342
  parsed_href = urlparse(href)
343
- if parsed_href.netloc == parsed_base.netloc and parsed_href.scheme in ('http', 'https'):
344
- if not any(href.lower().endswith(ext) for ext in ['.pdf', '.zip', '.jpg', '.png', '.css', '.js', '#']):
 
 
 
 
 
 
 
 
 
 
 
345
  internal_links.add(href)
 
346
  return list(internal_links)
347
-
348
- def crawl_site(self, start_url: str, mode: str) -> Tuple[List[Dict], List[str]]:
349
- logger.info(f"Starting limited crawl from {start_url} in mode: {mode}")
350
- queue = [start_url]
351
- results = []
352
- snapshot_paths = []
353
-
354
  while queue and len(self.crawled_urls) < self.max_pages:
355
- url = queue.pop(0)
356
- if url in self.crawled_urls:
 
357
  continue
 
 
358
  self.crawled_urls.add(url)
359
-
 
360
  content_result = self.processor.fetch_content(url)
361
- if not content_result or 'text/html' not in content_result['metadata']['content_type'].lower():
362
  continue
363
-
364
- raw_content = content_result['raw_content']
365
- base_url = content_result['metadata']['final_url']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  soup = BeautifulSoup(raw_content, 'html.parser')
367
-
368
- filename = f"snapshot_{len(self.crawled_urls)}_{urlparse(base_url).path.replace('/', '_') or 'index'}.png"
369
- snapshot_path = capture_visual_snapshot(base_url, filename)
370
- if snapshot_path:
371
- snapshot_paths.append(snapshot_path)
372
-
373
- new_links = self._get_links(soup, base_url)
374
- queue.extend([link for link in new_links if link not in self.crawled_urls and urlparse(link).netloc == urlparse(start_url).netloc])
375
-
376
- page_result = {
377
- 'source': 'crawl',
378
- 'url': base_url,
379
- 'metadata': content_result['metadata'],
380
- 'timestamp': datetime.now().isoformat(),
381
- 'snapshot_path': snapshot_path if snapshot_path else 'N/A'
382
- }
383
-
384
  if mode == "Extract for Template (Shell)":
385
- page_result['structured'] = self.processor._create_template_shell(raw_content, base_url)
386
  elif mode == "Extract for Database (Content Only)":
387
- page_result['structured'] = self.processor._extract_database_data(soup)
388
  else:
389
- page_result['structured'] = self.processor._process_html_content(raw_content, base_url)
390
-
391
- results.append(page_result)
392
-
393
- return results, snapshot_paths
394
-
395
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  class EnhancedFileProcessor:
397
- def __init__(self, max_file_size: int = 5 * 1024 * 1024):
 
 
398
  self.max_file_size = max_file_size
399
  self.supported_extensions = {
400
  '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.log',
401
  '.yml', '.yaml', '.ini', '.conf', '.cfg', '.zip', '.tar', '.gz',
402
  '.bz2', '.7z', '.rar', '.pdf', '.doc', '.docx', '.rtf', '.odt',
403
- '.jpg', '.jpeg', '.png', '.gif', '.bmp'
404
  }
405
-
406
- def process_file(self, file) -> List[Dict]:
407
- if not file:
 
408
  return []
409
- dataset = []
410
  try:
411
- file_path = file.name
412
  file_size = os.path.getsize(file_path)
413
  if file_size > self.max_file_size:
414
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
415
  return []
416
- with tempfile.TemporaryDirectory() as temp_dir:
417
- temp_dir_path = Path(temp_dir)
418
- if self._is_archive(file_path):
419
- dataset.extend(self._process_archive(file_path, temp_dir_path))
420
- else:
421
- dataset.extend(self._process_single_file(file))
422
  except Exception as e:
423
- logger.error(f"Error processing file: {str(e)}")
424
  return []
425
- return dataset
426
-
427
  def _is_archive(self, filepath: str) -> bool:
428
- return any(filepath.lower().endswith(ext) for ext in ['.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'])
429
-
430
- def _process_single_file(self, file) -> List[Dict]:
 
 
 
431
  try:
432
- file_path = file.name
433
  file_stat = os.stat(file_path)
434
- file_size = file_stat.st_size
435
- mime = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
 
436
  structured = {}
437
-
438
- if 'image/' in mime:
439
  structured = {
440
  'media_type': 'image',
441
  'filename': os.path.basename(file_path),
442
- 'mime': mime
 
443
  }
444
  else:
 
445
  with open(file_path, 'rb') as f:
446
  raw_bytes = f.read()
447
- detected = chardet.detect(raw_bytes)
 
 
448
  encoding = detected['encoding'] or 'utf-8'
 
449
  try:
450
- complete_content = raw_bytes.decode(encoding, errors='replace')
451
  except (UnicodeDecodeError, LookupError):
452
- complete_content = raw_bytes.decode('utf-8', errors='replace')
453
-
454
- if 'json' in mime:
 
455
  try:
456
- json_data = json.loads(complete_content)
457
  structured = json_data
458
- if isinstance(json_data, dict) and 'items' in json_data and isinstance(json_data['items'], list):
459
- structured['products'] = json_data['items']
460
- except json.JSONDecodeError:
461
- structured = {'text': complete_content, 'parse_error': 'Invalid JSON'}
462
- elif 'html' in mime or 'htm' in mime:
463
- url_processor = EnhancedURLProcessor()
464
- soup = BeautifulSoup(complete_content, 'html.parser')
465
- structured = url_processor._extract_database_data(soup)
 
466
  else:
467
- structured = {'text': complete_content}
468
-
469
- return [{
470
- 'source': 'file',
471
- 'filename': os.path.basename(file_path),
472
- 'file_size': file_size,
473
- 'mime_type': mime,
474
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
475
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
476
- 'structured': structured,
477
- 'timestamp': datetime.now().isoformat()
478
- }]
 
 
 
 
 
479
  except Exception as e:
480
- logger.error(f"File processing error: {e}")
481
  return []
482
-
483
- def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
 
484
  dataset = []
 
 
485
  try:
486
  if zipfile.is_zipfile(archive_path):
487
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
488
- zip_ref.extractall(extract_to)
 
489
  for file_info in zip_ref.infolist():
490
- if file_info.file_size > 0 and not file_info.filename.endswith('/'):
491
- extracted_path = extract_to / file_info.filename
492
- if extracted_path.suffix.lower() in self.supported_extensions:
493
- temp_file = type('TempFile', (), {'name': str(extracted_path)})()
494
- dataset.extend(self._process_single_file(temp_file))
495
  elif tarfile.is_tarfile(archive_path):
496
  with tarfile.open(archive_path, 'r') as tar_ref:
497
- tar_ref.extractall(extract_to)
 
498
  for member in tar_ref.getmembers():
499
  if member.isfile():
500
- extracted_path = extract_to / member.name
501
- if extracted_path.suffix.lower() in self.supported_extensions:
502
- temp_file = type('TempFile', (), {'name': str(extracted_path)})()
503
- dataset.extend(self._process_single_file(temp_file))
 
 
 
504
  except Exception as e:
505
- logger.error(f"Archive processing error: {e}")
 
 
 
 
 
 
 
 
506
  return dataset
507
 
508
 
509
- def break_down_data(data: Union[Dict, List[Dict]]) -> Union[Dict, List[Dict]]:
510
- def process_item(item: Dict) -> Dict:
511
- structured = item.get('structured', {})
512
- if structured.get('template_type') == 'html_shell':
513
- return item
514
- if not structured:
515
- content = item.get('content', item.get('raw_content', ''))
516
- if isinstance(content, str):
517
- structured = {'text': content}
518
- elif isinstance(content, dict):
519
- structured = content
520
- if 'products' not in structured:
521
- structured['products'] = []
522
- media = []
523
- media.extend([{'type': 'image', 'source': src} for src in structured.get('images', [])])
524
- media.extend([{'type': 'video', 'source': src} for src in structured.get('videos', [])])
525
- media.extend([{'type': 'audio', 'source': src} for src in structured.get('audios', [])])
526
- structured['media'] = media
527
- if structured['products']:
528
- structured['template'] = {
529
- 'type': 'product_catalog',
530
- 'items': structured['products'],
531
- 'metadata': item.get('metadata', {})
532
- }
533
- item['structured'] = structured
534
- return item
535
-
536
- if isinstance(data, list):
537
- return [process_item(item) for item in data]
538
- elif isinstance(data, dict):
539
- return process_item(data)
540
- return data
541
-
542
-
543
  class DataChunker:
544
- def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
 
 
 
 
 
 
545
  try:
546
- if isinstance(data, dict) and data.get('template_type') == 'html_shell':
 
547
  json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
548
- else:
549
  json_str = json.dumps(data, ensure_ascii=False)
550
- json_bytes = json_str.encode('utf-8')
551
- total_length = len(json_bytes)
552
- metadata_template = {
553
- "chunk_index": 999, "total_chunks": 999, "total_length": total_length,
554
- "chunk_hash": 0xFFFFFFFF, "data": ""
555
- }
556
- overhead_str = json.dumps(metadata_template, ensure_ascii=False).replace('""', '')
557
- overhead_bytes = len(overhead_str.encode('utf-8')) + 50
558
- effective_chunk_size = max_size - overhead_bytes
559
- if effective_chunk_size <= 0:
560
- raise ValueError(f"Max size ({max_size}) is too small after accounting for metadata overhead ({overhead_bytes})")
561
- num_chunks = (total_length + effective_chunk_size - 1) // effective_chunk_size
562
  chunks = []
563
- start = 0
564
- for i in range(num_chunks):
565
- end = min(start + effective_chunk_size, total_length)
566
- chunk_bytes = json_bytes[start:end]
567
- chunk_str = chunk_bytes.decode('utf-8', errors='replace')
568
- chunk_hash = hash(chunk_str) & 0xFFFFFFFF
569
  chunk = {
570
- "chunk_index": i + 1,
571
- "total_chunks": num_chunks,
572
- "total_length": total_length,
573
  "chunk_hash": chunk_hash,
574
- "data": chunk_str
 
575
  }
576
  chunks.append(chunk)
577
- start = end
578
  return chunks
 
579
  except Exception as e:
580
  logger.error(f"Error chunking data: {e}")
581
- return []
582
-
583
-
584
- def generate_stylish_qr(data: Union[str, Dict], filename: str, size: int = 10, border: int = 4,
585
- fill_color: str = "#000000", back_color: str = "#FFFFFF") -> str:
586
- try:
587
- qr = qrcode.QRCode(
588
- version=None,
589
- error_correction=qrcode.constants.ERROR_CORRECT_L,
590
- box_size=size,
591
- border=border
592
- )
593
- if isinstance(data, dict):
594
- qr_data_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
595
- qr.add_data(qr_data_str)
596
- else:
597
- qr.add_data(data)
598
- qr.make(fit=True)
599
- qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
600
- qr_image = qr_image.convert('RGBA')
601
- final_image = qr_image
602
- output_path = QR_CODES_DIR / filename
603
- final_image.save(output_path, quality=95)
604
- return str(output_path)
605
- except Exception as e:
606
- logger.error(f"QR generation error: {e}")
607
- return ""
608
-
609
-
610
- def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
611
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  chunker = DataChunker()
613
  paths = []
614
- if isinstance(data, dict) and data.get('template_type') == 'html_shell':
615
- combined = True
616
  if combined:
 
617
  chunks = chunker.chunk_data(data)
618
  for i, chunk in enumerate(chunks):
619
- filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
620
- qr_path = generate_stylish_qr(
621
  data=chunk,
622
  filename=filename,
623
  fill_color="#1a365d",
@@ -626,15 +909,16 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
626
  if qr_path:
627
  paths.append(qr_path)
628
  else:
 
629
  if isinstance(data, list):
630
  for idx, item in enumerate(data):
631
  chunks = chunker.chunk_data(item)
632
  for chunk_idx, chunk in enumerate(chunks):
633
- filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
634
- qr_path = generate_stylish_qr(
635
  data=chunk,
636
  filename=filename,
637
- fill_color="#1a365d",
638
  back_color="#ffffff"
639
  )
640
  if qr_path:
@@ -642,8 +926,8 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
642
  else:
643
  chunks = chunker.chunk_data(data)
644
  for i, chunk in enumerate(chunks):
645
- filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
646
- qr_path = generate_stylish_qr(
647
  data=chunk,
648
  filename=filename,
649
  fill_color="#1a365d",
@@ -651,68 +935,166 @@ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> Li
651
  )
652
  if qr_path:
653
  paths.append(qr_path)
 
654
  return paths
655
- except Exception as e:
656
- logger.error(f"Error in generate_qr_codes: {e}")
657
- return []
658
 
659
 
660
- # ←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←
661
- # THIS WAS THE EXACT LINE THAT WAS FAILING BECAUSE OF AN UNMATCHED `try:` ABOVE
662
- # ←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←←
663
- def package_database(results: List[Dict]) -> Optional[str]:
664
- """
665
- Downloads all media referenced in the results and packages everything into a ZIP file.
666
- """
667
- if not results:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  return None
669
 
670
- downloader = MediaDownloader()
671
- all_downloaded_files = {}
672
 
673
- # Collect all unique media URLs
674
- media מדים_to_download = set()
675
- for item in results:
 
676
  structured = item.get('structured', {})
677
- media_urls_to_download.update(structured.get('images', []))
678
- media_urls_to_download.update(structured.get('videos', []))
679
- media_urls_to_download.update(structured.get('audios', []))
680
-
681
- for url in media_urls_to_download:
682
- local_path = downloader.download_media(url)
683
- if local_path:
684
- all_downloaded_files[url] = local_path
685
-
686
- # Update results to use local relative paths
687
- updated_results = []
688
- for item in results:
689
- item_copy = item.copy()
690
- structured = item_copy.get('structured', {})
691
- for media_type in ['images', 'videos', 'audios']:
692
- if media_type in structured:
693
- new_paths = []
694
- for url in structured[media_type]:
695
- if url in all_downloaded_files:
696
- new_paths.append(f"media/{Path(all_downloaded_files[url]).name}")
697
- else:
698
- new_paths.append(url)
699
- structured[media_type] = new_paths
700
- item_copy['structured'] = structured
701
- updated_results.append(item_copy)
702
-
703
- # Create ZIP
704
- zip_filename = OUTPUTS_DIR / f"database_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
706
- with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
707
- zf.writestr('data_export.json', json.dumps(updated_results, indent=2, ensure_ascii=False))
708
- for original_url, local_path in all_downloaded_files.items():
709
- zf.write(local_path, arcname=f"media/{Path(local_path).name}")
710
 
711
- logger.info(f"Database package created at: {zip_filename}")
712
- return str(zip_filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
 
714
 
 
715
  def create_modern_interface():
 
716
  css = """
717
  :root {
718
  --primary-color: #1a365d;
@@ -722,208 +1104,332 @@ def create_modern_interface():
722
  --success-color: #48bb78;
723
  --error-color: #f56565;
724
  --warning-color: #ed8936;
 
725
  }
 
726
  .gradio-container {
727
  max-width: 1200px;
728
- margin: auto;
 
 
 
 
 
 
 
 
 
729
  padding: 2rem;
730
- background-color: var(--background-color);
731
- border-radius: 1rem;
732
  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
733
  }
 
 
 
 
 
 
 
 
734
  .primary-button {
735
- background-color: var(--primary-color) !important;
736
- color: white !important;
737
- padding: 0.75rem 1.5rem;
738
- border-radius: 0.375rem;
739
  border: none;
 
 
 
740
  cursor: pointer;
741
- transition: all 0.2s;
742
  }
 
743
  .primary-button:hover {
744
- background-color: var(--accent-color) !important;
745
- transform: translateY(-1px);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  }
747
  """
 
748
  with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator", theme=gr.themes.Soft()) as interface:
749
- gr.Markdown(f"""
750
- # Advanced Data Processor & QR Code Generator
751
- ## Site Crawling & Template/Database Extraction
752
-
753
- {"**WARNING: Playwright is not installed.** Visual capture and dynamic rendering are disabled. Install using `pip install playwright` and run `playwright install`." if not PLAYWRIGHT_AVAILABLE else ""}
 
754
  """)
755
-
756
- with gr.Tab("URL Processing"):
757
- url_input = gr.Textbox(label="Enter URLs (single URL for site crawl, multiple for independent pages)", lines=5, placeholder="https://example1.com\nhttps://example2.com", value="")
758
-
759
- with gr.Tab("File Input"):
760
- file_input = gr.File(label="Upload Files", file_types=["*"], file_count="multiple")
761
-
762
- with gr.Tab("JSON Input"):
763
- text_input = gr.TextArea(label="Direct JSON Input", lines=15, placeholder="Paste your JSON data here...", value="")
764
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  with gr.Row():
766
  extraction_mode = gr.Radio(
767
- label="Extraction Mode (Applies to URLs)",
768
- choices=["Full Structured Data", "Extract for Template (Shell)", "Extract for Database (Content Only)"],
 
 
 
 
769
  value="Full Structured Data",
770
- info="Template/Database mode with a single URL triggers a limited site crawl and visual capture."
771
  )
772
-
773
- with gr.Row():
774
- example_btn = gr.Button("Load Example", variant="secondary")
775
- clear_btn = gr.Button("Clear", variant="secondary")
776
-
 
 
 
777
  with gr.Row():
778
- combine_data = gr.Checkbox(label="Combine all data into sequence", value=True, info="Generate sequential QR codes for combined data (recommended for large datasets)")
779
- process_btn = gr.Button("Process & Generate QR / Database", variant="primary", elem_classes="primary-button")
780
-
781
- output_json = gr.JSON(label="Processed & Structured Data")
782
-
 
 
783
  with gr.Row():
784
- output_gallery = gr.Gallery(label="Visual Snapshots & Generated QR Codes", columns=3, height=400, show_label=True)
785
-
786
- output_database_zip = gr.File(label="Database Export (.zip)", interactive=False, file_count="single", visible=False)
787
- output_text = gr.Textbox(label="Processing Status", interactive=False)
788
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  def load_example():
790
  example = {
 
791
  "type": "product_catalog",
792
  "items": [
793
- {"id": "123", "name": "Premium Widget", "price": 299.99},
794
- {"id": "456", "name": "Basic Widget", "price": 149.99}
 
795
  ],
796
- "metadata": {"timestamp": datetime.now().isoformat()}
 
 
 
 
797
  }
798
  return json.dumps(example, indent=2)
799
-
800
- def clear_input():
801
- return "", None, "", "Full Structured Data", [], None
802
-
803
- def process_inputs(urls, files, text, combine, mode):
 
 
 
 
 
804
  try:
805
- results = []
806
- url_processor = EnhancedURLProcessor()
807
- file_processor = EnhancedFileProcessor()
808
- all_media_paths = []
809
- database_zip_path = None
810
-
811
  if text and text.strip():
812
  try:
813
  json_data = json.loads(text)
814
  if isinstance(json_data, list):
815
- results.extend([{'source': 'json', 'structured': item} for item in json_data])
 
 
 
 
816
  else:
817
- results.append({'source': 'json', 'structured': json_data})
 
 
 
818
  except json.JSONDecodeError as e:
819
- return None, [], f"Invalid JSON format: {str(e)}", None
820
-
 
821
  if files:
 
822
  for file in files:
823
- file_results = file_processor.process_file(file)
824
  if file_results:
825
  results.extend(file_results)
826
-
 
827
  if urls and urls.strip():
828
- url_list = re.split(r'[,\n]', urls)
829
- url_list = [url.strip() for url in url_list if url.strip()]
830
-
831
  if len(url_list) == 1 and mode != "Full Structured Data":
832
- crawler = SiteCrawler(url_processor)
 
833
  crawl_results, snapshot_paths = crawler.crawl_site(url_list[0], mode)
834
  results.extend(crawl_results)
835
  all_media_paths.extend(snapshot_paths)
836
  else:
 
837
  for url in url_list:
838
  validation = url_processor.validate_url(url)
839
- if validation['is_valid']:
840
  content = url_processor.fetch_content(url)
841
- if content and 'text/html' in content['metadata']['content_type'].lower():
842
- filename = f"snapshot_{int(time.time())}_{urlparse(url).netloc.replace('.', '_')}.png"
843
- snapshot_path = capture_visual_snapshot(content['metadata']['final_url'], filename)
844
- if snapshot_path:
845
- all_media_paths.append(snapshot_path)
846
-
 
 
 
 
847
  if mode == "Extract for Template (Shell)":
848
- structured_output = url_processor._create_template_shell(content['raw_content'], content['metadata']['final_url'])
 
 
 
849
  elif mode == "Extract for Database (Content Only)":
850
- soup = BeautifulSoup(content['raw_content'], 'html.parser')
851
- structured_output = url_processor._extract_database_data(soup)
 
 
 
852
  else:
853
- structured_output = url_processor._process_html_content(content['raw_content'], content['metadata']['final_url'])
854
-
855
- results.append({
856
- 'source': 'url', 'url': content['metadata']['final_url'],
857
- 'structured': structured_output, 'metadata': content['metadata'],
858
- 'timestamp': datetime.now().isoformat(), 'snapshot_path': snapshot_path if snapshot_path else 'N/A'
859
- })
860
- elif content:
861
- results.append({
862
- 'source': 'url', 'url': content['metadata']['final_url'],
863
- 'structured': content['structured'], 'metadata': content['metadata'],
864
- 'timestamp': datetime.now().isoformat()
865
- })
866
-
867
- if results:
868
- results = break_down_data(results)
869
-
870
  if results:
 
 
 
871
  if mode == "Extract for Database (Content Only)":
872
- database_zip_path = package_database(results)
873
- status_msg = f"Database package created. Processed {len(results)} items."
 
874
  else:
875
- qr_paths = generate_qr_codes(results, combine)
 
 
 
 
 
 
876
  all_media_paths.extend(qr_paths)
877
- status_msg = f"Processed {len(results)} items. Generated {len(all_media_paths)} media files."
878
-
879
- return (
880
- results,
881
- [str(path) for path in all_media_paths],
882
- status_msg,
883
- database_zip_path
884
- )
885
  else:
886
- return None, [], "No valid content to process from inputs.", None
 
887
  except Exception as e:
888
  logger.error(f"Processing error: {e}")
889
- return None, [], f"Critical Error during processing: {str(e)}", None
890
-
 
891
  example_btn.click(load_example, outputs=[text_input])
892
- clear_btn.click(clear_input, outputs=[url_input, file_input, text_input, extraction_mode, output_gallery, output_database_zip])
893
-
894
  process_btn.click(
895
  process_inputs,
896
- inputs=[url_input, file_input, text_input, combine_data, extraction_mode],
897
  outputs=[output_json, output_gallery, output_text, output_database_zip]
898
- ).success(
899
- fn=lambda zip_path: gr.update(visible=bool(zip_path)),
900
- inputs=[output_database_zip],
901
- outputs=[output_database_zip]
902
  )
903
-
 
904
  gr.Markdown("""
905
- ### Database Export (D-Baser)
906
- When **Extract for Database (Content Only)** is selected, the system performs the following:
907
- 1. Isolates entity-specific text, structured data, and media URLs.
908
- 2. Downloads all unique media files (images, videos) to the `output/media` folder.
909
- 3. Creates a final ZIP file containing:
910
- - `data_export.json`: Structured data with media URLs replaced by relative paths (`media/filename.jpg`).
911
- - `media/`: A folder containing all downloaded media files.
912
- This ZIP archive is ready to be deployed as a static content store.
 
 
913
  """)
914
-
915
  return interface
916
 
917
 
918
  def main():
 
919
  try:
 
920
  mimetypes.init()
 
 
921
  interface = create_modern_interface()
922
  interface.launch(
 
 
923
  share=False,
924
  debug=False,
925
  show_error=True,
926
- show_api=False
 
927
  )
928
  except Exception as e:
929
  logger.error(f"Application startup error: {e}")
 
1
+ """
2
+ Advanced Data Processor & QR Generator
3
+ Enhanced version with better error handling, performance improvements, and cleaner architecture.
4
+ """
5
+
6
  import json
7
  import os
8
  import re
 
13
  import tempfile
14
  import chardet
15
  import tarfile
16
+ import copy
17
+ import hashlib
18
  from datetime import datetime
19
+ from typing import List, Dict, Optional, Union, Tuple, Any, Set
20
  from pathlib import Path
21
  from urllib.parse import urlparse, urljoin
22
+ from dataclasses import dataclass, asdict
23
+ from contextlib import contextmanager
24
+
25
  import requests
26
  import validators
27
  import gradio as gr
 
28
  from bs4 import BeautifulSoup, NavigableString, Tag
29
  from fake_useragent import UserAgent
30
  from cleantext import clean
 
32
  from PIL import Image, ImageDraw, ImageFont
33
  import numpy as np
34
 
35
+ # Conditional imports with better error handling
36
+ PLAYWRIGHT_AVAILABLE = False
37
  try:
38
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
39
  PLAYWRIGHT_AVAILABLE = True
40
  except ImportError:
41
+ logger = logging.getLogger(__name__)
42
+ logger.warning("Playwright not installed. Install with: pip install playwright && playwright install")
43
 
44
+ # Setup enhanced logging
45
  logging.basicConfig(
46
  level=logging.INFO,
47
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
48
  handlers=[
49
  logging.StreamHandler(),
50
+ logging.FileHandler('app.log', encoding='utf-8', mode='a')
51
  ]
52
  )
53
  logger = logging.getLogger(__name__)
54
 
55
+ # Constants
56
  OUTPUTS_DIR = Path('output')
57
  QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes'
58
  SNAPSHOTS_DIR = OUTPUTS_DIR / 'snapshots'
59
+ MEDIA_DIR = OUTPUTS_DIR / 'media'
60
  TEMP_DIR = OUTPUTS_DIR / 'temp'
61
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB default max
62
+ DEFAULT_TIMEOUT = 30
63
+
64
+ # Ensure directories exist
65
  for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR, SNAPSHOTS_DIR, MEDIA_DIR]:
66
  directory.mkdir(parents=True, exist_ok=True)
67
 
68
 
69
+ # Data classes for better type safety
70
+ @dataclass
71
+ class URLValidationResult:
72
+ is_valid: bool
73
+ message: str
74
+ details: Dict[str, Any]
75
+
76
+
77
+ @dataclass
78
+ class FetchResult:
79
+ structured: Dict[str, Any]
80
+ raw_content: str
81
+ metadata: Dict[str, Any]
82
+
83
+
84
+ @dataclass
85
+ class ProcessedItem:
86
+ source: str
87
+ url: Optional[str] = None
88
+ filename: Optional[str] = None
89
+ structured: Dict[str, Any] = None
90
+ metadata: Dict[str, Any] = None
91
+ timestamp: str = None
92
+ snapshot_path: Optional[str] = None
93
+
94
+ def __post_init__(self):
95
+ if self.timestamp is None:
96
+ self.timestamp = datetime.now().isoformat()
97
+ if self.structured is None:
98
+ self.structured = {}
99
+ if self.metadata is None:
100
+ self.metadata = {}
101
+
102
+
103
+ # Media Downloader with better caching and error handling
104
  class MediaDownloader:
105
+ """Handles downloading and saving media files with caching."""
106
+
107
+ def __init__(self, cache_dir: Path = TEMP_DIR / 'media_cache'):
108
  self.session = requests.Session()
109
+ self.session.headers.update({
110
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
111
+ 'Accept': 'image/webp,image/*,*/*;q=0.8',
112
+ 'Accept-Language': 'en-US,en;q=0.5',
113
+ })
114
+ self.cache_dir = cache_dir
115
+ self.cache_dir.mkdir(exist_ok=True)
116
+ self.downloaded_files = {} # {url_hash: local_path}
117
+
118
+ def _get_url_hash(self, url: str) -> str:
119
+ """Generate consistent hash for URL."""
120
+ return hashlib.md5(url.encode()).hexdigest()
121
+
122
+ def download_media(self, url: str, timeout: int = 10) -> Optional[str]:
123
+ """Download media file with caching."""
124
+ url_hash = self._get_url_hash(url)
125
+
126
+ # Check cache first
127
+ cache_file = self.cache_dir / f"{url_hash}.cache"
128
+ if cache_file.exists():
129
+ try:
130
+ with open(cache_file, 'r') as f:
131
+ cached_path = f.read().strip()
132
+ if Path(cached_path).exists():
133
+ return cached_path
134
+ except Exception:
135
+ pass
136
+
137
+ # Download the file
138
  try:
139
+ response = self.session.get(url, timeout=timeout, stream=True)
140
  response.raise_for_status()
141
+
142
+ # Determine file extension
143
  content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
 
 
144
  ext = mimetypes.guess_extension(content_type)
145
  if not ext:
146
+ # Try to get extension from URL
147
+ parsed = urlparse(url)
148
+ ext = Path(parsed.path).suffix or '.bin'
149
+
150
+ # Create safe filename
151
+ safe_filename = f"{url_hash}{ext}"
152
+ local_path = MEDIA_DIR / safe_filename
153
+
154
+ # Save file
155
  with open(local_path, 'wb') as f:
156
  for chunk in response.iter_content(chunk_size=8192):
157
+ if chunk:
158
+ f.write(chunk)
159
+
160
+ # Update cache
161
+ with open(cache_file, 'w') as f:
162
+ f.write(str(local_path))
163
+
164
  self.downloaded_files[url] = str(local_path)
165
  logger.info(f"Downloaded media: {url} -> {local_path}")
166
  return str(local_path)
167
+
168
  except requests.exceptions.RequestException as e:
169
  logger.warning(f"Failed to download media {url}: {e}")
170
  return None
171
  except Exception as e:
172
+ logger.error(f"Unexpected error downloading {url}: {e}")
173
  return None
174
+
175
+ def batch_download(self, urls: List[str], max_workers: int = 5) -> Dict[str, Optional[str]]:
176
+ """Download multiple files (could be enhanced with threading)."""
177
+ results = {}
178
+ for url in urls:
179
+ results[url] = self.download_media(url)
180
+ return results
181
 
182
 
183
+ # Enhanced URL Processor
184
  class EnhancedURLProcessor:
185
+ """Advanced URL processing with complete content extraction."""
186
+
187
+ def __init__(self, timeout: int = DEFAULT_TIMEOUT, max_retries: int = 3):
188
  self.session = requests.Session()
189
+ self.timeout = timeout
190
+ self.max_retries = max_retries
191
  self.user_agent = UserAgent()
192
+
193
  self.session.headers.update({
194
  'User-Agent': self.user_agent.random,
195
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
196
+ 'Accept-Language': 'en-US,en;q=0.5',
197
  'Accept-Encoding': 'gzip, deflate, br',
198
  'Connection': 'keep-alive',
199
  'Upgrade-Insecure-Requests': '1',
200
  'Sec-Fetch-Dest': 'document',
201
  'Sec-Fetch-Mode': 'navigate',
202
  'Sec-Fetch-Site': 'none',
203
+ 'DNT': '1',
 
204
  })
205
+
206
+ def validate_url(self, url: str) -> URLValidationResult:
207
+ """Enhanced URL validation with detailed feedback."""
208
  try:
209
+ # Basic URL validation
210
+ if not url or not isinstance(url, str):
211
+ return URLValidationResult(
212
+ is_valid=False,
213
+ message='Invalid URL',
214
+ details={'error': 'URL must be a non-empty string'}
215
+ )
216
+
217
+ # Check if URL starts with http(s)
218
+ if not url.startswith(('http://', 'https://')):
219
+ url = 'https://' + url
220
+
221
+ # Validate with validators
222
  if not validators.url(url):
223
+ return URLValidationResult(
224
+ is_valid=False,
225
+ message='Invalid URL format',
226
+ details={'error': 'URL must be properly formatted'}
227
+ )
228
+
229
  parsed = urlparse(url)
230
  if not all([parsed.scheme, parsed.netloc]):
231
+ return URLValidationResult(
232
+ is_valid=False,
233
+ message='Incomplete URL',
234
+ details={'error': 'Missing scheme or domain'}
235
+ )
236
+
237
+ # Try to connect
238
  try:
239
+ head_response = self.session.head(
240
+ url,
241
+ timeout=5,
242
+ allow_redirects=True
243
+ )
244
  head_response.raise_for_status()
245
  except requests.exceptions.RequestException:
246
+ # Try GET if HEAD fails
247
+ response = self.session.get(url, timeout=5, stream=True)
248
  response.raise_for_status()
249
+
250
+ return URLValidationResult(
251
+ is_valid=True,
252
+ message='URL is valid and accessible',
253
+ details={
254
+ 'final_url': response.url if 'response' in locals() else head_response.url,
255
  'content_type': head_response.headers.get('Content-Type', 'unknown'),
256
  'server': head_response.headers.get('Server', 'unknown'),
257
  'size': head_response.headers.get('Content-Length', 'unknown')
258
  }
259
+ )
260
+
261
  except Exception as e:
262
+ return URLValidationResult(
263
+ is_valid=False,
264
+ message=f'URL validation failed: {str(e)}',
265
+ details={'error': str(e), 'traceback': str(e.__traceback__)}
266
+ )
267
+
268
+ def fetch_content(self, url: str, retry_count: int = 0) -> Optional[FetchResult]:
269
+ """Enhanced content fetcher with retries and encoding detection."""
270
  try:
271
+ logger.info(f"Fetching content from: {url} (Attempt {retry_count + 1}/{self.max_retries})")
272
+
273
+ # Update user agent
274
  self.session.headers.update({'User-Agent': self.user_agent.random})
275
+
276
+ response = self.session.get(
277
+ url,
278
+ timeout=self.timeout,
279
+ allow_redirects=True,
280
+ stream=True
281
+ )
282
  response.raise_for_status()
283
+
284
  # Encoding detection
285
  encoding = response.encoding
286
  if encoding is None or encoding == 'ISO-8859-1':
287
+ # Sample first 10KB for encoding detection
288
+ sample = response.content[:10240]
289
+ detected = chardet.detect(sample)
290
  encoding = detected['encoding'] or 'utf-8'
291
+
292
+ # Decode content
293
  try:
294
  raw_content = response.content.decode(encoding, errors='replace')
295
  except (UnicodeDecodeError, LookupError):
296
  raw_content = response.content.decode('utf-8', errors='replace')
297
  encoding = 'utf-8 (fallback)'
298
+
299
+ # Prepare metadata
300
  metadata = {
301
  'url': url,
302
  'final_url': response.url,
 
306
  'content_length': len(response.content),
307
  'status_code': response.status_code,
308
  'headers': dict(response.headers),
309
+ 'elapsed': response.elapsed.total_seconds(),
310
  }
311
+
312
+ # Process based on content type
313
  content_type = metadata['content_type'].lower()
314
  structured = {}
315
+
316
+ if 'text/html' in content_type:
317
+ structured = self._process_html_content(raw_content, response.url)
318
+ elif 'application/json' in content_type or url.endswith('.json'):
319
+ try:
320
+ structured = json.loads(raw_content)
321
+ except json.JSONDecodeError as e:
322
  structured = {
323
+ 'text': raw_content[:100000],
324
+ 'parse_error': str(e),
325
+ 'json_fragment': raw_content[:1000]
 
326
  }
327
+ elif 'image/' in content_type:
328
+ structured = {
329
+ 'media_type': 'image',
330
+ 'direct_url': response.url,
331
+ 'format': content_type.split('/')[-1],
332
+ 'size_bytes': len(response.content),
333
+ 'filename': Path(urlparse(url).path).name or 'unknown'
334
+ }
335
+ else:
336
+ # Generic content
337
+ structured = {'text': raw_content[:100000]}
338
+
339
+ return FetchResult(
340
+ structured=structured,
341
+ raw_content=raw_content,
342
+ metadata=metadata
343
+ )
344
+
345
  except requests.exceptions.RequestException as e:
346
  if retry_count < self.max_retries - 1:
347
  sleep_time = 2 ** retry_count
348
+ logger.info(f"Retrying {url} after {sleep_time}s...")
349
  time.sleep(sleep_time)
350
  return self.fetch_content(url, retry_count + 1)
351
  else:
 
354
  except Exception as e:
355
  logger.error(f"Unexpected error fetching {url}: {e}")
356
  return None
357
+
358
+ def _process_html_content(self, raw_content: str, base_url: str) -> Dict[str, Any]:
359
+ """Process HTML content and extract structured data."""
360
  soup = BeautifulSoup(raw_content, 'html.parser')
361
+
362
+ # Fix relative URLs
363
+ for tag in soup.find_all(['a', 'img', 'link', 'script', 'video', 'audio', 'source']):
364
+ for attr in ['href', 'src', 'data-src', 'poster']:
365
  if tag.get(attr) and not urlparse(tag[attr]).scheme:
366
  try:
367
  tag[attr] = urljoin(base_url, tag[attr])
368
+ except Exception as e:
369
+ logger.debug(f"Failed to join URL: {e}")
370
+
371
+ # Extract structured data
372
+ structured = self._extract_database_data(soup, base_url)
373
+ structured['raw_html'] = raw_content[:50000] # Store truncated HTML
374
+ structured['base_url'] = base_url
375
+
376
  return structured
377
+
378
+ def _create_template_shell(self, raw_content: str, base_url: str) -> Dict[str, Any]:
379
+ """Create a template shell from HTML content."""
380
  soup = BeautifulSoup(raw_content, 'html.parser')
381
+
382
  PLACEHOLDER_TEXT = "[LOREM IPSUM CONTENT]"
383
+ PLACEHOLDER_IMG = "data:image/svg+xml;charset=UTF-8,%3Csvg%20width%3D%22200%22%20height%3D%22100%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%3Crect%20width%3D%22200%22%20height%3D%22100%22%20fill%3D%22%23777%22%3E%3C%2Frect%3E%3Ctext%20x%3D%2270%22%20y%3D%2255%22%3E200x100%3C%2Ftext%3E%3C%2Fsvg%3E"
384
+
385
+ # Replace text content
386
+ text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'td', 'th', 'label', 'title', 'div']
387
+ for tag in soup.find_all(text_tags):
388
+ if tag.string and len(tag.get_text(strip=True)) > 5:
389
+ tag.string.replace_with(PLACEHOLDER_TEXT)
390
+
391
+ # Replace images
392
  for img in soup.find_all('img'):
393
  img['src'] = PLACEHOLDER_IMG
394
+ if 'srcset' in img.attrs:
395
+ del img['srcset']
396
+
397
+ # Remove external links
398
  for a in soup.find_all('a'):
399
+ if 'href' in a.attrs:
400
  a['href'] = '#'
401
+
402
+ # Remove sensitive data
403
  for script in soup.find_all('script', type='application/ld+json'):
404
  script.decompose()
405
+
406
+ # Remove comments
407
+ for comment in soup.find_all(string=lambda text: isinstance(text, NavigableString) and '<!--' in str(text)):
408
+ comment.extract()
409
+
410
  return {
411
  'template_type': 'html_shell',
412
  'base_url': base_url,
413
+ 'template_html': str(soup),
414
+ 'timestamp': datetime.now().isoformat()
415
  }
416
+
417
+ def _extract_database_data(self, soup: BeautifulSoup, base_url: str) -> Dict[str, Any]:
418
+ """Extract structured data from HTML."""
419
  structured = {
420
+ 'title': soup.title.string.strip() if soup.title and soup.title.string else '',
421
+ 'meta_description': '',
422
  'core_text_content': '',
423
+ 'images': [],
424
+ 'videos': [],
425
+ 'audios': [],
426
  'structured_data': [],
427
  'products': [],
428
+ 'links': [],
429
+ 'metadata': {}
430
  }
431
+
432
+ # Extract meta description
433
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
434
+ if meta_desc:
435
+ structured['meta_description'] = meta_desc.get('content', '')
436
+
437
+ # Extract JSON-LD structured data
438
  for script in soup.find_all('script', type='application/ld+json'):
439
  try:
440
+ ld_data = json.loads(script.string or '{}')
441
  structured['structured_data'].append(ld_data)
442
+
443
+ # Extract products
444
+ if isinstance(ld_data, dict):
445
+ if ld_data.get('@type') == 'Product':
446
+ structured['products'].append(ld_data)
447
+ elif ld_data.get('@graph'):
448
+ for item in ld_data.get('@graph', []):
449
+ if isinstance(item, dict) and item.get('@type') == 'Product':
450
+ structured['products'].append(item)
451
+ except (json.JSONDecodeError, TypeError) as e:
452
+ logger.debug(f"Failed to parse JSON-LD: {e}")
453
+
454
+ # Extract media
455
  for img in soup.find_all('img'):
456
+ src = img.get('src') or img.get('data-src')
457
+ if src:
458
+ structured['images'].append(urljoin(base_url, src))
459
+
460
  for video in soup.find_all('video'):
461
+ src = video.get('src') or (video.find('source') and video.find('source').get('src'))
462
+ if src:
463
+ structured['videos'].append(urljoin(base_url, src))
464
+
465
  for audio in soup.find_all('audio'):
466
+ src = audio.get('src') or (audio.find('source') and audio.find('source').get('src'))
467
+ if src:
468
+ structured['audios'].append(urljoin(base_url, src))
469
+
470
+ # Extract links
471
+ for a in soup.find_all('a', href=True):
472
+ href = a['href']
473
+ if href.startswith(('http://', 'https://')):
474
+ structured['links'].append(href)
475
+
476
+ # Extract main content
477
+ main_content_selectors = [
478
+ 'main', 'article', '[role="main"]',
479
+ '.main-content', '.content', '#content',
480
+ '.article', '.post'
481
+ ]
482
+
483
+ for selector in main_content_selectors:
484
+ main_tag = soup.select_one(selector)
485
+ if main_tag:
486
+ structured['core_text_content'] = clean(
487
+ main_tag.get_text('\n', strip=True),
488
+ lower=False,
489
+ no_line_breaks=False,
490
+ no_urls=True,
491
+ no_emails=True,
492
+ no_phone_numbers=True
493
+ )[:10000] # Limit size
494
+ break
495
+
496
  if not structured['core_text_content']:
497
+ # Fallback: extract all text
498
+ structured['core_text_content'] = clean(
499
+ soup.get_text('\n', strip=True),
500
+ lower=False,
501
+ no_line_breaks=False,
502
+ no_urls=True,
503
+ no_emails=True,
504
+ no_phone_numbers=True
505
+ )[:5000]
506
+
507
+ # Remove duplicates
508
+ structured['images'] = list(dict.fromkeys(structured['images']))[:50] # Limit to 50 images
509
+ structured['videos'] = list(dict.fromkeys(structured['videos']))
510
+ structured['audios'] = list(dict.fromkeys(structured['audios']))
511
+ structured['links'] = list(dict.fromkeys(structured['links']))[:100] # Limit to 100 links
512
+
513
  return structured
514
 
515
 
516
+ # Site Crawler with improved logic
517
  class SiteCrawler:
518
+ """Crawl website with configurable depth and limits."""
519
+
520
+ def __init__(self, processor: EnhancedURLProcessor, max_pages: int = 10, max_depth: int = 2):
521
  self.processor = processor
522
+ self.max_pages = max_pages
523
+ self.max_depth = max_depth
524
  self.crawled_urls = set()
525
+ self.results = []
526
+ self.snapshot_paths = []
527
+
528
+ def _normalize_url(self, url: str, base_url: str) -> str:
529
+ """Normalize URL by removing fragments and query parameters for crawling."""
530
+ parsed = urlparse(url)
531
+ base_parsed = urlparse(base_url)
532
+
533
+ # Ensure same domain
534
+ if parsed.netloc and parsed.netloc != base_parsed.netloc:
535
+ return None
536
+
537
+ # Remove fragments and query params for crawling
538
+ normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
539
+ return normalized.rstrip('/')
540
+
541
+ def _get_internal_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
542
+ """Extract internal links from page."""
543
  parsed_base = urlparse(base_url)
544
  internal_links = set()
545
+
546
  for a in soup.find_all('a', href=True):
547
  href = urljoin(base_url, a['href'])
548
  parsed_href = urlparse(href)
549
+
550
+ # Check if same domain
551
+ if parsed_href.netloc == parsed_base.netloc:
552
+ # Filter out non-HTML resources
553
+ if any(href.lower().endswith(ext) for ext in [
554
+ '.pdf', '.zip', '.jpg', '.jpeg', '.png', '.gif',
555
+ '.css', '.js', '.mp4', '.mp3', '.avi', '.mov'
556
+ ]):
557
+ continue
558
+
559
+ # Remove fragments
560
+ href = self._normalize_url(href, base_url)
561
+ if href:
562
  internal_links.add(href)
563
+
564
  return list(internal_links)
565
+
566
+ def crawl_site(self, start_url: str, mode: str = "Full Structured Data") -> Tuple[List[Dict], List[str]]:
567
+ """Crawl website starting from given URL."""
568
+ logger.info(f"Starting crawl from {start_url} (max pages: {self.max_pages})")
569
+
570
+ queue = [(start_url, 0)] # (url, depth)
571
+
572
  while queue and len(self.crawled_urls) < self.max_pages:
573
+ url, depth = queue.pop(0)
574
+
575
+ if url in self.crawled_urls or depth > self.max_depth:
576
  continue
577
+
578
+ logger.info(f"Crawling: {url} (depth: {depth})")
579
  self.crawled_urls.add(url)
580
+
581
+ # Fetch content
582
  content_result = self.processor.fetch_content(url)
583
+ if not content_result:
584
  continue
585
+
586
+ # Check if HTML
587
+ content_type = content_result.metadata.get('content_type', '').lower()
588
+ if 'text/html' not in content_type:
589
+ continue
590
+
591
+ # Capture snapshot if Playwright is available
592
+ snapshot_path = None
593
+ if PLAYWRIGHT_AVAILABLE:
594
+ try:
595
+ filename = f"snapshot_{len(self.crawled_urls)}_{hashlib.md5(url.encode()).hexdigest()[:8]}.png"
596
+ snapshot_path = capture_visual_snapshot(url, filename)
597
+ if snapshot_path:
598
+ self.snapshot_paths.append(snapshot_path)
599
+ except Exception as e:
600
+ logger.warning(f"Failed to capture snapshot for {url}: {e}")
601
+
602
+ # Process based on mode
603
+ raw_content = content_result.raw_content
604
+ base_url = content_result.metadata['final_url']
605
  soup = BeautifulSoup(raw_content, 'html.parser')
606
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  if mode == "Extract for Template (Shell)":
608
+ structured = self.processor._create_template_shell(raw_content, base_url)
609
  elif mode == "Extract for Database (Content Only)":
610
+ structured = self.processor._extract_database_data(soup, base_url)
611
  else:
612
+ structured = self.processor._process_html_content(raw_content, base_url)
613
+
614
+ # Create result item
615
+ result_item = ProcessedItem(
616
+ source='crawl',
617
+ url=base_url,
618
+ structured=structured,
619
+ metadata=content_result.metadata,
620
+ snapshot_path=snapshot_path
621
+ )
622
+ self.results.append(asdict(result_item))
623
+
624
+ # Extract links for next level
625
+ if depth < self.max_depth:
626
+ new_links = self._get_internal_links(soup, base_url)
627
+ for link in new_links:
628
+ if link not in self.crawled_urls and len(self.crawled_urls) < self.max_pages:
629
+ queue.append((link, depth + 1))
630
+
631
+ # Be polite
632
+ time.sleep(0.5)
633
+
634
+ logger.info(f"Crawl completed. Found {len(self.results)} pages.")
635
+ return self.results, self.snapshot_paths
636
+
637
+
638
+ # File Processor with better archive handling
639
  class EnhancedFileProcessor:
640
+ """Process various file types including archives."""
641
+
642
+ def __init__(self, max_file_size: int = MAX_FILE_SIZE):
643
  self.max_file_size = max_file_size
644
  self.supported_extensions = {
645
  '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm', '.log',
646
  '.yml', '.yaml', '.ini', '.conf', '.cfg', '.zip', '.tar', '.gz',
647
  '.bz2', '.7z', '.rar', '.pdf', '.doc', '.docx', '.rtf', '.odt',
648
+ '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'
649
  }
650
+
651
+ def process_file(self, file_path: str) -> List[Dict]:
652
+ """Process a single file or archive."""
653
+ if not file_path or not os.path.exists(file_path):
654
  return []
655
+
656
  try:
 
657
  file_size = os.path.getsize(file_path)
658
  if file_size > self.max_file_size:
659
+ logger.warning(f"File {file_path} exceeds size limit ({file_size} > {self.max_file_size})")
660
  return []
661
+
662
+ if self._is_archive(file_path):
663
+ return self._process_archive(file_path)
664
+ else:
665
+ return self._process_single_file(file_path)
666
+
667
  except Exception as e:
668
+ logger.error(f"Error processing file {file_path}: {e}")
669
  return []
670
+
 
671
  def _is_archive(self, filepath: str) -> bool:
672
+ """Check if file is an archive."""
673
+ archive_extensions = ['.zip', '.tar', '.gz', '.bz2', '.7z', '.rar']
674
+ return any(filepath.lower().endswith(ext) for ext in archive_extensions)
675
+
676
+ def _process_single_file(self, file_path: str) -> List[Dict]:
677
+ """Process a single file."""
678
  try:
 
679
  file_stat = os.stat(file_path)
680
+ mime_type, _ = mimetypes.guess_type(file_path)
681
+ mime_type = mime_type or 'application/octet-stream'
682
+
683
  structured = {}
684
+
685
+ if 'image/' in mime_type:
686
  structured = {
687
  'media_type': 'image',
688
  'filename': os.path.basename(file_path),
689
+ 'mime_type': mime_type,
690
+ 'size_bytes': file_stat.st_size
691
  }
692
  else:
693
+ # Read file content
694
  with open(file_path, 'rb') as f:
695
  raw_bytes = f.read()
696
+
697
+ # Detect encoding
698
+ detected = chardet.detect(raw_bytes[:10000])
699
  encoding = detected['encoding'] or 'utf-8'
700
+
701
  try:
702
+ content = raw_bytes.decode(encoding, errors='replace')
703
  except (UnicodeDecodeError, LookupError):
704
+ content = raw_bytes.decode('utf-8', errors='replace')
705
+
706
+ # Parse based on file type
707
+ if 'json' in mime_type or file_path.endswith('.json'):
708
  try:
709
+ json_data = json.loads(content)
710
  structured = json_data
711
+ except json.JSONDecodeError as e:
712
+ structured = {
713
+ 'text': content[:50000],
714
+ 'parse_error': str(e)
715
+ }
716
+ elif 'html' in mime_type or file_path.endswith(('.html', '.htm')):
717
+ processor = EnhancedURLProcessor()
718
+ soup = BeautifulSoup(content, 'html.parser')
719
+ structured = processor._extract_database_data(soup, f"file://{file_path}")
720
  else:
721
+ structured = {'text': content[:100000]}
722
+
723
+ result_item = ProcessedItem(
724
+ source='file',
725
+ filename=os.path.basename(file_path),
726
+ structured=structured,
727
+ metadata={
728
+ 'file_size': file_stat.st_size,
729
+ 'mime_type': mime_type,
730
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
731
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
732
+ 'file_path': file_path
733
+ }
734
+ )
735
+
736
+ return [asdict(result_item)]
737
+
738
  except Exception as e:
739
+ logger.error(f"Error processing single file {file_path}: {e}")
740
  return []
741
+
742
+ def _process_archive(self, archive_path: str) -> List[Dict]:
743
+ """Extract and process files from archive."""
744
  dataset = []
745
+ temp_dir = tempfile.mkdtemp(prefix='archive_extract_')
746
+
747
  try:
748
  if zipfile.is_zipfile(archive_path):
749
  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
750
+ zip_ref.extractall(temp_dir)
751
+
752
  for file_info in zip_ref.infolist():
753
+ if not file_info.is_dir():
754
+ file_path = os.path.join(temp_dir, file_info.filename)
755
+ if os.path.exists(file_path):
756
+ dataset.extend(self._process_single_file(file_path))
757
+
758
  elif tarfile.is_tarfile(archive_path):
759
  with tarfile.open(archive_path, 'r') as tar_ref:
760
+ tar_ref.extractall(temp_dir)
761
+
762
  for member in tar_ref.getmembers():
763
  if member.isfile():
764
+ file_path = os.path.join(temp_dir, member.name)
765
+ if os.path.exists(file_path):
766
+ dataset.extend(self._process_single_file(file_path))
767
+
768
+ else:
769
+ logger.warning(f"Unsupported archive format: {archive_path}")
770
+
771
  except Exception as e:
772
+ logger.error(f"Error processing archive {archive_path}: {e}")
773
+ finally:
774
+ # Cleanup
775
+ try:
776
+ import shutil
777
+ shutil.rmtree(temp_dir, ignore_errors=True)
778
+ except:
779
+ pass
780
+
781
  return dataset
782
 
783
 
784
+ # Data Chunker with improved chunking logic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
  class DataChunker:
786
+ """Chunk data for QR code generation."""
787
+
788
+ def __init__(self, max_chunk_size: int = 2953):
789
+ self.max_chunk_size = max_chunk_size
790
+
791
+ def chunk_data(self, data: Any) -> List[Dict]:
792
+ """Chunk data into smaller pieces for QR encoding."""
793
  try:
794
+ # Serialize data
795
+ if isinstance(data, dict):
796
  json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
797
+ elif isinstance(data, list):
798
  json_str = json.dumps(data, ensure_ascii=False)
799
+ else:
800
+ json_str = str(data)
801
+
802
+ # Calculate chunk size
803
+ total_bytes = len(json_str.encode('utf-8'))
804
+ chunk_size = self.max_chunk_size
805
+
806
+ # Create chunks
 
 
 
 
807
  chunks = []
808
+ for i in range(0, total_bytes, chunk_size):
809
+ chunk_str = json_str.encode('utf-8')[i:i + chunk_size].decode('utf-8', errors='ignore')
810
+ chunk_hash = hashlib.md5(chunk_str.encode()).hexdigest()[:8]
811
+
 
 
812
  chunk = {
813
+ "chunk_index": len(chunks) + 1,
814
+ "total_chunks": (total_bytes + chunk_size - 1) // chunk_size,
815
+ "total_length": total_bytes,
816
  "chunk_hash": chunk_hash,
817
+ "data": chunk_str,
818
+ "timestamp": datetime.now().isoformat()
819
  }
820
  chunks.append(chunk)
821
+
822
  return chunks
823
+
824
  except Exception as e:
825
  logger.error(f"Error chunking data: {e}")
826
+ return [{"error": str(e), "data": str(data)[:100]}]
827
+
828
+
829
+ # QR Code Generator with styling options
830
+ class QRCodeGenerator:
831
+ """Generate QR codes with various styling options."""
832
+
833
+ def __init__(self, output_dir: Path = QR_CODES_DIR):
834
+ self.output_dir = output_dir
835
+ self.output_dir.mkdir(exist_ok=True)
836
+
837
+ def generate_stylish_qr(self, data: Union[str, Dict], filename: str,
838
+ size: int = 10, border: int = 4,
839
+ fill_color: str = "#000000",
840
+ back_color: str = "#FFFFFF",
841
+ logo_path: Optional[str] = None) -> str:
842
+ """Generate a stylish QR code."""
843
+ try:
844
+ # Prepare data
845
+ if isinstance(data, dict):
846
+ data_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
847
+ else:
848
+ data_str = str(data)
849
+
850
+ # Create QR code
851
+ qr = qrcode.QRCode(
852
+ version=None,
853
+ error_correction=qrcode.constants.ERROR_CORRECT_H, # High error correction
854
+ box_size=size,
855
+ border=border
856
+ )
857
+ qr.add_data(data_str)
858
+ qr.make(fit=True)
859
+
860
+ # Create image
861
+ qr_img = qr.make_image(fill_color=fill_color, back_color=back_color)
862
+ qr_img = qr_img.convert('RGBA')
863
+
864
+ # Add logo if provided
865
+ if logo_path and os.path.exists(logo_path):
866
+ try:
867
+ logo = Image.open(logo_path)
868
+ logo_size = qr_img.size[0] // 5
869
+ logo = logo.resize((logo_size, logo_size), Image.Resampling.LANCZOS)
870
+
871
+ # Calculate position
872
+ pos = ((qr_img.size[0] - logo.size[0]) // 2,
873
+ (qr_img.size[1] - logo.size[1]) // 2)
874
+
875
+ # Paste logo
876
+ qr_img.paste(logo, pos, logo)
877
+ except Exception as e:
878
+ logger.warning(f"Failed to add logo: {e}")
879
+
880
+ # Save image
881
+ output_path = self.output_dir / filename
882
+ qr_img.save(output_path, 'PNG', quality=95)
883
+
884
+ logger.info(f"QR code generated: {output_path}")
885
+ return str(output_path)
886
+
887
+ except Exception as e:
888
+ logger.error(f"QR generation error: {e}")
889
+ return ""
890
+
891
+ def generate_qr_sequence(self, data: Any, combined: bool = True,
892
+ prefix: str = "qr") -> List[str]:
893
+ """Generate a sequence of QR codes for data."""
894
  chunker = DataChunker()
895
  paths = []
896
+ timestamp = int(time.time())
897
+
898
  if combined:
899
+ # Generate QR codes for combined data
900
  chunks = chunker.chunk_data(data)
901
  for i, chunk in enumerate(chunks):
902
+ filename = f'{prefix}_{timestamp}_{i+1}_of_{len(chunks)}.png'
903
+ qr_path = self.generate_stylish_qr(
904
  data=chunk,
905
  filename=filename,
906
  fill_color="#1a365d",
 
909
  if qr_path:
910
  paths.append(qr_path)
911
  else:
912
+ # Generate separate QR codes for each item
913
  if isinstance(data, list):
914
  for idx, item in enumerate(data):
915
  chunks = chunker.chunk_data(item)
916
  for chunk_idx, chunk in enumerate(chunks):
917
+ filename = f'{prefix}_item{idx+1}_{chunk_idx+1}_of_{len(chunks)}_{timestamp}.png'
918
+ qr_path = self.generate_stylish_qr(
919
  data=chunk,
920
  filename=filename,
921
+ fill_color="#2d3748",
922
  back_color="#ffffff"
923
  )
924
  if qr_path:
 
926
  else:
927
  chunks = chunker.chunk_data(data)
928
  for i, chunk in enumerate(chunks):
929
+ filename = f'{prefix}_single_{i+1}_of_{len(chunks)}_{timestamp}.png'
930
+ qr_path = self.generate_stylish_qr(
931
  data=chunk,
932
  filename=filename,
933
  fill_color="#1a365d",
 
935
  )
936
  if qr_path:
937
  paths.append(qr_path)
938
+
939
  return paths
 
 
 
940
 
941
 
942
+ # Main processing functions
943
+ def capture_visual_snapshot(url: str, filename: str) -> Optional[str]:
944
+ """Capture webpage screenshot using Playwright."""
945
+ if not PLAYWRIGHT_AVAILABLE:
946
+ logger.warning("Playwright not available for screenshots")
947
+ return None
948
+
949
+ output_path = SNAPSHOTS_DIR / filename
950
+
951
+ try:
952
+ with sync_playwright() as p:
953
+ browser = p.chromium.launch(headless=True)
954
+ context = browser.new_context(
955
+ viewport={'width': 1280, 'height': 720},
956
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
957
+ )
958
+ page = context.new_page()
959
+
960
+ # Navigate with timeout
961
+ page.goto(url, wait_until='networkidle', timeout=30000)
962
+
963
+ # Take full page screenshot
964
+ page.screenshot(path=output_path, full_page=True)
965
+
966
+ browser.close()
967
+
968
+ logger.info(f"Snapshot captured: {output_path}")
969
+ return str(output_path)
970
+
971
+ except Exception as e:
972
+ logger.error(f"Failed to capture snapshot for {url}: {e}")
973
  return None
974
 
 
 
975
 
976
+ def break_down_data(data: Union[Dict, List[Dict]]) -> Union[Dict, List[Dict]]:
977
+ """Break down and restructure data for better organization."""
978
+
979
+ def process_item(item: Dict) -> Dict:
980
  structured = item.get('structured', {})
981
+
982
+ # Handle template shells
983
+ if structured.get('template_type') == 'html_shell':
984
+ return item
985
+
986
+ # Ensure structured data exists
987
+ if not structured:
988
+ content = item.get('content') or item.get('raw_content', '')
989
+ if isinstance(content, str):
990
+ structured = {'text': content}
991
+ elif isinstance(content, dict):
992
+ structured = content
993
+
994
+ # Extract media
995
+ media = []
996
+ for img in structured.get('images', []):
997
+ media.append({'type': 'image', 'source': img, 'size': 'unknown'})
998
+ for vid in structured.get('videos', []):
999
+ media.append({'type': 'video', 'source': vid, 'size': 'unknown'})
1000
+ for aud in structured.get('audios', []):
1001
+ media.append({'type': 'audio', 'source': aud, 'size': 'unknown'})
1002
+
1003
+ structured['media'] = media
1004
+
1005
+ # Extract products
1006
+ if 'products' not in structured:
1007
+ structured['products'] = []
1008
+
1009
+ # Create template if products exist
1010
+ if structured['products']:
1011
+ structured['template'] = {
1012
+ 'type': 'product_catalog',
1013
+ 'item_count': len(structured['products']),
1014
+ 'items': structured['products'][:10], # Limit to 10
1015
+ 'metadata': item.get('metadata', {})
1016
+ }
1017
+
1018
+ item['structured'] = structured
1019
+ return item
1020
+
1021
+ if isinstance(data, list):
1022
+ return [process_item(item) for item in data]
1023
+ elif isinstance(data, dict):
1024
+ return process_item(data)
1025
+
1026
+ return data
1027
 
 
 
 
 
1028
 
1029
+ def package_database(results: List[Dict]) -> Optional[str]:
1030
+ """Package processed data and media into a ZIP file."""
1031
+ if not results:
1032
+ return None
1033
+
1034
+ try:
1035
+ downloader = MediaDownloader()
1036
+ updated_results = copy.deepcopy(results)
1037
+
1038
+ # Collect media URLs
1039
+ media_urls = set()
1040
+ for item in updated_results:
1041
+ structured = item.get('structured', {})
1042
+ media_urls.update(structured.get('images', []))
1043
+ media_urls.update(structured.get('videos', []))
1044
+ media_urls.update(structured.get('audios', []))
1045
+
1046
+ # Download media
1047
+ media_mapping = downloader.batch_download(list(media_urls))
1048
+
1049
+ # Update results with local paths
1050
+ for item in updated_results:
1051
+ structured = item.get('structured', {})
1052
+ for media_type in ['images', 'videos', 'audios']:
1053
+ if media_type in structured:
1054
+ new_paths = []
1055
+ for url in structured[media_type]:
1056
+ if url in media_mapping and media_mapping[url]:
1057
+ local_path = Path(media_mapping[url])
1058
+ new_paths.append(f"media/{local_path.name}")
1059
+ else:
1060
+ new_paths.append(url)
1061
+ structured[media_type] = new_paths
1062
+
1063
+ # Create ZIP file
1064
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
1065
+ zip_filename = OUTPUTS_DIR / f"database_export_{timestamp}.zip"
1066
+
1067
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
1068
+ # Add data
1069
+ zf.writestr(
1070
+ 'data_export.json',
1071
+ json.dumps(updated_results, indent=2, ensure_ascii=False)
1072
+ )
1073
+
1074
+ # Add README
1075
+ readme = f"""Database Export
1076
+ Generated: {datetime.now().isoformat()}
1077
+ Items: {len(updated_results)}
1078
+ Media Files: {len(media_mapping)}
1079
+ """
1080
+ zf.writestr('README.txt', readme)
1081
+
1082
+ # Add media files
1083
+ for url, local_path in media_mapping.items():
1084
+ if local_path and os.path.exists(local_path):
1085
+ zf.write(local_path, arcname=f"media/{Path(local_path).name}")
1086
+
1087
+ logger.info(f"Database package created: {zip_filename}")
1088
+ return str(zip_filename)
1089
+
1090
+ except Exception as e:
1091
+ logger.error(f"Failed to create database package: {e}")
1092
+ return None
1093
 
1094
 
1095
+ # Gradio Interface
1096
  def create_modern_interface():
1097
+ """Create modern Gradio interface."""
1098
  css = """
1099
  :root {
1100
  --primary-color: #1a365d;
 
1104
  --success-color: #48bb78;
1105
  --error-color: #f56565;
1106
  --warning-color: #ed8936;
1107
+ --border-radius: 0.5rem;
1108
  }
1109
+
1110
  .gradio-container {
1111
  max-width: 1200px;
1112
+ margin: 2rem auto;
1113
+ padding: 2rem;
1114
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1115
+ border-radius: var(--border-radius);
1116
+ box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
1117
+ }
1118
+
1119
+ .container-inner {
1120
+ background: white;
1121
+ border-radius: var(--border-radius);
1122
  padding: 2rem;
 
 
1123
  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
1124
  }
1125
+
1126
+ h1 {
1127
+ background: linear-gradient(90deg, #667eea, #764ba2);
1128
+ -webkit-background-clip: text;
1129
+ -webkit-text-fill-color: transparent;
1130
+ margin-bottom: 1rem;
1131
+ }
1132
+
1133
  .primary-button {
1134
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1135
+ color: white;
 
 
1136
  border: none;
1137
+ padding: 0.75rem 1.5rem;
1138
+ border-radius: var(--border-radius);
1139
+ font-weight: 600;
1140
  cursor: pointer;
1141
+ transition: transform 0.2s, box-shadow 0.2s;
1142
  }
1143
+
1144
  .primary-button:hover {
1145
+ transform: translateY(-2px);
1146
+ box-shadow: 0 10px 20px rgba(102, 126, 234, 0.4);
1147
+ }
1148
+
1149
+ .warning-box {
1150
+ background: linear-gradient(135deg, #f6d365 0%, #fda085 100%);
1151
+ padding: 1rem;
1152
+ border-radius: var(--border-radius);
1153
+ margin-bottom: 1rem;
1154
+ border-left: 4px solid #ed8936;
1155
+ }
1156
+
1157
+ .tab-nav {
1158
+ background: linear-gradient(135deg, #f7fafc 0%, #edf2f7 100%);
1159
+ border-radius: var(--border-radius);
1160
+ padding: 0.5rem;
1161
+ margin-bottom: 1rem;
1162
  }
1163
  """
1164
+
1165
  with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator", theme=gr.themes.Soft()) as interface:
1166
+
1167
+ gr.Markdown("""
1168
+ <div class="container-inner">
1169
+ <h1>🚀 Advanced Data Processor & QR Code Generator</h1>
1170
+ <p>Process URLs, files, and JSON data. Generate QR codes and export databases.</p>
1171
+ </div>
1172
  """)
1173
+
1174
+ # Warning if Playwright not available
1175
+ if not PLAYWRIGHT_AVAILABLE:
1176
+ gr.Markdown("""
1177
+ <div class="warning-box">
1178
+ ⚠️ **Playwright not installed** - Screenshots and advanced rendering disabled.<br>
1179
+ Install with: `pip install playwright && playwright install`
1180
+ </div>
1181
+ """)
1182
+
1183
+ with gr.Tabs() as tabs:
1184
+ with gr.TabItem("🌐 URL Processing"):
1185
+ url_input = gr.Textbox(
1186
+ label="Enter URLs",
1187
+ lines=5,
1188
+ placeholder="Enter one URL per line:\nhttps://example.com\nhttps://example.org",
1189
+ value=""
1190
+ )
1191
+
1192
+ with gr.TabItem("📁 File Input"):
1193
+ file_input = gr.File(
1194
+ label="Upload Files",
1195
+ file_types=["*"],
1196
+ file_count="multiple"
1197
+ )
1198
+
1199
+ with gr.TabItem("📝 JSON Input"):
1200
+ text_input = gr.TextArea(
1201
+ label="Direct JSON Input",
1202
+ lines=15,
1203
+ placeholder='{"data": "your json here"} or [{"item": 1}, {"item": 2}]',
1204
+ value=""
1205
+ )
1206
+
1207
+ # Options
1208
  with gr.Row():
1209
  extraction_mode = gr.Radio(
1210
+ label="Extraction Mode",
1211
+ choices=[
1212
+ "Full Structured Data",
1213
+ "Extract for Template (Shell)",
1214
+ "Extract for Database (Content Only)"
1215
+ ],
1216
  value="Full Structured Data",
1217
+ info="Template/Database mode with single URL triggers site crawl."
1218
  )
1219
+
1220
+ combine_data = gr.Checkbox(
1221
+ label="Combine data for sequential QR codes",
1222
+ value=True,
1223
+ info="Recommended for large datasets"
1224
+ )
1225
+
1226
+ # Buttons
1227
  with gr.Row():
1228
+ example_btn = gr.Button("📋 Load Example", variant="secondary")
1229
+ clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
1230
+ process_btn = gr.Button("⚡ Process & Generate", variant="primary", scale=2)
1231
+
1232
+ # Outputs
1233
+ output_json = gr.JSON(label="Processed Data", visible=True)
1234
+
1235
  with gr.Row():
1236
+ output_gallery = gr.Gallery(
1237
+ label="Generated QR Codes & Snapshots",
1238
+ columns=3,
1239
+ height=400,
1240
+ show_label=True
1241
+ )
1242
+
1243
+ output_database_zip = gr.File(
1244
+ label="Database Export (.zip)",
1245
+ interactive=False
1246
+ )
1247
+
1248
+ output_text = gr.Textbox(
1249
+ label="Processing Status",
1250
+ interactive=False
1251
+ )
1252
+
1253
+ # Progress bar
1254
+ progress_bar = gr.Progress()
1255
+
1256
+ # Example data
1257
  def load_example():
1258
  example = {
1259
+ "name": "Example Product Catalog",
1260
  "type": "product_catalog",
1261
  "items": [
1262
+ {"id": "123", "name": "Premium Widget", "price": 299.99, "category": "Electronics"},
1263
+ {"id": "456", "name": "Basic Widget", "price": 149.99, "category": "Electronics"},
1264
+ {"id": "789", "name": "Deluxe Widget", "price": 499.99, "category": "Electronics"}
1265
  ],
1266
+ "metadata": {
1267
+ "timestamp": datetime.now().isoformat(),
1268
+ "source": "example",
1269
+ "version": "1.0"
1270
+ }
1271
  }
1272
  return json.dumps(example, indent=2)
1273
+
1274
+ def clear_inputs():
1275
+ return "", None, "", "Full Structured Data", True
1276
+
1277
+ def process_inputs(urls, files, text, mode, combine):
1278
+ """Main processing function."""
1279
+ results = []
1280
+ all_media_paths = []
1281
+ database_zip_path = None
1282
+
1283
  try:
1284
+ # Process JSON input
 
 
 
 
 
1285
  if text and text.strip():
1286
  try:
1287
  json_data = json.loads(text)
1288
  if isinstance(json_data, list):
1289
+ for item in json_data:
1290
+ results.append(ProcessedItem(
1291
+ source='json',
1292
+ structured=item
1293
+ ))
1294
  else:
1295
+ results.append(ProcessedItem(
1296
+ source='json',
1297
+ structured=json_data
1298
+ ))
1299
  except json.JSONDecodeError as e:
1300
+ return None, [], f"Invalid JSON: {str(e)}", None
1301
+
1302
+ # Process files
1303
  if files:
1304
+ file_processor = EnhancedFileProcessor()
1305
  for file in files:
1306
+ file_results = file_processor.process_file(file.name)
1307
  if file_results:
1308
  results.extend(file_results)
1309
+
1310
+ # Process URLs
1311
  if urls and urls.strip():
1312
+ url_processor = EnhancedURLProcessor()
1313
+ url_list = [url.strip() for url in re.split(r'[,\n]', urls) if url.strip()]
1314
+
1315
  if len(url_list) == 1 and mode != "Full Structured Data":
1316
+ # Site crawl
1317
+ crawler = SiteCrawler(url_processor, max_pages=5)
1318
  crawl_results, snapshot_paths = crawler.crawl_site(url_list[0], mode)
1319
  results.extend(crawl_results)
1320
  all_media_paths.extend(snapshot_paths)
1321
  else:
1322
+ # Single URL processing
1323
  for url in url_list:
1324
  validation = url_processor.validate_url(url)
1325
+ if validation.is_valid:
1326
  content = url_processor.fetch_content(url)
1327
+ if content:
1328
+ # Capture snapshot
1329
+ snapshot_path = None
1330
+ if PLAYWRIGHT_AVAILABLE:
1331
+ filename = f"snapshot_{hashlib.md5(url.encode()).hexdigest()[:8]}.png"
1332
+ snapshot_path = capture_visual_snapshot(url, filename)
1333
+ if snapshot_path:
1334
+ all_media_paths.append(snapshot_path)
1335
+
1336
+ # Process based on mode
1337
  if mode == "Extract for Template (Shell)":
1338
+ structured = url_processor._create_template_shell(
1339
+ content.raw_content,
1340
+ content.metadata['final_url']
1341
+ )
1342
  elif mode == "Extract for Database (Content Only)":
1343
+ soup = BeautifulSoup(content.raw_content, 'html.parser')
1344
+ structured = url_processor._extract_database_data(
1345
+ soup,
1346
+ content.metadata['final_url']
1347
+ )
1348
  else:
1349
+ structured = url_processor._process_html_content(
1350
+ content.raw_content,
1351
+ content.metadata['final_url']
1352
+ )
1353
+
1354
+ results.append(ProcessedItem(
1355
+ source='url',
1356
+ url=content.metadata['final_url'],
1357
+ structured=structured,
1358
+ metadata=content.metadata,
1359
+ snapshot_path=snapshot_path
1360
+ ))
1361
+
1362
+ # Process results
 
 
 
1363
  if results:
1364
+ results_dicts = [asdict(r) for r in results]
1365
+ processed_results = break_down_data(results_dicts)
1366
+
1367
  if mode == "Extract for Database (Content Only)":
1368
+ # Create database package
1369
+ database_zip_path = package_database(processed_results)
1370
+ status_msg = f"✅ Database package created with {len(results)} items"
1371
  else:
1372
+ # Generate QR codes
1373
+ qr_generator = QRCodeGenerator()
1374
+ qr_paths = qr_generator.generate_qr_sequence(
1375
+ processed_results,
1376
+ combined=combine,
1377
+ prefix="data_qr"
1378
+ )
1379
  all_media_paths.extend(qr_paths)
1380
+ status_msg = f"Processed {len(results)} items, generated {len(qr_paths)} QR codes"
1381
+
1382
+ return processed_results, all_media_paths, status_msg, database_zip_path
 
 
 
 
 
1383
  else:
1384
+ return None, [], "No valid content found in inputs", None
1385
+
1386
  except Exception as e:
1387
  logger.error(f"Processing error: {e}")
1388
+ return None, [], f" Error: {str(e)}", None
1389
+
1390
+ # Connect events
1391
  example_btn.click(load_example, outputs=[text_input])
1392
+ clear_btn.click(clear_inputs, outputs=[url_input, file_input, text_input, extraction_mode, combine_data])
1393
+
1394
  process_btn.click(
1395
  process_inputs,
1396
+ inputs=[url_input, file_input, text_input, extraction_mode, combine_data],
1397
  outputs=[output_json, output_gallery, output_text, output_database_zip]
 
 
 
 
1398
  )
1399
+
1400
+ # Footer
1401
  gr.Markdown("""
1402
+ <div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e2e8f0;">
1403
+ <h3>📚 Features</h3>
1404
+ <ul>
1405
+ <li><strong>URL Processing</strong>: Extract structured data from web pages</li>
1406
+ <li><strong>File Support</strong>: Process various file formats including archives</li>
1407
+ <li><strong>Site Crawling</strong>: Limited crawl for template/database extraction</li>
1408
+ <li><strong>QR Generation</strong>: Create QR codes for data sharing</li>
1409
+ <li><strong>Database Export</strong>: Package data and media for deployment</li>
1410
+ </ul>
1411
+ </div>
1412
  """)
1413
+
1414
  return interface
1415
 
1416
 
1417
  def main():
1418
+ """Main entry point."""
1419
  try:
1420
+ # Initialize mimetypes
1421
  mimetypes.init()
1422
+
1423
+ # Create and launch interface
1424
  interface = create_modern_interface()
1425
  interface.launch(
1426
+ server_name="0.0.0.0",
1427
+ server_port=7860,
1428
  share=False,
1429
  debug=False,
1430
  show_error=True,
1431
+ show_api=False,
1432
+ favicon_path=None
1433
  )
1434
  except Exception as e:
1435
  logger.error(f"Application startup error: {e}")