Spaces:
Sleeping
Sleeping
File size: 23,277 Bytes
045423f 76f26db 045423f 76f26db 045423f 443869b 045423f 443869b b6dcd96 045423f 30ea5f3 045423f 883f8b8 2c0c380 045423f 9c35b0c 8809763 045423f 76f26db 045423f 76f26db 045423f 76f26db 045423f 76f26db 045423f 443869b 045423f 883f8b8 443869b 045423f 883f8b8 045423f 883f8b8 045423f 883f8b8 045423f 883f8b8 045423f 883f8b8 443869b 045423f 443869b 045423f 443869b 045423f 883f8b8 045423f 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 76f26db 883f8b8 443869b 883f8b8 443869b 883f8b8 76f26db 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 76f26db 443869b 59f237a 883f8b8 76f26db 883f8b8 045423f 883f8b8 443869b 883f8b8 443869b 883f8b8 443869b 883f8b8 76f26db 883f8b8 045423f 883f8b8 76f26db 883f8b8 045423f 883f8b8 b6dcd96 045423f 9c35b0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 |
import os
import json
import requests
import traceback
from flask import Flask, request, jsonify
from bs4 import BeautifulSoup
import logging
import cssutils
import re
import urllib.parse
from PIL import Image
from io import BytesIO
app = Flask(__name__)
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
cssutils.log.setLevel(logging.CRITICAL) # Suppress CSS parsing warnings
@app.route('/')
def home():
return """
<!DOCTYPE html>
<html>
<head>
<title>Website Converter</title>
</head>
<body>
<h1>Welcome to Website Converter</h1>
<p>Use the /api/convert endpoint to convert websites to structured data.</p>
</body>
</html>
"""
@app.route('/api/convert', methods=['POST'])
def convert_website():
try:
data = request.json
if not data:
return jsonify({"error": "No data provided"}), 400
url = data.get('url')
if not url:
return jsonify({"error": "URL is required"}), 400
# Add http if not present
if not url.startswith('http'):
url = 'https://' + url
viewport_width = int(data.get('viewport_width', 1440))
viewport_height = 900 # Default height
logger.info(f"Converting website: {url} with viewport width: {viewport_width}")
try:
# Use requests to get the webpage
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.9',
}
response = requests.get(url, headers=headers, timeout=20)
response.raise_for_status() # Raise an exception for HTTP errors
html_content = response.text
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Extract all CSS styles (improved method)
all_styles = extract_all_css(soup, url)
# Extract the page elements using BeautifulSoup
elements = extract_elements_improved(soup, all_styles)
# Estimate page height based on content
estimated_height = viewport_height
if elements:
# Find the maximum y-coordinate plus height
max_y = 0
for element in elements:
elem_bottom = element.get('y', 0) + element.get('height', 0)
max_y = max(max_y, elem_bottom)
estimated_height = max(viewport_height, max_y)
# Prepare response
response = {
"status": "success",
"url": url,
"viewport_width": viewport_width,
"viewport_height": estimated_height,
"elements": elements
}
return jsonify(response)
except requests.exceptions.RequestException as e:
logger.error(f"Request error: {str(e)}")
return jsonify({"error": f"Failed to fetch website: {str(e)}"}), 500
except Exception as e:
logger.error(f"Error: {str(e)}")
logger.error(traceback.format_exc())
return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
def extract_all_css(soup, base_url):
"""Extract all CSS from the page: inline, style tags, and external stylesheets"""
all_styles = {}
# 1. Extract inline styles
for element in soup.find_all(style=True):
element_id = element.get('id')
element_classes = element.get('class', [])
# Create selectors for this element
selectors = []
if element_id:
selectors.append(f"#{element_id}")
if element_classes:
for cls in element_classes:
selectors.append(f".{cls}")
if not selectors: # Fallback to tag name
selectors.append(element.name)
# Store inline style for each selector
inline_style = parse_inline_style(element['style'])
for selector in selectors:
all_styles[selector] = inline_style
# 2. Extract style tags
for style_tag in soup.find_all('style'):
if style_tag.string:
css_dict = parse_css(style_tag.string)
all_styles.update(css_dict)
# 3. Extract linked stylesheets
for link in soup.find_all('link', rel='stylesheet'):
href = link.get('href')
if not href:
continue
# Make absolute URL if relative
if not href.startswith(('http://', 'https://')):
href = urllib.parse.urljoin(base_url, href)
try:
css_response = requests.get(href, timeout=10)
if css_response.ok:
css_dict = parse_css(css_response.text)
all_styles.update(css_dict)
except Exception as e:
logger.warning(f"Failed to fetch stylesheet {href}: {e}")
# 4. Add computed styles for common elements
add_default_styles(all_styles)
return all_styles
def parse_inline_style(style_text):
"""Parse inline style string into a dictionary"""
style_dict = {}
if not style_text:
return style_dict
# Split style string into individual properties
for item in style_text.split(';'):
if ':' in item:
prop, value = item.split(':', 1)
prop = prop.strip().lower()
value = value.strip()
if prop and value:
style_dict[prop] = value
return style_dict
def parse_css(css_text):
"""Parse CSS text into a dictionary of selectors and styles"""
styles = {}
try:
sheet = cssutils.parseString(css_text)
for rule in sheet:
# Only handle style rules (not @media, etc.)
if rule.type == rule.STYLE_RULE:
selector = rule.selectorText
style_dict = {}
for prop in rule.style:
if prop.name and prop.value:
style_dict[prop.name.lower()] = prop.value
# Add to styles, merging if selector already exists
if selector in styles:
styles[selector].update(style_dict)
else:
styles[selector] = style_dict
except Exception as e:
logger.warning(f"CSS parsing error: {e}")
return styles
def add_default_styles(styles):
"""Add default styles for common HTML elements"""
# Body defaults
styles.setdefault('body', {}).update({
'margin': '0px',
'font-family': 'Arial, sans-serif',
'color': '#000000',
'font-size': '16px'
})
# Heading defaults
styles.setdefault('h1', {}).update({'font-size': '32px', 'font-weight': 'bold', 'margin': '21.44px 0'})
styles.setdefault('h2', {}).update({'font-size': '24px', 'font-weight': 'bold', 'margin': '19.92px 0'})
styles.setdefault('h3', {}).update({'font-size': '18px', 'font-weight': 'bold', 'margin': '18.72px 0'})
styles.setdefault('h4', {}).update({'font-size': '16px', 'font-weight': 'bold', 'margin': '21.28px 0'})
# Link defaults
styles.setdefault('a', {}).update({'color': '#0000EE', 'text-decoration': 'underline'})
# Button defaults
styles.setdefault('button', {}).update({
'background-color': '#F0F0F0',
'border': '1px solid #CCCCCC',
'padding': '4px 8px',
'border-radius': '2px'
})
# Input defaults
styles.setdefault('input', {}).update({
'border': '1px solid #CCCCCC',
'padding': '2px 4px'
})
def extract_elements_improved(soup, styles):
"""Extract elements from the webpage with improved CSS handling"""
elements = []
# Get the body element
body = soup.find('body')
if not body:
return elements
# Start position for elements
x_offset = 0
y_position = 0
viewport_width = 1440 # Default width
# Create a mapping of elements to their computed styles
element_styles = {}
# Process main content blocks first
main_blocks = body.find_all(['div', 'header', 'main', 'nav', 'footer', 'section'], recursive=False)
if not main_blocks: # If no main blocks, use all direct children
main_blocks = body.find_all(recursive=False)
# Process each main block
for block in main_blocks:
block_data = process_element_with_styles(block, x_offset, y_position, viewport_width, styles)
if block_data:
elements.append(block_data)
y_position += block_data['height'] + 10 # Add spacing between blocks
# If no elements were found, try to extract text directly
if not elements and body.text.strip():
text_element = {
'type': 'text',
'tagName': 'p',
'x': 0,
'y': 0,
'width': viewport_width,
'height': 100,
'content': body.text.strip(),
'style': {
'color': '#000000',
'fontSize': '16px',
'fontFamily': 'Arial, sans-serif'
}
}
elements.append(text_element)
return elements
def process_element_with_styles(element, x_position, y_position, parent_width, styles, depth=0):
"""Process a single HTML element with its styles"""
if depth > 10: # Limit recursion depth
return None
tag_name = element.name.lower() if hasattr(element, 'name') else None
if not tag_name or tag_name in ['script', 'style', 'meta', 'link', 'noscript']:
return None
# Get element's classes and ID
elem_classes = element.get('class', [])
elem_id = element.get('id')
# Calculate element's computed style
computed_style = compute_element_style(element, tag_name, elem_id, elem_classes, styles)
# Create base element data
element_data = {
'type': get_element_type(tag_name),
'tagName': tag_name,
'x': x_position,
'y': y_position,
'width': calc_element_width(computed_style, parent_width),
'height': 50, # Default height, will be adjusted later
'style': {}
}
# Set element ID and class if present
if elem_id:
element_data['id'] = elem_id
if elem_classes:
if isinstance(elem_classes, list):
element_data['className'] = ' '.join(elem_classes)
else:
element_data['className'] = elem_classes
# Process specific element types
if element_data['type'] == 'text':
text_content = element.get_text().strip()
element_data['content'] = text_content
# Set text styles
extract_text_styles(element_data, computed_style)
# Calculate height based on text content
element_data['height'] = calc_text_height(text_content, computed_style)
elif element_data['type'] == 'image':
# Set image source if available
element_data['src'] = element.get('src', '')
element_data['alt'] = element.get('alt', '')
# Set height for images
if 'height' in computed_style:
try:
element_data['height'] = parse_dimension(computed_style['height'], parent_width)
except:
element_data['height'] = 200 # Default height
else:
element_data['height'] = 200
# Extract background styles
extract_background_styles(element_data, computed_style)
elif element_data['type'] in ['div', 'container', 'rectangle']:
# Process container elements
extract_container_styles(element_data, computed_style)
# Process children
children = []
child_y_position = 0
child_x_position = 0
# Apply padding if present
padding_left = parse_dimension(computed_style.get('padding-left', '0'), parent_width)
child_x_position += padding_left
available_width = element_data['width'] - (padding_left + parse_dimension(computed_style.get('padding-right', '0'), parent_width))
# Process child elements
for child in element.find_all(['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'span', 'a', 'button', 'input', 'form'], recursive=False):
child_data = process_element_with_styles(child, child_x_position, child_y_position, available_width, styles, depth + 1)
if child_data:
children.append(child_data)
if 'display' in computed_style and computed_style['display'] == 'flex':
# Handle flex layout (simplified)
if computed_style.get('flex-direction') == 'row':
child_x_position += child_data['width'] + 5
else:
child_y_position += child_data['height'] + 5
else:
# Default block layout
child_y_position += child_data['height'] + 5
if children:
element_data['children'] = children
# Adjust container height based on children
if children and 'display' not in computed_style or computed_style.get('display') != 'flex':
last_child = children[-1]
element_data['height'] = last_child['y'] - element_data['y'] + last_child['height'] + 10
# Apply common styles (border, margin, etc)
apply_common_styles(element_data, computed_style)
# If height is unreasonably small, set a minimum
if element_data['height'] < 10:
element_data['height'] = 10
return element_data
def get_element_type(tag_name):
"""Determine element type based on tag name"""
if tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'a', 'label']:
return 'text'
elif tag_name == 'img':
return 'image'
elif tag_name in ['div', 'section', 'article', 'header', 'footer', 'main', 'form']:
return 'div'
elif tag_name == 'button':
return 'rectangle' # Represent as a rectangle with text
elif tag_name == 'input':
return 'rectangle' # Represent as a rectangle
else:
return 'div' # Default type
def compute_element_style(element, tag_name, elem_id, elem_classes, styles):
"""Compute the final style for an element by cascading CSS rules"""
computed_style = {}
# 1. Apply tag-level styles
if tag_name in styles:
computed_style.update(styles[tag_name])
# 2. Apply class styles
if isinstance(elem_classes, list):
for cls in elem_classes:
class_selector = f".{cls}"
if class_selector in styles:
computed_style.update(styles[class_selector])
elif elem_classes:
class_selector = f".{elem_classes}"
if class_selector in styles:
computed_style.update(styles[class_selector])
# 3. Apply ID styles (highest specificity)
if elem_id and f"#{elem_id}" in styles:
computed_style.update(styles[f"#{elem_id}"])
# 4. Apply inline styles (overrides everything)
inline_style = element.get('style')
if inline_style:
parsed_inline = parse_inline_style(inline_style)
computed_style.update(parsed_inline)
return computed_style
def parse_dimension(value, container_size):
"""Parse dimension values (px, %, em, etc)"""
if not value or not isinstance(value, str):
return 0
value = value.strip().lower()
# Handle pixel values
if value.endswith('px'):
try:
return float(value[:-2])
except:
return 0
# Handle percentage values
elif value.endswith('%'):
try:
percentage = float(value[:-1]) / 100
return container_size * percentage
except:
return 0
# Handle em values (approximate)
elif value.endswith('em'):
try:
em_value = float(value[:-2])
return em_value * 16 # Assuming 1em = 16px
except:
return 0
# Handle rem values (approximate)
elif value.endswith('rem'):
try:
rem_value = float(value[:-3])
return rem_value * 16 # Assuming 1rem = 16px
except:
return 0
# Handle vh/vw values (viewport height/width)
elif value.endswith('vh'):
try:
vh_value = float(value[:-2]) / 100
return vh_value * 900 # Assuming viewport height is 900px
except:
return 0
elif value.endswith('vw'):
try:
vw_value = float(value[:-2]) / 100
return vw_value * 1440 # Assuming viewport width is 1440px
except:
return 0
# Handle numeric values
elif value.isdigit():
return float(value)
# Handle auto (use container size)
elif value == 'auto':
return container_size
# Default fallback
return 0
def calc_element_width(style, parent_width):
"""Calculate element width based on its style"""
# Check if width is explicitly set
if 'width' in style:
width_value = style['width']
return parse_dimension(width_value, parent_width)
# Check for max-width
if 'max-width' in style:
max_width = parse_dimension(style['max-width'], parent_width)
return min(parent_width, max_width)
# Default: use parent width
return parent_width
def calc_text_height(text, style):
"""Calculate text height based on content and style"""
if not text:
return 20
# Get font size
font_size = 16 # Default
if 'font-size' in style:
font_size_value = style['font-size']
if isinstance(font_size_value, str):
if font_size_value.endswith('px'):
try:
font_size = float(font_size_value[:-2])
except:
pass
elif font_size_value.endswith('em'):
try:
font_size = float(font_size_value[:-2]) * 16
except:
pass
# Get line height
line_height = 1.2 # Default
if 'line-height' in style:
line_height_value = style['line-height']
if isinstance(line_height_value, str):
if line_height_value.endswith('px'):
try:
line_height = float(line_height_value[:-2]) / font_size
except:
pass
else:
try:
line_height = float(line_height_value)
except:
pass
# Estimate number of lines needed
text_length = len(text)
chars_per_line = 70 # Rough estimate
num_lines = max(1, (text_length / chars_per_line))
# Calculate height
return max(20, int(font_size * line_height * num_lines))
def extract_text_styles(element_data, style):
"""Extract text-related styles from computed style"""
# Text color
if 'color' in style:
element_data['style']['color'] = style['color']
else:
element_data['style']['color'] = '#000000' # Default black
# Font size
if 'font-size' in style:
element_data['style']['fontSize'] = style['font-size']
else:
tag_name = element_data.get('tagName', '')
if tag_name.startswith('h'):
# Default heading sizes
heading_level = int(tag_name[1])
size = 32 - ((heading_level - 1) * 4)
element_data['style']['fontSize'] = f"{size}px"
else:
element_data['style']['fontSize'] = '16px' # Default
# Font weight
if 'font-weight' in style:
element_data['style']['fontWeight'] = style['font-weight']
else:
tag_name = element_data.get('tagName', '')
if tag_name.startswith('h'):
element_data['style']['fontWeight'] = 'bold'
else:
element_data['style']['fontWeight'] = 'normal'
# Font family
if 'font-family' in style:
element_data['style']['fontFamily'] = style['font-family']
# Text alignment
if 'text-align' in style:
element_data['style']['textAlign'] = style['text-align']
# Text decoration
if 'text-decoration' in style:
element_data['style']['textDecoration'] = style['text-decoration']
def extract_container_styles(element_data, style):
"""Extract container-related styles from computed style"""
# Background color
if 'background-color' in style:
element_data['style']['backgroundColor'] = style['background-color']
# Display type
if 'display' in style:
element_data['style']['display'] = style['display']
# Flex-related properties
if 'display' in style and style['display'] == 'flex':
if 'flex-direction' in style:
element_data['style']['flexDirection'] = style['flex-direction']
if 'justify-content' in style:
element_data['style']['justifyContent'] = style['justify-content']
if 'align-items' in style:
element_data['style']['alignItems'] = style['align-items']
def extract_background_styles(element_data, style):
"""Extract background-related styles from computed style"""
if 'background-color' in style:
element_data['style']['backgroundColor'] = style['background-color']
if 'background-image' in style:
bg_image = style['background-image']
if bg_image.startswith('url(') and bg_image.endswith(')'):
image_url = bg_image[4:-1].strip('"\'')
element_data['style']['backgroundImage'] = image_url
def apply_common_styles(element_data, style):
"""Apply common styles that apply to all elements"""
# Border properties
if 'border' in style:
element_data['style']['border'] = style['border']
else:
# Individual border properties
for side in ['top', 'right', 'bottom', 'left']:
border_key = f'border-{side}'
if border_key in style:
element_data['style'][border_key] = style[border_key]
if 'border-radius' in style:
element_data['style']['borderRadius'] = style['border-radius']
# Opacity
if 'opacity' in style:
element_data['style']['opacity'] = style['opacity']
# Visibility
if 'visibility' in style:
element_data['style']['visibility'] = style['visibility']
# Box shadow
if 'box-shadow' in style:
element_data['style']['boxShadow'] = style['box-shadow']
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port) |