import os import json import requests import traceback from flask import Flask, request, jsonify from bs4 import BeautifulSoup import logging import cssutils import re import urllib.parse from PIL import Image from io import BytesIO app = Flask(__name__) # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) cssutils.log.setLevel(logging.CRITICAL) # Suppress CSS parsing warnings @app.route('/') def home(): return """
Use the /api/convert endpoint to convert websites to structured data.
""" @app.route('/api/convert', methods=['POST']) def convert_website(): try: data = request.json if not data: return jsonify({"error": "No data provided"}), 400 url = data.get('url') if not url: return jsonify({"error": "URL is required"}), 400 # Add http if not present if not url.startswith('http'): url = 'https://' + url viewport_width = int(data.get('viewport_width', 1440)) viewport_height = 900 # Default height logger.info(f"Converting website: {url} with viewport width: {viewport_width}") try: # Use requests to get the webpage headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml', 'Accept-Language': 'en-US,en;q=0.9', } response = requests.get(url, headers=headers, timeout=20) response.raise_for_status() # Raise an exception for HTTP errors html_content = response.text # Parse the HTML content soup = BeautifulSoup(html_content, 'html.parser') # Extract all CSS styles (improved method) all_styles = extract_all_css(soup, url) # Extract the page elements using BeautifulSoup elements = extract_elements_improved(soup, all_styles) # Estimate page height based on content estimated_height = viewport_height if elements: # Find the maximum y-coordinate plus height max_y = 0 for element in elements: elem_bottom = element.get('y', 0) + element.get('height', 0) max_y = max(max_y, elem_bottom) estimated_height = max(viewport_height, max_y) # Prepare response response = { "status": "success", "url": url, "viewport_width": viewport_width, "viewport_height": estimated_height, "elements": elements } return jsonify(response) except requests.exceptions.RequestException as e: logger.error(f"Request error: {str(e)}") return jsonify({"error": f"Failed to fetch website: {str(e)}"}), 500 except Exception as e: logger.error(f"Error: {str(e)}") logger.error(traceback.format_exc()) return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500 def extract_all_css(soup, base_url): """Extract all CSS from the page: inline, style tags, and external stylesheets""" all_styles = {} # 1. Extract inline styles for element in soup.find_all(style=True): element_id = element.get('id') element_classes = element.get('class', []) # Create selectors for this element selectors = [] if element_id: selectors.append(f"#{element_id}") if element_classes: for cls in element_classes: selectors.append(f".{cls}") if not selectors: # Fallback to tag name selectors.append(element.name) # Store inline style for each selector inline_style = parse_inline_style(element['style']) for selector in selectors: all_styles[selector] = inline_style # 2. Extract style tags for style_tag in soup.find_all('style'): if style_tag.string: css_dict = parse_css(style_tag.string) all_styles.update(css_dict) # 3. Extract linked stylesheets for link in soup.find_all('link', rel='stylesheet'): href = link.get('href') if not href: continue # Make absolute URL if relative if not href.startswith(('http://', 'https://')): href = urllib.parse.urljoin(base_url, href) try: css_response = requests.get(href, timeout=10) if css_response.ok: css_dict = parse_css(css_response.text) all_styles.update(css_dict) except Exception as e: logger.warning(f"Failed to fetch stylesheet {href}: {e}") # 4. Add computed styles for common elements add_default_styles(all_styles) return all_styles def parse_inline_style(style_text): """Parse inline style string into a dictionary""" style_dict = {} if not style_text: return style_dict # Split style string into individual properties for item in style_text.split(';'): if ':' in item: prop, value = item.split(':', 1) prop = prop.strip().lower() value = value.strip() if prop and value: style_dict[prop] = value return style_dict def parse_css(css_text): """Parse CSS text into a dictionary of selectors and styles""" styles = {} try: sheet = cssutils.parseString(css_text) for rule in sheet: # Only handle style rules (not @media, etc.) if rule.type == rule.STYLE_RULE: selector = rule.selectorText style_dict = {} for prop in rule.style: if prop.name and prop.value: style_dict[prop.name.lower()] = prop.value # Add to styles, merging if selector already exists if selector in styles: styles[selector].update(style_dict) else: styles[selector] = style_dict except Exception as e: logger.warning(f"CSS parsing error: {e}") return styles def add_default_styles(styles): """Add default styles for common HTML elements""" # Body defaults styles.setdefault('body', {}).update({ 'margin': '0px', 'font-family': 'Arial, sans-serif', 'color': '#000000', 'font-size': '16px' }) # Heading defaults styles.setdefault('h1', {}).update({'font-size': '32px', 'font-weight': 'bold', 'margin': '21.44px 0'}) styles.setdefault('h2', {}).update({'font-size': '24px', 'font-weight': 'bold', 'margin': '19.92px 0'}) styles.setdefault('h3', {}).update({'font-size': '18px', 'font-weight': 'bold', 'margin': '18.72px 0'}) styles.setdefault('h4', {}).update({'font-size': '16px', 'font-weight': 'bold', 'margin': '21.28px 0'}) # Link defaults styles.setdefault('a', {}).update({'color': '#0000EE', 'text-decoration': 'underline'}) # Button defaults styles.setdefault('button', {}).update({ 'background-color': '#F0F0F0', 'border': '1px solid #CCCCCC', 'padding': '4px 8px', 'border-radius': '2px' }) # Input defaults styles.setdefault('input', {}).update({ 'border': '1px solid #CCCCCC', 'padding': '2px 4px' }) def extract_elements_improved(soup, styles): """Extract elements from the webpage with improved CSS handling""" elements = [] # Get the body element body = soup.find('body') if not body: return elements # Start position for elements x_offset = 0 y_position = 0 viewport_width = 1440 # Default width # Create a mapping of elements to their computed styles element_styles = {} # Process main content blocks first main_blocks = body.find_all(['div', 'header', 'main', 'nav', 'footer', 'section'], recursive=False) if not main_blocks: # If no main blocks, use all direct children main_blocks = body.find_all(recursive=False) # Process each main block for block in main_blocks: block_data = process_element_with_styles(block, x_offset, y_position, viewport_width, styles) if block_data: elements.append(block_data) y_position += block_data['height'] + 10 # Add spacing between blocks # If no elements were found, try to extract text directly if not elements and body.text.strip(): text_element = { 'type': 'text', 'tagName': 'p', 'x': 0, 'y': 0, 'width': viewport_width, 'height': 100, 'content': body.text.strip(), 'style': { 'color': '#000000', 'fontSize': '16px', 'fontFamily': 'Arial, sans-serif' } } elements.append(text_element) return elements def process_element_with_styles(element, x_position, y_position, parent_width, styles, depth=0): """Process a single HTML element with its styles""" if depth > 10: # Limit recursion depth return None tag_name = element.name.lower() if hasattr(element, 'name') else None if not tag_name or tag_name in ['script', 'style', 'meta', 'link', 'noscript']: return None # Get element's classes and ID elem_classes = element.get('class', []) elem_id = element.get('id') # Calculate element's computed style computed_style = compute_element_style(element, tag_name, elem_id, elem_classes, styles) # Create base element data element_data = { 'type': get_element_type(tag_name), 'tagName': tag_name, 'x': x_position, 'y': y_position, 'width': calc_element_width(computed_style, parent_width), 'height': 50, # Default height, will be adjusted later 'style': {} } # Set element ID and class if present if elem_id: element_data['id'] = elem_id if elem_classes: if isinstance(elem_classes, list): element_data['className'] = ' '.join(elem_classes) else: element_data['className'] = elem_classes # Process specific element types if element_data['type'] == 'text': text_content = element.get_text().strip() element_data['content'] = text_content # Set text styles extract_text_styles(element_data, computed_style) # Calculate height based on text content element_data['height'] = calc_text_height(text_content, computed_style) elif element_data['type'] == 'image': # Set image source if available element_data['src'] = element.get('src', '') element_data['alt'] = element.get('alt', '') # Set height for images if 'height' in computed_style: try: element_data['height'] = parse_dimension(computed_style['height'], parent_width) except: element_data['height'] = 200 # Default height else: element_data['height'] = 200 # Extract background styles extract_background_styles(element_data, computed_style) elif element_data['type'] in ['div', 'container', 'rectangle']: # Process container elements extract_container_styles(element_data, computed_style) # Process children children = [] child_y_position = 0 child_x_position = 0 # Apply padding if present padding_left = parse_dimension(computed_style.get('padding-left', '0'), parent_width) child_x_position += padding_left available_width = element_data['width'] - (padding_left + parse_dimension(computed_style.get('padding-right', '0'), parent_width)) # Process child elements for child in element.find_all(['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'span', 'a', 'button', 'input', 'form'], recursive=False): child_data = process_element_with_styles(child, child_x_position, child_y_position, available_width, styles, depth + 1) if child_data: children.append(child_data) if 'display' in computed_style and computed_style['display'] == 'flex': # Handle flex layout (simplified) if computed_style.get('flex-direction') == 'row': child_x_position += child_data['width'] + 5 else: child_y_position += child_data['height'] + 5 else: # Default block layout child_y_position += child_data['height'] + 5 if children: element_data['children'] = children # Adjust container height based on children if children and 'display' not in computed_style or computed_style.get('display') != 'flex': last_child = children[-1] element_data['height'] = last_child['y'] - element_data['y'] + last_child['height'] + 10 # Apply common styles (border, margin, etc) apply_common_styles(element_data, computed_style) # If height is unreasonably small, set a minimum if element_data['height'] < 10: element_data['height'] = 10 return element_data def get_element_type(tag_name): """Determine element type based on tag name""" if tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'a', 'label']: return 'text' elif tag_name == 'img': return 'image' elif tag_name in ['div', 'section', 'article', 'header', 'footer', 'main', 'form']: return 'div' elif tag_name == 'button': return 'rectangle' # Represent as a rectangle with text elif tag_name == 'input': return 'rectangle' # Represent as a rectangle else: return 'div' # Default type def compute_element_style(element, tag_name, elem_id, elem_classes, styles): """Compute the final style for an element by cascading CSS rules""" computed_style = {} # 1. Apply tag-level styles if tag_name in styles: computed_style.update(styles[tag_name]) # 2. Apply class styles if isinstance(elem_classes, list): for cls in elem_classes: class_selector = f".{cls}" if class_selector in styles: computed_style.update(styles[class_selector]) elif elem_classes: class_selector = f".{elem_classes}" if class_selector in styles: computed_style.update(styles[class_selector]) # 3. Apply ID styles (highest specificity) if elem_id and f"#{elem_id}" in styles: computed_style.update(styles[f"#{elem_id}"]) # 4. Apply inline styles (overrides everything) inline_style = element.get('style') if inline_style: parsed_inline = parse_inline_style(inline_style) computed_style.update(parsed_inline) return computed_style def parse_dimension(value, container_size): """Parse dimension values (px, %, em, etc)""" if not value or not isinstance(value, str): return 0 value = value.strip().lower() # Handle pixel values if value.endswith('px'): try: return float(value[:-2]) except: return 0 # Handle percentage values elif value.endswith('%'): try: percentage = float(value[:-1]) / 100 return container_size * percentage except: return 0 # Handle em values (approximate) elif value.endswith('em'): try: em_value = float(value[:-2]) return em_value * 16 # Assuming 1em = 16px except: return 0 # Handle rem values (approximate) elif value.endswith('rem'): try: rem_value = float(value[:-3]) return rem_value * 16 # Assuming 1rem = 16px except: return 0 # Handle vh/vw values (viewport height/width) elif value.endswith('vh'): try: vh_value = float(value[:-2]) / 100 return vh_value * 900 # Assuming viewport height is 900px except: return 0 elif value.endswith('vw'): try: vw_value = float(value[:-2]) / 100 return vw_value * 1440 # Assuming viewport width is 1440px except: return 0 # Handle numeric values elif value.isdigit(): return float(value) # Handle auto (use container size) elif value == 'auto': return container_size # Default fallback return 0 def calc_element_width(style, parent_width): """Calculate element width based on its style""" # Check if width is explicitly set if 'width' in style: width_value = style['width'] return parse_dimension(width_value, parent_width) # Check for max-width if 'max-width' in style: max_width = parse_dimension(style['max-width'], parent_width) return min(parent_width, max_width) # Default: use parent width return parent_width def calc_text_height(text, style): """Calculate text height based on content and style""" if not text: return 20 # Get font size font_size = 16 # Default if 'font-size' in style: font_size_value = style['font-size'] if isinstance(font_size_value, str): if font_size_value.endswith('px'): try: font_size = float(font_size_value[:-2]) except: pass elif font_size_value.endswith('em'): try: font_size = float(font_size_value[:-2]) * 16 except: pass # Get line height line_height = 1.2 # Default if 'line-height' in style: line_height_value = style['line-height'] if isinstance(line_height_value, str): if line_height_value.endswith('px'): try: line_height = float(line_height_value[:-2]) / font_size except: pass else: try: line_height = float(line_height_value) except: pass # Estimate number of lines needed text_length = len(text) chars_per_line = 70 # Rough estimate num_lines = max(1, (text_length / chars_per_line)) # Calculate height return max(20, int(font_size * line_height * num_lines)) def extract_text_styles(element_data, style): """Extract text-related styles from computed style""" # Text color if 'color' in style: element_data['style']['color'] = style['color'] else: element_data['style']['color'] = '#000000' # Default black # Font size if 'font-size' in style: element_data['style']['fontSize'] = style['font-size'] else: tag_name = element_data.get('tagName', '') if tag_name.startswith('h'): # Default heading sizes heading_level = int(tag_name[1]) size = 32 - ((heading_level - 1) * 4) element_data['style']['fontSize'] = f"{size}px" else: element_data['style']['fontSize'] = '16px' # Default # Font weight if 'font-weight' in style: element_data['style']['fontWeight'] = style['font-weight'] else: tag_name = element_data.get('tagName', '') if tag_name.startswith('h'): element_data['style']['fontWeight'] = 'bold' else: element_data['style']['fontWeight'] = 'normal' # Font family if 'font-family' in style: element_data['style']['fontFamily'] = style['font-family'] # Text alignment if 'text-align' in style: element_data['style']['textAlign'] = style['text-align'] # Text decoration if 'text-decoration' in style: element_data['style']['textDecoration'] = style['text-decoration'] def extract_container_styles(element_data, style): """Extract container-related styles from computed style""" # Background color if 'background-color' in style: element_data['style']['backgroundColor'] = style['background-color'] # Display type if 'display' in style: element_data['style']['display'] = style['display'] # Flex-related properties if 'display' in style and style['display'] == 'flex': if 'flex-direction' in style: element_data['style']['flexDirection'] = style['flex-direction'] if 'justify-content' in style: element_data['style']['justifyContent'] = style['justify-content'] if 'align-items' in style: element_data['style']['alignItems'] = style['align-items'] def extract_background_styles(element_data, style): """Extract background-related styles from computed style""" if 'background-color' in style: element_data['style']['backgroundColor'] = style['background-color'] if 'background-image' in style: bg_image = style['background-image'] if bg_image.startswith('url(') and bg_image.endswith(')'): image_url = bg_image[4:-1].strip('"\'') element_data['style']['backgroundImage'] = image_url def apply_common_styles(element_data, style): """Apply common styles that apply to all elements""" # Border properties if 'border' in style: element_data['style']['border'] = style['border'] else: # Individual border properties for side in ['top', 'right', 'bottom', 'left']: border_key = f'border-{side}' if border_key in style: element_data['style'][border_key] = style[border_key] if 'border-radius' in style: element_data['style']['borderRadius'] = style['border-radius'] # Opacity if 'opacity' in style: element_data['style']['opacity'] = style['opacity'] # Visibility if 'visibility' in style: element_data['style']['visibility'] = style['visibility'] # Box shadow if 'box-shadow' in style: element_data['style']['boxShadow'] = style['box-shadow'] if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) app.run(host="0.0.0.0", port=port)