Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import requests | |
| import traceback | |
| from flask import Flask, request, jsonify | |
| from bs4 import BeautifulSoup | |
| import logging | |
| import cssutils | |
| import re | |
| import urllib.parse | |
| from PIL import Image | |
| from io import BytesIO | |
| app = Flask(__name__) | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| cssutils.log.setLevel(logging.CRITICAL) # Suppress CSS parsing warnings | |
| def home(): | |
| return """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>Website Converter</title> | |
| </head> | |
| <body> | |
| <h1>Welcome to Website Converter</h1> | |
| <p>Use the /api/convert endpoint to convert websites to structured data.</p> | |
| </body> | |
| </html> | |
| """ | |
| def convert_website(): | |
| try: | |
| data = request.json | |
| if not data: | |
| return jsonify({"error": "No data provided"}), 400 | |
| url = data.get('url') | |
| if not url: | |
| return jsonify({"error": "URL is required"}), 400 | |
| # Add http if not present | |
| if not url.startswith('http'): | |
| url = 'https://' + url | |
| viewport_width = int(data.get('viewport_width', 1440)) | |
| viewport_height = 900 # Default height | |
| logger.info(f"Converting website: {url} with viewport width: {viewport_width}") | |
| try: | |
| # Use requests to get the webpage | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| } | |
| response = requests.get(url, headers=headers, timeout=20) | |
| response.raise_for_status() # Raise an exception for HTTP errors | |
| html_content = response.text | |
| # Parse the HTML content | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract all CSS styles (improved method) | |
| all_styles = extract_all_css(soup, url) | |
| # Extract the page elements using BeautifulSoup | |
| elements = extract_elements_improved(soup, all_styles) | |
| # Estimate page height based on content | |
| estimated_height = viewport_height | |
| if elements: | |
| # Find the maximum y-coordinate plus height | |
| max_y = 0 | |
| for element in elements: | |
| elem_bottom = element.get('y', 0) + element.get('height', 0) | |
| max_y = max(max_y, elem_bottom) | |
| estimated_height = max(viewport_height, max_y) | |
| # Prepare response | |
| response = { | |
| "status": "success", | |
| "url": url, | |
| "viewport_width": viewport_width, | |
| "viewport_height": estimated_height, | |
| "elements": elements | |
| } | |
| return jsonify(response) | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Request error: {str(e)}") | |
| return jsonify({"error": f"Failed to fetch website: {str(e)}"}), 500 | |
| except Exception as e: | |
| logger.error(f"Error: {str(e)}") | |
| logger.error(traceback.format_exc()) | |
| return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500 | |
| def extract_all_css(soup, base_url): | |
| """Extract all CSS from the page: inline, style tags, and external stylesheets""" | |
| all_styles = {} | |
| # 1. Extract inline styles | |
| for element in soup.find_all(style=True): | |
| element_id = element.get('id') | |
| element_classes = element.get('class', []) | |
| # Create selectors for this element | |
| selectors = [] | |
| if element_id: | |
| selectors.append(f"#{element_id}") | |
| if element_classes: | |
| for cls in element_classes: | |
| selectors.append(f".{cls}") | |
| if not selectors: # Fallback to tag name | |
| selectors.append(element.name) | |
| # Store inline style for each selector | |
| inline_style = parse_inline_style(element['style']) | |
| for selector in selectors: | |
| all_styles[selector] = inline_style | |
| # 2. Extract style tags | |
| for style_tag in soup.find_all('style'): | |
| if style_tag.string: | |
| css_dict = parse_css(style_tag.string) | |
| all_styles.update(css_dict) | |
| # 3. Extract linked stylesheets | |
| for link in soup.find_all('link', rel='stylesheet'): | |
| href = link.get('href') | |
| if not href: | |
| continue | |
| # Make absolute URL if relative | |
| if not href.startswith(('http://', 'https://')): | |
| href = urllib.parse.urljoin(base_url, href) | |
| try: | |
| css_response = requests.get(href, timeout=10) | |
| if css_response.ok: | |
| css_dict = parse_css(css_response.text) | |
| all_styles.update(css_dict) | |
| except Exception as e: | |
| logger.warning(f"Failed to fetch stylesheet {href}: {e}") | |
| # 4. Add computed styles for common elements | |
| add_default_styles(all_styles) | |
| return all_styles | |
| def parse_inline_style(style_text): | |
| """Parse inline style string into a dictionary""" | |
| style_dict = {} | |
| if not style_text: | |
| return style_dict | |
| # Split style string into individual properties | |
| for item in style_text.split(';'): | |
| if ':' in item: | |
| prop, value = item.split(':', 1) | |
| prop = prop.strip().lower() | |
| value = value.strip() | |
| if prop and value: | |
| style_dict[prop] = value | |
| return style_dict | |
| def parse_css(css_text): | |
| """Parse CSS text into a dictionary of selectors and styles""" | |
| styles = {} | |
| try: | |
| sheet = cssutils.parseString(css_text) | |
| for rule in sheet: | |
| # Only handle style rules (not @media, etc.) | |
| if rule.type == rule.STYLE_RULE: | |
| selector = rule.selectorText | |
| style_dict = {} | |
| for prop in rule.style: | |
| if prop.name and prop.value: | |
| style_dict[prop.name.lower()] = prop.value | |
| # Add to styles, merging if selector already exists | |
| if selector in styles: | |
| styles[selector].update(style_dict) | |
| else: | |
| styles[selector] = style_dict | |
| except Exception as e: | |
| logger.warning(f"CSS parsing error: {e}") | |
| return styles | |
| def add_default_styles(styles): | |
| """Add default styles for common HTML elements""" | |
| # Body defaults | |
| styles.setdefault('body', {}).update({ | |
| 'margin': '0px', | |
| 'font-family': 'Arial, sans-serif', | |
| 'color': '#000000', | |
| 'font-size': '16px' | |
| }) | |
| # Heading defaults | |
| styles.setdefault('h1', {}).update({'font-size': '32px', 'font-weight': 'bold', 'margin': '21.44px 0'}) | |
| styles.setdefault('h2', {}).update({'font-size': '24px', 'font-weight': 'bold', 'margin': '19.92px 0'}) | |
| styles.setdefault('h3', {}).update({'font-size': '18px', 'font-weight': 'bold', 'margin': '18.72px 0'}) | |
| styles.setdefault('h4', {}).update({'font-size': '16px', 'font-weight': 'bold', 'margin': '21.28px 0'}) | |
| # Link defaults | |
| styles.setdefault('a', {}).update({'color': '#0000EE', 'text-decoration': 'underline'}) | |
| # Button defaults | |
| styles.setdefault('button', {}).update({ | |
| 'background-color': '#F0F0F0', | |
| 'border': '1px solid #CCCCCC', | |
| 'padding': '4px 8px', | |
| 'border-radius': '2px' | |
| }) | |
| # Input defaults | |
| styles.setdefault('input', {}).update({ | |
| 'border': '1px solid #CCCCCC', | |
| 'padding': '2px 4px' | |
| }) | |
| def extract_elements_improved(soup, styles): | |
| """Extract elements from the webpage with improved CSS handling""" | |
| elements = [] | |
| # Get the body element | |
| body = soup.find('body') | |
| if not body: | |
| return elements | |
| # Start position for elements | |
| x_offset = 0 | |
| y_position = 0 | |
| viewport_width = 1440 # Default width | |
| # Create a mapping of elements to their computed styles | |
| element_styles = {} | |
| # Process main content blocks first | |
| main_blocks = body.find_all(['div', 'header', 'main', 'nav', 'footer', 'section'], recursive=False) | |
| if not main_blocks: # If no main blocks, use all direct children | |
| main_blocks = body.find_all(recursive=False) | |
| # Process each main block | |
| for block in main_blocks: | |
| block_data = process_element_with_styles(block, x_offset, y_position, viewport_width, styles) | |
| if block_data: | |
| elements.append(block_data) | |
| y_position += block_data['height'] + 10 # Add spacing between blocks | |
| # If no elements were found, try to extract text directly | |
| if not elements and body.text.strip(): | |
| text_element = { | |
| 'type': 'text', | |
| 'tagName': 'p', | |
| 'x': 0, | |
| 'y': 0, | |
| 'width': viewport_width, | |
| 'height': 100, | |
| 'content': body.text.strip(), | |
| 'style': { | |
| 'color': '#000000', | |
| 'fontSize': '16px', | |
| 'fontFamily': 'Arial, sans-serif' | |
| } | |
| } | |
| elements.append(text_element) | |
| return elements | |
| def process_element_with_styles(element, x_position, y_position, parent_width, styles, depth=0): | |
| """Process a single HTML element with its styles""" | |
| if depth > 10: # Limit recursion depth | |
| return None | |
| tag_name = element.name.lower() if hasattr(element, 'name') else None | |
| if not tag_name or tag_name in ['script', 'style', 'meta', 'link', 'noscript']: | |
| return None | |
| # Get element's classes and ID | |
| elem_classes = element.get('class', []) | |
| elem_id = element.get('id') | |
| # Calculate element's computed style | |
| computed_style = compute_element_style(element, tag_name, elem_id, elem_classes, styles) | |
| # Create base element data | |
| element_data = { | |
| 'type': get_element_type(tag_name), | |
| 'tagName': tag_name, | |
| 'x': x_position, | |
| 'y': y_position, | |
| 'width': calc_element_width(computed_style, parent_width), | |
| 'height': 50, # Default height, will be adjusted later | |
| 'style': {} | |
| } | |
| # Set element ID and class if present | |
| if elem_id: | |
| element_data['id'] = elem_id | |
| if elem_classes: | |
| if isinstance(elem_classes, list): | |
| element_data['className'] = ' '.join(elem_classes) | |
| else: | |
| element_data['className'] = elem_classes | |
| # Process specific element types | |
| if element_data['type'] == 'text': | |
| text_content = element.get_text().strip() | |
| element_data['content'] = text_content | |
| # Set text styles | |
| extract_text_styles(element_data, computed_style) | |
| # Calculate height based on text content | |
| element_data['height'] = calc_text_height(text_content, computed_style) | |
| elif element_data['type'] == 'image': | |
| # Set image source if available | |
| element_data['src'] = element.get('src', '') | |
| element_data['alt'] = element.get('alt', '') | |
| # Set height for images | |
| if 'height' in computed_style: | |
| try: | |
| element_data['height'] = parse_dimension(computed_style['height'], parent_width) | |
| except: | |
| element_data['height'] = 200 # Default height | |
| else: | |
| element_data['height'] = 200 | |
| # Extract background styles | |
| extract_background_styles(element_data, computed_style) | |
| elif element_data['type'] in ['div', 'container', 'rectangle']: | |
| # Process container elements | |
| extract_container_styles(element_data, computed_style) | |
| # Process children | |
| children = [] | |
| child_y_position = 0 | |
| child_x_position = 0 | |
| # Apply padding if present | |
| padding_left = parse_dimension(computed_style.get('padding-left', '0'), parent_width) | |
| child_x_position += padding_left | |
| available_width = element_data['width'] - (padding_left + parse_dimension(computed_style.get('padding-right', '0'), parent_width)) | |
| # Process child elements | |
| for child in element.find_all(['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'span', 'a', 'button', 'input', 'form'], recursive=False): | |
| child_data = process_element_with_styles(child, child_x_position, child_y_position, available_width, styles, depth + 1) | |
| if child_data: | |
| children.append(child_data) | |
| if 'display' in computed_style and computed_style['display'] == 'flex': | |
| # Handle flex layout (simplified) | |
| if computed_style.get('flex-direction') == 'row': | |
| child_x_position += child_data['width'] + 5 | |
| else: | |
| child_y_position += child_data['height'] + 5 | |
| else: | |
| # Default block layout | |
| child_y_position += child_data['height'] + 5 | |
| if children: | |
| element_data['children'] = children | |
| # Adjust container height based on children | |
| if children and 'display' not in computed_style or computed_style.get('display') != 'flex': | |
| last_child = children[-1] | |
| element_data['height'] = last_child['y'] - element_data['y'] + last_child['height'] + 10 | |
| # Apply common styles (border, margin, etc) | |
| apply_common_styles(element_data, computed_style) | |
| # If height is unreasonably small, set a minimum | |
| if element_data['height'] < 10: | |
| element_data['height'] = 10 | |
| return element_data | |
| def get_element_type(tag_name): | |
| """Determine element type based on tag name""" | |
| if tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'a', 'label']: | |
| return 'text' | |
| elif tag_name == 'img': | |
| return 'image' | |
| elif tag_name in ['div', 'section', 'article', 'header', 'footer', 'main', 'form']: | |
| return 'div' | |
| elif tag_name == 'button': | |
| return 'rectangle' # Represent as a rectangle with text | |
| elif tag_name == 'input': | |
| return 'rectangle' # Represent as a rectangle | |
| else: | |
| return 'div' # Default type | |
| def compute_element_style(element, tag_name, elem_id, elem_classes, styles): | |
| """Compute the final style for an element by cascading CSS rules""" | |
| computed_style = {} | |
| # 1. Apply tag-level styles | |
| if tag_name in styles: | |
| computed_style.update(styles[tag_name]) | |
| # 2. Apply class styles | |
| if isinstance(elem_classes, list): | |
| for cls in elem_classes: | |
| class_selector = f".{cls}" | |
| if class_selector in styles: | |
| computed_style.update(styles[class_selector]) | |
| elif elem_classes: | |
| class_selector = f".{elem_classes}" | |
| if class_selector in styles: | |
| computed_style.update(styles[class_selector]) | |
| # 3. Apply ID styles (highest specificity) | |
| if elem_id and f"#{elem_id}" in styles: | |
| computed_style.update(styles[f"#{elem_id}"]) | |
| # 4. Apply inline styles (overrides everything) | |
| inline_style = element.get('style') | |
| if inline_style: | |
| parsed_inline = parse_inline_style(inline_style) | |
| computed_style.update(parsed_inline) | |
| return computed_style | |
| def parse_dimension(value, container_size): | |
| """Parse dimension values (px, %, em, etc)""" | |
| if not value or not isinstance(value, str): | |
| return 0 | |
| value = value.strip().lower() | |
| # Handle pixel values | |
| if value.endswith('px'): | |
| try: | |
| return float(value[:-2]) | |
| except: | |
| return 0 | |
| # Handle percentage values | |
| elif value.endswith('%'): | |
| try: | |
| percentage = float(value[:-1]) / 100 | |
| return container_size * percentage | |
| except: | |
| return 0 | |
| # Handle em values (approximate) | |
| elif value.endswith('em'): | |
| try: | |
| em_value = float(value[:-2]) | |
| return em_value * 16 # Assuming 1em = 16px | |
| except: | |
| return 0 | |
| # Handle rem values (approximate) | |
| elif value.endswith('rem'): | |
| try: | |
| rem_value = float(value[:-3]) | |
| return rem_value * 16 # Assuming 1rem = 16px | |
| except: | |
| return 0 | |
| # Handle vh/vw values (viewport height/width) | |
| elif value.endswith('vh'): | |
| try: | |
| vh_value = float(value[:-2]) / 100 | |
| return vh_value * 900 # Assuming viewport height is 900px | |
| except: | |
| return 0 | |
| elif value.endswith('vw'): | |
| try: | |
| vw_value = float(value[:-2]) / 100 | |
| return vw_value * 1440 # Assuming viewport width is 1440px | |
| except: | |
| return 0 | |
| # Handle numeric values | |
| elif value.isdigit(): | |
| return float(value) | |
| # Handle auto (use container size) | |
| elif value == 'auto': | |
| return container_size | |
| # Default fallback | |
| return 0 | |
| def calc_element_width(style, parent_width): | |
| """Calculate element width based on its style""" | |
| # Check if width is explicitly set | |
| if 'width' in style: | |
| width_value = style['width'] | |
| return parse_dimension(width_value, parent_width) | |
| # Check for max-width | |
| if 'max-width' in style: | |
| max_width = parse_dimension(style['max-width'], parent_width) | |
| return min(parent_width, max_width) | |
| # Default: use parent width | |
| return parent_width | |
| def calc_text_height(text, style): | |
| """Calculate text height based on content and style""" | |
| if not text: | |
| return 20 | |
| # Get font size | |
| font_size = 16 # Default | |
| if 'font-size' in style: | |
| font_size_value = style['font-size'] | |
| if isinstance(font_size_value, str): | |
| if font_size_value.endswith('px'): | |
| try: | |
| font_size = float(font_size_value[:-2]) | |
| except: | |
| pass | |
| elif font_size_value.endswith('em'): | |
| try: | |
| font_size = float(font_size_value[:-2]) * 16 | |
| except: | |
| pass | |
| # Get line height | |
| line_height = 1.2 # Default | |
| if 'line-height' in style: | |
| line_height_value = style['line-height'] | |
| if isinstance(line_height_value, str): | |
| if line_height_value.endswith('px'): | |
| try: | |
| line_height = float(line_height_value[:-2]) / font_size | |
| except: | |
| pass | |
| else: | |
| try: | |
| line_height = float(line_height_value) | |
| except: | |
| pass | |
| # Estimate number of lines needed | |
| text_length = len(text) | |
| chars_per_line = 70 # Rough estimate | |
| num_lines = max(1, (text_length / chars_per_line)) | |
| # Calculate height | |
| return max(20, int(font_size * line_height * num_lines)) | |
| def extract_text_styles(element_data, style): | |
| """Extract text-related styles from computed style""" | |
| # Text color | |
| if 'color' in style: | |
| element_data['style']['color'] = style['color'] | |
| else: | |
| element_data['style']['color'] = '#000000' # Default black | |
| # Font size | |
| if 'font-size' in style: | |
| element_data['style']['fontSize'] = style['font-size'] | |
| else: | |
| tag_name = element_data.get('tagName', '') | |
| if tag_name.startswith('h'): | |
| # Default heading sizes | |
| heading_level = int(tag_name[1]) | |
| size = 32 - ((heading_level - 1) * 4) | |
| element_data['style']['fontSize'] = f"{size}px" | |
| else: | |
| element_data['style']['fontSize'] = '16px' # Default | |
| # Font weight | |
| if 'font-weight' in style: | |
| element_data['style']['fontWeight'] = style['font-weight'] | |
| else: | |
| tag_name = element_data.get('tagName', '') | |
| if tag_name.startswith('h'): | |
| element_data['style']['fontWeight'] = 'bold' | |
| else: | |
| element_data['style']['fontWeight'] = 'normal' | |
| # Font family | |
| if 'font-family' in style: | |
| element_data['style']['fontFamily'] = style['font-family'] | |
| # Text alignment | |
| if 'text-align' in style: | |
| element_data['style']['textAlign'] = style['text-align'] | |
| # Text decoration | |
| if 'text-decoration' in style: | |
| element_data['style']['textDecoration'] = style['text-decoration'] | |
| def extract_container_styles(element_data, style): | |
| """Extract container-related styles from computed style""" | |
| # Background color | |
| if 'background-color' in style: | |
| element_data['style']['backgroundColor'] = style['background-color'] | |
| # Display type | |
| if 'display' in style: | |
| element_data['style']['display'] = style['display'] | |
| # Flex-related properties | |
| if 'display' in style and style['display'] == 'flex': | |
| if 'flex-direction' in style: | |
| element_data['style']['flexDirection'] = style['flex-direction'] | |
| if 'justify-content' in style: | |
| element_data['style']['justifyContent'] = style['justify-content'] | |
| if 'align-items' in style: | |
| element_data['style']['alignItems'] = style['align-items'] | |
| def extract_background_styles(element_data, style): | |
| """Extract background-related styles from computed style""" | |
| if 'background-color' in style: | |
| element_data['style']['backgroundColor'] = style['background-color'] | |
| if 'background-image' in style: | |
| bg_image = style['background-image'] | |
| if bg_image.startswith('url(') and bg_image.endswith(')'): | |
| image_url = bg_image[4:-1].strip('"\'') | |
| element_data['style']['backgroundImage'] = image_url | |
| def apply_common_styles(element_data, style): | |
| """Apply common styles that apply to all elements""" | |
| # Border properties | |
| if 'border' in style: | |
| element_data['style']['border'] = style['border'] | |
| else: | |
| # Individual border properties | |
| for side in ['top', 'right', 'bottom', 'left']: | |
| border_key = f'border-{side}' | |
| if border_key in style: | |
| element_data['style'][border_key] = style[border_key] | |
| if 'border-radius' in style: | |
| element_data['style']['borderRadius'] = style['border-radius'] | |
| # Opacity | |
| if 'opacity' in style: | |
| element_data['style']['opacity'] = style['opacity'] | |
| # Visibility | |
| if 'visibility' in style: | |
| element_data['style']['visibility'] = style['visibility'] | |
| # Box shadow | |
| if 'box-shadow' in style: | |
| element_data['style']['boxShadow'] = style['box-shadow'] | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| app.run(host="0.0.0.0", port=port) |