editableweb / app.py
AkashKumarave's picture
Update app.py
9c35b0c verified
raw
history blame
23.3 kB
import os
import json
import requests
import traceback
from flask import Flask, request, jsonify
from bs4 import BeautifulSoup
import logging
import cssutils
import re
import urllib.parse
from PIL import Image
from io import BytesIO
app = Flask(__name__)
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
cssutils.log.setLevel(logging.CRITICAL) # Suppress CSS parsing warnings
@app.route('/')
def home():
return """
<!DOCTYPE html>
<html>
<head>
<title>Website Converter</title>
</head>
<body>
<h1>Welcome to Website Converter</h1>
<p>Use the /api/convert endpoint to convert websites to structured data.</p>
</body>
</html>
"""
@app.route('/api/convert', methods=['POST'])
def convert_website():
try:
data = request.json
if not data:
return jsonify({"error": "No data provided"}), 400
url = data.get('url')
if not url:
return jsonify({"error": "URL is required"}), 400
# Add http if not present
if not url.startswith('http'):
url = 'https://' + url
viewport_width = int(data.get('viewport_width', 1440))
viewport_height = 900 # Default height
logger.info(f"Converting website: {url} with viewport width: {viewport_width}")
try:
# Use requests to get the webpage
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.9',
}
response = requests.get(url, headers=headers, timeout=20)
response.raise_for_status() # Raise an exception for HTTP errors
html_content = response.text
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Extract all CSS styles (improved method)
all_styles = extract_all_css(soup, url)
# Extract the page elements using BeautifulSoup
elements = extract_elements_improved(soup, all_styles)
# Estimate page height based on content
estimated_height = viewport_height
if elements:
# Find the maximum y-coordinate plus height
max_y = 0
for element in elements:
elem_bottom = element.get('y', 0) + element.get('height', 0)
max_y = max(max_y, elem_bottom)
estimated_height = max(viewport_height, max_y)
# Prepare response
response = {
"status": "success",
"url": url,
"viewport_width": viewport_width,
"viewport_height": estimated_height,
"elements": elements
}
return jsonify(response)
except requests.exceptions.RequestException as e:
logger.error(f"Request error: {str(e)}")
return jsonify({"error": f"Failed to fetch website: {str(e)}"}), 500
except Exception as e:
logger.error(f"Error: {str(e)}")
logger.error(traceback.format_exc())
return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
def extract_all_css(soup, base_url):
"""Extract all CSS from the page: inline, style tags, and external stylesheets"""
all_styles = {}
# 1. Extract inline styles
for element in soup.find_all(style=True):
element_id = element.get('id')
element_classes = element.get('class', [])
# Create selectors for this element
selectors = []
if element_id:
selectors.append(f"#{element_id}")
if element_classes:
for cls in element_classes:
selectors.append(f".{cls}")
if not selectors: # Fallback to tag name
selectors.append(element.name)
# Store inline style for each selector
inline_style = parse_inline_style(element['style'])
for selector in selectors:
all_styles[selector] = inline_style
# 2. Extract style tags
for style_tag in soup.find_all('style'):
if style_tag.string:
css_dict = parse_css(style_tag.string)
all_styles.update(css_dict)
# 3. Extract linked stylesheets
for link in soup.find_all('link', rel='stylesheet'):
href = link.get('href')
if not href:
continue
# Make absolute URL if relative
if not href.startswith(('http://', 'https://')):
href = urllib.parse.urljoin(base_url, href)
try:
css_response = requests.get(href, timeout=10)
if css_response.ok:
css_dict = parse_css(css_response.text)
all_styles.update(css_dict)
except Exception as e:
logger.warning(f"Failed to fetch stylesheet {href}: {e}")
# 4. Add computed styles for common elements
add_default_styles(all_styles)
return all_styles
def parse_inline_style(style_text):
"""Parse inline style string into a dictionary"""
style_dict = {}
if not style_text:
return style_dict
# Split style string into individual properties
for item in style_text.split(';'):
if ':' in item:
prop, value = item.split(':', 1)
prop = prop.strip().lower()
value = value.strip()
if prop and value:
style_dict[prop] = value
return style_dict
def parse_css(css_text):
"""Parse CSS text into a dictionary of selectors and styles"""
styles = {}
try:
sheet = cssutils.parseString(css_text)
for rule in sheet:
# Only handle style rules (not @media, etc.)
if rule.type == rule.STYLE_RULE:
selector = rule.selectorText
style_dict = {}
for prop in rule.style:
if prop.name and prop.value:
style_dict[prop.name.lower()] = prop.value
# Add to styles, merging if selector already exists
if selector in styles:
styles[selector].update(style_dict)
else:
styles[selector] = style_dict
except Exception as e:
logger.warning(f"CSS parsing error: {e}")
return styles
def add_default_styles(styles):
"""Add default styles for common HTML elements"""
# Body defaults
styles.setdefault('body', {}).update({
'margin': '0px',
'font-family': 'Arial, sans-serif',
'color': '#000000',
'font-size': '16px'
})
# Heading defaults
styles.setdefault('h1', {}).update({'font-size': '32px', 'font-weight': 'bold', 'margin': '21.44px 0'})
styles.setdefault('h2', {}).update({'font-size': '24px', 'font-weight': 'bold', 'margin': '19.92px 0'})
styles.setdefault('h3', {}).update({'font-size': '18px', 'font-weight': 'bold', 'margin': '18.72px 0'})
styles.setdefault('h4', {}).update({'font-size': '16px', 'font-weight': 'bold', 'margin': '21.28px 0'})
# Link defaults
styles.setdefault('a', {}).update({'color': '#0000EE', 'text-decoration': 'underline'})
# Button defaults
styles.setdefault('button', {}).update({
'background-color': '#F0F0F0',
'border': '1px solid #CCCCCC',
'padding': '4px 8px',
'border-radius': '2px'
})
# Input defaults
styles.setdefault('input', {}).update({
'border': '1px solid #CCCCCC',
'padding': '2px 4px'
})
def extract_elements_improved(soup, styles):
"""Extract elements from the webpage with improved CSS handling"""
elements = []
# Get the body element
body = soup.find('body')
if not body:
return elements
# Start position for elements
x_offset = 0
y_position = 0
viewport_width = 1440 # Default width
# Create a mapping of elements to their computed styles
element_styles = {}
# Process main content blocks first
main_blocks = body.find_all(['div', 'header', 'main', 'nav', 'footer', 'section'], recursive=False)
if not main_blocks: # If no main blocks, use all direct children
main_blocks = body.find_all(recursive=False)
# Process each main block
for block in main_blocks:
block_data = process_element_with_styles(block, x_offset, y_position, viewport_width, styles)
if block_data:
elements.append(block_data)
y_position += block_data['height'] + 10 # Add spacing between blocks
# If no elements were found, try to extract text directly
if not elements and body.text.strip():
text_element = {
'type': 'text',
'tagName': 'p',
'x': 0,
'y': 0,
'width': viewport_width,
'height': 100,
'content': body.text.strip(),
'style': {
'color': '#000000',
'fontSize': '16px',
'fontFamily': 'Arial, sans-serif'
}
}
elements.append(text_element)
return elements
def process_element_with_styles(element, x_position, y_position, parent_width, styles, depth=0):
"""Process a single HTML element with its styles"""
if depth > 10: # Limit recursion depth
return None
tag_name = element.name.lower() if hasattr(element, 'name') else None
if not tag_name or tag_name in ['script', 'style', 'meta', 'link', 'noscript']:
return None
# Get element's classes and ID
elem_classes = element.get('class', [])
elem_id = element.get('id')
# Calculate element's computed style
computed_style = compute_element_style(element, tag_name, elem_id, elem_classes, styles)
# Create base element data
element_data = {
'type': get_element_type(tag_name),
'tagName': tag_name,
'x': x_position,
'y': y_position,
'width': calc_element_width(computed_style, parent_width),
'height': 50, # Default height, will be adjusted later
'style': {}
}
# Set element ID and class if present
if elem_id:
element_data['id'] = elem_id
if elem_classes:
if isinstance(elem_classes, list):
element_data['className'] = ' '.join(elem_classes)
else:
element_data['className'] = elem_classes
# Process specific element types
if element_data['type'] == 'text':
text_content = element.get_text().strip()
element_data['content'] = text_content
# Set text styles
extract_text_styles(element_data, computed_style)
# Calculate height based on text content
element_data['height'] = calc_text_height(text_content, computed_style)
elif element_data['type'] == 'image':
# Set image source if available
element_data['src'] = element.get('src', '')
element_data['alt'] = element.get('alt', '')
# Set height for images
if 'height' in computed_style:
try:
element_data['height'] = parse_dimension(computed_style['height'], parent_width)
except:
element_data['height'] = 200 # Default height
else:
element_data['height'] = 200
# Extract background styles
extract_background_styles(element_data, computed_style)
elif element_data['type'] in ['div', 'container', 'rectangle']:
# Process container elements
extract_container_styles(element_data, computed_style)
# Process children
children = []
child_y_position = 0
child_x_position = 0
# Apply padding if present
padding_left = parse_dimension(computed_style.get('padding-left', '0'), parent_width)
child_x_position += padding_left
available_width = element_data['width'] - (padding_left + parse_dimension(computed_style.get('padding-right', '0'), parent_width))
# Process child elements
for child in element.find_all(['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'span', 'a', 'button', 'input', 'form'], recursive=False):
child_data = process_element_with_styles(child, child_x_position, child_y_position, available_width, styles, depth + 1)
if child_data:
children.append(child_data)
if 'display' in computed_style and computed_style['display'] == 'flex':
# Handle flex layout (simplified)
if computed_style.get('flex-direction') == 'row':
child_x_position += child_data['width'] + 5
else:
child_y_position += child_data['height'] + 5
else:
# Default block layout
child_y_position += child_data['height'] + 5
if children:
element_data['children'] = children
# Adjust container height based on children
if children and 'display' not in computed_style or computed_style.get('display') != 'flex':
last_child = children[-1]
element_data['height'] = last_child['y'] - element_data['y'] + last_child['height'] + 10
# Apply common styles (border, margin, etc)
apply_common_styles(element_data, computed_style)
# If height is unreasonably small, set a minimum
if element_data['height'] < 10:
element_data['height'] = 10
return element_data
def get_element_type(tag_name):
"""Determine element type based on tag name"""
if tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'a', 'label']:
return 'text'
elif tag_name == 'img':
return 'image'
elif tag_name in ['div', 'section', 'article', 'header', 'footer', 'main', 'form']:
return 'div'
elif tag_name == 'button':
return 'rectangle' # Represent as a rectangle with text
elif tag_name == 'input':
return 'rectangle' # Represent as a rectangle
else:
return 'div' # Default type
def compute_element_style(element, tag_name, elem_id, elem_classes, styles):
"""Compute the final style for an element by cascading CSS rules"""
computed_style = {}
# 1. Apply tag-level styles
if tag_name in styles:
computed_style.update(styles[tag_name])
# 2. Apply class styles
if isinstance(elem_classes, list):
for cls in elem_classes:
class_selector = f".{cls}"
if class_selector in styles:
computed_style.update(styles[class_selector])
elif elem_classes:
class_selector = f".{elem_classes}"
if class_selector in styles:
computed_style.update(styles[class_selector])
# 3. Apply ID styles (highest specificity)
if elem_id and f"#{elem_id}" in styles:
computed_style.update(styles[f"#{elem_id}"])
# 4. Apply inline styles (overrides everything)
inline_style = element.get('style')
if inline_style:
parsed_inline = parse_inline_style(inline_style)
computed_style.update(parsed_inline)
return computed_style
def parse_dimension(value, container_size):
"""Parse dimension values (px, %, em, etc)"""
if not value or not isinstance(value, str):
return 0
value = value.strip().lower()
# Handle pixel values
if value.endswith('px'):
try:
return float(value[:-2])
except:
return 0
# Handle percentage values
elif value.endswith('%'):
try:
percentage = float(value[:-1]) / 100
return container_size * percentage
except:
return 0
# Handle em values (approximate)
elif value.endswith('em'):
try:
em_value = float(value[:-2])
return em_value * 16 # Assuming 1em = 16px
except:
return 0
# Handle rem values (approximate)
elif value.endswith('rem'):
try:
rem_value = float(value[:-3])
return rem_value * 16 # Assuming 1rem = 16px
except:
return 0
# Handle vh/vw values (viewport height/width)
elif value.endswith('vh'):
try:
vh_value = float(value[:-2]) / 100
return vh_value * 900 # Assuming viewport height is 900px
except:
return 0
elif value.endswith('vw'):
try:
vw_value = float(value[:-2]) / 100
return vw_value * 1440 # Assuming viewport width is 1440px
except:
return 0
# Handle numeric values
elif value.isdigit():
return float(value)
# Handle auto (use container size)
elif value == 'auto':
return container_size
# Default fallback
return 0
def calc_element_width(style, parent_width):
"""Calculate element width based on its style"""
# Check if width is explicitly set
if 'width' in style:
width_value = style['width']
return parse_dimension(width_value, parent_width)
# Check for max-width
if 'max-width' in style:
max_width = parse_dimension(style['max-width'], parent_width)
return min(parent_width, max_width)
# Default: use parent width
return parent_width
def calc_text_height(text, style):
"""Calculate text height based on content and style"""
if not text:
return 20
# Get font size
font_size = 16 # Default
if 'font-size' in style:
font_size_value = style['font-size']
if isinstance(font_size_value, str):
if font_size_value.endswith('px'):
try:
font_size = float(font_size_value[:-2])
except:
pass
elif font_size_value.endswith('em'):
try:
font_size = float(font_size_value[:-2]) * 16
except:
pass
# Get line height
line_height = 1.2 # Default
if 'line-height' in style:
line_height_value = style['line-height']
if isinstance(line_height_value, str):
if line_height_value.endswith('px'):
try:
line_height = float(line_height_value[:-2]) / font_size
except:
pass
else:
try:
line_height = float(line_height_value)
except:
pass
# Estimate number of lines needed
text_length = len(text)
chars_per_line = 70 # Rough estimate
num_lines = max(1, (text_length / chars_per_line))
# Calculate height
return max(20, int(font_size * line_height * num_lines))
def extract_text_styles(element_data, style):
"""Extract text-related styles from computed style"""
# Text color
if 'color' in style:
element_data['style']['color'] = style['color']
else:
element_data['style']['color'] = '#000000' # Default black
# Font size
if 'font-size' in style:
element_data['style']['fontSize'] = style['font-size']
else:
tag_name = element_data.get('tagName', '')
if tag_name.startswith('h'):
# Default heading sizes
heading_level = int(tag_name[1])
size = 32 - ((heading_level - 1) * 4)
element_data['style']['fontSize'] = f"{size}px"
else:
element_data['style']['fontSize'] = '16px' # Default
# Font weight
if 'font-weight' in style:
element_data['style']['fontWeight'] = style['font-weight']
else:
tag_name = element_data.get('tagName', '')
if tag_name.startswith('h'):
element_data['style']['fontWeight'] = 'bold'
else:
element_data['style']['fontWeight'] = 'normal'
# Font family
if 'font-family' in style:
element_data['style']['fontFamily'] = style['font-family']
# Text alignment
if 'text-align' in style:
element_data['style']['textAlign'] = style['text-align']
# Text decoration
if 'text-decoration' in style:
element_data['style']['textDecoration'] = style['text-decoration']
def extract_container_styles(element_data, style):
"""Extract container-related styles from computed style"""
# Background color
if 'background-color' in style:
element_data['style']['backgroundColor'] = style['background-color']
# Display type
if 'display' in style:
element_data['style']['display'] = style['display']
# Flex-related properties
if 'display' in style and style['display'] == 'flex':
if 'flex-direction' in style:
element_data['style']['flexDirection'] = style['flex-direction']
if 'justify-content' in style:
element_data['style']['justifyContent'] = style['justify-content']
if 'align-items' in style:
element_data['style']['alignItems'] = style['align-items']
def extract_background_styles(element_data, style):
"""Extract background-related styles from computed style"""
if 'background-color' in style:
element_data['style']['backgroundColor'] = style['background-color']
if 'background-image' in style:
bg_image = style['background-image']
if bg_image.startswith('url(') and bg_image.endswith(')'):
image_url = bg_image[4:-1].strip('"\'')
element_data['style']['backgroundImage'] = image_url
def apply_common_styles(element_data, style):
"""Apply common styles that apply to all elements"""
# Border properties
if 'border' in style:
element_data['style']['border'] = style['border']
else:
# Individual border properties
for side in ['top', 'right', 'bottom', 'left']:
border_key = f'border-{side}'
if border_key in style:
element_data['style'][border_key] = style[border_key]
if 'border-radius' in style:
element_data['style']['borderRadius'] = style['border-radius']
# Opacity
if 'opacity' in style:
element_data['style']['opacity'] = style['opacity']
# Visibility
if 'visibility' in style:
element_data['style']['visibility'] = style['visibility']
# Box shadow
if 'box-shadow' in style:
element_data['style']['boxShadow'] = style['box-shadow']
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port)