Spaces:

AkashKumarave
/

editableweb

Sleeping

App Files Files Community

editableweb / app.py

AkashKumarave

Update app.py

9c35b0c verified 7 months ago

raw

history blame

23.3 kB

	import os
	import json
	import requests
	import traceback
	from flask import Flask, request, jsonify
	from bs4 import BeautifulSoup
	import logging
	import cssutils
	import re
	import urllib.parse
	from PIL import Image
	from io import BytesIO

	app = Flask(__name__)

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)
	cssutils.log.setLevel(logging.CRITICAL) # Suppress CSS parsing warnings

	@app.route('/')
	def home():
	return """
	<!DOCTYPE html>
	<html>
	<head>
	<title>Website Converter</title>
	</head>
	<body>
	<h1>Welcome to Website Converter</h1>
	<p>Use the /api/convert endpoint to convert websites to structured data.</p>
	</body>
	</html>
	"""

	@app.route('/api/convert', methods=['POST'])
	def convert_website():
	try:
	data = request.json
	if not data:
	return jsonify({"error": "No data provided"}), 400

	url = data.get('url')
	if not url:
	return jsonify({"error": "URL is required"}), 400

	# Add http if not present
	if not url.startswith('http'):
	url = 'https://' + url

	viewport_width = int(data.get('viewport_width', 1440))
	viewport_height = 900 # Default height

	logger.info(f"Converting website: {url} with viewport width: {viewport_width}")

	try:
	# Use requests to get the webpage
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml',
	'Accept-Language': 'en-US,en;q=0.9',
	}

	response = requests.get(url, headers=headers, timeout=20)
	response.raise_for_status() # Raise an exception for HTTP errors

	html_content = response.text

	# Parse the HTML content
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract all CSS styles (improved method)
	all_styles = extract_all_css(soup, url)

	# Extract the page elements using BeautifulSoup
	elements = extract_elements_improved(soup, all_styles)

	# Estimate page height based on content
	estimated_height = viewport_height
	if elements:
	# Find the maximum y-coordinate plus height
	max_y = 0
	for element in elements:
	elem_bottom = element.get('y', 0) + element.get('height', 0)
	max_y = max(max_y, elem_bottom)
	estimated_height = max(viewport_height, max_y)

	# Prepare response
	response = {
	"status": "success",
	"url": url,
	"viewport_width": viewport_width,
	"viewport_height": estimated_height,
	"elements": elements
	}

	return jsonify(response)

	except requests.exceptions.RequestException as e:
	logger.error(f"Request error: {str(e)}")
	return jsonify({"error": f"Failed to fetch website: {str(e)}"}), 500

	except Exception as e:
	logger.error(f"Error: {str(e)}")
	logger.error(traceback.format_exc())
	return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500

	def extract_all_css(soup, base_url):
	"""Extract all CSS from the page: inline, style tags, and external stylesheets"""
	all_styles = {}

	# 1. Extract inline styles
	for element in soup.find_all(style=True):
	element_id = element.get('id')
	element_classes = element.get('class', [])

	# Create selectors for this element
	selectors = []
	if element_id:
	selectors.append(f"#{element_id}")
	if element_classes:
	for cls in element_classes:
	selectors.append(f".{cls}")
	if not selectors: # Fallback to tag name
	selectors.append(element.name)

	# Store inline style for each selector
	inline_style = parse_inline_style(element['style'])
	for selector in selectors:
	all_styles[selector] = inline_style

	# 2. Extract style tags
	for style_tag in soup.find_all('style'):
	if style_tag.string:
	css_dict = parse_css(style_tag.string)
	all_styles.update(css_dict)

	# 3. Extract linked stylesheets
	for link in soup.find_all('link', rel='stylesheet'):
	href = link.get('href')
	if not href:
	continue

	# Make absolute URL if relative
	if not href.startswith(('http://', 'https://')):
	href = urllib.parse.urljoin(base_url, href)

	try:
	css_response = requests.get(href, timeout=10)
	if css_response.ok:
	css_dict = parse_css(css_response.text)
	all_styles.update(css_dict)
	except Exception as e:
	logger.warning(f"Failed to fetch stylesheet {href}: {e}")

	# 4. Add computed styles for common elements
	add_default_styles(all_styles)

	return all_styles

	def parse_inline_style(style_text):
	"""Parse inline style string into a dictionary"""
	style_dict = {}
	if not style_text:
	return style_dict

	# Split style string into individual properties
	for item in style_text.split(';'):
	if ':' in item:
	prop, value = item.split(':', 1)
	prop = prop.strip().lower()
	value = value.strip()
	if prop and value:
	style_dict[prop] = value

	return style_dict

	def parse_css(css_text):
	"""Parse CSS text into a dictionary of selectors and styles"""
	styles = {}

	try:
	sheet = cssutils.parseString(css_text)
	for rule in sheet:
	# Only handle style rules (not @media, etc.)
	if rule.type == rule.STYLE_RULE:
	selector = rule.selectorText
	style_dict = {}

	for prop in rule.style:
	if prop.name and prop.value:
	style_dict[prop.name.lower()] = prop.value

	# Add to styles, merging if selector already exists
	if selector in styles:
	styles[selector].update(style_dict)
	else:
	styles[selector] = style_dict
	except Exception as e:
	logger.warning(f"CSS parsing error: {e}")

	return styles

	def add_default_styles(styles):
	"""Add default styles for common HTML elements"""
	# Body defaults
	styles.setdefault('body', {}).update({
	'margin': '0px',
	'font-family': 'Arial, sans-serif',
	'color': '#000000',
	'font-size': '16px'
	})

	# Heading defaults
	styles.setdefault('h1', {}).update({'font-size': '32px', 'font-weight': 'bold', 'margin': '21.44px 0'})
	styles.setdefault('h2', {}).update({'font-size': '24px', 'font-weight': 'bold', 'margin': '19.92px 0'})
	styles.setdefault('h3', {}).update({'font-size': '18px', 'font-weight': 'bold', 'margin': '18.72px 0'})
	styles.setdefault('h4', {}).update({'font-size': '16px', 'font-weight': 'bold', 'margin': '21.28px 0'})

	# Link defaults
	styles.setdefault('a', {}).update({'color': '#0000EE', 'text-decoration': 'underline'})

	# Button defaults
	styles.setdefault('button', {}).update({
	'background-color': '#F0F0F0',
	'border': '1px solid #CCCCCC',
	'padding': '4px 8px',
	'border-radius': '2px'
	})

	# Input defaults
	styles.setdefault('input', {}).update({
	'border': '1px solid #CCCCCC',
	'padding': '2px 4px'
	})

	def extract_elements_improved(soup, styles):
	"""Extract elements from the webpage with improved CSS handling"""
	elements = []

	# Get the body element
	body = soup.find('body')
	if not body:
	return elements

	# Start position for elements
	x_offset = 0
	y_position = 0
	viewport_width = 1440 # Default width

	# Create a mapping of elements to their computed styles
	element_styles = {}

	# Process main content blocks first
	main_blocks = body.find_all(['div', 'header', 'main', 'nav', 'footer', 'section'], recursive=False)

	if not main_blocks: # If no main blocks, use all direct children
	main_blocks = body.find_all(recursive=False)

	# Process each main block
	for block in main_blocks:
	block_data = process_element_with_styles(block, x_offset, y_position, viewport_width, styles)
	if block_data:
	elements.append(block_data)
	y_position += block_data['height'] + 10 # Add spacing between blocks

	# If no elements were found, try to extract text directly
	if not elements and body.text.strip():
	text_element = {
	'type': 'text',
	'tagName': 'p',
	'x': 0,
	'y': 0,
	'width': viewport_width,
	'height': 100,
	'content': body.text.strip(),
	'style': {
	'color': '#000000',
	'fontSize': '16px',
	'fontFamily': 'Arial, sans-serif'
	}
	}
	elements.append(text_element)

	return elements

	def process_element_with_styles(element, x_position, y_position, parent_width, styles, depth=0):
	"""Process a single HTML element with its styles"""
	if depth > 10: # Limit recursion depth
	return None

	tag_name = element.name.lower() if hasattr(element, 'name') else None
	if not tag_name or tag_name in ['script', 'style', 'meta', 'link', 'noscript']:
	return None

	# Get element's classes and ID
	elem_classes = element.get('class', [])
	elem_id = element.get('id')

	# Calculate element's computed style
	computed_style = compute_element_style(element, tag_name, elem_id, elem_classes, styles)

	# Create base element data
	element_data = {
	'type': get_element_type(tag_name),
	'tagName': tag_name,
	'x': x_position,
	'y': y_position,
	'width': calc_element_width(computed_style, parent_width),
	'height': 50, # Default height, will be adjusted later
	'style': {}
	}

	# Set element ID and class if present
	if elem_id:
	element_data['id'] = elem_id

	if elem_classes:
	if isinstance(elem_classes, list):
	element_data['className'] = ' '.join(elem_classes)
	else:
	element_data['className'] = elem_classes

	# Process specific element types
	if element_data['type'] == 'text':
	text_content = element.get_text().strip()
	element_data['content'] = text_content

	# Set text styles
	extract_text_styles(element_data, computed_style)

	# Calculate height based on text content
	element_data['height'] = calc_text_height(text_content, computed_style)

	elif element_data['type'] == 'image':
	# Set image source if available
	element_data['src'] = element.get('src', '')
	element_data['alt'] = element.get('alt', '')

	# Set height for images
	if 'height' in computed_style:
	try:
	element_data['height'] = parse_dimension(computed_style['height'], parent_width)
	except:
	element_data['height'] = 200 # Default height
	else:
	element_data['height'] = 200

	# Extract background styles
	extract_background_styles(element_data, computed_style)

	elif element_data['type'] in ['div', 'container', 'rectangle']:
	# Process container elements
	extract_container_styles(element_data, computed_style)

	# Process children
	children = []
	child_y_position = 0
	child_x_position = 0

	# Apply padding if present
	padding_left = parse_dimension(computed_style.get('padding-left', '0'), parent_width)
	child_x_position += padding_left

	available_width = element_data['width'] - (padding_left + parse_dimension(computed_style.get('padding-right', '0'), parent_width))

	# Process child elements
	for child in element.find_all(['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'span', 'a', 'button', 'input', 'form'], recursive=False):
	child_data = process_element_with_styles(child, child_x_position, child_y_position, available_width, styles, depth + 1)
	if child_data:
	children.append(child_data)
	if 'display' in computed_style and computed_style['display'] == 'flex':
	# Handle flex layout (simplified)
	if computed_style.get('flex-direction') == 'row':
	child_x_position += child_data['width'] + 5
	else:
	child_y_position += child_data['height'] + 5
	else:
	# Default block layout
	child_y_position += child_data['height'] + 5

	if children:
	element_data['children'] = children

	# Adjust container height based on children
	if children and 'display' not in computed_style or computed_style.get('display') != 'flex':
	last_child = children[-1]
	element_data['height'] = last_child['y'] - element_data['y'] + last_child['height'] + 10

	# Apply common styles (border, margin, etc)
	apply_common_styles(element_data, computed_style)

	# If height is unreasonably small, set a minimum
	if element_data['height'] < 10:
	element_data['height'] = 10

	return element_data

	def get_element_type(tag_name):
	"""Determine element type based on tag name"""
	if tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'a', 'label']:
	return 'text'
	elif tag_name == 'img':
	return 'image'
	elif tag_name in ['div', 'section', 'article', 'header', 'footer', 'main', 'form']:
	return 'div'
	elif tag_name == 'button':
	return 'rectangle' # Represent as a rectangle with text
	elif tag_name == 'input':
	return 'rectangle' # Represent as a rectangle
	else:
	return 'div' # Default type

	def compute_element_style(element, tag_name, elem_id, elem_classes, styles):
	"""Compute the final style for an element by cascading CSS rules"""
	computed_style = {}

	# 1. Apply tag-level styles
	if tag_name in styles:
	computed_style.update(styles[tag_name])

	# 2. Apply class styles
	if isinstance(elem_classes, list):
	for cls in elem_classes:
	class_selector = f".{cls}"
	if class_selector in styles:
	computed_style.update(styles[class_selector])
	elif elem_classes:
	class_selector = f".{elem_classes}"
	if class_selector in styles:
	computed_style.update(styles[class_selector])

	# 3. Apply ID styles (highest specificity)
	if elem_id and f"#{elem_id}" in styles:
	computed_style.update(styles[f"#{elem_id}"])

	# 4. Apply inline styles (overrides everything)
	inline_style = element.get('style')
	if inline_style:
	parsed_inline = parse_inline_style(inline_style)
	computed_style.update(parsed_inline)

	return computed_style

	def parse_dimension(value, container_size):
	"""Parse dimension values (px, %, em, etc)"""
	if not value or not isinstance(value, str):
	return 0

	value = value.strip().lower()

	# Handle pixel values
	if value.endswith('px'):
	try:
	return float(value[:-2])
	except:
	return 0

	# Handle percentage values
	elif value.endswith('%'):
	try:
	percentage = float(value[:-1]) / 100
	return container_size * percentage
	except:
	return 0

	# Handle em values (approximate)
	elif value.endswith('em'):
	try:
	em_value = float(value[:-2])
	return em_value * 16 # Assuming 1em = 16px
	except:
	return 0

	# Handle rem values (approximate)
	elif value.endswith('rem'):
	try:
	rem_value = float(value[:-3])
	return rem_value * 16 # Assuming 1rem = 16px
	except:
	return 0

	# Handle vh/vw values (viewport height/width)
	elif value.endswith('vh'):
	try:
	vh_value = float(value[:-2]) / 100
	return vh_value * 900 # Assuming viewport height is 900px
	except:
	return 0
	elif value.endswith('vw'):
	try:
	vw_value = float(value[:-2]) / 100
	return vw_value * 1440 # Assuming viewport width is 1440px
	except:
	return 0

	# Handle numeric values
	elif value.isdigit():
	return float(value)

	# Handle auto (use container size)
	elif value == 'auto':
	return container_size

	# Default fallback
	return 0

	def calc_element_width(style, parent_width):
	"""Calculate element width based on its style"""
	# Check if width is explicitly set
	if 'width' in style:
	width_value = style['width']
	return parse_dimension(width_value, parent_width)

	# Check for max-width
	if 'max-width' in style:
	max_width = parse_dimension(style['max-width'], parent_width)
	return min(parent_width, max_width)

	# Default: use parent width
	return parent_width

	def calc_text_height(text, style):
	"""Calculate text height based on content and style"""
	if not text:
	return 20

	# Get font size
	font_size = 16 # Default
	if 'font-size' in style:
	font_size_value = style['font-size']
	if isinstance(font_size_value, str):
	if font_size_value.endswith('px'):
	try:
	font_size = float(font_size_value[:-2])
	except:
	pass
	elif font_size_value.endswith('em'):
	try:
	font_size = float(font_size_value[:-2]) * 16
	except:
	pass

	# Get line height
	line_height = 1.2 # Default
	if 'line-height' in style:
	line_height_value = style['line-height']
	if isinstance(line_height_value, str):
	if line_height_value.endswith('px'):
	try:
	line_height = float(line_height_value[:-2]) / font_size
	except:
	pass
	else:
	try:
	line_height = float(line_height_value)
	except:
	pass

	# Estimate number of lines needed
	text_length = len(text)
	chars_per_line = 70 # Rough estimate
	num_lines = max(1, (text_length / chars_per_line))

	# Calculate height
	return max(20, int(font_size * line_height * num_lines))

	def extract_text_styles(element_data, style):
	"""Extract text-related styles from computed style"""
	# Text color
	if 'color' in style:
	element_data['style']['color'] = style['color']
	else:
	element_data['style']['color'] = '#000000' # Default black

	# Font size
	if 'font-size' in style:
	element_data['style']['fontSize'] = style['font-size']
	else:
	tag_name = element_data.get('tagName', '')
	if tag_name.startswith('h'):
	# Default heading sizes
	heading_level = int(tag_name[1])
	size = 32 - ((heading_level - 1) * 4)
	element_data['style']['fontSize'] = f"{size}px"
	else:
	element_data['style']['fontSize'] = '16px' # Default

	# Font weight
	if 'font-weight' in style:
	element_data['style']['fontWeight'] = style['font-weight']
	else:
	tag_name = element_data.get('tagName', '')
	if tag_name.startswith('h'):
	element_data['style']['fontWeight'] = 'bold'
	else:
	element_data['style']['fontWeight'] = 'normal'

	# Font family
	if 'font-family' in style:
	element_data['style']['fontFamily'] = style['font-family']

	# Text alignment
	if 'text-align' in style:
	element_data['style']['textAlign'] = style['text-align']

	# Text decoration
	if 'text-decoration' in style:
	element_data['style']['textDecoration'] = style['text-decoration']

	def extract_container_styles(element_data, style):
	"""Extract container-related styles from computed style"""
	# Background color
	if 'background-color' in style:
	element_data['style']['backgroundColor'] = style['background-color']

	# Display type
	if 'display' in style:
	element_data['style']['display'] = style['display']

	# Flex-related properties
	if 'display' in style and style['display'] == 'flex':
	if 'flex-direction' in style:
	element_data['style']['flexDirection'] = style['flex-direction']
	if 'justify-content' in style:
	element_data['style']['justifyContent'] = style['justify-content']
	if 'align-items' in style:
	element_data['style']['alignItems'] = style['align-items']

	def extract_background_styles(element_data, style):
	"""Extract background-related styles from computed style"""
	if 'background-color' in style:
	element_data['style']['backgroundColor'] = style['background-color']

	if 'background-image' in style:
	bg_image = style['background-image']
	if bg_image.startswith('url(') and bg_image.endswith(')'):
	image_url = bg_image[4:-1].strip('"\'')
	element_data['style']['backgroundImage'] = image_url

	def apply_common_styles(element_data, style):
	"""Apply common styles that apply to all elements"""
	# Border properties
	if 'border' in style:
	element_data['style']['border'] = style['border']
	else:
	# Individual border properties
	for side in ['top', 'right', 'bottom', 'left']:
	border_key = f'border-{side}'
	if border_key in style:
	element_data['style'][border_key] = style[border_key]

	if 'border-radius' in style:
	element_data['style']['borderRadius'] = style['border-radius']

	# Opacity
	if 'opacity' in style:
	element_data['style']['opacity'] = style['opacity']

	# Visibility
	if 'visibility' in style:
	element_data['style']['visibility'] = style['visibility']

	# Box shadow
	if 'box-shadow' in style:
	element_data['style']['boxShadow'] = style['box-shadow']

	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860))
	app.run(host="0.0.0.0", port=port)