AkashKumarave commited on
Commit
443869b
·
verified ·
1 Parent(s): 87c162c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +228 -274
app.py CHANGED
@@ -5,18 +5,12 @@ import requests
5
  import traceback
6
  from flask import Flask, request, jsonify
7
  from bs4 import BeautifulSoup
8
- from selenium import webdriver
9
- from selenium.webdriver.chrome.options import Options
10
- from selenium.webdriver.chrome.service import Service
11
- from webdriver_manager.chrome import ChromeDriverManager
12
- import time
13
- import re
14
- import base64
15
  import logging
16
- from PIL import Image
17
- from io import BytesIO
18
  import cssutils
 
19
  import urllib.parse
 
 
20
 
21
  app = Flask(__name__)
22
 
@@ -24,16 +18,6 @@ app = Flask(__name__)
24
  logging.basicConfig(level=logging.INFO)
25
  logger = logging.getLogger(__name__)
26
 
27
- # Configure Chrome options for headless browsing
28
- def get_chrome_options():
29
- options = Options()
30
- options.add_argument("--headless")
31
- options.add_argument("--no-sandbox")
32
- options.add_argument("--disable-dev-shm-usage")
33
- options.add_argument("--disable-gpu")
34
- options.add_argument("--window-size=1440,900")
35
- return options
36
-
37
  @app.route('/')
38
  def home():
39
  return '''
@@ -123,272 +107,255 @@ def convert_website():
123
 
124
  logger.info(f"Converting website: {url} with viewport width: {viewport_width}")
125
 
126
- # Set up Chrome driver with webdriver_manager
127
- options = get_chrome_options()
128
- options.add_argument(f"--window-size={viewport_width},{viewport_height}")
129
-
130
- # Use ChromeDriverManager to handle driver installation
131
- service = Service(ChromeDriverManager().install())
132
- driver = webdriver.Chrome(service=service, options=options)
133
- driver.set_window_size(viewport_width, viewport_height)
134
-
135
  try:
136
- # Load the page
137
- driver.get(url)
138
-
139
- # Wait for page to load (adjust as needed)
140
- time.sleep(5)
 
141
 
142
- # Get page dimensions
143
- page_dimensions = driver.execute_script("""
144
- return {
145
- width: Math.max(document.body.scrollWidth, document.documentElement.scrollWidth),
146
- height: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)
147
- };
148
- """)
149
 
150
- viewport_height = page_dimensions['height']
 
151
 
152
- # Parse the page elements
153
- elements = extract_elements(driver)
154
 
155
- # Get all CSS styles
156
- styles = extract_styles(driver)
157
 
158
  # Apply styles to elements
159
  apply_styles_to_elements(elements, styles)
160
 
 
 
 
 
 
 
 
 
 
 
161
  # Prepare response
162
  response = {
163
  "status": "success",
164
  "url": url,
165
  "viewport_width": viewport_width,
166
- "viewport_height": viewport_height,
167
  "elements": elements
168
  }
169
 
170
  return jsonify(response)
171
 
172
- finally:
173
- driver.quit()
 
174
 
175
  except Exception as e:
176
  logger.error(f"Error: {str(e)}")
177
  logger.error(traceback.format_exc())
178
  return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
179
 
180
- def extract_elements(driver):
181
- """Extract elements from the webpage"""
182
  elements = []
183
 
184
- # Execute JavaScript to extract elements
185
- elements_data = driver.execute_script("""
186
- function getElementInfo(element, depth = 0) {
187
- if (!element || depth > 15) return null;
188
-
189
- // Skip invisible elements
190
- if (element.offsetWidth === 0 || element.offsetHeight === 0) return null;
191
-
192
- // Get element position and size
193
- const rect = element.getBoundingClientRect();
194
- if (rect.width < 1 || rect.height < 1) return null;
195
-
196
- // Create element data
197
- const elementData = {
198
- tagName: element.tagName.toLowerCase(),
199
- id: element.id || null,
200
- className: element.className || null,
201
- x: rect.left,
202
- y: rect.top,
203
- width: rect.width,
204
- height: rect.height
205
- };
206
-
207
- // Handle text elements
208
- if (element.tagName.toLowerCase() === 'p' ||
209
- element.tagName.toLowerCase() === 'h1' ||
210
- element.tagName.toLowerCase() === 'h2' ||
211
- element.tagName.toLowerCase() === 'h3' ||
212
- element.tagName.toLowerCase() === 'h4' ||
213
- element.tagName.toLowerCase() === 'h5' ||
214
- element.tagName.toLowerCase() === 'h6' ||
215
- element.tagName.toLowerCase() === 'span' ||
216
- element.tagName.toLowerCase() === 'a') {
217
-
218
- const textContent = element.textContent.trim();
219
- if (textContent) {
220
- elementData.type = 'text';
221
- elementData.content = textContent;
222
-
223
- // Get computed style
224
- const style = window.getComputedStyle(element);
225
- elementData.style = {
226
- color: style.color,
227
- fontSize: style.fontSize,
228
- fontWeight: style.fontWeight,
229
- lineHeight: style.lineHeight,
230
- textAlign: style.textAlign,
231
- fontFamily: style.fontFamily
232
- };
233
- }
234
- }
235
-
236
- // Handle image elements
237
- else if (element.tagName.toLowerCase() === 'img') {
238
- elementData.type = 'image';
239
- elementData.src = element.src;
240
- elementData.alt = element.alt || '';
241
- }
242
-
243
- // Handle div/container elements
244
- else if (element.tagName.toLowerCase() === 'div' ||
245
- element.tagName.toLowerCase() === 'section' ||
246
- element.tagName.toLowerCase() === 'article' ||
247
- element.tagName.toLowerCase() === 'header' ||
248
- element.tagName.toLowerCase() === 'footer' ||
249
- element.tagName.toLowerCase() === 'main') {
250
-
251
- elementData.type = 'div';
252
-
253
- // Get background color
254
- const style = window.getComputedStyle(element);
255
- elementData.style = {
256
- backgroundColor: style.backgroundColor,
257
- borderRadius: style.borderRadius,
258
- borderWidth: style.borderWidth,
259
- borderColor: style.borderColor,
260
- boxShadow: style.boxShadow
261
- };
262
-
263
- // Check if there are direct text children
264
- if (element.childNodes.length > 0) {
265
- let directText = '';
266
- for (let i = 0; i < element.childNodes.length; i++) {
267
- if (element.childNodes[i].nodeType === Node.TEXT_NODE) {
268
- const text = element.childNodes[i].textContent.trim();
269
- if (text) directText += text + ' ';
270
- }
271
- }
272
-
273
- if (directText.trim()) {
274
- elementData.type = 'text';
275
- elementData.content = directText.trim();
276
- elementData.style.color = style.color;
277
- elementData.style.fontSize = style.fontSize;
278
- elementData.style.fontWeight = style.fontWeight;
279
- }
280
- }
281
-
282
- // Get children
283
- const children = [];
284
- for (let i = 0; i < element.children.length; i++) {
285
- const childData = getElementInfo(element.children[i], depth + 1);
286
- if (childData) children.push(childData);
287
- }
288
-
289
- if (children.length > 0) {
290
- elementData.children = children;
291
- elementData.type = 'container';
292
- }
293
- }
294
-
295
- // Handle button elements
296
- else if (element.tagName.toLowerCase() === 'button' ||
297
- (element.tagName.toLowerCase() === 'a' && window.getComputedStyle(element).display === 'inline-block')) {
298
-
299
- elementData.type = 'rectangle';
300
- elementData.name = 'Button';
301
-
302
- // Get style
303
- const style = window.getComputedStyle(element);
304
- elementData.style = {
305
- backgroundColor: style.backgroundColor,
306
- color: style.color,
307
- borderRadius: style.borderRadius,
308
- borderWidth: style.borderWidth,
309
- borderColor: style.borderColor
310
- };
311
-
312
- // Add text content
313
- const textContent = element.textContent.trim();
314
- if (textContent) {
315
- elementData.content = textContent;
316
- }
317
- }
318
-
319
- else {
320
- // Default to rectangle for other elements
321
- elementData.type = 'rectangle';
322
-
323
- // Get style
324
- const style = window.getComputedStyle(element);
325
- elementData.style = {
326
- backgroundColor: style.backgroundColor
327
- };
328
-
329
- // Get children
330
- const children = [];
331
- for (let i = 0; i < element.children.length; i++) {
332
- const childData = getElementInfo(element.children[i], depth + 1);
333
- if (childData) children.push(childData);
334
- }
335
-
336
- if (children.length > 0) {
337
- elementData.children = children;
338
- }
339
- }
340
-
341
- return elementData;
342
  }
343
 
344
- function getVisibleElements() {
345
- const bodyElement = document.body;
346
- const result = getElementInfo(bodyElement);
347
- return result ? result.children || [] : [];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  }
349
 
350
- return getVisibleElements();
351
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
- return elements_data
354
 
355
- def extract_styles(driver):
356
- """Extract CSS styles from the webpage"""
357
  styles = {}
358
 
359
- # Execute JavaScript to extract styles
360
- css_rules = driver.execute_script("""
361
- const sheets = document.styleSheets;
362
- const rules = [];
363
-
364
- for (let i = 0; i < sheets.length; i++) {
365
- try {
366
- const sheet = sheets[i];
367
- const ruleList = sheet.rules || sheet.cssRules;
368
-
369
- for (let j = 0; j < ruleList.length; j++) {
370
- try {
371
- const rule = ruleList[j];
372
- if (rule.selectorText) {
373
- rules.push({
374
- selector: rule.selectorText,
375
- style: rule.style.cssText
376
- });
377
- }
378
- } catch (e) {
379
- // Skip rule if it can't be accessed
380
- }
381
- }
382
- } catch (e) {
383
- // Skip stylesheet if it can't be accessed
384
- }
385
- }
386
-
387
- return rules;
388
- """)
389
 
390
- for rule in css_rules:
391
- styles[rule['selector']] = rule['style']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
  return styles
394
 
@@ -411,37 +378,24 @@ def apply_styles_to_elements(elements, styles):
411
  if 'children' in element and element['children']:
412
  apply_styles_to_elements(element['children'], styles)
413
 
414
- def apply_style(element, style_text):
415
  """Apply a CSS style to an element"""
416
  if 'style' not in element:
417
  element['style'] = {}
418
 
419
- # Parse CSS text to extract properties
420
- style_dict = parse_css_text(style_text)
421
-
422
  # Apply properties to element style
423
- for key, value in style_dict.items():
424
- if key not in element['style'] or not element['style'][key]:
425
  element['style'][key] = value
426
-
427
- def parse_css_text(css_text):
428
- """Parse CSS text into a dictionary"""
429
- style_dict = {}
430
-
431
- # Basic parsing of CSS text
432
- for item in css_text.split(';'):
433
- if ':' in item:
434
- key, value = item.split(':', 1)
435
- key = key.strip()
436
- value = value.strip()
437
- if key and value:
438
- # Convert to camelCase for JavaScript
439
- key_parts = key.split('-')
440
- if len(key_parts) > 1:
441
- key = key_parts[0] + ''.join(part.capitalize() for part in key_parts[1:])
442
- style_dict[key] = value
443
-
444
- return style_dict
445
 
446
  if __name__ == "__main__":
447
  port = int(os.environ.get("PORT", 7860))
 
5
  import traceback
6
  from flask import Flask, request, jsonify
7
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
8
  import logging
 
 
9
  import cssutils
10
+ import re
11
  import urllib.parse
12
+ from PIL import Image
13
+ from io import BytesIO
14
 
15
  app = Flask(__name__)
16
 
 
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
 
 
 
 
 
 
 
 
 
 
21
  @app.route('/')
22
  def home():
23
  return '''
 
107
 
108
  logger.info(f"Converting website: {url} with viewport width: {viewport_width}")
109
 
 
 
 
 
 
 
 
 
 
110
  try:
111
+ # Use requests to get the webpage
112
+ headers = {
113
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
114
+ 'Accept': 'text/html,application/xhtml+xml,application/xml',
115
+ 'Accept-Language': 'en-US,en;q=0.9',
116
+ }
117
 
118
+ response = requests.get(url, headers=headers, timeout=15)
119
+ response.raise_for_status() # Raise an exception for HTTP errors
 
 
 
 
 
120
 
121
+ # Parse the HTML content
122
+ soup = BeautifulSoup(response.text, 'html.parser')
123
 
124
+ # Extract the page elements using BeautifulSoup
125
+ elements = extract_elements_bs(soup)
126
 
127
+ # Extract CSS styles
128
+ styles = extract_styles_bs(soup, url)
129
 
130
  # Apply styles to elements
131
  apply_styles_to_elements(elements, styles)
132
 
133
+ # Estimate page height based on content (simplified approach)
134
+ estimated_height = viewport_height
135
+ if elements:
136
+ # Find the maximum y-coordinate plus height
137
+ max_y = 0
138
+ for element in elements:
139
+ elem_bottom = element.get('y', 0) + element.get('height', 0)
140
+ max_y = max(max_y, elem_bottom)
141
+ estimated_height = max(viewport_height, max_y)
142
+
143
  # Prepare response
144
  response = {
145
  "status": "success",
146
  "url": url,
147
  "viewport_width": viewport_width,
148
+ "viewport_height": estimated_height,
149
  "elements": elements
150
  }
151
 
152
  return jsonify(response)
153
 
154
+ except requests.exceptions.RequestException as e:
155
+ logger.error(f"Request error: {str(e)}")
156
+ return jsonify({"error": f"Failed to fetch website: {str(e)}"}), 500
157
 
158
  except Exception as e:
159
  logger.error(f"Error: {str(e)}")
160
  logger.error(traceback.format_exc())
161
  return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
162
 
163
+ def extract_elements_bs(soup):
164
+ """Extract elements from the webpage using BeautifulSoup"""
165
  elements = []
166
 
167
+ # Helper function to get the coordinates and dimensions
168
+ # For simplicity, we'll just stack elements vertically
169
+ y_position = 0
170
+ max_width = 1440 # Default max width
171
+
172
+ # Process body and its children
173
+ body = soup.find('body')
174
+ if not body:
175
+ return elements
176
+
177
+ # Add the body as a container
178
+ body_element = {
179
+ 'type': 'container',
180
+ 'tagName': 'body',
181
+ 'x': 0,
182
+ 'y': 0,
183
+ 'width': max_width,
184
+ 'height': 900, # Default height
185
+ 'children': []
186
+ }
187
+
188
+ # Process main content elements
189
+ for element in body.find_all(['div', 'header', 'main', 'footer', 'section', 'nav'], recursive=False):
190
+ element_data = process_element(element, 0, y_position, max_width)
191
+ if element_data:
192
+ y_position += element_data['height'] + 10 # Add spacing
193
+ body_element['children'].append(element_data)
194
+
195
+ # Adjust body height
196
+ body_element['height'] = y_position + 50 # Add some padding
197
+
198
+ # Make the body itself the first element, and return its children
199
+ elements = body_element['children']
200
+
201
+ return elements
202
+
203
+ def process_element(element, x_position, y_position, max_width, depth=0):
204
+ """Process a single HTML element"""
205
+ if depth > 10: # Limit recursion depth
206
+ return None
207
+
208
+ tag_name = element.name.lower()
209
+ element_height = 50 # Default height for elements
210
+
211
+ # Skip script, style tags
212
+ if tag_name in ['script', 'style', 'meta', 'link']:
213
+ return None
214
+
215
+ # Create element data dictionary
216
+ element_data = {
217
+ 'tagName': tag_name,
218
+ 'x': x_position,
219
+ 'y': y_position,
220
+ 'width': max_width,
221
+ 'height': element_height,
222
+ }
223
+
224
+ # Get element classes
225
+ element_classes = element.get('class', [])
226
+ if element_classes:
227
+ if isinstance(element_classes, list):
228
+ element_data['className'] = ' '.join(element_classes)
229
+ else:
230
+ element_data['className'] = element_classes
231
+
232
+ # Get element ID
233
+ element_id = element.get('id')
234
+ if element_id:
235
+ element_data['id'] = element_id
236
+
237
+ # Handle different types of elements
238
+ if tag_name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'a']:
239
+ element_data['type'] = 'text'
240
+ text_content = element.get_text().strip()
241
+ element_data['content'] = text_content
242
+ element_height = 20 # Default height for text
243
+
244
+ # Adjust height based on text length
245
+ if text_content:
246
+ num_lines = len(text_content) // 50 + 1 # Rough estimate
247
+ element_height = max(20, num_lines * 20) # Min 20px, 20px per line
248
+
249
+ # Basic style properties
250
+ element_data['style'] = {
251
+ 'color': '#000000',
252
+ 'fontSize': '16px',
253
+ 'fontWeight': 'normal',
254
+ 'lineHeight': '1.5'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  }
256
 
257
+ # Adjust style based on tag
258
+ if tag_name.startswith('h'):
259
+ size = 26 - int(tag_name[1]) # h1 = 25px, h2 = 24px, etc.
260
+ element_data['style']['fontSize'] = f"{size}px"
261
+ element_data['style']['fontWeight'] = 'bold'
262
+ element_height = size * 1.5 # Adjust height based on font size
263
+
264
+ elif tag_name == 'img':
265
+ element_data['type'] = 'image'
266
+ element_data['src'] = element.get('src', '')
267
+ element_data['alt'] = element.get('alt', '')
268
+ element_height = 200 # Default height for images
269
+
270
+ elif tag_name in ['div', 'section', 'article', 'header', 'footer', 'main', 'nav']:
271
+ # Container elements
272
+ element_data['type'] = 'div'
273
+ element_data['style'] = {
274
+ 'backgroundColor': 'transparent',
275
+ 'borderRadius': '0px'
276
  }
277
 
278
+ # Process children
279
+ children = []
280
+ child_y_position = 0
281
+
282
+ for child in element.find_all(['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'span', 'a', 'section', 'article', 'nav'], recursive=False):
283
+ child_data = process_element(child, 5, child_y_position, max_width - 10, depth + 1)
284
+ if child_data:
285
+ children.append(child_data)
286
+ child_y_position += child_data['height'] + 5 # Add spacing
287
+
288
+ if children:
289
+ element_data['children'] = children
290
+ element_data['type'] = 'container'
291
+ element_height = child_y_position + 10 # Total height of children + padding
292
+ else:
293
+ # Check if there's direct text content
294
+ text_content = element.get_text(strip=True)
295
+ if text_content:
296
+ element_data['type'] = 'text'
297
+ element_data['content'] = text_content
298
+ element_data['style'] = {
299
+ 'color': '#000000',
300
+ 'fontSize': '16px'
301
+ }
302
+ element_height = 40 # Default height for text containers
303
+
304
+ # Update height
305
+ element_data['height'] = element_height
306
 
307
+ return element_data
308
 
309
+ def extract_styles_bs(soup, base_url):
310
+ """Extract CSS styles from the webpage using BeautifulSoup"""
311
  styles = {}
312
 
313
+ # Extract inline styles
314
+ for element in soup.find_all(style=True):
315
+ classes = element.get('class', [])
316
+ if classes:
317
+ class_str = '.'.join(classes)
318
+ styles[f".{class_str}"] = element['style']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+ # Extract style tags
321
+ for style_tag in soup.find_all('style'):
322
+ css_text = style_tag.string
323
+ if css_text:
324
+ parsed_styles = parse_css(css_text)
325
+ styles.update(parsed_styles)
326
+
327
+ # Extract linked stylesheets
328
+ for link in soup.find_all('link', rel='stylesheet'):
329
+ href = link.get('href')
330
+ if href:
331
+ # Make absolute URL if relative
332
+ if not href.startswith(('http://', 'https://')):
333
+ href = urllib.parse.urljoin(base_url, href)
334
+
335
+ try:
336
+ css_response = requests.get(href, timeout=5)
337
+ if css_response.ok:
338
+ parsed_styles = parse_css(css_response.text)
339
+ styles.update(parsed_styles)
340
+ except Exception as e:
341
+ logger.warning(f"Failed to fetch stylesheet {href}: {e}")
342
+
343
+ return styles
344
+
345
+ def parse_css(css_text):
346
+ """Parse CSS text into a dictionary of selectors and styles"""
347
+ styles = {}
348
+ try:
349
+ sheet = cssutils.parseString(css_text)
350
+ for rule in sheet:
351
+ if rule.type == rule.STYLE_RULE:
352
+ selector = rule.selectorText
353
+ style_dict = {}
354
+ for property in rule.style:
355
+ style_dict[property.name] = property.value
356
+ styles[selector] = style_dict
357
+ except Exception as e:
358
+ logger.warning(f"CSS parsing error: {e}")
359
 
360
  return styles
361
 
 
378
  if 'children' in element and element['children']:
379
  apply_styles_to_elements(element['children'], styles)
380
 
381
+ def apply_style(element, style_dict):
382
  """Apply a CSS style to an element"""
383
  if 'style' not in element:
384
  element['style'] = {}
385
 
 
 
 
386
  # Apply properties to element style
387
+ if isinstance(style_dict, dict):
388
+ for key, value in style_dict.items():
389
  element['style'][key] = value
390
+ elif isinstance(style_dict, str):
391
+ # Parse inline style string
392
+ for item in style_dict.split(';'):
393
+ if ':' in item:
394
+ key, value = item.split(':', 1)
395
+ key = key.strip()
396
+ value = value.strip()
397
+ if key and value:
398
+ element['style'][key] = value
 
 
 
 
 
 
 
 
 
 
399
 
400
  if __name__ == "__main__":
401
  port = int(os.environ.get("PORT", 7860))