diamond-in commited on
Commit
a44d039
·
verified ·
1 Parent(s): 3815cee

Update features/extraction.py

Browse files
Files changed (1) hide show
  1. features/extraction.py +88 -0
features/extraction.py CHANGED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text and HTML extraction features
3
+ """
4
+ import json
5
+ import logging
6
+ import re
7
+ from datetime import datetime
8
+ from selenium.webdriver.support.ui import WebDriverWait
9
+ from browser.driver import get_driver, cleanup_driver
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def get_html_source(url: str, use_persistent: bool = False) -> str:
14
+ """Get full HTML source code of the page"""
15
+ driver = None
16
+ try:
17
+ driver = get_driver(url, use_persistent)
18
+
19
+ # Wait for page to fully load
20
+ WebDriverWait(driver, 10).until(
21
+ lambda d: d.execute_script("return document.readyState") == "complete"
22
+ )
23
+
24
+ # Get the full HTML
25
+ html = driver.page_source
26
+ return html
27
+ except Exception as e:
28
+ logger.error(f"Error in get_html_source: {e}")
29
+ return f"Error: {e}"
30
+ finally:
31
+ cleanup_driver(driver, use_persistent)
32
+
33
+ def save_html_to_file(url: str, filename: str = "", use_persistent: bool = False) -> str:
34
+ """Save HTML source to file"""
35
+ driver = None
36
+ try:
37
+ driver = get_driver(url, use_persistent)
38
+
39
+ # Wait for page to fully load
40
+ WebDriverWait(driver, 10).until(
41
+ lambda d: d.execute_script("return document.readyState") == "complete"
42
+ )
43
+
44
+ # Get the full HTML
45
+ html = driver.page_source
46
+
47
+ # Generate filename if not provided
48
+ if not filename:
49
+ safe_url = re.sub(r'[^\w\s-]', '', url.replace('https://', '').replace('http://', ''))[:50]
50
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
51
+ filename = f"{safe_url}_{timestamp}.html"
52
+
53
+ filepath = f"/tmp/{filename}"
54
+ with open(filepath, 'w', encoding='utf-8') as f:
55
+ f.write(html)
56
+
57
+ return filepath
58
+ except Exception as e:
59
+ logger.error(f"Error in save_html_to_file: {e}")
60
+ return f"Error: {e}"
61
+ finally:
62
+ cleanup_driver(driver, use_persistent)
63
+
64
+ def get_page_info(url: str, use_persistent: bool = False) -> str:
65
+ """Get comprehensive page information"""
66
+ driver = None
67
+ try:
68
+ driver = get_driver(url, use_persistent)
69
+
70
+ info = {
71
+ "title": driver.title,
72
+ "url": driver.current_url,
73
+ "page_source_length": len(driver.page_source),
74
+ "cookies_count": len(driver.get_cookies()),
75
+ "viewport": driver.execute_script("return {width: window.innerWidth, height: window.innerHeight};"),
76
+ "scroll_height": driver.execute_script("return document.body.scrollHeight;"),
77
+ "ready_state": driver.execute_script("return document.readyState;"),
78
+ "links_count": len(driver.find_elements("tag name", "a")),
79
+ "images_count": len(driver.find_elements("tag name", "img")),
80
+ "forms_count": len(driver.find_elements("tag name", "form"))
81
+ }
82
+
83
+ return json.dumps(info, indent=2)
84
+ except Exception as e:
85
+ logger.error(f"Error in get_page_info: {e}")
86
+ return f"Error: {e}"
87
+ finally:
88
+ cleanup_driver(driver, use_persistent)