Spaces:

mgbam
/

builder

Running

App Files Files Community

builder / web_extraction.py

mgbam

Upload 6 files

f22daae verified 9 months ago

raw

history blame

11.5 kB

	import requests
	from urllib.parse import urlparse, urljoin
	from bs4 import BeautifulSoup
	import re
	from tavily import TavilyClient
	import os

	tavily_client = None
	TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
	if TAVILY_API_KEY:
	try:
	tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
	except Exception as e:
	print(f"Failed to initialize Tavily client: {e}")

	def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
	"""Perform web search using Tavily with default parameters"""
	if not tavily_client:
	return "Web search is not available. Please set the TAVILY_API_KEY environment variable."

	try:
	# Use Tavily defaults with advanced search depth for better results
	search_params = {
	"search_depth": "advanced",
	"max_results": min(max(1, max_results), 20)
	}
	if include_domains is not None:
	search_params["include_domains"] = include_domains
	if exclude_domains is not None:
	search_params["exclude_domains"] = exclude_domains

	response = tavily_client.search(query, **search_params)

	search_results = []
	for result in response.get('results', []):
	title = result.get('title', 'No title')
	url = result.get('url', 'No URL')
	content = result.get('content', 'No content')
	search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")

	if search_results:
	return "Web Search Results:\n\n" + "\n---\n".join(search_results)
	else:
	return "No search results found."

	except Exception as e:
	return f"Search error: {str(e)}"

	def enhance_query_with_search(query: str, enable_search: bool) -> str:
	"""Enhance the query with web search results if search is enabled"""
	if not enable_search or not tavily_client:
	return query

	# Perform search to get relevant information
	search_results = perform_web_search(query)

	# Combine original query with search results
	enhanced_query = f"""Original Query: {query}
	{search_results}
	Please use the search results above to help create the requested application with the most up-to-date information and best practices."""

	return enhanced_query

	def extract_website_content(url: str) -> str:
	"""Extract HTML code and content from a website URL"""
	try:
	# Validate URL
	parsed_url = urlparse(url)
	if not parsed_url.scheme:
	url = "https://" + url
	parsed_url = urlparse(url)

	if not parsed_url.netloc:
	return "Error: Invalid URL provided"

	# Set comprehensive headers to mimic a real browser request
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Accept-Encoding': 'gzip, deflate, br',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	'Sec-Fetch-Dest': 'document',
	'Sec-Fetch-Mode': 'navigate',
	'Sec-Fetch-Site': 'none',
	'Sec-Fetch-User': '?1',
	'Cache-Control': 'max-age=0'
	}

	# Create a session to maintain cookies and handle redirects
	session = requests.Session()
	session.headers.update(headers)

	# Make the request with retry logic
	max_retries = 3
	for attempt in range(max_retries):
	try:
	response = session.get(url, timeout=15, allow_redirects=True)
	response.raise_for_status()
	break
	except requests.exceptions.HTTPError as e:
	if e.response.status_code == 403 and attempt < max_retries - 1:
	# Try with different User-Agent on 403
	session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
	continue
	else:
	raise

	# Get the raw HTML content with proper encoding
	try:
	# Try to get the content with automatic encoding detection
	response.encoding = response.apparent_encoding
	raw_html = response.text
	except:
	# Fallback to UTF-8 if encoding detection fails
	raw_html = response.content.decode('utf-8', errors='ignore')

	# Debug: Check if we got valid HTML
	if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
	print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")

	# Try alternative approaches
	try:
	raw_html = response.content.decode('latin-1', errors='ignore')
	except:
	try:
	raw_html = response.content.decode('utf-8', errors='ignore')
	except:
	raw_html = response.content.decode('cp1252', errors='ignore')

	# Parse HTML content for analysis
	soup = BeautifulSoup(raw_html, 'html.parser')

	# Check if this is a JavaScript-heavy site
	script_tags = soup.find_all('script')
	if len(script_tags) > 10:
	print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
	# Attempt to use Playwright to render the page and get full HTML
	try:
	from playwright.sync_api import sync_playwright
	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page()
	page.goto(url, timeout=30000)
	page.wait_for_load_state("networkidle")
	rendered_html = page.content()
	browser.close()
	soup = BeautifulSoup(rendered_html, 'html.parser')
	except Exception as e:
	print(f"Playwright rendering failed: {e}")

	# Extract title, meta description, etc.
	title = soup.find('title')
	title_text = title.get_text().strip() if title else "No title found"
	meta_desc = soup.find('meta', attrs={'name': 'description'})
	description = meta_desc.get('content', '') if meta_desc else ""

	# Fix image URLs
	for img in soup.find_all('img'):
	src = img.get('src', '')
	if src:
	img['src'] = urljoin(url, src)

	# Fix background images in style attributes
	for element in soup.find_all(attrs={'style': True}):
	style_attr = element.get('style', '')
	bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
	matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
	for match in matches:
	if not match.startswith(('http', '//', 'data:')):
	style_attr = style_attr.replace(match, urljoin(url, match))
	element['style'] = style_attr

	# Fix background images in <style> tags
	for style in soup.find_all('style'):
	if style.string:
	style_content = style.string
	bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
	matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
	for match in matches:
	if not match.startswith(('http', '//', 'data:')):
	style_content = style_content.replace(match, urljoin(url, match))
	style.string = style_content

	# Test a few image URLs to see if they're accessible
	def test_image_url(img_url):
	try:
	test_response = requests.head(img_url, timeout=5, allow_redirects=True)
	return test_response.status_code == 200
	except:
	return False

	working_images = []
	for img in soup.find_all('img')[:10]:
	if test_image_url(img['src']):
	working_images.append(img)

	modified_html = str(soup)
	cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
	cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
	cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)

	if len(cleaned_html) > 15000:
	cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"

	if len(cleaned_html.strip()) < 100:
	website_content = f"""
	WEBSITE REDESIGN - EXTRACTION FAILED
	====================================
	URL: {url}
	Title: {title_text}
	ERROR: Could not extract meaningful HTML content from this website. This could be due to:
	1. The website uses heavy JavaScript to load content dynamically
	2. The website has anti-bot protection
	3. The website requires authentication
	FALLBACK APPROACH:
	Please create a modern, responsive website design for a {title_text.lower()} website."""
	return website_content.strip()

	website_content = f"""
	WEBSITE REDESIGN - ORIGINAL HTML CODE
	=====================================
	URL: {url}
	Title: {title_text}
	Description: {description}
	IMAGES FOUND (use these exact URLs in your redesign):
	{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
	ORIGINAL HTML CODE (use this as the base for redesign):
	```html
	{cleaned_html}
	```
	REDESIGN INSTRUCTIONS:
	Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""

	return website_content.strip()

	except requests.exceptions.HTTPError as e:
	if e.response.status_code == 403:
	return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
	elif e.response.status_code == 404:
	return f"Error: Website not found (404). Please check the URL and try again."
	elif e.response.status_code >= 500:
	return f"Error: Website server error ({e.response.status_code}). Please try again later."
	else:
	return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
	except requests.exceptions.Timeout:
	return "Error: Request timed out. The website may be slow or unavailable."
	except requests.exceptions.ConnectionError:
	return "Error: Could not connect to the website. Please check your internet connection and the URL."
	except requests.exceptions.RequestException as e:
	return f"Error accessing website: {str(e)}"
	except Exception as e:
	return f"Error extracting website content: {str(e)}"