| import requests
|
| from urllib.parse import urlparse, urljoin
|
| from bs4 import BeautifulSoup
|
| import re
|
| from tavily import TavilyClient
|
| import os
|
|
|
| tavily_client = None
|
| TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
|
| if TAVILY_API_KEY:
|
| try:
|
| tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
| except Exception as e:
|
| print(f"Failed to initialize Tavily client: {e}")
|
|
|
| def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
|
| """Perform web search using Tavily with default parameters"""
|
| if not tavily_client:
|
| return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
|
|
|
| try:
|
|
|
| search_params = {
|
| "search_depth": "advanced",
|
| "max_results": min(max(1, max_results), 20)
|
| }
|
| if include_domains is not None:
|
| search_params["include_domains"] = include_domains
|
| if exclude_domains is not None:
|
| search_params["exclude_domains"] = exclude_domains
|
|
|
| response = tavily_client.search(query, **search_params)
|
|
|
| search_results = []
|
| for result in response.get('results', []):
|
| title = result.get('title', 'No title')
|
| url = result.get('url', 'No URL')
|
| content = result.get('content', 'No content')
|
| search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
|
|
|
| if search_results:
|
| return "Web Search Results:\n\n" + "\n---\n".join(search_results)
|
| else:
|
| return "No search results found."
|
|
|
| except Exception as e:
|
| return f"Search error: {str(e)}"
|
|
|
| def enhance_query_with_search(query: str, enable_search: bool) -> str:
|
| """Enhance the query with web search results if search is enabled"""
|
| if not enable_search or not tavily_client:
|
| return query
|
|
|
|
|
| search_results = perform_web_search(query)
|
|
|
|
|
| enhanced_query = f"""Original Query: {query}
|
| {search_results}
|
| Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
|
|
|
| return enhanced_query
|
|
|
| def extract_website_content(url: str) -> str:
|
| """Extract HTML code and content from a website URL"""
|
| try:
|
|
|
| parsed_url = urlparse(url)
|
| if not parsed_url.scheme:
|
| url = "https://" + url
|
| parsed_url = urlparse(url)
|
|
|
| if not parsed_url.netloc:
|
| return "Error: Invalid URL provided"
|
|
|
|
|
| headers = {
|
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
| 'Accept-Language': 'en-US,en;q=0.9',
|
| 'Accept-Encoding': 'gzip, deflate, br',
|
| 'DNT': '1',
|
| 'Connection': 'keep-alive',
|
| 'Upgrade-Insecure-Requests': '1',
|
| 'Sec-Fetch-Dest': 'document',
|
| 'Sec-Fetch-Mode': 'navigate',
|
| 'Sec-Fetch-Site': 'none',
|
| 'Sec-Fetch-User': '?1',
|
| 'Cache-Control': 'max-age=0'
|
| }
|
|
|
|
|
| session = requests.Session()
|
| session.headers.update(headers)
|
|
|
|
|
| max_retries = 3
|
| for attempt in range(max_retries):
|
| try:
|
| response = session.get(url, timeout=15, allow_redirects=True)
|
| response.raise_for_status()
|
| break
|
| except requests.exceptions.HTTPError as e:
|
| if e.response.status_code == 403 and attempt < max_retries - 1:
|
|
|
| session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| continue
|
| else:
|
| raise
|
|
|
|
|
| try:
|
|
|
| response.encoding = response.apparent_encoding
|
| raw_html = response.text
|
| except:
|
|
|
| raw_html = response.content.decode('utf-8', errors='ignore')
|
|
|
|
|
| if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
|
| print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
|
|
|
|
|
| try:
|
| raw_html = response.content.decode('latin-1', errors='ignore')
|
| except:
|
| try:
|
| raw_html = response.content.decode('utf-8', errors='ignore')
|
| except:
|
| raw_html = response.content.decode('cp1252', errors='ignore')
|
|
|
|
|
| soup = BeautifulSoup(raw_html, 'html.parser')
|
|
|
|
|
| script_tags = soup.find_all('script')
|
| if len(script_tags) > 10:
|
| print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
|
|
|
| try:
|
| from playwright.sync_api import sync_playwright
|
| with sync_playwright() as p:
|
| browser = p.chromium.launch()
|
| page = browser.new_page()
|
| page.goto(url, timeout=30000)
|
| page.wait_for_load_state("networkidle")
|
| rendered_html = page.content()
|
| browser.close()
|
| soup = BeautifulSoup(rendered_html, 'html.parser')
|
| except Exception as e:
|
| print(f"Playwright rendering failed: {e}")
|
|
|
|
|
| title = soup.find('title')
|
| title_text = title.get_text().strip() if title else "No title found"
|
| meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| description = meta_desc.get('content', '') if meta_desc else ""
|
|
|
|
|
| for img in soup.find_all('img'):
|
| src = img.get('src', '')
|
| if src:
|
| img['src'] = urljoin(url, src)
|
|
|
|
|
| for element in soup.find_all(attrs={'style': True}):
|
| style_attr = element.get('style', '')
|
| bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
| matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
|
| for match in matches:
|
| if not match.startswith(('http', '//', 'data:')):
|
| style_attr = style_attr.replace(match, urljoin(url, match))
|
| element['style'] = style_attr
|
|
|
|
|
| for style in soup.find_all('style'):
|
| if style.string:
|
| style_content = style.string
|
| bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
| matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
|
| for match in matches:
|
| if not match.startswith(('http', '//', 'data:')):
|
| style_content = style_content.replace(match, urljoin(url, match))
|
| style.string = style_content
|
|
|
|
|
| def test_image_url(img_url):
|
| try:
|
| test_response = requests.head(img_url, timeout=5, allow_redirects=True)
|
| return test_response.status_code == 200
|
| except:
|
| return False
|
|
|
| working_images = []
|
| for img in soup.find_all('img')[:10]:
|
| if test_image_url(img['src']):
|
| working_images.append(img)
|
|
|
| modified_html = str(soup)
|
| cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
|
| cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
|
| cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
|
|
|
| if len(cleaned_html) > 15000:
|
| cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
|
|
|
| if len(cleaned_html.strip()) < 100:
|
| website_content = f"""
|
| WEBSITE REDESIGN - EXTRACTION FAILED
|
| ====================================
|
| URL: {url}
|
| Title: {title_text}
|
| ERROR: Could not extract meaningful HTML content from this website. This could be due to:
|
| 1. The website uses heavy JavaScript to load content dynamically
|
| 2. The website has anti-bot protection
|
| 3. The website requires authentication
|
| FALLBACK APPROACH:
|
| Please create a modern, responsive website design for a {title_text.lower()} website."""
|
| return website_content.strip()
|
|
|
| website_content = f"""
|
| WEBSITE REDESIGN - ORIGINAL HTML CODE
|
| =====================================
|
| URL: {url}
|
| Title: {title_text}
|
| Description: {description}
|
| IMAGES FOUND (use these exact URLs in your redesign):
|
| {chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
|
| ORIGINAL HTML CODE (use this as the base for redesign):
|
| ```html
|
| {cleaned_html}
|
| ```
|
| REDESIGN INSTRUCTIONS:
|
| Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
|
|
|
| return website_content.strip()
|
|
|
| except requests.exceptions.HTTPError as e:
|
| if e.response.status_code == 403:
|
| return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
|
| elif e.response.status_code == 404:
|
| return f"Error: Website not found (404). Please check the URL and try again."
|
| elif e.response.status_code >= 500:
|
| return f"Error: Website server error ({e.response.status_code}). Please try again later."
|
| else:
|
| return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
|
| except requests.exceptions.Timeout:
|
| return "Error: Request timed out. The website may be slow or unavailable."
|
| except requests.exceptions.ConnectionError:
|
| return "Error: Could not connect to the website. Please check your internet connection and the URL."
|
| except requests.exceptions.RequestException as e:
|
| return f"Error accessing website: {str(e)}"
|
| except Exception as e:
|
| return f"Error extracting website content: {str(e)}" |