Spaces:

bparekh99
/

WebsiteOptimizer

Sleeping

App Files Files Community

bparekh99 commited on Jan 3

Commit

d2e8f75

verified ·

1 Parent(s): 3cf32d4

Update app.py

Browse files

Files changed (1) hide show

app.py +288 -110

app.py CHANGED Viewed

@@ -3,12 +3,14 @@ import requests
 import socket
 import logging
 import time
 from bs4 import BeautifulSoup
-from urllib.parse import urlparse
 from google import genai
 # -------------------------------------------------
-# Logging setup (Hugging Face compatible)
 # -------------------------------------------------
 logging.basicConfig(
     level=logging.INFO,
@@ -18,21 +20,87 @@ logger = logging.getLogger(__name__)
 logger.info("AI Website Review Tool starting up")
 # -----------------------------
-# URL Normalization
 # -----------------------------
 def normalize_url(url: str) -> str:
     parsed = urlparse(url)
     if not parsed.scheme:
-        return "https://" + url
     return url
 # -----------------------------
-# Fetch & Parse Website (Hardened)
 # -----------------------------
-def fetch_website_text(url: str) -> str:
-    socket.setdefaulttimeout(10)
     headers = {
         "User-Agent": (
@@ -40,163 +108,273 @@ def fetch_website_text(url: str) -> str:
             "AppleWebKit/537.36 (KHTML, like Gecko) "
             "Chrome/121.0 Safari/537.36"
         ),
-        "Accept": "text/html,application/xhtml+xml",
         "Accept-Language": "en-US,en;q=0.9",
     }
-    response = requests.get(
-        url,
-        headers=headers,
-        timeout=10,
-        allow_redirects=True,
-    )
-    response.raise_for_status()
     soup = BeautifulSoup(response.text, "html.parser")
     # Remove noisy tags
-    for tag in soup(["script", "style", "noscript"]):
         tag.decompose()
-    title = soup.title.string.strip() if soup.title else ""
-    h1 = soup.find("h1").get_text(strip=True) if soup.find("h1") else ""
     body_text = " ".join(soup.stripped_strings)
-    body_text = body_text[:8000]  # token safety
-    return f"""
-PAGE TITLE:
-{title}
-PRIMARY H1:
-{h1}
 VISIBLE CONTENT:
 {body_text}
 """
-# -----------------------------
-# Safe Wrapper (Never Crash)
-# -----------------------------
-def fetch_website_text_safe(url: str) -> str:
     try:
         return fetch_website_text(url)
     except Exception as e:
-        return f"""
-⚠️ Unable to fully fetch website content.
-Error:
-{str(e)}
-Fallback:
-Analyze based on URL structure, homepage intent, and general best practices.
-"""
 # -----------------------------
 # Gemini Analysis
 # -----------------------------
-def analyze_website(api_key, url, industry, goal):
-    if not api_key:
-        return "❌ Please enter your Gemini API key."
     if not url:
         return "❌ Please enter a website URL."
     try:
-        url = normalize_url(url)
-        client = genai.Client(api_key=api_key)
-        website_text = fetch_website_text_safe(url)
-        prompt = f"""
-You are an AI consultant helping small businesses improve their websites.
-Business context:
 - Industry: {industry}
-- Primary goal: {goal}
-Analyze the website content below and provide recommendations in this structure:
-1. Messaging Clarity (score 1–10)
-   - Main issue
-   - 2–3 actionable recommendations
-2. Conversion Effectiveness (score 1–10)
-   - Main issue
-   - 2–3 actionable recommendations
-3. Trust & Credibility (score 1–10)
-   - Main issue
-   - 2–3 actionable recommendations
-4. User Experience Issues
-   - Bullet list of issues
-5. AI & Automation Opportunities
-   - 3 concrete ideas a small business could implement
-End with:
-- Overall score out of 100
-- Top 3 fixes to prioritize this week
-Use clear, non-technical business language.
-Website content:
 {website_text}
 """
         response = client.models.generate_content(
-            model="gemini-2.5-flash",
             contents=prompt,
         )
-        return response.text
     except Exception as e:
-        return f"❌ Error during analysis: {str(e)}"
 # -----------------------------
 # Gradio UI
 # -----------------------------
-with gr.Blocks(title="AI Website Review Tool") as demo:
-    gr.Markdown("## 🔍 AI Website Review Tool")
     gr.Markdown(
-        "Analyze any website and receive practical, business-focused recommendations."
-    )
-    api_key = gr.Textbox(
-        label="Gemini API Key",
-        placeholder="Paste your Gemini API key here",
-        type="password",
     )
-    url = gr.Textbox(
-        label="Website URL",
-        placeholder="https://example.com",
-    )
-    industry = gr.Dropdown(
-        label="Industry",
-        choices=[
-            "General SMB",
-            "Law Firm",
-            "Hospitality",
-            "Healthcare",
-            "Real Estate",
-        ],
-        value="General SMB",
-    )
-    goal = gr.Dropdown(
-        label="Primary Website Goal",
-        choices=[
-            "Generate leads",
-            "Sell services",
-            "Build credibility",
-            "Educate visitors",
-        ],
-        value="Generate leads",
-    )
-    analyze_btn = gr.Button("Analyze Website")
-    status = gr.Markdown("")
-    output = gr.Markdown()
     analyze_btn.click(
         fn=analyze_website,
         inputs=[api_key, url, industry, goal],
         outputs=output,
     )
-demo.launch()

 import socket
 import logging
 import time
+import re
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin
+from typing import Dict, Tuple, Optional
 from google import genai
 # -------------------------------------------------
+# Logging setup
 # -------------------------------------------------
 logging.basicConfig(
     level=logging.INFO,
 logger.info("AI Website Review Tool starting up")
+# -------------------------------------------------
+# Constants
+# -------------------------------------------------
+TIMEOUT = 15
+MAX_RETRIES = 2
+CONTENT_LIMIT = 12000
 # -----------------------------
+# URL Validation & Normalization
 # -----------------------------
 def normalize_url(url: str) -> str:
+    """Normalize and validate URL format."""
+    url = url.strip()
     parsed = urlparse(url)
     if not parsed.scheme:
+        url = "https://" + url
     return url
+def validate_url(url: str) -> Tuple[bool, str]:
+    """Validate URL format and accessibility."""
+    try:
+        parsed = urlparse(url)
+        if not parsed.netloc:
+            return False, "Invalid URL format. Please include domain name."
+        # Check for obviously invalid domains
+        if len(parsed.netloc) < 4 or '.' not in parsed.netloc:
+            return False, "Invalid domain name."
+        return True, ""
+    except Exception as e:
+        return False, f"URL validation error: {str(e)}"
 # -----------------------------
+# Enhanced Content Extraction
 # -----------------------------
+def extract_website_info(soup: BeautifulSoup, url: str) -> Dict[str, str]:
+    """Extract key website elements for analysis."""
+    info = {}
+    # Title
+    info['title'] = soup.title.string.strip() if soup.title else ""
+    # Meta description
+    meta_desc = soup.find("meta", attrs={"name": "description"})
+    info['meta_description'] = meta_desc.get("content", "").strip() if meta_desc else ""
+    # Headings
+    info['h1'] = soup.find("h1").get_text(strip=True) if soup.find("h1") else ""
+    h2_tags = soup.find_all("h2", limit=5)
+    info['h2s'] = " | ".join([h2.get_text(strip=True) for h2 in h2_tags])
+    # CTAs (buttons and prominent links)
+    cta_patterns = ['button', 'btn', 'cta', 'call-to-action']
+    ctas = []
+    for pattern in cta_patterns:
+        elements = soup.find_all(class_=re.compile(pattern, re.I))
+        ctas.extend([el.get_text(strip=True) for el in elements[:3]])
+    info['ctas'] = " | ".join(ctas[:5]) if ctas else "No clear CTAs found"
+    # Contact information
+    contact_indicators = soup.find_all(string=re.compile(r'contact|email|phone|call', re.I))
+    info['has_contact'] = len(contact_indicators) > 0
+    # Links analysis
+    links = soup.find_all('a', href=True)
+    info['total_links'] = len(links)
+    external_links = [l for l in links if urlparse(l['href']).netloc and urlparse(l['href']).netloc != urlparse(url).netloc]
+    info['external_links'] = len(external_links)
+    return info
+def fetch_website_text(url: str) -> Tuple[str, bool]:
+    """
+    Fetch and parse website content.
+    Returns (content_string, success_boolean)
+    """
+    socket.setdefaulttimeout(TIMEOUT)
     headers = {
         "User-Agent": (
             "AppleWebKit/537.36 (KHTML, like Gecko) "
             "Chrome/121.0 Safari/537.36"
         ),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "en-US,en;q=0.9",
+        "Accept-Encoding": "gzip, deflate",
+        "DNT": "1",
     }
+    for attempt in range(MAX_RETRIES):
+        try:
+            response = requests.get(
+                url,
+                headers=headers,
+                timeout=TIMEOUT,
+                allow_redirects=True,
+            )
+            response.raise_for_status()
+            break
+        except requests.exceptions.RequestException as e:
+            if attempt == MAX_RETRIES - 1:
+                raise
+            time.sleep(1)
     soup = BeautifulSoup(response.text, "html.parser")
     # Remove noisy tags
+    for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
         tag.decompose()
+    # Extract structured info
+    info = extract_website_info(soup, url)
+    # Body content
     body_text = " ".join(soup.stripped_strings)
+    body_text = body_text[:CONTENT_LIMIT]
+    content = f"""
+PAGE TITLE: {info['title']}
+META DESCRIPTION: {info['meta_description']}
+PRIMARY H1: {info['h1']}
+KEY H2 HEADINGS: {info['h2s']}
+CALL-TO-ACTION BUTTONS: {info['ctas']}
+CONTACT INFO PRESENT: {"Yes" if info['has_contact'] else "No"}
+LINK ANALYSIS: {info['total_links']} total links, {info['external_links']} external
 VISIBLE CONTENT:
 {body_text}
 """
+    return content, True
+def fetch_website_text_safe(url: str) -> Tuple[str, bool]:
+    """Safe wrapper that never crashes."""
     try:
         return fetch_website_text(url)
+    except requests.exceptions.Timeout:
+        return """⚠️ Website took too long to respond (timeout).
+This might indicate slow server performance.
+Analysis will be based on URL structure and general best practices.""", False
+    except requests.exceptions.SSLError:
+        return """⚠️ SSL Certificate error detected.
+This is a major trust issue that should be fixed immediately.
+Analysis will include this critical security concern.""", False
+    except requests.exceptions.ConnectionError:
+        return """⚠️ Could not connect to website.
+Website may be down or have DNS issues.
+Analysis will be based on general best practices.""", False
     except Exception as e:
+        return f"""⚠️ Unable to fully fetch website content.
+Error: {str(e)}
+Analysis will be based on available information and general best practices.""", False
 # -----------------------------
 # Gemini Analysis
 # -----------------------------
+def analyze_website(api_key: str, url: str, industry: str, goal: str) -> str:
+    """Main analysis function."""
+    # Validate inputs
+    if not api_key or len(api_key) < 20:
+        return "❌ Please enter a valid Gemini API key. Get one at https://aistudio.google.com/apikey"
     if not url:
         return "❌ Please enter a website URL."
+    # Normalize and validate URL
+    url = normalize_url(url)
+    is_valid, error_msg = validate_url(url)
+    if not is_valid:
+        return f"❌ {error_msg}"
     try:
+        # Initialize client
+        try:
+            client = genai.Client(api_key=api_key)
+        except Exception as e:
+            return f"❌ Invalid API key. Please check your Gemini API key.\nError: {str(e)}"
+        # Fetch website content
+        website_text, fetch_success = fetch_website_text_safe(url)
+        fetch_status = "✅ Full content analysis" if fetch_success else "⚠️ Limited analysis"
+        # Build enhanced prompt
+        prompt = f"""You are an AI consultant helping small businesses improve their websites.
+Business Context:
 - Industry: {industry}
+- Primary Goal: {goal}
+- URL: {url}
+- Content Fetch Status: {fetch_status}
+Analyze the website content below and provide a comprehensive business-focused review.
+Structure your response with clear sections:
+## 1. Messaging Clarity (Score: X/10)
+**Main Issue:** [One sentence summary]
+**Recommendations:**
+- [Specific actionable item]
+- [Specific actionable item]
+- [Specific actionable item]
+## 2. Conversion Effectiveness (Score: X/10)
+**Main Issue:** [One sentence summary]
+**Recommendations:**
+- [Specific actionable item]
+- [Specific actionable item]
+- [Specific actionable item]
+## 3. Trust & Credibility (Score: X/10)
+**Main Issue:** [One sentence summary]
+**Recommendations:**
+- [Specific actionable item]
+- [Specific actionable item]
+- [Specific actionable item]
+## 4. User Experience Issues
+- [Issue 1]
+- [Issue 2]
+- [Issue 3]
+## 5. AI & Automation Opportunities
+For a {industry} business with limited tech resources:
+- [Practical AI tool/solution #1]
+- [Practical AI tool/solution #2]
+- [Practical AI tool/solution #3]
+## Summary
+**Overall Score:** X/100
+**Top 3 Priority Fixes:**
+1. [Most urgent fix]
+2. [Second priority]
+3. [Third priority]
+Use clear, non-technical language that a small business owner would understand.
+Website Content:
 {website_text}
 """
+        # Generate analysis
         response = client.models.generate_content(
+            model="gemini-2.0-flash-exp",
             contents=prompt,
         )
+        result = f"# Analysis for {url}\n\n{response.text}"
+        if not fetch_success:
+            result += "\n\n---\n⚠️ **Note:** Analysis was performed with limited content due to website access issues."
+        return result
     except Exception as e:
+        logger.error(f"Analysis error: {str(e)}")
+        return f"❌ Error during analysis: {str(e)}\n\nPlease check your API key and try again."
 # -----------------------------
 # Gradio UI
 # -----------------------------
+with gr.Blocks(
+    title="AI Website Review Tool",
+    theme=gr.themes.Soft(),
+    css="""
+        .gradio-container {max-width: 900px !important}
+        #output {min-height: 500px}
+    """
+) as demo:
+    gr.Markdown("# 🔍 AI Website Review Tool")
     gr.Markdown(
+        "Get actionable insights to improve your small business website using AI analysis."
     )
+    with gr.Row():
+        with gr.Column():
+            api_key = gr.Textbox(
+                label="🔑 Gemini API Key",
+                placeholder="Paste your Gemini API key here",
+                type="password",
+                info="Get your free API key at https://aistudio.google.com/apikey",
+            )
+            url = gr.Textbox(
+                label="🌐 Website URL",
+                placeholder="example.com or https://example.com",
+                info="Enter the homepage or any page you want analyzed",
+            )
+            gr.Examples(
+                examples=[
+                    ["https://www.stripe.com"],
+                    ["https://www.shopify.com"],
+                ],
+                inputs=url,
+                label="Try example websites",
+            )
+            with gr.Row():
+                industry = gr.Dropdown(
+                    label="🏢 Industry",
+                    choices=[
+                        "General SMB",
+                        "Law Firm",
+                        "Hospitality",
+                        "Healthcare",
+                        "Real Estate",
+                        "E-commerce",
+                        "Consulting",
+                        "Restaurant",
+                        "Fitness",
+                        "Education",
+                    ],
+                    value="General SMB",
+                )
+                goal = gr.Dropdown(
+                    label="🎯 Primary Goal",
+                    choices=[
+                        "Generate leads",
+                        "Sell products",
+                        "Sell services",
+                        "Build credibility",
+                        "Educate visitors",
+                        "Book appointments",
+                    ],
+                    value="Generate leads",
+                )
+            analyze_btn = gr.Button("🚀 Analyze Website", variant="primary", size="lg")
+    with gr.Row():
+        output = gr.Markdown(elem_id="output")
     analyze_btn.click(
         fn=analyze_website,
         inputs=[api_key, url, industry, goal],
         outputs=output,
     )
+    gr.Markdown("""
+    ---
+    ### Tips for Best Results:
+    - Ensure the website is publicly accessible (not behind a login)
+    - Use the homepage URL for overall site analysis
+    - Specific landing pages can be analyzed for targeted insights
+    - Analysis takes 10-30 seconds depending on website size
+    """)
+demo.launch()