bparekh99 commited on
Commit
d2e8f75
Β·
verified Β·
1 Parent(s): 3cf32d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -110
app.py CHANGED
@@ -3,12 +3,14 @@ import requests
3
  import socket
4
  import logging
5
  import time
 
6
  from bs4 import BeautifulSoup
7
- from urllib.parse import urlparse
 
8
  from google import genai
9
 
10
  # -------------------------------------------------
11
- # Logging setup (Hugging Face compatible)
12
  # -------------------------------------------------
13
  logging.basicConfig(
14
  level=logging.INFO,
@@ -18,21 +20,87 @@ logger = logging.getLogger(__name__)
18
 
19
  logger.info("AI Website Review Tool starting up")
20
 
 
 
 
 
 
 
 
21
  # -----------------------------
22
- # URL Normalization
23
  # -----------------------------
24
  def normalize_url(url: str) -> str:
 
 
25
  parsed = urlparse(url)
26
  if not parsed.scheme:
27
- return "https://" + url
28
  return url
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # -----------------------------
32
- # Fetch & Parse Website (Hardened)
33
  # -----------------------------
34
- def fetch_website_text(url: str) -> str:
35
- socket.setdefaulttimeout(10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  headers = {
38
  "User-Agent": (
@@ -40,163 +108,273 @@ def fetch_website_text(url: str) -> str:
40
  "AppleWebKit/537.36 (KHTML, like Gecko) "
41
  "Chrome/121.0 Safari/537.36"
42
  ),
43
- "Accept": "text/html,application/xhtml+xml",
44
  "Accept-Language": "en-US,en;q=0.9",
 
 
45
  }
46
 
47
- response = requests.get(
48
- url,
49
- headers=headers,
50
- timeout=10,
51
- allow_redirects=True,
52
- )
53
- response.raise_for_status()
 
 
 
 
 
 
 
54
 
55
  soup = BeautifulSoup(response.text, "html.parser")
56
 
57
  # Remove noisy tags
58
- for tag in soup(["script", "style", "noscript"]):
59
  tag.decompose()
60
 
61
- title = soup.title.string.strip() if soup.title else ""
62
- h1 = soup.find("h1").get_text(strip=True) if soup.find("h1") else ""
63
 
 
64
  body_text = " ".join(soup.stripped_strings)
65
- body_text = body_text[:8000] # token safety
 
 
 
 
 
 
 
 
 
66
 
67
- return f"""
68
- PAGE TITLE:
69
- {title}
70
- PRIMARY H1:
71
- {h1}
72
  VISIBLE CONTENT:
73
  {body_text}
74
  """
 
75
 
76
 
77
- # -----------------------------
78
- # Safe Wrapper (Never Crash)
79
- # -----------------------------
80
- def fetch_website_text_safe(url: str) -> str:
81
  try:
82
  return fetch_website_text(url)
 
 
 
 
 
 
 
 
 
 
 
 
83
  except Exception as e:
84
- return f"""
85
- ⚠️ Unable to fully fetch website content.
86
- Error:
87
- {str(e)}
88
- Fallback:
89
- Analyze based on URL structure, homepage intent, and general best practices.
90
- """
91
 
92
 
93
  # -----------------------------
94
  # Gemini Analysis
95
  # -----------------------------
96
- def analyze_website(api_key, url, industry, goal):
97
- if not api_key:
98
- return "❌ Please enter your Gemini API key."
 
 
 
99
 
100
  if not url:
101
  return "❌ Please enter a website URL."
102
 
 
 
 
 
 
 
103
  try:
104
- url = normalize_url(url)
 
 
 
 
105
 
106
- client = genai.Client(api_key=api_key)
 
 
 
107
 
108
- website_text = fetch_website_text_safe(url)
 
109
 
110
- prompt = f"""
111
- You are an AI consultant helping small businesses improve their websites.
112
- Business context:
113
  - Industry: {industry}
114
- - Primary goal: {goal}
115
- Analyze the website content below and provide recommendations in this structure:
116
- 1. Messaging Clarity (score 1–10)
117
- - Main issue
118
- - 2–3 actionable recommendations
119
- 2. Conversion Effectiveness (score 1–10)
120
- - Main issue
121
- - 2–3 actionable recommendations
122
- 3. Trust & Credibility (score 1–10)
123
- - Main issue
124
- - 2–3 actionable recommendations
125
- 4. User Experience Issues
126
- - Bullet list of issues
127
- 5. AI & Automation Opportunities
128
- - 3 concrete ideas a small business could implement
129
- End with:
130
- - Overall score out of 100
131
- - Top 3 fixes to prioritize this week
132
- Use clear, non-technical business language.
133
- Website content:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  {website_text}
135
  """
136
 
 
137
  response = client.models.generate_content(
138
- model="gemini-2.5-flash",
139
  contents=prompt,
140
  )
141
 
142
- return response.text
 
 
 
 
 
143
 
144
  except Exception as e:
145
- return f"❌ Error during analysis: {str(e)}"
 
146
 
147
 
148
  # -----------------------------
149
  # Gradio UI
150
  # -----------------------------
151
- with gr.Blocks(title="AI Website Review Tool") as demo:
152
- gr.Markdown("## πŸ” AI Website Review Tool")
 
 
 
 
 
 
 
153
  gr.Markdown(
154
- "Analyze any website and receive practical, business-focused recommendations."
155
- )
156
-
157
- api_key = gr.Textbox(
158
- label="Gemini API Key",
159
- placeholder="Paste your Gemini API key here",
160
- type="password",
161
  )
162
 
163
- url = gr.Textbox(
164
- label="Website URL",
165
- placeholder="https://example.com",
166
- )
167
-
168
- industry = gr.Dropdown(
169
- label="Industry",
170
- choices=[
171
- "General SMB",
172
- "Law Firm",
173
- "Hospitality",
174
- "Healthcare",
175
- "Real Estate",
176
- ],
177
- value="General SMB",
178
- )
179
-
180
- goal = gr.Dropdown(
181
- label="Primary Website Goal",
182
- choices=[
183
- "Generate leads",
184
- "Sell services",
185
- "Build credibility",
186
- "Educate visitors",
187
- ],
188
- value="Generate leads",
189
- )
190
-
191
- analyze_btn = gr.Button("Analyze Website")
192
-
193
- status = gr.Markdown("")
194
- output = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  analyze_btn.click(
197
  fn=analyze_website,
198
  inputs=[api_key, url, industry, goal],
199
  outputs=output,
200
  )
201
-
202
- demo.launch()
 
 
 
 
 
 
 
 
 
 
3
  import socket
4
  import logging
5
  import time
6
+ import re
7
  from bs4 import BeautifulSoup
8
+ from urllib.parse import urlparse, urljoin
9
+ from typing import Dict, Tuple, Optional
10
  from google import genai
11
 
12
  # -------------------------------------------------
13
+ # Logging setup
14
  # -------------------------------------------------
15
  logging.basicConfig(
16
  level=logging.INFO,
 
20
 
21
  logger.info("AI Website Review Tool starting up")
22
 
23
+ # -------------------------------------------------
24
+ # Constants
25
+ # -------------------------------------------------
26
+ TIMEOUT = 15
27
+ MAX_RETRIES = 2
28
+ CONTENT_LIMIT = 12000
29
+
30
  # -----------------------------
31
+ # URL Validation & Normalization
32
  # -----------------------------
33
  def normalize_url(url: str) -> str:
34
+ """Normalize and validate URL format."""
35
+ url = url.strip()
36
  parsed = urlparse(url)
37
  if not parsed.scheme:
38
+ url = "https://" + url
39
  return url
40
 
41
 
42
+ def validate_url(url: str) -> Tuple[bool, str]:
43
+ """Validate URL format and accessibility."""
44
+ try:
45
+ parsed = urlparse(url)
46
+ if not parsed.netloc:
47
+ return False, "Invalid URL format. Please include domain name."
48
+
49
+ # Check for obviously invalid domains
50
+ if len(parsed.netloc) < 4 or '.' not in parsed.netloc:
51
+ return False, "Invalid domain name."
52
+
53
+ return True, ""
54
+ except Exception as e:
55
+ return False, f"URL validation error: {str(e)}"
56
+
57
+
58
  # -----------------------------
59
+ # Enhanced Content Extraction
60
  # -----------------------------
61
+ def extract_website_info(soup: BeautifulSoup, url: str) -> Dict[str, str]:
62
+ """Extract key website elements for analysis."""
63
+ info = {}
64
+
65
+ # Title
66
+ info['title'] = soup.title.string.strip() if soup.title else ""
67
+
68
+ # Meta description
69
+ meta_desc = soup.find("meta", attrs={"name": "description"})
70
+ info['meta_description'] = meta_desc.get("content", "").strip() if meta_desc else ""
71
+
72
+ # Headings
73
+ info['h1'] = soup.find("h1").get_text(strip=True) if soup.find("h1") else ""
74
+ h2_tags = soup.find_all("h2", limit=5)
75
+ info['h2s'] = " | ".join([h2.get_text(strip=True) for h2 in h2_tags])
76
+
77
+ # CTAs (buttons and prominent links)
78
+ cta_patterns = ['button', 'btn', 'cta', 'call-to-action']
79
+ ctas = []
80
+ for pattern in cta_patterns:
81
+ elements = soup.find_all(class_=re.compile(pattern, re.I))
82
+ ctas.extend([el.get_text(strip=True) for el in elements[:3]])
83
+ info['ctas'] = " | ".join(ctas[:5]) if ctas else "No clear CTAs found"
84
+
85
+ # Contact information
86
+ contact_indicators = soup.find_all(string=re.compile(r'contact|email|phone|call', re.I))
87
+ info['has_contact'] = len(contact_indicators) > 0
88
+
89
+ # Links analysis
90
+ links = soup.find_all('a', href=True)
91
+ info['total_links'] = len(links)
92
+ external_links = [l for l in links if urlparse(l['href']).netloc and urlparse(l['href']).netloc != urlparse(url).netloc]
93
+ info['external_links'] = len(external_links)
94
+
95
+ return info
96
+
97
+
98
+ def fetch_website_text(url: str) -> Tuple[str, bool]:
99
+ """
100
+ Fetch and parse website content.
101
+ Returns (content_string, success_boolean)
102
+ """
103
+ socket.setdefaulttimeout(TIMEOUT)
104
 
105
  headers = {
106
  "User-Agent": (
 
108
  "AppleWebKit/537.36 (KHTML, like Gecko) "
109
  "Chrome/121.0 Safari/537.36"
110
  ),
111
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
112
  "Accept-Language": "en-US,en;q=0.9",
113
+ "Accept-Encoding": "gzip, deflate",
114
+ "DNT": "1",
115
  }
116
 
117
+ for attempt in range(MAX_RETRIES):
118
+ try:
119
+ response = requests.get(
120
+ url,
121
+ headers=headers,
122
+ timeout=TIMEOUT,
123
+ allow_redirects=True,
124
+ )
125
+ response.raise_for_status()
126
+ break
127
+ except requests.exceptions.RequestException as e:
128
+ if attempt == MAX_RETRIES - 1:
129
+ raise
130
+ time.sleep(1)
131
 
132
  soup = BeautifulSoup(response.text, "html.parser")
133
 
134
  # Remove noisy tags
135
+ for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
136
  tag.decompose()
137
 
138
+ # Extract structured info
139
+ info = extract_website_info(soup, url)
140
 
141
+ # Body content
142
  body_text = " ".join(soup.stripped_strings)
143
+ body_text = body_text[:CONTENT_LIMIT]
144
+
145
+ content = f"""
146
+ PAGE TITLE: {info['title']}
147
+ META DESCRIPTION: {info['meta_description']}
148
+ PRIMARY H1: {info['h1']}
149
+ KEY H2 HEADINGS: {info['h2s']}
150
+ CALL-TO-ACTION BUTTONS: {info['ctas']}
151
+ CONTACT INFO PRESENT: {"Yes" if info['has_contact'] else "No"}
152
+ LINK ANALYSIS: {info['total_links']} total links, {info['external_links']} external
153
 
 
 
 
 
 
154
  VISIBLE CONTENT:
155
  {body_text}
156
  """
157
+ return content, True
158
 
159
 
160
+ def fetch_website_text_safe(url: str) -> Tuple[str, bool]:
161
+ """Safe wrapper that never crashes."""
 
 
162
  try:
163
  return fetch_website_text(url)
164
+ except requests.exceptions.Timeout:
165
+ return """⚠️ Website took too long to respond (timeout).
166
+ This might indicate slow server performance.
167
+ Analysis will be based on URL structure and general best practices.""", False
168
+ except requests.exceptions.SSLError:
169
+ return """⚠️ SSL Certificate error detected.
170
+ This is a major trust issue that should be fixed immediately.
171
+ Analysis will include this critical security concern.""", False
172
+ except requests.exceptions.ConnectionError:
173
+ return """⚠️ Could not connect to website.
174
+ Website may be down or have DNS issues.
175
+ Analysis will be based on general best practices.""", False
176
  except Exception as e:
177
+ return f"""⚠️ Unable to fully fetch website content.
178
+ Error: {str(e)}
179
+ Analysis will be based on available information and general best practices.""", False
 
 
 
 
180
 
181
 
182
  # -----------------------------
183
  # Gemini Analysis
184
  # -----------------------------
185
+ def analyze_website(api_key: str, url: str, industry: str, goal: str) -> str:
186
+ """Main analysis function."""
187
+
188
+ # Validate inputs
189
+ if not api_key or len(api_key) < 20:
190
+ return "❌ Please enter a valid Gemini API key. Get one at https://aistudio.google.com/apikey"
191
 
192
  if not url:
193
  return "❌ Please enter a website URL."
194
 
195
+ # Normalize and validate URL
196
+ url = normalize_url(url)
197
+ is_valid, error_msg = validate_url(url)
198
+ if not is_valid:
199
+ return f"❌ {error_msg}"
200
+
201
  try:
202
+ # Initialize client
203
+ try:
204
+ client = genai.Client(api_key=api_key)
205
+ except Exception as e:
206
+ return f"❌ Invalid API key. Please check your Gemini API key.\nError: {str(e)}"
207
 
208
+ # Fetch website content
209
+ website_text, fetch_success = fetch_website_text_safe(url)
210
+
211
+ fetch_status = "βœ… Full content analysis" if fetch_success else "⚠️ Limited analysis"
212
 
213
+ # Build enhanced prompt
214
+ prompt = f"""You are an AI consultant helping small businesses improve their websites.
215
 
216
+ Business Context:
 
 
217
  - Industry: {industry}
218
+ - Primary Goal: {goal}
219
+ - URL: {url}
220
+ - Content Fetch Status: {fetch_status}
221
+
222
+ Analyze the website content below and provide a comprehensive business-focused review.
223
+
224
+ Structure your response with clear sections:
225
+
226
+ ## 1. Messaging Clarity (Score: X/10)
227
+ **Main Issue:** [One sentence summary]
228
+ **Recommendations:**
229
+ - [Specific actionable item]
230
+ - [Specific actionable item]
231
+ - [Specific actionable item]
232
+
233
+ ## 2. Conversion Effectiveness (Score: X/10)
234
+ **Main Issue:** [One sentence summary]
235
+ **Recommendations:**
236
+ - [Specific actionable item]
237
+ - [Specific actionable item]
238
+ - [Specific actionable item]
239
+
240
+ ## 3. Trust & Credibility (Score: X/10)
241
+ **Main Issue:** [One sentence summary]
242
+ **Recommendations:**
243
+ - [Specific actionable item]
244
+ - [Specific actionable item]
245
+ - [Specific actionable item]
246
+
247
+ ## 4. User Experience Issues
248
+ - [Issue 1]
249
+ - [Issue 2]
250
+ - [Issue 3]
251
+
252
+ ## 5. AI & Automation Opportunities
253
+ For a {industry} business with limited tech resources:
254
+ - [Practical AI tool/solution #1]
255
+ - [Practical AI tool/solution #2]
256
+ - [Practical AI tool/solution #3]
257
+
258
+ ## Summary
259
+ **Overall Score:** X/100
260
+ **Top 3 Priority Fixes:**
261
+ 1. [Most urgent fix]
262
+ 2. [Second priority]
263
+ 3. [Third priority]
264
+
265
+ Use clear, non-technical language that a small business owner would understand.
266
+
267
+ Website Content:
268
  {website_text}
269
  """
270
 
271
+ # Generate analysis
272
  response = client.models.generate_content(
273
+ model="gemini-2.0-flash-exp",
274
  contents=prompt,
275
  )
276
 
277
+ result = f"# Analysis for {url}\n\n{response.text}"
278
+
279
+ if not fetch_success:
280
+ result += "\n\n---\n⚠️ **Note:** Analysis was performed with limited content due to website access issues."
281
+
282
+ return result
283
 
284
  except Exception as e:
285
+ logger.error(f"Analysis error: {str(e)}")
286
+ return f"❌ Error during analysis: {str(e)}\n\nPlease check your API key and try again."
287
 
288
 
289
  # -----------------------------
290
  # Gradio UI
291
  # -----------------------------
292
+ with gr.Blocks(
293
+ title="AI Website Review Tool",
294
+ theme=gr.themes.Soft(),
295
+ css="""
296
+ .gradio-container {max-width: 900px !important}
297
+ #output {min-height: 500px}
298
+ """
299
+ ) as demo:
300
+ gr.Markdown("# πŸ” AI Website Review Tool")
301
  gr.Markdown(
302
+ "Get actionable insights to improve your small business website using AI analysis."
 
 
 
 
 
 
303
  )
304
 
305
+ with gr.Row():
306
+ with gr.Column():
307
+ api_key = gr.Textbox(
308
+ label="πŸ”‘ Gemini API Key",
309
+ placeholder="Paste your Gemini API key here",
310
+ type="password",
311
+ info="Get your free API key at https://aistudio.google.com/apikey",
312
+ )
313
+
314
+ url = gr.Textbox(
315
+ label="🌐 Website URL",
316
+ placeholder="example.com or https://example.com",
317
+ info="Enter the homepage or any page you want analyzed",
318
+ )
319
+
320
+ gr.Examples(
321
+ examples=[
322
+ ["https://www.stripe.com"],
323
+ ["https://www.shopify.com"],
324
+ ],
325
+ inputs=url,
326
+ label="Try example websites",
327
+ )
328
+
329
+ with gr.Row():
330
+ industry = gr.Dropdown(
331
+ label="🏒 Industry",
332
+ choices=[
333
+ "General SMB",
334
+ "Law Firm",
335
+ "Hospitality",
336
+ "Healthcare",
337
+ "Real Estate",
338
+ "E-commerce",
339
+ "Consulting",
340
+ "Restaurant",
341
+ "Fitness",
342
+ "Education",
343
+ ],
344
+ value="General SMB",
345
+ )
346
+
347
+ goal = gr.Dropdown(
348
+ label="🎯 Primary Goal",
349
+ choices=[
350
+ "Generate leads",
351
+ "Sell products",
352
+ "Sell services",
353
+ "Build credibility",
354
+ "Educate visitors",
355
+ "Book appointments",
356
+ ],
357
+ value="Generate leads",
358
+ )
359
+
360
+ analyze_btn = gr.Button("πŸš€ Analyze Website", variant="primary", size="lg")
361
+
362
+ with gr.Row():
363
+ output = gr.Markdown(elem_id="output")
364
 
365
  analyze_btn.click(
366
  fn=analyze_website,
367
  inputs=[api_key, url, industry, goal],
368
  outputs=output,
369
  )
370
+
371
+ gr.Markdown("""
372
+ ---
373
+ ### Tips for Best Results:
374
+ - Ensure the website is publicly accessible (not behind a login)
375
+ - Use the homepage URL for overall site analysis
376
+ - Specific landing pages can be analyzed for targeted insights
377
+ - Analysis takes 10-30 seconds depending on website size
378
+ """)
379
+
380
+ demo.launch()