ryanshelley commited on
Commit
6b25473
Β·
verified Β·
1 Parent(s): c6d4a83

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +513 -0
app.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import httpx
4
+ from bs4 import BeautifulSoup
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
6
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import numpy as np
9
+ from typing import List, Dict, Tuple
10
+ import pandas as pd
11
+ from dataclasses import dataclass
12
+ import json
13
+ import time
14
+ import warnings
15
+ import os
16
+ warnings.filterwarnings('ignore')
17
+
18
+ @dataclass
19
+ class ContentChunk:
20
+ content: str
21
+ url: str
22
+ page_type: str # 'client' or 'competitor'
23
+ chunk_index: int
24
+ chunk_type: str # 'header_section', 'paragraph', or 'header_subsection'
25
+ header_info: Dict = None # Will store header level and text
26
+ similarity_score: float = 0.0
27
+
28
+ @dataclass
29
+ class PageAnalysis:
30
+ url: str
31
+ page_type: str
32
+ total_chunks: int
33
+ avg_similarity: float
34
+ max_similarity: float
35
+ top_chunks: List[ContentChunk]
36
+
37
+ class SEOContentAnalyzer:
38
+ def __init__(self, api_key: str):
39
+ self.embeddings = OpenAIEmbeddings(
40
+ model="text-embedding-3-small",
41
+ openai_api_key=api_key
42
+ )
43
+ self.llm = ChatOpenAI(
44
+ model="gpt-4o-mini",
45
+ temperature=0.3,
46
+ openai_api_key=api_key
47
+ )
48
+
49
+ # Header-based splitter (first level)
50
+ self.html_splitter = HTMLHeaderTextSplitter(
51
+ headers_to_split_on=[
52
+ ("h1", "Header 1"),
53
+ ("h2", "Header 2"),
54
+ ("h3", "Header 3"),
55
+ ("h4", "Header 4"),
56
+ ("h5", "Header 5"),
57
+ ("h6", "Header 6"),
58
+ ]
59
+ )
60
+
61
+ # Paragraph-based splitter (second level)
62
+ self.text_splitter = RecursiveCharacterTextSplitter(
63
+ chunk_size=600,
64
+ chunk_overlap=100,
65
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
66
+ )
67
+
68
+ self.all_chunks = []
69
+ self.keyword_embedding = None
70
+
71
+ async def fetch_and_clean_html(self, url: str) -> Dict:
72
+ """Fetch and clean HTML content from URL"""
73
+ try:
74
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
75
+ response = await client.get(url)
76
+ response.raise_for_status()
77
+
78
+ soup = BeautifulSoup(response.text, 'html.parser')
79
+
80
+ # Remove unwanted elements
81
+ for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
82
+ element.decompose()
83
+
84
+ # Try to find main content area
85
+ main_content = (
86
+ soup.find('main') or
87
+ soup.find('article') or
88
+ soup.find(class_=lambda x: x and any(word in x.lower() for word in ['content', 'post', 'article'])) or
89
+ soup.find('body')
90
+ )
91
+
92
+ if main_content:
93
+ text_content = main_content.get_text(separator='\n', strip=True)
94
+ text_content = '\n'.join(line.strip() for line in text_content.split('\n') if line.strip())
95
+
96
+ return {
97
+ 'url': url,
98
+ 'title': soup.title.string if soup.title else '',
99
+ 'text': text_content,
100
+ 'html': str(main_content), # Keep HTML for header splitting
101
+ 'success': True,
102
+ 'word_count': len(text_content.split())
103
+ }
104
+
105
+ except Exception as e:
106
+ return {'url': url, 'success': False, 'error': str(e)}
107
+
108
+ async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict:
109
+ """Crawl client and competitor URLs"""
110
+ all_urls = [client_url] + competitor_urls
111
+
112
+ tasks = [self.fetch_and_clean_html(url) for url in all_urls]
113
+ results = await asyncio.gather(*tasks, return_exceptions=True)
114
+
115
+ # Process results
116
+ crawl_data = {
117
+ 'client': None,
118
+ 'competitors': [],
119
+ 'failed_urls': []
120
+ }
121
+
122
+ for i, result in enumerate(results):
123
+ if isinstance(result, Exception):
124
+ crawl_data['failed_urls'].append(all_urls[i])
125
+ continue
126
+
127
+ if not result.get('success'):
128
+ crawl_data['failed_urls'].append(result['url'])
129
+ continue
130
+
131
+ if i == 0: # First URL is client
132
+ crawl_data['client'] = result
133
+ else:
134
+ crawl_data['competitors'].append(result)
135
+
136
+ return crawl_data
137
+
138
+ def chunk_content(self, crawl_data: Dict) -> List[ContentChunk]:
139
+ """Chunk all content using header-first, then paragraph-level splitting"""
140
+ all_chunks = []
141
+
142
+ # Process client content
143
+ if crawl_data['client']:
144
+ client_chunks = self._chunk_single_page(
145
+ crawl_data['client'], 'client'
146
+ )
147
+ all_chunks.extend(client_chunks)
148
+
149
+ # Process competitor content
150
+ for comp_data in crawl_data['competitors']:
151
+ comp_chunks = self._chunk_single_page(comp_data, 'competitor')
152
+ all_chunks.extend(comp_chunks)
153
+
154
+ self.all_chunks = all_chunks
155
+ return all_chunks
156
+
157
+ def _chunk_single_page(self, page_data: Dict, page_type: str) -> List[ContentChunk]:
158
+ """Chunk a single page using header + paragraph strategy"""
159
+ chunks = []
160
+ chunk_index = 0
161
+
162
+ try:
163
+ # Step 1: Try header-based splitting first
164
+ if 'html' in page_data:
165
+ header_splits = self.html_splitter.split_text(page_data['html'])
166
+
167
+ if header_splits and len(header_splits) > 1:
168
+ # We found headers, process each section
169
+ for split in header_splits:
170
+ header_info = split.metadata if hasattr(split, 'metadata') else {}
171
+ content = split.page_content if hasattr(split, 'page_content') else str(split)
172
+
173
+ # If header section is large, split it further by paragraphs
174
+ if len(content) > 800:
175
+ sub_chunks = self.text_splitter.split_text(content)
176
+ for i, sub_chunk in enumerate(sub_chunks):
177
+ if len(sub_chunk.strip()) > 50:
178
+ chunks.append(ContentChunk(
179
+ content=sub_chunk.strip(),
180
+ url=page_data['url'],
181
+ page_type=page_type,
182
+ chunk_index=chunk_index,
183
+ chunk_type='header_subsection',
184
+ header_info=header_info
185
+ ))
186
+ chunk_index += 1
187
+ else:
188
+ # Small header section, keep as is
189
+ if len(content.strip()) > 50:
190
+ chunks.append(ContentChunk(
191
+ content=content.strip(),
192
+ url=page_data['url'],
193
+ page_type=page_type,
194
+ chunk_index=chunk_index,
195
+ chunk_type='header_section',
196
+ header_info=header_info
197
+ ))
198
+ chunk_index += 1
199
+ else:
200
+ # No meaningful headers found, fall back to paragraph splitting
201
+ self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index)
202
+ else:
203
+ # No HTML available, use text splitting
204
+ self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index)
205
+
206
+ except Exception as e:
207
+ self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index)
208
+
209
+ return chunks
210
+
211
+ def _add_paragraph_chunks(self, page_data: Dict, page_type: str, chunks: List, start_index: int):
212
+ """Add paragraph-level chunks as fallback"""
213
+ text_chunks = self.text_splitter.split_text(page_data['text'])
214
+ chunk_index = start_index
215
+
216
+ for chunk_text in text_chunks:
217
+ if len(chunk_text.strip()) > 50:
218
+ chunks.append(ContentChunk(
219
+ content=chunk_text.strip(),
220
+ url=page_data['url'],
221
+ page_type=page_type,
222
+ chunk_index=chunk_index,
223
+ chunk_type='paragraph',
224
+ header_info={}
225
+ ))
226
+ chunk_index += 1
227
+
228
+ async def calculate_similarities(self, keyword: str) -> List[ContentChunk]:
229
+ """Calculate cosine similarity between chunks and keyword"""
230
+ if not self.all_chunks:
231
+ raise ValueError("No chunks available. Run chunk_content first.")
232
+
233
+ # Create embeddings for keyword
234
+ self.keyword_embedding = await self.embeddings.aembed_query(keyword)
235
+
236
+ # Create embeddings for all chunks
237
+ chunk_texts = [chunk.content for chunk in self.all_chunks]
238
+ chunk_embeddings = await self.embeddings.aembed_documents(chunk_texts)
239
+
240
+ # Calculate similarities
241
+ similarities = cosine_similarity([self.keyword_embedding], chunk_embeddings)[0]
242
+
243
+ # Update chunks with similarity scores
244
+ for i, chunk in enumerate(self.all_chunks):
245
+ chunk.similarity_score = float(similarities[i])
246
+
247
+ # Sort by similarity score
248
+ sorted_chunks = sorted(self.all_chunks, key=lambda x: x.similarity_score, reverse=True)
249
+
250
+ return sorted_chunks
251
+
252
+ def analyze_pages(self, sorted_chunks: List[ContentChunk]) -> Dict[str, PageAnalysis]:
253
+ """Analyze performance by page"""
254
+ # Group chunks by URL
255
+ url_groups = {}
256
+ for chunk in sorted_chunks:
257
+ if chunk.url not in url_groups:
258
+ url_groups[chunk.url] = []
259
+ url_groups[chunk.url].append(chunk)
260
+
261
+ page_analyses = {}
262
+ for url, chunks in url_groups.items():
263
+ page_type = chunks[0].page_type
264
+ similarities = [chunk.similarity_score for chunk in chunks]
265
+
266
+ analysis = PageAnalysis(
267
+ url=url,
268
+ page_type=page_type,
269
+ total_chunks=len(chunks),
270
+ avg_similarity=np.mean(similarities),
271
+ max_similarity=np.max(similarities),
272
+ top_chunks=sorted(chunks, key=lambda x: x.similarity_score, reverse=True)[:3]
273
+ )
274
+
275
+ page_analyses[url] = analysis
276
+
277
+ return page_analyses
278
+
279
+ async def generate_report(self, keyword: str, page_analyses: Dict[str, PageAnalysis],
280
+ sorted_chunks: List[ContentChunk]) -> str:
281
+ """Generate comprehensive SEO report"""
282
+ # Prepare data for LLM
283
+ client_analysis = next((p for p in page_analyses.values() if p.page_type == 'client'), None)
284
+ competitor_analyses = [p for p in page_analyses.values() if p.page_type == 'competitor']
285
+
286
+ # Get top performing content
287
+ top_chunks = sorted_chunks[:5]
288
+ client_top_chunks = [c for c in sorted_chunks if c.page_type == 'client'][:3]
289
+ competitor_top_chunks = [c for c in sorted_chunks if c.page_type == 'competitor'][:5]
290
+
291
+ # Format client analysis data safely
292
+ client_url = client_analysis.url if client_analysis else 'No client data'
293
+ client_chunks = client_analysis.total_chunks if client_analysis else 0
294
+ client_avg = f"{client_analysis.avg_similarity:.4f}" if client_analysis else "0.0000"
295
+ client_max = f"{client_analysis.max_similarity:.4f}" if client_analysis else "0.0000"
296
+
297
+ # Create prompt for LLM
298
+ prompt = f"""
299
+ As an SEO expert, analyze this content relevance data for the keyword "{keyword}" and provide actionable insights.
300
+
301
+ CLIENT PAGE PERFORMANCE:
302
+ URL: {client_url}
303
+ Total Chunks: {client_chunks}
304
+ Average Similarity: {client_avg}
305
+ Max Similarity: {client_max}
306
+
307
+ TOP CLIENT CONTENT SECTIONS:
308
+ {chr(10).join([f"Score {c.similarity_score:.4f}: {c.content[:200]}..." for c in client_top_chunks[:3]])}
309
+
310
+ COMPETITOR PERFORMANCE:
311
+ {chr(10).join([f"URL: {p.url}, Avg: {p.avg_similarity:.4f}, Max: {p.max_similarity:.4f}" for p in competitor_analyses])}
312
+
313
+ TOP COMPETITOR CONTENT SECTIONS:
314
+ {chr(10).join([f"Score {c.similarity_score:.4f} ({c.url}): {c.content[:200]}..." for c in competitor_top_chunks[:3]])}
315
+
316
+ OVERALL TOP PERFORMING CONTENT:
317
+ {chr(10).join([f"Score {c.similarity_score:.4f} ({c.page_type}): {c.content[:150]}..." for c in top_chunks])}
318
+
319
+ Please provide:
320
+ 1. Which page/content is strongest for this keyword?
321
+ 2. What sections are performing best?
322
+ 3. What is our client page doing well?
323
+ 4. What is our client page missing compared to competitors?
324
+ 5. Specific actionable recommendations to improve content relevance
325
+
326
+ Format as a clear, actionable SEO report.
327
+ """
328
+
329
+ response = await self.llm.ainvoke(prompt)
330
+ return response.content
331
+
332
+ # Gradio Interface Functions
333
+ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competitor_urls_text: str, progress=gr.Progress()):
334
+ """Main function to run SEO analysis"""
335
+
336
+ if not api_key:
337
+ return "❌ Please provide your OpenAI API key", "", ""
338
+
339
+ if not keyword or not client_url:
340
+ return "❌ Please provide both keyword and client URL", "", ""
341
+
342
+ # Parse competitor URLs
343
+ competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()]
344
+
345
+ if not competitor_urls:
346
+ return "❌ Please provide at least one competitor URL", "", ""
347
+
348
+ try:
349
+ progress(0.1, desc="Initializing analyzer...")
350
+ analyzer = SEOContentAnalyzer(api_key)
351
+
352
+ progress(0.2, desc="Crawling websites...")
353
+ crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls)
354
+
355
+ if not crawl_data['client']:
356
+ return "❌ Failed to crawl client URL", "", ""
357
+
358
+ if not crawl_data['competitors']:
359
+ return "❌ Failed to crawl any competitor URLs", "", ""
360
+
361
+ progress(0.4, desc="Processing content...")
362
+ chunks = analyzer.chunk_content(crawl_data)
363
+
364
+ progress(0.6, desc="Calculating similarities...")
365
+ sorted_chunks = await analyzer.calculate_similarities(keyword)
366
+
367
+ progress(0.8, desc="Analyzing pages...")
368
+ page_analyses = analyzer.analyze_pages(sorted_chunks)
369
+
370
+ progress(0.9, desc="Generating report...")
371
+ report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks)
372
+
373
+ # Create summary data
374
+ summary_data = []
375
+ for url, analysis in page_analyses.items():
376
+ summary_data.append({
377
+ 'URL': url,
378
+ 'Type': analysis.page_type.title(),
379
+ 'Total Chunks': analysis.total_chunks,
380
+ 'Avg Similarity': f"{analysis.avg_similarity:.4f}",
381
+ 'Max Similarity': f"{analysis.max_similarity:.4f}"
382
+ })
383
+
384
+ summary_df = pd.DataFrame(summary_data)
385
+
386
+ # Create top content data
387
+ top_content_data = []
388
+ for i, chunk in enumerate(sorted_chunks[:10], 1):
389
+ top_content_data.append({
390
+ 'Rank': i,
391
+ 'Type': chunk.page_type.title(),
392
+ 'Score': f"{chunk.similarity_score:.4f}",
393
+ 'Content Preview': chunk.content[:150] + "..." if len(chunk.content) > 150 else chunk.content,
394
+ 'URL': chunk.url
395
+ })
396
+
397
+ top_content_df = pd.DataFrame(top_content_data)
398
+
399
+ progress(1.0, desc="Complete!")
400
+
401
+ return report, summary_df, top_content_df
402
+
403
+ except Exception as e:
404
+ return f"❌ Error during analysis: {str(e)}", "", ""
405
+
406
+ def sync_run_seo_analysis(*args):
407
+ """Synchronous wrapper for the async function"""
408
+ return asyncio.run(run_seo_analysis(*args))
409
+
410
+ # Create Gradio Interface
411
+ def create_interface():
412
+ with gr.Blocks(title="SEO Content Gap Analysis", theme=gr.themes.Soft()) as demo:
413
+ gr.Markdown("""
414
+ # πŸ” SEO Content Gap Analysis Tool
415
+
416
+ Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
417
+
418
+ **How it works:**
419
+ 1. Crawls your page and competitor pages
420
+ 2. Chunks content intelligently (headers + paragraphs)
421
+ 3. Uses OpenAI embeddings to measure semantic similarity to your keyword
422
+ 4. Generates actionable SEO recommendations
423
+ """)
424
+
425
+ with gr.Row():
426
+ with gr.Column(scale=1):
427
+ gr.Markdown("### πŸ”‘ Configuration")
428
+
429
+ api_key = gr.Textbox(
430
+ label="OpenAI API Key",
431
+ placeholder="sk-...",
432
+ type="password",
433
+ info="Your OpenAI API key for embeddings and analysis"
434
+ )
435
+
436
+ keyword = gr.Textbox(
437
+ label="Target Keyword",
438
+ placeholder="e.g., python web scraping",
439
+ info="The keyword you want to optimize for"
440
+ )
441
+
442
+ client_url = gr.Textbox(
443
+ label="Your Page URL",
444
+ placeholder="https://yoursite.com/page",
445
+ info="The URL of your page to analyze"
446
+ )
447
+
448
+ competitor_urls = gr.Textbox(
449
+ label="Competitor URLs",
450
+ placeholder="https://competitor1.com/page\nhttps://competitor2.com/page",
451
+ lines=5,
452
+ info="One URL per line (2-5 competitors recommended)"
453
+ )
454
+
455
+ analyze_btn = gr.Button("πŸš€ Run Analysis", variant="primary", size="lg")
456
+
457
+ with gr.Column(scale=2):
458
+ gr.Markdown("### πŸ“Š Results")
459
+
460
+ with gr.Tabs():
461
+ with gr.TabItem("πŸ“ SEO Report"):
462
+ report_output = gr.Markdown(
463
+ label="SEO Analysis Report",
464
+ value="Click 'Run Analysis' to generate your SEO report..."
465
+ )
466
+
467
+ with gr.TabItem("πŸ“ˆ Page Summary"):
468
+ summary_output = gr.Dataframe(
469
+ label="Page Performance Summary",
470
+ headers=["URL", "Type", "Total Chunks", "Avg Similarity", "Max Similarity"]
471
+ )
472
+
473
+ with gr.TabItem("🎯 Top Content"):
474
+ top_content_output = gr.Dataframe(
475
+ label="Top Performing Content Sections",
476
+ headers=["Rank", "Type", "Score", "Content Preview", "URL"]
477
+ )
478
+
479
+ # Example section
480
+ gr.Markdown("""
481
+ ### πŸ’‘ Example Usage
482
+
483
+ **Keyword:** `content marketing strategy`
484
+ **Your URL:** `https://yoursite.com/content-marketing-guide`
485
+ **Competitors:**
486
+ ```
487
+ https://hubspot.com/content-marketing
488
+ https://contentmarketinginstitute.com/strategy
489
+ https://neilpatel.com/blog/content-marketing-strategy
490
+ ```
491
+ """)
492
+
493
+ # Event handler
494
+ analyze_btn.click(
495
+ fn=sync_run_seo_analysis,
496
+ inputs=[api_key, keyword, client_url, competitor_urls],
497
+ outputs=[report_output, summary_output, top_content_output]
498
+ )
499
+
500
+ gr.Markdown("""
501
+ ### ⚠️ Important Notes
502
+ - Analysis may take 2-5 minutes depending on content size
503
+ - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
504
+ - Works best with content-rich pages (blogs, guides, etc.)
505
+ - Respects robots.txt and rate limits
506
+ """)
507
+
508
+ return demo
509
+
510
+ # Launch the app
511
+ if __name__ == "__main__":
512
+ demo = create_interface()
513
+ demo.launch()