Update app.py
Browse filesReplace BS4 with trafilatura
app.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import asyncio
|
| 3 |
-
import httpx
|
| 4 |
-
from bs4 import BeautifulSoup
|
| 5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
|
| 6 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 7 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
@@ -13,6 +11,12 @@ import json
|
|
| 13 |
import time
|
| 14 |
import warnings
|
| 15 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
warnings.filterwarnings('ignore')
|
| 17 |
|
| 18 |
@dataclass
|
|
@@ -69,61 +73,100 @@ class SEOContentAnalyzer:
|
|
| 69 |
self.keyword_embedding = None
|
| 70 |
|
| 71 |
async def fetch_and_clean_html(self, url: str) -> Dict:
|
| 72 |
-
"""Fetch and clean HTML content from URL"""
|
| 73 |
try:
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
if
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
except Exception as e:
|
| 106 |
return {'url': url, 'success': False, 'error': str(e)}
|
| 107 |
|
| 108 |
async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict:
|
| 109 |
-
"""Crawl client and competitor URLs"""
|
| 110 |
all_urls = [client_url] + competitor_urls
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
# Process results
|
| 116 |
crawl_data = {
|
| 117 |
'client': None,
|
| 118 |
'competitors': [],
|
| 119 |
'failed_urls': []
|
| 120 |
}
|
| 121 |
|
| 122 |
-
for i,
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
continue
|
| 126 |
-
|
| 127 |
if not result.get('success'):
|
| 128 |
crawl_data['failed_urls'].append(result['url'])
|
| 129 |
continue
|
|
@@ -350,28 +393,41 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
|
|
| 350 |
return "β Please provide at least one competitor URL", empty_summary_df, empty_content_df
|
| 351 |
|
| 352 |
try:
|
| 353 |
-
progress(0.1, desc="Initializing analyzer...")
|
| 354 |
analyzer = SEOContentAnalyzer(api_key)
|
| 355 |
|
| 356 |
-
progress(0.2, desc="Crawling websites...")
|
| 357 |
crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls)
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
if not crawl_data['client']:
|
| 360 |
return "β Failed to crawl client URL", empty_summary_df, empty_content_df
|
| 361 |
|
| 362 |
if not crawl_data['competitors']:
|
| 363 |
return "β Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
|
| 364 |
|
| 365 |
-
progress(0.4, desc="Processing content...")
|
| 366 |
chunks = analyzer.chunk_content(crawl_data)
|
| 367 |
|
| 368 |
-
|
|
|
|
|
|
|
|
|
|
| 369 |
sorted_chunks = await analyzer.calculate_similarities(keyword)
|
| 370 |
|
| 371 |
-
progress(0.8, desc="Analyzing
|
| 372 |
page_analyses = analyzer.analyze_pages(sorted_chunks)
|
| 373 |
|
| 374 |
-
progress(0.9, desc="Generating report...")
|
| 375 |
report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks)
|
| 376 |
|
| 377 |
# Create summary data
|
|
@@ -400,7 +456,7 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
|
|
| 400 |
|
| 401 |
top_content_df = pd.DataFrame(top_content_data)
|
| 402 |
|
| 403 |
-
progress(1.0, desc="
|
| 404 |
|
| 405 |
return report, summary_df, top_content_df
|
| 406 |
|
|
@@ -411,19 +467,23 @@ def sync_run_seo_analysis(*args):
|
|
| 411 |
"""Synchronous wrapper for the async function"""
|
| 412 |
return asyncio.run(run_seo_analysis(*args))
|
| 413 |
|
| 414 |
-
# Create Gradio Interface
|
| 415 |
def create_interface():
|
| 416 |
-
with gr.Blocks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
gr.Markdown("""
|
| 418 |
# π SEO Content Gap Analysis Tool
|
| 419 |
|
| 420 |
Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
|
| 421 |
|
| 422 |
-
**
|
| 423 |
-
1. Crawls your page and competitor pages
|
| 424 |
-
2. Chunks content intelligently (headers + paragraphs)
|
| 425 |
-
3. Uses OpenAI embeddings to measure semantic similarity to your keyword
|
| 426 |
-
4. Generates actionable SEO recommendations
|
| 427 |
""")
|
| 428 |
|
| 429 |
with gr.Row():
|
|
@@ -464,8 +524,8 @@ def create_interface():
|
|
| 464 |
with gr.Tabs():
|
| 465 |
with gr.TabItem("π SEO Report"):
|
| 466 |
report_output = gr.Markdown(
|
| 467 |
-
label="SEO Analysis Report",
|
| 468 |
-
value="Click 'Run Analysis' to generate your SEO report..."
|
| 469 |
)
|
| 470 |
|
| 471 |
with gr.TabItem("π Page Summary"):
|
|
@@ -482,7 +542,7 @@ def create_interface():
|
|
| 482 |
value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
|
| 483 |
)
|
| 484 |
|
| 485 |
-
#
|
| 486 |
gr.Markdown("""
|
| 487 |
### π‘ Example Usage
|
| 488 |
|
|
@@ -494,6 +554,12 @@ def create_interface():
|
|
| 494 |
https://contentmarketinginstitute.com/strategy
|
| 495 |
https://neilpatel.com/blog/content-marketing-strategy
|
| 496 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
""")
|
| 498 |
|
| 499 |
# Event handler
|
|
@@ -507,8 +573,9 @@ def create_interface():
|
|
| 507 |
### β οΈ Important Notes
|
| 508 |
- Analysis may take 2-5 minutes depending on content size
|
| 509 |
- Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
|
| 510 |
-
-
|
| 511 |
-
-
|
|
|
|
| 512 |
""")
|
| 513 |
|
| 514 |
return demo
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import asyncio
|
|
|
|
|
|
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
|
| 4 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 11 |
import time
|
| 12 |
import warnings
|
| 13 |
import os
|
| 14 |
+
import re
|
| 15 |
+
|
| 16 |
+
# Trafilatura imports
|
| 17 |
+
from trafilatura import fetch_url, extract, bare_extraction
|
| 18 |
+
from trafilatura.downloads import fetch_url as trafilatura_fetch
|
| 19 |
+
|
| 20 |
warnings.filterwarnings('ignore')
|
| 21 |
|
| 22 |
@dataclass
|
|
|
|
| 73 |
self.keyword_embedding = None
|
| 74 |
|
| 75 |
async def fetch_and_clean_html(self, url: str) -> Dict:
|
| 76 |
+
"""Fetch and clean HTML content from URL using Trafilatura"""
|
| 77 |
try:
|
| 78 |
+
# Use trafilatura to fetch the URL with custom settings
|
| 79 |
+
downloaded = trafilatura_fetch(url)
|
| 80 |
+
|
| 81 |
+
if not downloaded:
|
| 82 |
+
return {'url': url, 'success': False, 'error': 'Failed to download'}
|
| 83 |
+
|
| 84 |
+
# Extract text content using trafilatura
|
| 85 |
+
text_content = extract(downloaded, include_comments=False, include_tables=True)
|
| 86 |
+
|
| 87 |
+
if not text_content:
|
| 88 |
+
return {'url': url, 'success': False, 'error': 'No content extracted'}
|
| 89 |
+
|
| 90 |
+
# Extract with metadata to get title and other info
|
| 91 |
+
metadata_result = bare_extraction(downloaded, include_comments=False, include_tables=True)
|
| 92 |
+
|
| 93 |
+
# Handle Document object properly
|
| 94 |
+
title = ''
|
| 95 |
+
if metadata_result:
|
| 96 |
+
if hasattr(metadata_result, 'title') and metadata_result.title:
|
| 97 |
+
title = metadata_result.title
|
| 98 |
+
elif hasattr(metadata_result, 'get'):
|
| 99 |
+
title = metadata_result.get('title', '')
|
| 100 |
+
else:
|
| 101 |
+
# Try to access as attribute
|
| 102 |
+
try:
|
| 103 |
+
title = getattr(metadata_result, 'title', '')
|
| 104 |
+
except:
|
| 105 |
+
title = ''
|
| 106 |
+
|
| 107 |
+
# Extract HTML with formatting for header splitting
|
| 108 |
+
html_content = extract(downloaded, output_format='xml', include_comments=False, include_tables=True)
|
| 109 |
+
|
| 110 |
+
# Convert trafilatura XML to simple HTML for header splitting
|
| 111 |
+
if html_content and len(html_content) > 100:
|
| 112 |
+
# Simple conversion: replace XML tags with HTML equivalents
|
| 113 |
+
html_for_splitting = html_content
|
| 114 |
+
# Convert <head> tags to proper header tags
|
| 115 |
+
html_for_splitting = re.sub(r'<head rend="(h[1-6])"[^>]*>', r'<\1>', html_for_splitting)
|
| 116 |
+
html_for_splitting = re.sub(r'<head rend="h(\d)"[^>]*>', r'<h\1>', html_for_splitting)
|
| 117 |
+
html_for_splitting = re.sub(r'</head>', '</h2>', html_for_splitting)
|
| 118 |
+
html_for_splitting = re.sub(r'<head[^>]*>', '<h2>', html_for_splitting)
|
| 119 |
+
# Wrap in div
|
| 120 |
+
html_for_splitting = f"<div>{html_for_splitting}</div>"
|
| 121 |
+
else:
|
| 122 |
+
# Fallback: create simple HTML structure from text
|
| 123 |
+
# Try to detect headers in plain text
|
| 124 |
+
lines = text_content.split('\n')
|
| 125 |
+
html_lines = []
|
| 126 |
+
for line in lines:
|
| 127 |
+
line = line.strip()
|
| 128 |
+
if line:
|
| 129 |
+
# Simple heuristic: short lines that might be headers
|
| 130 |
+
if len(line) < 100 and len(line) > 5 and not line.endswith('.') and not line.endswith(',') and not line.endswith(';'):
|
| 131 |
+
# Check if it looks like a header (title case, shorter, etc.)
|
| 132 |
+
if line.istitle() or line.isupper() or (len(line.split()) <= 8):
|
| 133 |
+
html_lines.append(f"<h3>{line}</h3>")
|
| 134 |
+
else:
|
| 135 |
+
html_lines.append(f"<p>{line}</p>")
|
| 136 |
+
else:
|
| 137 |
+
html_lines.append(f"<p>{line}</p>")
|
| 138 |
|
| 139 |
+
html_for_splitting = f"<div>{''.join(html_lines)}</div>"
|
| 140 |
+
|
| 141 |
+
word_count = len(text_content.split())
|
| 142 |
+
|
| 143 |
+
return {
|
| 144 |
+
'url': url,
|
| 145 |
+
'title': title,
|
| 146 |
+
'text': text_content,
|
| 147 |
+
'html': html_for_splitting,
|
| 148 |
+
'success': True,
|
| 149 |
+
'word_count': word_count
|
| 150 |
+
}
|
| 151 |
|
| 152 |
except Exception as e:
|
| 153 |
return {'url': url, 'success': False, 'error': str(e)}
|
| 154 |
|
| 155 |
async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict:
|
| 156 |
+
"""Crawl client and competitor URLs using Trafilatura"""
|
| 157 |
all_urls = [client_url] + competitor_urls
|
| 158 |
|
| 159 |
+
# Since trafilatura is synchronous, we'll run them sequentially
|
| 160 |
+
# but we can still use async structure for consistency
|
|
|
|
|
|
|
| 161 |
crawl_data = {
|
| 162 |
'client': None,
|
| 163 |
'competitors': [],
|
| 164 |
'failed_urls': []
|
| 165 |
}
|
| 166 |
|
| 167 |
+
for i, url in enumerate(all_urls):
|
| 168 |
+
result = await self.fetch_and_clean_html(url)
|
| 169 |
+
|
|
|
|
|
|
|
| 170 |
if not result.get('success'):
|
| 171 |
crawl_data['failed_urls'].append(result['url'])
|
| 172 |
continue
|
|
|
|
| 393 |
return "β Please provide at least one competitor URL", empty_summary_df, empty_content_df
|
| 394 |
|
| 395 |
try:
|
| 396 |
+
progress(0.1, desc="Initializing analyzer with Trafilatura...")
|
| 397 |
analyzer = SEOContentAnalyzer(api_key)
|
| 398 |
|
| 399 |
+
progress(0.2, desc="Crawling websites with enhanced extraction...")
|
| 400 |
crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls)
|
| 401 |
|
| 402 |
+
# Check if we have any successful crawls
|
| 403 |
+
total_successful = 0
|
| 404 |
+
if crawl_data['client']:
|
| 405 |
+
total_successful += 1
|
| 406 |
+
total_successful += len(crawl_data['competitors'])
|
| 407 |
+
|
| 408 |
+
if total_successful == 0:
|
| 409 |
+
failed_urls = ', '.join(crawl_data['failed_urls'][:3])
|
| 410 |
+
return f"β No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df
|
| 411 |
+
|
| 412 |
if not crawl_data['client']:
|
| 413 |
return "β Failed to crawl client URL", empty_summary_df, empty_content_df
|
| 414 |
|
| 415 |
if not crawl_data['competitors']:
|
| 416 |
return "β Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
|
| 417 |
|
| 418 |
+
progress(0.4, desc="Processing content with intelligent chunking...")
|
| 419 |
chunks = analyzer.chunk_content(crawl_data)
|
| 420 |
|
| 421 |
+
if not chunks:
|
| 422 |
+
return "β No content chunks were created from the crawled pages", empty_summary_df, empty_content_df
|
| 423 |
+
|
| 424 |
+
progress(0.6, desc="Calculating semantic similarities...")
|
| 425 |
sorted_chunks = await analyzer.calculate_similarities(keyword)
|
| 426 |
|
| 427 |
+
progress(0.8, desc="Analyzing page performance...")
|
| 428 |
page_analyses = analyzer.analyze_pages(sorted_chunks)
|
| 429 |
|
| 430 |
+
progress(0.9, desc="Generating AI-powered SEO report...")
|
| 431 |
report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks)
|
| 432 |
|
| 433 |
# Create summary data
|
|
|
|
| 456 |
|
| 457 |
top_content_df = pd.DataFrame(top_content_data)
|
| 458 |
|
| 459 |
+
progress(1.0, desc="Analysis complete!")
|
| 460 |
|
| 461 |
return report, summary_df, top_content_df
|
| 462 |
|
|
|
|
| 467 |
"""Synchronous wrapper for the async function"""
|
| 468 |
return asyncio.run(run_seo_analysis(*args))
|
| 469 |
|
| 470 |
+
# Create Gradio Interface with Glass Theme
|
| 471 |
def create_interface():
|
| 472 |
+
with gr.Blocks(
|
| 473 |
+
title="SEO Content Gap Analysis",
|
| 474 |
+
theme=gr.themes.Glass(
|
| 475 |
+
primary_hue="blue",
|
| 476 |
+
secondary_hue="slate",
|
| 477 |
+
neutral_hue="zinc",
|
| 478 |
+
font="Inter"
|
| 479 |
+
)
|
| 480 |
+
) as demo:
|
| 481 |
gr.Markdown("""
|
| 482 |
# π SEO Content Gap Analysis Tool
|
| 483 |
|
| 484 |
Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
|
| 485 |
|
| 486 |
+
**Enhanced with Trafilatura** for superior content extraction and intelligent header-based chunking.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
""")
|
| 488 |
|
| 489 |
with gr.Row():
|
|
|
|
| 524 |
with gr.Tabs():
|
| 525 |
with gr.TabItem("π SEO Report"):
|
| 526 |
report_output = gr.Markdown(
|
| 527 |
+
label="AI-Generated SEO Analysis Report",
|
| 528 |
+
value="Click 'Run Analysis' to generate your comprehensive SEO report with actionable insights..."
|
| 529 |
)
|
| 530 |
|
| 531 |
with gr.TabItem("π Page Summary"):
|
|
|
|
| 542 |
value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
|
| 543 |
)
|
| 544 |
|
| 545 |
+
# Enhanced example section
|
| 546 |
gr.Markdown("""
|
| 547 |
### π‘ Example Usage
|
| 548 |
|
|
|
|
| 554 |
https://contentmarketinginstitute.com/strategy
|
| 555 |
https://neilpatel.com/blog/content-marketing-strategy
|
| 556 |
```
|
| 557 |
+
|
| 558 |
+
### β¨ What's New
|
| 559 |
+
- **Enhanced Content Extraction**: Uses Trafilatura for better content quality
|
| 560 |
+
- **Intelligent Chunking**: Header-aware splitting for more accurate analysis
|
| 561 |
+
- **Improved Accuracy**: Better handling of complex page structures
|
| 562 |
+
- **Glass Theme**: Modern, sleek interface design
|
| 563 |
""")
|
| 564 |
|
| 565 |
# Event handler
|
|
|
|
| 573 |
### β οΈ Important Notes
|
| 574 |
- Analysis may take 2-5 minutes depending on content size
|
| 575 |
- Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
|
| 576 |
+
- Enhanced extraction works best with any type of web content
|
| 577 |
+
- Trafilatura respects robots.txt and implements smart rate limiting
|
| 578 |
+
- Glass theme provides modern, professional appearance
|
| 579 |
""")
|
| 580 |
|
| 581 |
return demo
|