ryanshelley commited on
Commit
0ba165b
Β·
verified Β·
1 Parent(s): 4022ccc

Update app.py

Browse files

Replace BS4 with trafilatura

Files changed (1) hide show
  1. app.py +128 -61
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import gradio as gr
2
  import asyncio
3
- import httpx
4
- from bs4 import BeautifulSoup
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
6
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
  from sklearn.metrics.pairwise import cosine_similarity
@@ -13,6 +11,12 @@ import json
13
  import time
14
  import warnings
15
  import os
 
 
 
 
 
 
16
  warnings.filterwarnings('ignore')
17
 
18
  @dataclass
@@ -69,61 +73,100 @@ class SEOContentAnalyzer:
69
  self.keyword_embedding = None
70
 
71
  async def fetch_and_clean_html(self, url: str) -> Dict:
72
- """Fetch and clean HTML content from URL"""
73
  try:
74
- async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
75
- response = await client.get(url)
76
- response.raise_for_status()
77
-
78
- soup = BeautifulSoup(response.text, 'html.parser')
79
-
80
- # Remove unwanted elements
81
- for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
82
- element.decompose()
83
-
84
- # Try to find main content area
85
- main_content = (
86
- soup.find('main') or
87
- soup.find('article') or
88
- soup.find(class_=lambda x: x and any(word in x.lower() for word in ['content', 'post', 'article'])) or
89
- soup.find('body')
90
- )
91
-
92
- if main_content:
93
- text_content = main_content.get_text(separator='\n', strip=True)
94
- text_content = '\n'.join(line.strip() for line in text_content.split('\n') if line.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- return {
97
- 'url': url,
98
- 'title': soup.title.string if soup.title else '',
99
- 'text': text_content,
100
- 'html': str(main_content), # Keep HTML for header splitting
101
- 'success': True,
102
- 'word_count': len(text_content.split())
103
- }
 
 
 
 
104
 
105
  except Exception as e:
106
  return {'url': url, 'success': False, 'error': str(e)}
107
 
108
  async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict:
109
- """Crawl client and competitor URLs"""
110
  all_urls = [client_url] + competitor_urls
111
 
112
- tasks = [self.fetch_and_clean_html(url) for url in all_urls]
113
- results = await asyncio.gather(*tasks, return_exceptions=True)
114
-
115
- # Process results
116
  crawl_data = {
117
  'client': None,
118
  'competitors': [],
119
  'failed_urls': []
120
  }
121
 
122
- for i, result in enumerate(results):
123
- if isinstance(result, Exception):
124
- crawl_data['failed_urls'].append(all_urls[i])
125
- continue
126
-
127
  if not result.get('success'):
128
  crawl_data['failed_urls'].append(result['url'])
129
  continue
@@ -350,28 +393,41 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
350
  return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df
351
 
352
  try:
353
- progress(0.1, desc="Initializing analyzer...")
354
  analyzer = SEOContentAnalyzer(api_key)
355
 
356
- progress(0.2, desc="Crawling websites...")
357
  crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls)
358
 
 
 
 
 
 
 
 
 
 
 
359
  if not crawl_data['client']:
360
  return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df
361
 
362
  if not crawl_data['competitors']:
363
  return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
364
 
365
- progress(0.4, desc="Processing content...")
366
  chunks = analyzer.chunk_content(crawl_data)
367
 
368
- progress(0.6, desc="Calculating similarities...")
 
 
 
369
  sorted_chunks = await analyzer.calculate_similarities(keyword)
370
 
371
- progress(0.8, desc="Analyzing pages...")
372
  page_analyses = analyzer.analyze_pages(sorted_chunks)
373
 
374
- progress(0.9, desc="Generating report...")
375
  report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks)
376
 
377
  # Create summary data
@@ -400,7 +456,7 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
400
 
401
  top_content_df = pd.DataFrame(top_content_data)
402
 
403
- progress(1.0, desc="Complete!")
404
 
405
  return report, summary_df, top_content_df
406
 
@@ -411,19 +467,23 @@ def sync_run_seo_analysis(*args):
411
  """Synchronous wrapper for the async function"""
412
  return asyncio.run(run_seo_analysis(*args))
413
 
414
- # Create Gradio Interface
415
  def create_interface():
416
- with gr.Blocks(title="SEO Content Gap Analysis", theme=gr.themes.Monochrome()) as demo:
 
 
 
 
 
 
 
 
417
  gr.Markdown("""
418
  # πŸ” SEO Content Gap Analysis Tool
419
 
420
  Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
421
 
422
- **How it works:**
423
- 1. Crawls your page and competitor pages
424
- 2. Chunks content intelligently (headers + paragraphs)
425
- 3. Uses OpenAI embeddings to measure semantic similarity to your keyword
426
- 4. Generates actionable SEO recommendations
427
  """)
428
 
429
  with gr.Row():
@@ -464,8 +524,8 @@ def create_interface():
464
  with gr.Tabs():
465
  with gr.TabItem("πŸ“ SEO Report"):
466
  report_output = gr.Markdown(
467
- label="SEO Analysis Report",
468
- value="Click 'Run Analysis' to generate your SEO report..."
469
  )
470
 
471
  with gr.TabItem("πŸ“ˆ Page Summary"):
@@ -482,7 +542,7 @@ def create_interface():
482
  value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
483
  )
484
 
485
- # Example section
486
  gr.Markdown("""
487
  ### πŸ’‘ Example Usage
488
 
@@ -494,6 +554,12 @@ def create_interface():
494
  https://contentmarketinginstitute.com/strategy
495
  https://neilpatel.com/blog/content-marketing-strategy
496
  ```
 
 
 
 
 
 
497
  """)
498
 
499
  # Event handler
@@ -507,8 +573,9 @@ def create_interface():
507
  ### ⚠️ Important Notes
508
  - Analysis may take 2-5 minutes depending on content size
509
  - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
510
- - Works best with content-rich pages (blogs, guides, etc.)
511
- - Respects robots.txt and rate limits
 
512
  """)
513
 
514
  return demo
 
1
  import gradio as gr
2
  import asyncio
 
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
4
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
5
  from sklearn.metrics.pairwise import cosine_similarity
 
11
  import time
12
  import warnings
13
  import os
14
+ import re
15
+
16
+ # Trafilatura imports
17
+ from trafilatura import fetch_url, extract, bare_extraction
18
+ from trafilatura.downloads import fetch_url as trafilatura_fetch
19
+
20
  warnings.filterwarnings('ignore')
21
 
22
  @dataclass
 
73
  self.keyword_embedding = None
74
 
75
  async def fetch_and_clean_html(self, url: str) -> Dict:
76
+ """Fetch and clean HTML content from URL using Trafilatura"""
77
  try:
78
+ # Use trafilatura to fetch the URL with custom settings
79
+ downloaded = trafilatura_fetch(url)
80
+
81
+ if not downloaded:
82
+ return {'url': url, 'success': False, 'error': 'Failed to download'}
83
+
84
+ # Extract text content using trafilatura
85
+ text_content = extract(downloaded, include_comments=False, include_tables=True)
86
+
87
+ if not text_content:
88
+ return {'url': url, 'success': False, 'error': 'No content extracted'}
89
+
90
+ # Extract with metadata to get title and other info
91
+ metadata_result = bare_extraction(downloaded, include_comments=False, include_tables=True)
92
+
93
+ # Handle Document object properly
94
+ title = ''
95
+ if metadata_result:
96
+ if hasattr(metadata_result, 'title') and metadata_result.title:
97
+ title = metadata_result.title
98
+ elif hasattr(metadata_result, 'get'):
99
+ title = metadata_result.get('title', '')
100
+ else:
101
+ # Try to access as attribute
102
+ try:
103
+ title = getattr(metadata_result, 'title', '')
104
+ except:
105
+ title = ''
106
+
107
+ # Extract HTML with formatting for header splitting
108
+ html_content = extract(downloaded, output_format='xml', include_comments=False, include_tables=True)
109
+
110
+ # Convert trafilatura XML to simple HTML for header splitting
111
+ if html_content and len(html_content) > 100:
112
+ # Simple conversion: replace XML tags with HTML equivalents
113
+ html_for_splitting = html_content
114
+ # Convert <head> tags to proper header tags
115
+ html_for_splitting = re.sub(r'<head rend="(h[1-6])"[^>]*>', r'<\1>', html_for_splitting)
116
+ html_for_splitting = re.sub(r'<head rend="h(\d)"[^>]*>', r'<h\1>', html_for_splitting)
117
+ html_for_splitting = re.sub(r'</head>', '</h2>', html_for_splitting)
118
+ html_for_splitting = re.sub(r'<head[^>]*>', '<h2>', html_for_splitting)
119
+ # Wrap in div
120
+ html_for_splitting = f"<div>{html_for_splitting}</div>"
121
+ else:
122
+ # Fallback: create simple HTML structure from text
123
+ # Try to detect headers in plain text
124
+ lines = text_content.split('\n')
125
+ html_lines = []
126
+ for line in lines:
127
+ line = line.strip()
128
+ if line:
129
+ # Simple heuristic: short lines that might be headers
130
+ if len(line) < 100 and len(line) > 5 and not line.endswith('.') and not line.endswith(',') and not line.endswith(';'):
131
+ # Check if it looks like a header (title case, shorter, etc.)
132
+ if line.istitle() or line.isupper() or (len(line.split()) <= 8):
133
+ html_lines.append(f"<h3>{line}</h3>")
134
+ else:
135
+ html_lines.append(f"<p>{line}</p>")
136
+ else:
137
+ html_lines.append(f"<p>{line}</p>")
138
 
139
+ html_for_splitting = f"<div>{''.join(html_lines)}</div>"
140
+
141
+ word_count = len(text_content.split())
142
+
143
+ return {
144
+ 'url': url,
145
+ 'title': title,
146
+ 'text': text_content,
147
+ 'html': html_for_splitting,
148
+ 'success': True,
149
+ 'word_count': word_count
150
+ }
151
 
152
  except Exception as e:
153
  return {'url': url, 'success': False, 'error': str(e)}
154
 
155
  async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict:
156
+ """Crawl client and competitor URLs using Trafilatura"""
157
  all_urls = [client_url] + competitor_urls
158
 
159
+ # Since trafilatura is synchronous, we'll run them sequentially
160
+ # but we can still use async structure for consistency
 
 
161
  crawl_data = {
162
  'client': None,
163
  'competitors': [],
164
  'failed_urls': []
165
  }
166
 
167
+ for i, url in enumerate(all_urls):
168
+ result = await self.fetch_and_clean_html(url)
169
+
 
 
170
  if not result.get('success'):
171
  crawl_data['failed_urls'].append(result['url'])
172
  continue
 
393
  return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df
394
 
395
  try:
396
+ progress(0.1, desc="Initializing analyzer with Trafilatura...")
397
  analyzer = SEOContentAnalyzer(api_key)
398
 
399
+ progress(0.2, desc="Crawling websites with enhanced extraction...")
400
  crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls)
401
 
402
+ # Check if we have any successful crawls
403
+ total_successful = 0
404
+ if crawl_data['client']:
405
+ total_successful += 1
406
+ total_successful += len(crawl_data['competitors'])
407
+
408
+ if total_successful == 0:
409
+ failed_urls = ', '.join(crawl_data['failed_urls'][:3])
410
+ return f"❌ No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df
411
+
412
  if not crawl_data['client']:
413
  return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df
414
 
415
  if not crawl_data['competitors']:
416
  return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
417
 
418
+ progress(0.4, desc="Processing content with intelligent chunking...")
419
  chunks = analyzer.chunk_content(crawl_data)
420
 
421
+ if not chunks:
422
+ return "❌ No content chunks were created from the crawled pages", empty_summary_df, empty_content_df
423
+
424
+ progress(0.6, desc="Calculating semantic similarities...")
425
  sorted_chunks = await analyzer.calculate_similarities(keyword)
426
 
427
+ progress(0.8, desc="Analyzing page performance...")
428
  page_analyses = analyzer.analyze_pages(sorted_chunks)
429
 
430
+ progress(0.9, desc="Generating AI-powered SEO report...")
431
  report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks)
432
 
433
  # Create summary data
 
456
 
457
  top_content_df = pd.DataFrame(top_content_data)
458
 
459
+ progress(1.0, desc="Analysis complete!")
460
 
461
  return report, summary_df, top_content_df
462
 
 
467
  """Synchronous wrapper for the async function"""
468
  return asyncio.run(run_seo_analysis(*args))
469
 
470
+ # Create Gradio Interface with Glass Theme
471
  def create_interface():
472
+ with gr.Blocks(
473
+ title="SEO Content Gap Analysis",
474
+ theme=gr.themes.Glass(
475
+ primary_hue="blue",
476
+ secondary_hue="slate",
477
+ neutral_hue="zinc",
478
+ font="Inter"
479
+ )
480
+ ) as demo:
481
  gr.Markdown("""
482
  # πŸ” SEO Content Gap Analysis Tool
483
 
484
  Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
485
 
486
+ **Enhanced with Trafilatura** for superior content extraction and intelligent header-based chunking.
 
 
 
 
487
  """)
488
 
489
  with gr.Row():
 
524
  with gr.Tabs():
525
  with gr.TabItem("πŸ“ SEO Report"):
526
  report_output = gr.Markdown(
527
+ label="AI-Generated SEO Analysis Report",
528
+ value="Click 'Run Analysis' to generate your comprehensive SEO report with actionable insights..."
529
  )
530
 
531
  with gr.TabItem("πŸ“ˆ Page Summary"):
 
542
  value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
543
  )
544
 
545
+ # Enhanced example section
546
  gr.Markdown("""
547
  ### πŸ’‘ Example Usage
548
 
 
554
  https://contentmarketinginstitute.com/strategy
555
  https://neilpatel.com/blog/content-marketing-strategy
556
  ```
557
+
558
+ ### ✨ What's New
559
+ - **Enhanced Content Extraction**: Uses Trafilatura for better content quality
560
+ - **Intelligent Chunking**: Header-aware splitting for more accurate analysis
561
+ - **Improved Accuracy**: Better handling of complex page structures
562
+ - **Glass Theme**: Modern, sleek interface design
563
  """)
564
 
565
  # Event handler
 
573
  ### ⚠️ Important Notes
574
  - Analysis may take 2-5 minutes depending on content size
575
  - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
576
+ - Enhanced extraction works best with any type of web content
577
+ - Trafilatura respects robots.txt and implements smart rate limiting
578
+ - Glass theme provides modern, professional appearance
579
  """)
580
 
581
  return demo