Spaces:

ryanshelley
/

vector_based_content_gap_analysis

Sleeping

App Files Files Community

ryanshelley commited on Jul 17, 2025

Commit

0ba165b

verified ·

1 Parent(s): 4022ccc

Update app.py

Browse files

Replace BS4 with trafilatura

Files changed (1) hide show

app.py +128 -61

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import gradio as gr
 import asyncio
-import httpx
-from bs4 import BeautifulSoup
 from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
 from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from sklearn.metrics.pairwise import cosine_similarity
@@ -13,6 +11,12 @@ import json
 import time
 import warnings
 import os
 warnings.filterwarnings('ignore')
 @dataclass
@@ -69,61 +73,100 @@ class SEOContentAnalyzer:
         self.keyword_embedding = None
     async def fetch_and_clean_html(self, url: str) -> Dict:
-        """Fetch and clean HTML content from URL"""
         try:
-            async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
-                response = await client.get(url)
-                response.raise_for_status()
-                soup = BeautifulSoup(response.text, 'html.parser')
-                # Remove unwanted elements
-                for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
-                    element.decompose()
-                # Try to find main content area
-                main_content = (
-                    soup.find('main') or
-                    soup.find('article') or
-                    soup.find(class_=lambda x: x and any(word in x.lower() for word in ['content', 'post', 'article'])) or
-                    soup.find('body')
-                )
-                if main_content:
-                    text_content = main_content.get_text(separator='\n', strip=True)
-                    text_content = '\n'.join(line.strip() for line in text_content.split('\n') if line.strip())
-                return {
-                    'url': url,
-                    'title': soup.title.string if soup.title else '',
-                    'text': text_content,
-                    'html': str(main_content),  # Keep HTML for header splitting
-                    'success': True,
-                    'word_count': len(text_content.split())
-                }
         except Exception as e:
             return {'url': url, 'success': False, 'error': str(e)}
     async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict:
-        """Crawl client and competitor URLs"""
         all_urls = [client_url] + competitor_urls
-        tasks = [self.fetch_and_clean_html(url) for url in all_urls]
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-        # Process results
         crawl_data = {
             'client': None,
             'competitors': [],
             'failed_urls': []
         }
-        for i, result in enumerate(results):
-            if isinstance(result, Exception):
-                crawl_data['failed_urls'].append(all_urls[i])
-                continue
             if not result.get('success'):
                 crawl_data['failed_urls'].append(result['url'])
                 continue
@@ -350,28 +393,41 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
         return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df
     try:
-        progress(0.1, desc="Initializing analyzer...")
         analyzer = SEOContentAnalyzer(api_key)
-        progress(0.2, desc="Crawling websites...")
         crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls)
         if not crawl_data['client']:
             return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df
         if not crawl_data['competitors']:
             return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
-        progress(0.4, desc="Processing content...")
         chunks = analyzer.chunk_content(crawl_data)
-        progress(0.6, desc="Calculating similarities...")
         sorted_chunks = await analyzer.calculate_similarities(keyword)
-        progress(0.8, desc="Analyzing pages...")
         page_analyses = analyzer.analyze_pages(sorted_chunks)
-        progress(0.9, desc="Generating report...")
         report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks)
         # Create summary data
@@ -400,7 +456,7 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
         top_content_df = pd.DataFrame(top_content_data)
-        progress(1.0, desc="Complete!")
         return report, summary_df, top_content_df
@@ -411,19 +467,23 @@ def sync_run_seo_analysis(*args):
     """Synchronous wrapper for the async function"""
     return asyncio.run(run_seo_analysis(*args))
-# Create Gradio Interface
 def create_interface():
-    with gr.Blocks(title="SEO Content Gap Analysis", theme=gr.themes.Monochrome()) as demo:
         gr.Markdown("""
         # 🔍 SEO Content Gap Analysis Tool
         Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
-        **How it works:**
-        1. Crawls your page and competitor pages
-        2. Chunks content intelligently (headers + paragraphs)
-        3. Uses OpenAI embeddings to measure semantic similarity to your keyword
-        4. Generates actionable SEO recommendations
         """)
         with gr.Row():
@@ -464,8 +524,8 @@ def create_interface():
                 with gr.Tabs():
                     with gr.TabItem("📝 SEO Report"):
                         report_output = gr.Markdown(
-                            label="SEO Analysis Report",
-                            value="Click 'Run Analysis' to generate your SEO report..."
                         )
                     with gr.TabItem("📈 Page Summary"):
@@ -482,7 +542,7 @@ def create_interface():
                             value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
                         )
-        # Example section
         gr.Markdown("""
         ### 💡 Example Usage
@@ -494,6 +554,12 @@ def create_interface():
         https://contentmarketinginstitute.com/strategy
         https://neilpatel.com/blog/content-marketing-strategy
         ```
         """)
         # Event handler
@@ -507,8 +573,9 @@ def create_interface():
         ### ⚠️ Important Notes
         - Analysis may take 2-5 minutes depending on content size
         - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
-        - Works best with content-rich pages (blogs, guides, etc.)
-        - Respects robots.txt and rate limits
         """)
     return demo

 import gradio as gr
 import asyncio
 from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
 from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from sklearn.metrics.pairwise import cosine_similarity
 import time
 import warnings
 import os
+import re
+# Trafilatura imports
+from trafilatura import fetch_url, extract, bare_extraction
+from trafilatura.downloads import fetch_url as trafilatura_fetch
 warnings.filterwarnings('ignore')
 @dataclass
         self.keyword_embedding = None
     async def fetch_and_clean_html(self, url: str) -> Dict:
+        """Fetch and clean HTML content from URL using Trafilatura"""
         try:
+            # Use trafilatura to fetch the URL with custom settings
+            downloaded = trafilatura_fetch(url)
+            if not downloaded:
+                return {'url': url, 'success': False, 'error': 'Failed to download'}
+            # Extract text content using trafilatura
+            text_content = extract(downloaded, include_comments=False, include_tables=True)
+            if not text_content:
+                return {'url': url, 'success': False, 'error': 'No content extracted'}
+            # Extract with metadata to get title and other info
+            metadata_result = bare_extraction(downloaded, include_comments=False, include_tables=True)
+            # Handle Document object properly
+            title = ''
+            if metadata_result:
+                if hasattr(metadata_result, 'title') and metadata_result.title:
+                    title = metadata_result.title
+                elif hasattr(metadata_result, 'get'):
+                    title = metadata_result.get('title', '')
+                else:
+                    # Try to access as attribute
+                    try:
+                        title = getattr(metadata_result, 'title', '')
+                    except:
+                        title = ''
+            # Extract HTML with formatting for header splitting
+            html_content = extract(downloaded, output_format='xml', include_comments=False, include_tables=True)
+            # Convert trafilatura XML to simple HTML for header splitting
+            if html_content and len(html_content) > 100:
+                # Simple conversion: replace XML tags with HTML equivalents
+                html_for_splitting = html_content
+                # Convert <head> tags to proper header tags
+                html_for_splitting = re.sub(r'<head rend="(h[1-6])"[^>]*>', r'<\1>', html_for_splitting)
+                html_for_splitting = re.sub(r'<head rend="h(\d)"[^>]*>', r'<h\1>', html_for_splitting)
+                html_for_splitting = re.sub(r'</head>', '</h2>', html_for_splitting)
+                html_for_splitting = re.sub(r'<head[^>]*>', '<h2>', html_for_splitting)
+                # Wrap in div
+                html_for_splitting = f"<div>{html_for_splitting}</div>"
+            else:
+                # Fallback: create simple HTML structure from text
+                # Try to detect headers in plain text
+                lines = text_content.split('\n')
+                html_lines = []
+                for line in lines:
+                    line = line.strip()
+                    if line:
+                        # Simple heuristic: short lines that might be headers
+                        if len(line) < 100 and len(line) > 5 and not line.endswith('.') and not line.endswith(',') and not line.endswith(';'):
+                            # Check if it looks like a header (title case, shorter, etc.)
+                            if line.istitle() or line.isupper() or (len(line.split()) <= 8):
+                                html_lines.append(f"<h3>{line}</h3>")
+                            else:
+                                html_lines.append(f"<p>{line}</p>")
+                        else:
+                            html_lines.append(f"<p>{line}</p>")
+                html_for_splitting = f"<div>{''.join(html_lines)}</div>"
+            word_count = len(text_content.split())
+            return {
+                'url': url,
+                'title': title,
+                'text': text_content,
+                'html': html_for_splitting,
+                'success': True,
+                'word_count': word_count
+            }
         except Exception as e:
             return {'url': url, 'success': False, 'error': str(e)}
     async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict:
+        """Crawl client and competitor URLs using Trafilatura"""
         all_urls = [client_url] + competitor_urls
+        # Since trafilatura is synchronous, we'll run them sequentially
+        # but we can still use async structure for consistency
         crawl_data = {
             'client': None,
             'competitors': [],
             'failed_urls': []
         }
+        for i, url in enumerate(all_urls):
+            result = await self.fetch_and_clean_html(url)
             if not result.get('success'):
                 crawl_data['failed_urls'].append(result['url'])
                 continue
         return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df
     try:
+        progress(0.1, desc="Initializing analyzer with Trafilatura...")
         analyzer = SEOContentAnalyzer(api_key)
+        progress(0.2, desc="Crawling websites with enhanced extraction...")
         crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls)
+        # Check if we have any successful crawls
+        total_successful = 0
+        if crawl_data['client']:
+            total_successful += 1
+        total_successful += len(crawl_data['competitors'])
+        if total_successful == 0:
+            failed_urls = ', '.join(crawl_data['failed_urls'][:3])
+            return f"❌ No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df
         if not crawl_data['client']:
             return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df
         if not crawl_data['competitors']:
             return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
+        progress(0.4, desc="Processing content with intelligent chunking...")
         chunks = analyzer.chunk_content(crawl_data)
+        if not chunks:
+            return "❌ No content chunks were created from the crawled pages", empty_summary_df, empty_content_df
+        progress(0.6, desc="Calculating semantic similarities...")
         sorted_chunks = await analyzer.calculate_similarities(keyword)
+        progress(0.8, desc="Analyzing page performance...")
         page_analyses = analyzer.analyze_pages(sorted_chunks)
+        progress(0.9, desc="Generating AI-powered SEO report...")
         report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks)
         # Create summary data
         top_content_df = pd.DataFrame(top_content_data)
+        progress(1.0, desc="Analysis complete!")
         return report, summary_df, top_content_df
     """Synchronous wrapper for the async function"""
     return asyncio.run(run_seo_analysis(*args))
+# Create Gradio Interface with Glass Theme
 def create_interface():
+    with gr.Blocks(
+        title="SEO Content Gap Analysis",
+        theme=gr.themes.Glass(
+            primary_hue="blue",
+            secondary_hue="slate",
+            neutral_hue="zinc",
+            font="Inter"
+        )
+    ) as demo:
         gr.Markdown("""
         # 🔍 SEO Content Gap Analysis Tool
         Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
+        **Enhanced with Trafilatura** for superior content extraction and intelligent header-based chunking.
         """)
         with gr.Row():
                 with gr.Tabs():
                     with gr.TabItem("📝 SEO Report"):
                         report_output = gr.Markdown(
+                            label="AI-Generated SEO Analysis Report",
+                            value="Click 'Run Analysis' to generate your comprehensive SEO report with actionable insights..."
                         )
                     with gr.TabItem("📈 Page Summary"):
                             value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
                         )
+        # Enhanced example section
         gr.Markdown("""
         ### 💡 Example Usage
         https://contentmarketinginstitute.com/strategy
         https://neilpatel.com/blog/content-marketing-strategy
         ```
+        ### ✨ What's New
+        - **Enhanced Content Extraction**: Uses Trafilatura for better content quality
+        - **Intelligent Chunking**: Header-aware splitting for more accurate analysis
+        - **Improved Accuracy**: Better handling of complex page structures
+        - **Glass Theme**: Modern, sleek interface design
         """)
         # Event handler
         ### ⚠️ Important Notes
         - Analysis may take 2-5 minutes depending on content size
         - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
+        - Enhanced extraction works best with any type of web content
+        - Trafilatura respects robots.txt and implements smart rate limiting
+        - Glass theme provides modern, professional appearance
         """)
     return demo