Spaces:

ryanshelley
/

vector_based_content_gap_analysis

Running

App Files Files Community

ryanshelley commited on Jul 17, 2025

Commit

89491b0

verified ·

1 Parent(s): 34524ae

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -15

app.py CHANGED Viewed

@@ -381,16 +381,16 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
     empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
     if not api_key:
-        return "❌ Please provide your OpenAI API key", empty_summary_df, empty_content_df
     if not keyword or not client_url:
-        return "❌ Please provide both keyword and client URL", empty_summary_df, empty_content_df
     # Parse competitor URLs
     competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()]
     if not competitor_urls:
-        return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df
     try:
         progress(0.1, desc="Initializing analyzer with Trafilatura...")
@@ -407,19 +407,19 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
         if total_successful == 0:
             failed_urls = ', '.join(crawl_data['failed_urls'][:3])
-            return f"❌ No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df
         if not crawl_data['client']:
-            return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df
         if not crawl_data['competitors']:
-            return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
         progress(0.4, desc="Processing content with intelligent chunking...")
         chunks = analyzer.chunk_content(crawl_data)
         if not chunks:
-            return "❌ No content chunks were created from the crawled pages", empty_summary_df, empty_content_df
         progress(0.6, desc="Calculating semantic similarities...")
         sorted_chunks = await analyzer.calculate_similarities(keyword)
@@ -456,17 +456,48 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
         top_content_df = pd.DataFrame(top_content_data)
         progress(1.0, desc="Analysis complete!")
-        return report, summary_df, top_content_df
     except Exception as e:
-        return f"❌ Error during analysis: {str(e)}", empty_summary_df, empty_content_df
 def sync_run_seo_analysis(*args):
     """Synchronous wrapper for the async function"""
     return asyncio.run(run_seo_analysis(*args))
 # Create Gradio Interface with Glass Theme
 def create_interface():
     with gr.Blocks(
@@ -479,7 +510,7 @@ def create_interface():
         )
     ) as demo:
         gr.Markdown("""
-        # 🔍 SEO Content Gap Analysis Tool
         Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
@@ -541,6 +572,23 @@ def create_interface():
                             headers=["Rank", "Type", "Score", "Content Preview", "URL"],
                             value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
                         )
         # Enhanced example section
         gr.Markdown("""
@@ -559,14 +607,13 @@ def create_interface():
         - **Enhanced Content Extraction**: Uses Trafilatura for better content quality
         - **Intelligent Chunking**: Header-aware splitting for more accurate analysis
         - **Improved Accuracy**: Better handling of complex page structures
-        - **Glass Theme**: Modern, sleek interface design
         """)
-        # Event handler
         analyze_btn.click(
-            fn=sync_run_seo_analysis,
             inputs=[api_key, keyword, client_url, competitor_urls],
-            outputs=[report_output, summary_output, top_content_output]
         )
         gr.Markdown("""
@@ -575,7 +622,6 @@ def create_interface():
         - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
         - Enhanced extraction works best with any type of web content
         - Trafilatura respects robots.txt and implements smart rate limiting
-        - Glass theme provides modern, professional appearance
         """)
     return demo

     empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
     if not api_key:
+        return "❌ Please provide your OpenAI API key", empty_summary_df, empty_content_df, empty_summary_df
     if not keyword or not client_url:
+        return "❌ Please provide both keyword and client URL", empty_summary_df, empty_content_df, empty_summary_df
     # Parse competitor URLs
     competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()]
     if not competitor_urls:
+        return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df, empty_summary_df
     try:
         progress(0.1, desc="Initializing analyzer with Trafilatura...")
         if total_successful == 0:
             failed_urls = ', '.join(crawl_data['failed_urls'][:3])
+            return f"❌ No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df, empty_summary_df
         if not crawl_data['client']:
+            return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df, empty_summary_df
         if not crawl_data['competitors']:
+            return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df, empty_summary_df
         progress(0.4, desc="Processing content with intelligent chunking...")
         chunks = analyzer.chunk_content(crawl_data)
         if not chunks:
+            return "❌ No content chunks were created from the crawled pages", empty_summary_df, empty_content_df, empty_summary_df
         progress(0.6, desc="Calculating semantic similarities...")
         sorted_chunks = await analyzer.calculate_similarities(keyword)
         top_content_df = pd.DataFrame(top_content_data)
+        # Create comprehensive vector data for download (similar to Colab export)
+        vector_data = []
+        for chunk in sorted_chunks:
+            vector_data.append({
+                'url': chunk.url,
+                'page_type': chunk.page_type,
+                'chunk_index': chunk.chunk_index,
+                'chunk_type': chunk.chunk_type,
+                'header_info': str(chunk.header_info) if chunk.header_info else '',
+                'similarity_score': chunk.similarity_score,
+                'content_preview': chunk.content[:100] + '...' if len(chunk.content) > 100 else chunk.content,
+                'content_length': len(chunk.content),
+                'full_content': chunk.content  # Include full content for download
+            })
+        vector_df = pd.DataFrame(vector_data)
+        # Prepare download file
+        download_file_path = prepare_download(vector_df)
         progress(1.0, desc="Analysis complete!")
+        return report, summary_df, top_content_df, vector_df
     except Exception as e:
+        return f"❌ Error during analysis: {str(e)}", empty_summary_df, empty_content_df, empty_summary_df
 def sync_run_seo_analysis(*args):
     """Synchronous wrapper for the async function"""
     return asyncio.run(run_seo_analysis(*args))
+def handle_analysis_and_download(api_key, keyword, client_url, competitor_urls_text, progress=gr.Progress()):
+    """Handle analysis and prepare download file"""
+    result = sync_run_seo_analysis(api_key, keyword, client_url, competitor_urls_text, progress)
+    # If analysis was successful (4 outputs), prepare download
+    if len(result) == 4 and isinstance(result[3], pd.DataFrame) and not result[3].empty:
+        download_file_path = prepare_download(result[3])
+        return result[0], result[1], result[2], download_file_path
+    else:
+        return result[0], result[1], result[2], None
 # Create Gradio Interface with Glass Theme
 def create_interface():
     with gr.Blocks(
         )
     ) as demo:
         gr.Markdown("""
+        # 🔍 SEO Content Gap Analysis Using Vector Embeddings
         Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
                             headers=["Rank", "Type", "Score", "Content Preview", "URL"],
                             value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
                         )
+                    with gr.TabItem("📊 Vector Data"):
+                        with gr.Row():
+                            with gr.Column():
+                                gr.Markdown("### 📥 Download Complete Analysis Data")
+                                gr.Markdown("""
+                                **Contains:**
+                                - All content chunks with similarity scores
+                                - Full content text for each chunk
+                                - Header information and chunk types
+                                - Perfect for further analysis in Excel/Python
+                                """)
+                                download_file = gr.File(
+                                    label="Vector Data CSV (Generated after analysis)",
+                                    interactive=False
+                                )
         # Enhanced example section
         gr.Markdown("""
         - **Enhanced Content Extraction**: Uses Trafilatura for better content quality
         - **Intelligent Chunking**: Header-aware splitting for more accurate analysis
         - **Improved Accuracy**: Better handling of complex page structures
         """)
+        # Event handlers
         analyze_btn.click(
+            fn=handle_analysis_and_download,
             inputs=[api_key, keyword, client_url, competitor_urls],
+            outputs=[report_output, summary_output, top_content_output, download_file]
         )
         gr.Markdown("""
         - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
         - Enhanced extraction works best with any type of web content
         - Trafilatura respects robots.txt and implements smart rate limiting
         """)
     return demo