ryanshelley commited on
Commit
89491b0
Β·
verified Β·
1 Parent(s): 34524ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -15
app.py CHANGED
@@ -381,16 +381,16 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
381
  empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
382
 
383
  if not api_key:
384
- return "❌ Please provide your OpenAI API key", empty_summary_df, empty_content_df
385
 
386
  if not keyword or not client_url:
387
- return "❌ Please provide both keyword and client URL", empty_summary_df, empty_content_df
388
 
389
  # Parse competitor URLs
390
  competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()]
391
 
392
  if not competitor_urls:
393
- return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df
394
 
395
  try:
396
  progress(0.1, desc="Initializing analyzer with Trafilatura...")
@@ -407,19 +407,19 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
407
 
408
  if total_successful == 0:
409
  failed_urls = ', '.join(crawl_data['failed_urls'][:3])
410
- return f"❌ No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df
411
 
412
  if not crawl_data['client']:
413
- return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df
414
 
415
  if not crawl_data['competitors']:
416
- return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
417
 
418
  progress(0.4, desc="Processing content with intelligent chunking...")
419
  chunks = analyzer.chunk_content(crawl_data)
420
 
421
  if not chunks:
422
- return "❌ No content chunks were created from the crawled pages", empty_summary_df, empty_content_df
423
 
424
  progress(0.6, desc="Calculating semantic similarities...")
425
  sorted_chunks = await analyzer.calculate_similarities(keyword)
@@ -456,17 +456,48 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
456
 
457
  top_content_df = pd.DataFrame(top_content_data)
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  progress(1.0, desc="Analysis complete!")
460
 
461
- return report, summary_df, top_content_df
462
 
463
  except Exception as e:
464
- return f"❌ Error during analysis: {str(e)}", empty_summary_df, empty_content_df
465
 
466
  def sync_run_seo_analysis(*args):
467
  """Synchronous wrapper for the async function"""
468
  return asyncio.run(run_seo_analysis(*args))
469
 
 
 
 
 
 
 
 
 
 
 
 
470
  # Create Gradio Interface with Glass Theme
471
  def create_interface():
472
  with gr.Blocks(
@@ -479,7 +510,7 @@ def create_interface():
479
  )
480
  ) as demo:
481
  gr.Markdown("""
482
- # πŸ” SEO Content Gap Analysis Tool
483
 
484
  Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
485
 
@@ -541,6 +572,23 @@ def create_interface():
541
  headers=["Rank", "Type", "Score", "Content Preview", "URL"],
542
  value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
543
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
  # Enhanced example section
546
  gr.Markdown("""
@@ -559,14 +607,13 @@ def create_interface():
559
  - **Enhanced Content Extraction**: Uses Trafilatura for better content quality
560
  - **Intelligent Chunking**: Header-aware splitting for more accurate analysis
561
  - **Improved Accuracy**: Better handling of complex page structures
562
- - **Glass Theme**: Modern, sleek interface design
563
  """)
564
 
565
- # Event handler
566
  analyze_btn.click(
567
- fn=sync_run_seo_analysis,
568
  inputs=[api_key, keyword, client_url, competitor_urls],
569
- outputs=[report_output, summary_output, top_content_output]
570
  )
571
 
572
  gr.Markdown("""
@@ -575,7 +622,6 @@ def create_interface():
575
  - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
576
  - Enhanced extraction works best with any type of web content
577
  - Trafilatura respects robots.txt and implements smart rate limiting
578
- - Glass theme provides modern, professional appearance
579
  """)
580
 
581
  return demo
 
381
  empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
382
 
383
  if not api_key:
384
+ return "❌ Please provide your OpenAI API key", empty_summary_df, empty_content_df, empty_summary_df
385
 
386
  if not keyword or not client_url:
387
+ return "❌ Please provide both keyword and client URL", empty_summary_df, empty_content_df, empty_summary_df
388
 
389
  # Parse competitor URLs
390
  competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()]
391
 
392
  if not competitor_urls:
393
+ return "❌ Please provide at least one competitor URL", empty_summary_df, empty_content_df, empty_summary_df
394
 
395
  try:
396
  progress(0.1, desc="Initializing analyzer with Trafilatura...")
 
407
 
408
  if total_successful == 0:
409
  failed_urls = ', '.join(crawl_data['failed_urls'][:3])
410
+ return f"❌ No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df, empty_summary_df
411
 
412
  if not crawl_data['client']:
413
+ return "❌ Failed to crawl client URL", empty_summary_df, empty_content_df, empty_summary_df
414
 
415
  if not crawl_data['competitors']:
416
+ return "❌ Failed to crawl any competitor URLs", empty_summary_df, empty_content_df, empty_summary_df
417
 
418
  progress(0.4, desc="Processing content with intelligent chunking...")
419
  chunks = analyzer.chunk_content(crawl_data)
420
 
421
  if not chunks:
422
+ return "❌ No content chunks were created from the crawled pages", empty_summary_df, empty_content_df, empty_summary_df
423
 
424
  progress(0.6, desc="Calculating semantic similarities...")
425
  sorted_chunks = await analyzer.calculate_similarities(keyword)
 
456
 
457
  top_content_df = pd.DataFrame(top_content_data)
458
 
459
+ # Create comprehensive vector data for download (similar to Colab export)
460
+ vector_data = []
461
+ for chunk in sorted_chunks:
462
+ vector_data.append({
463
+ 'url': chunk.url,
464
+ 'page_type': chunk.page_type,
465
+ 'chunk_index': chunk.chunk_index,
466
+ 'chunk_type': chunk.chunk_type,
467
+ 'header_info': str(chunk.header_info) if chunk.header_info else '',
468
+ 'similarity_score': chunk.similarity_score,
469
+ 'content_preview': chunk.content[:100] + '...' if len(chunk.content) > 100 else chunk.content,
470
+ 'content_length': len(chunk.content),
471
+ 'full_content': chunk.content # Include full content for download
472
+ })
473
+
474
+ vector_df = pd.DataFrame(vector_data)
475
+
476
+ # Prepare download file
477
+ download_file_path = prepare_download(vector_df)
478
+
479
  progress(1.0, desc="Analysis complete!")
480
 
481
+ return report, summary_df, top_content_df, vector_df
482
 
483
  except Exception as e:
484
+ return f"❌ Error during analysis: {str(e)}", empty_summary_df, empty_content_df, empty_summary_df
485
 
486
  def sync_run_seo_analysis(*args):
487
  """Synchronous wrapper for the async function"""
488
  return asyncio.run(run_seo_analysis(*args))
489
 
490
+ def handle_analysis_and_download(api_key, keyword, client_url, competitor_urls_text, progress=gr.Progress()):
491
+ """Handle analysis and prepare download file"""
492
+ result = sync_run_seo_analysis(api_key, keyword, client_url, competitor_urls_text, progress)
493
+
494
+ # If analysis was successful (4 outputs), prepare download
495
+ if len(result) == 4 and isinstance(result[3], pd.DataFrame) and not result[3].empty:
496
+ download_file_path = prepare_download(result[3])
497
+ return result[0], result[1], result[2], download_file_path
498
+ else:
499
+ return result[0], result[1], result[2], None
500
+
501
  # Create Gradio Interface with Glass Theme
502
  def create_interface():
503
  with gr.Blocks(
 
510
  )
511
  ) as demo:
512
  gr.Markdown("""
513
+ # πŸ” SEO Content Gap Analysis Using Vector Embeddings
514
 
515
  Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
516
 
 
572
  headers=["Rank", "Type", "Score", "Content Preview", "URL"],
573
  value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
574
  )
575
+
576
+ with gr.TabItem("πŸ“Š Vector Data"):
577
+ with gr.Row():
578
+ with gr.Column():
579
+ gr.Markdown("### πŸ“₯ Download Complete Analysis Data")
580
+ gr.Markdown("""
581
+ **Contains:**
582
+ - All content chunks with similarity scores
583
+ - Full content text for each chunk
584
+ - Header information and chunk types
585
+ - Perfect for further analysis in Excel/Python
586
+ """)
587
+
588
+ download_file = gr.File(
589
+ label="Vector Data CSV (Generated after analysis)",
590
+ interactive=False
591
+ )
592
 
593
  # Enhanced example section
594
  gr.Markdown("""
 
607
  - **Enhanced Content Extraction**: Uses Trafilatura for better content quality
608
  - **Intelligent Chunking**: Header-aware splitting for more accurate analysis
609
  - **Improved Accuracy**: Better handling of complex page structures
 
610
  """)
611
 
612
+ # Event handlers
613
  analyze_btn.click(
614
+ fn=handle_analysis_and_download,
615
  inputs=[api_key, keyword, client_url, competitor_urls],
616
+ outputs=[report_output, summary_output, top_content_output, download_file]
617
  )
618
 
619
  gr.Markdown("""
 
622
  - Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
623
  - Enhanced extraction works best with any type of web content
624
  - Trafilatura respects robots.txt and implements smart rate limiting
 
625
  """)
626
 
627
  return demo