Update app.py
Browse files
app.py
CHANGED
|
@@ -381,16 +381,16 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
|
|
| 381 |
empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
|
| 382 |
|
| 383 |
if not api_key:
|
| 384 |
-
return "β Please provide your OpenAI API key", empty_summary_df, empty_content_df
|
| 385 |
|
| 386 |
if not keyword or not client_url:
|
| 387 |
-
return "β Please provide both keyword and client URL", empty_summary_df, empty_content_df
|
| 388 |
|
| 389 |
# Parse competitor URLs
|
| 390 |
competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()]
|
| 391 |
|
| 392 |
if not competitor_urls:
|
| 393 |
-
return "β Please provide at least one competitor URL", empty_summary_df, empty_content_df
|
| 394 |
|
| 395 |
try:
|
| 396 |
progress(0.1, desc="Initializing analyzer with Trafilatura...")
|
|
@@ -407,19 +407,19 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
|
|
| 407 |
|
| 408 |
if total_successful == 0:
|
| 409 |
failed_urls = ', '.join(crawl_data['failed_urls'][:3])
|
| 410 |
-
return f"β No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df
|
| 411 |
|
| 412 |
if not crawl_data['client']:
|
| 413 |
-
return "β Failed to crawl client URL", empty_summary_df, empty_content_df
|
| 414 |
|
| 415 |
if not crawl_data['competitors']:
|
| 416 |
-
return "β Failed to crawl any competitor URLs", empty_summary_df, empty_content_df
|
| 417 |
|
| 418 |
progress(0.4, desc="Processing content with intelligent chunking...")
|
| 419 |
chunks = analyzer.chunk_content(crawl_data)
|
| 420 |
|
| 421 |
if not chunks:
|
| 422 |
-
return "β No content chunks were created from the crawled pages", empty_summary_df, empty_content_df
|
| 423 |
|
| 424 |
progress(0.6, desc="Calculating semantic similarities...")
|
| 425 |
sorted_chunks = await analyzer.calculate_similarities(keyword)
|
|
@@ -456,17 +456,48 @@ async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competit
|
|
| 456 |
|
| 457 |
top_content_df = pd.DataFrame(top_content_data)
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
progress(1.0, desc="Analysis complete!")
|
| 460 |
|
| 461 |
-
return report, summary_df, top_content_df
|
| 462 |
|
| 463 |
except Exception as e:
|
| 464 |
-
return f"β Error during analysis: {str(e)}", empty_summary_df, empty_content_df
|
| 465 |
|
| 466 |
def sync_run_seo_analysis(*args):
|
| 467 |
"""Synchronous wrapper for the async function"""
|
| 468 |
return asyncio.run(run_seo_analysis(*args))
|
| 469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
# Create Gradio Interface with Glass Theme
|
| 471 |
def create_interface():
|
| 472 |
with gr.Blocks(
|
|
@@ -479,7 +510,7 @@ def create_interface():
|
|
| 479 |
)
|
| 480 |
) as demo:
|
| 481 |
gr.Markdown("""
|
| 482 |
-
# π SEO Content Gap Analysis
|
| 483 |
|
| 484 |
Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
|
| 485 |
|
|
@@ -541,6 +572,23 @@ def create_interface():
|
|
| 541 |
headers=["Rank", "Type", "Score", "Content Preview", "URL"],
|
| 542 |
value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
|
| 543 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
# Enhanced example section
|
| 546 |
gr.Markdown("""
|
|
@@ -559,14 +607,13 @@ def create_interface():
|
|
| 559 |
- **Enhanced Content Extraction**: Uses Trafilatura for better content quality
|
| 560 |
- **Intelligent Chunking**: Header-aware splitting for more accurate analysis
|
| 561 |
- **Improved Accuracy**: Better handling of complex page structures
|
| 562 |
-
- **Glass Theme**: Modern, sleek interface design
|
| 563 |
""")
|
| 564 |
|
| 565 |
-
# Event
|
| 566 |
analyze_btn.click(
|
| 567 |
-
fn=
|
| 568 |
inputs=[api_key, keyword, client_url, competitor_urls],
|
| 569 |
-
outputs=[report_output, summary_output, top_content_output]
|
| 570 |
)
|
| 571 |
|
| 572 |
gr.Markdown("""
|
|
@@ -575,7 +622,6 @@ def create_interface():
|
|
| 575 |
- Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
|
| 576 |
- Enhanced extraction works best with any type of web content
|
| 577 |
- Trafilatura respects robots.txt and implements smart rate limiting
|
| 578 |
-
- Glass theme provides modern, professional appearance
|
| 579 |
""")
|
| 580 |
|
| 581 |
return demo
|
|
|
|
| 381 |
empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
|
| 382 |
|
| 383 |
if not api_key:
|
| 384 |
+
return "β Please provide your OpenAI API key", empty_summary_df, empty_content_df, empty_summary_df
|
| 385 |
|
| 386 |
if not keyword or not client_url:
|
| 387 |
+
return "β Please provide both keyword and client URL", empty_summary_df, empty_content_df, empty_summary_df
|
| 388 |
|
| 389 |
# Parse competitor URLs
|
| 390 |
competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()]
|
| 391 |
|
| 392 |
if not competitor_urls:
|
| 393 |
+
return "β Please provide at least one competitor URL", empty_summary_df, empty_content_df, empty_summary_df
|
| 394 |
|
| 395 |
try:
|
| 396 |
progress(0.1, desc="Initializing analyzer with Trafilatura...")
|
|
|
|
| 407 |
|
| 408 |
if total_successful == 0:
|
| 409 |
failed_urls = ', '.join(crawl_data['failed_urls'][:3])
|
| 410 |
+
return f"β No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df, empty_summary_df
|
| 411 |
|
| 412 |
if not crawl_data['client']:
|
| 413 |
+
return "β Failed to crawl client URL", empty_summary_df, empty_content_df, empty_summary_df
|
| 414 |
|
| 415 |
if not crawl_data['competitors']:
|
| 416 |
+
return "β Failed to crawl any competitor URLs", empty_summary_df, empty_content_df, empty_summary_df
|
| 417 |
|
| 418 |
progress(0.4, desc="Processing content with intelligent chunking...")
|
| 419 |
chunks = analyzer.chunk_content(crawl_data)
|
| 420 |
|
| 421 |
if not chunks:
|
| 422 |
+
return "β No content chunks were created from the crawled pages", empty_summary_df, empty_content_df, empty_summary_df
|
| 423 |
|
| 424 |
progress(0.6, desc="Calculating semantic similarities...")
|
| 425 |
sorted_chunks = await analyzer.calculate_similarities(keyword)
|
|
|
|
| 456 |
|
| 457 |
top_content_df = pd.DataFrame(top_content_data)
|
| 458 |
|
| 459 |
+
# Create comprehensive vector data for download (similar to Colab export)
|
| 460 |
+
vector_data = []
|
| 461 |
+
for chunk in sorted_chunks:
|
| 462 |
+
vector_data.append({
|
| 463 |
+
'url': chunk.url,
|
| 464 |
+
'page_type': chunk.page_type,
|
| 465 |
+
'chunk_index': chunk.chunk_index,
|
| 466 |
+
'chunk_type': chunk.chunk_type,
|
| 467 |
+
'header_info': str(chunk.header_info) if chunk.header_info else '',
|
| 468 |
+
'similarity_score': chunk.similarity_score,
|
| 469 |
+
'content_preview': chunk.content[:100] + '...' if len(chunk.content) > 100 else chunk.content,
|
| 470 |
+
'content_length': len(chunk.content),
|
| 471 |
+
'full_content': chunk.content # Include full content for download
|
| 472 |
+
})
|
| 473 |
+
|
| 474 |
+
vector_df = pd.DataFrame(vector_data)
|
| 475 |
+
|
| 476 |
+
# Prepare download file
|
| 477 |
+
download_file_path = prepare_download(vector_df)
|
| 478 |
+
|
| 479 |
progress(1.0, desc="Analysis complete!")
|
| 480 |
|
| 481 |
+
return report, summary_df, top_content_df, vector_df
|
| 482 |
|
| 483 |
except Exception as e:
|
| 484 |
+
return f"β Error during analysis: {str(e)}", empty_summary_df, empty_content_df, empty_summary_df
|
| 485 |
|
| 486 |
def sync_run_seo_analysis(*args):
|
| 487 |
"""Synchronous wrapper for the async function"""
|
| 488 |
return asyncio.run(run_seo_analysis(*args))
|
| 489 |
|
| 490 |
+
def handle_analysis_and_download(api_key, keyword, client_url, competitor_urls_text, progress=gr.Progress()):
|
| 491 |
+
"""Handle analysis and prepare download file"""
|
| 492 |
+
result = sync_run_seo_analysis(api_key, keyword, client_url, competitor_urls_text, progress)
|
| 493 |
+
|
| 494 |
+
# If analysis was successful (4 outputs), prepare download
|
| 495 |
+
if len(result) == 4 and isinstance(result[3], pd.DataFrame) and not result[3].empty:
|
| 496 |
+
download_file_path = prepare_download(result[3])
|
| 497 |
+
return result[0], result[1], result[2], download_file_path
|
| 498 |
+
else:
|
| 499 |
+
return result[0], result[1], result[2], None
|
| 500 |
+
|
| 501 |
# Create Gradio Interface with Glass Theme
|
| 502 |
def create_interface():
|
| 503 |
with gr.Blocks(
|
|
|
|
| 510 |
)
|
| 511 |
) as demo:
|
| 512 |
gr.Markdown("""
|
| 513 |
+
# π SEO Content Gap Analysis Using Vector Embeddings
|
| 514 |
|
| 515 |
Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity.
|
| 516 |
|
|
|
|
| 572 |
headers=["Rank", "Type", "Score", "Content Preview", "URL"],
|
| 573 |
value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"])
|
| 574 |
)
|
| 575 |
+
|
| 576 |
+
with gr.TabItem("π Vector Data"):
|
| 577 |
+
with gr.Row():
|
| 578 |
+
with gr.Column():
|
| 579 |
+
gr.Markdown("### π₯ Download Complete Analysis Data")
|
| 580 |
+
gr.Markdown("""
|
| 581 |
+
**Contains:**
|
| 582 |
+
- All content chunks with similarity scores
|
| 583 |
+
- Full content text for each chunk
|
| 584 |
+
- Header information and chunk types
|
| 585 |
+
- Perfect for further analysis in Excel/Python
|
| 586 |
+
""")
|
| 587 |
+
|
| 588 |
+
download_file = gr.File(
|
| 589 |
+
label="Vector Data CSV (Generated after analysis)",
|
| 590 |
+
interactive=False
|
| 591 |
+
)
|
| 592 |
|
| 593 |
# Enhanced example section
|
| 594 |
gr.Markdown("""
|
|
|
|
| 607 |
- **Enhanced Content Extraction**: Uses Trafilatura for better content quality
|
| 608 |
- **Intelligent Chunking**: Header-aware splitting for more accurate analysis
|
| 609 |
- **Improved Accuracy**: Better handling of complex page structures
|
|
|
|
| 610 |
""")
|
| 611 |
|
| 612 |
+
# Event handlers
|
| 613 |
analyze_btn.click(
|
| 614 |
+
fn=handle_analysis_and_download,
|
| 615 |
inputs=[api_key, keyword, client_url, competitor_urls],
|
| 616 |
+
outputs=[report_output, summary_output, top_content_output, download_file]
|
| 617 |
)
|
| 618 |
|
| 619 |
gr.Markdown("""
|
|
|
|
| 622 |
- Requires OpenAI API key (costs ~$0.01-0.10 per analysis)
|
| 623 |
- Enhanced extraction works best with any type of web content
|
| 624 |
- Trafilatura respects robots.txt and implements smart rate limiting
|
|
|
|
| 625 |
""")
|
| 626 |
|
| 627 |
return demo
|