|
|
import gradio as gr |
|
|
import asyncio |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter |
|
|
from langchain_openai import OpenAIEmbeddings, ChatOpenAI |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import numpy as np |
|
|
from typing import List, Dict, Tuple |
|
|
import pandas as pd |
|
|
from dataclasses import dataclass |
|
|
import json |
|
|
import time |
|
|
import warnings |
|
|
import os |
|
|
import re |
|
|
import tempfile |
|
|
|
|
|
|
|
|
from trafilatura import fetch_url, extract, bare_extraction |
|
|
from trafilatura.downloads import fetch_url as trafilatura_fetch |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
latest_vector_data = None |
|
|
|
|
|
def prepare_download(vector_df): |
|
|
"""Prepare the vector data for download""" |
|
|
global latest_vector_data |
|
|
if vector_df is not None and not vector_df.empty: |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, newline='', encoding='utf-8') |
|
|
vector_df.to_csv(temp_file.name, index=False) |
|
|
latest_vector_data = temp_file.name |
|
|
return temp_file.name |
|
|
return None |
|
|
|
|
|
def download_vector_data(): |
|
|
"""Return the prepared vector data file""" |
|
|
global latest_vector_data |
|
|
if latest_vector_data: |
|
|
return latest_vector_data |
|
|
return None |
|
|
|
|
|
@dataclass |
|
|
class ContentChunk: |
|
|
content: str |
|
|
url: str |
|
|
page_type: str |
|
|
chunk_index: int |
|
|
chunk_type: str |
|
|
header_info: Dict = None |
|
|
similarity_score: float = 0.0 |
|
|
|
|
|
@dataclass |
|
|
class PageAnalysis: |
|
|
url: str |
|
|
page_type: str |
|
|
total_chunks: int |
|
|
avg_similarity: float |
|
|
max_similarity: float |
|
|
top_chunks: List[ContentChunk] |
|
|
|
|
|
class SEOContentAnalyzer: |
|
|
def __init__(self, api_key: str): |
|
|
self.embeddings = OpenAIEmbeddings( |
|
|
model="text-embedding-3-small", |
|
|
openai_api_key=api_key |
|
|
) |
|
|
self.llm = ChatOpenAI( |
|
|
model="gpt-4o-mini", |
|
|
temperature=0.3, |
|
|
openai_api_key=api_key |
|
|
) |
|
|
|
|
|
|
|
|
self.html_splitter = HTMLHeaderTextSplitter( |
|
|
headers_to_split_on=[ |
|
|
("h1", "Header 1"), |
|
|
("h2", "Header 2"), |
|
|
("h3", "Header 3"), |
|
|
("h4", "Header 4"), |
|
|
("h5", "Header 5"), |
|
|
("h6", "Header 6"), |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
self.text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=600, |
|
|
chunk_overlap=100, |
|
|
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] |
|
|
) |
|
|
|
|
|
self.all_chunks = [] |
|
|
self.keyword_embedding = None |
|
|
|
|
|
async def fetch_and_clean_html(self, url: str) -> Dict: |
|
|
"""Fetch and clean HTML content from URL using Trafilatura""" |
|
|
try: |
|
|
|
|
|
downloaded = trafilatura_fetch(url) |
|
|
|
|
|
if not downloaded: |
|
|
return {'url': url, 'success': False, 'error': 'Failed to download'} |
|
|
|
|
|
|
|
|
text_content = extract(downloaded, include_comments=False, include_tables=True) |
|
|
|
|
|
if not text_content: |
|
|
return {'url': url, 'success': False, 'error': 'No content extracted'} |
|
|
|
|
|
|
|
|
metadata_result = bare_extraction(downloaded, include_comments=False, include_tables=True) |
|
|
|
|
|
|
|
|
title = '' |
|
|
if metadata_result: |
|
|
if hasattr(metadata_result, 'title') and metadata_result.title: |
|
|
title = metadata_result.title |
|
|
elif hasattr(metadata_result, 'get'): |
|
|
title = metadata_result.get('title', '') |
|
|
else: |
|
|
|
|
|
try: |
|
|
title = getattr(metadata_result, 'title', '') |
|
|
except: |
|
|
title = '' |
|
|
|
|
|
|
|
|
html_content = extract(downloaded, output_format='xml', include_comments=False, include_tables=True) |
|
|
|
|
|
|
|
|
if html_content and len(html_content) > 100: |
|
|
|
|
|
html_for_splitting = html_content |
|
|
|
|
|
html_for_splitting = re.sub(r'<head rend="(h[1-6])"[^>]*>', r'<\1>', html_for_splitting) |
|
|
html_for_splitting = re.sub(r'<head rend="h(\d)"[^>]*>', r'<h\1>', html_for_splitting) |
|
|
html_for_splitting = re.sub(r'</head>', '</h2>', html_for_splitting) |
|
|
html_for_splitting = re.sub(r'<head[^>]*>', '<h2>', html_for_splitting) |
|
|
|
|
|
html_for_splitting = f"<div>{html_for_splitting}</div>" |
|
|
else: |
|
|
|
|
|
|
|
|
lines = text_content.split('\n') |
|
|
html_lines = [] |
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
if line: |
|
|
|
|
|
if len(line) < 100 and len(line) > 5 and not line.endswith('.') and not line.endswith(',') and not line.endswith(';'): |
|
|
|
|
|
if line.istitle() or line.isupper() or (len(line.split()) <= 8): |
|
|
html_lines.append(f"<h3>{line}</h3>") |
|
|
else: |
|
|
html_lines.append(f"<p>{line}</p>") |
|
|
else: |
|
|
html_lines.append(f"<p>{line}</p>") |
|
|
|
|
|
html_for_splitting = f"<div>{''.join(html_lines)}</div>" |
|
|
|
|
|
word_count = len(text_content.split()) |
|
|
|
|
|
return { |
|
|
'url': url, |
|
|
'title': title, |
|
|
'text': text_content, |
|
|
'html': html_for_splitting, |
|
|
'success': True, |
|
|
'word_count': word_count |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
return {'url': url, 'success': False, 'error': str(e)} |
|
|
|
|
|
async def crawl_all_urls(self, client_url: str, competitor_urls: List[str]) -> Dict: |
|
|
"""Crawl client and competitor URLs using Trafilatura""" |
|
|
all_urls = [client_url] + competitor_urls |
|
|
|
|
|
|
|
|
|
|
|
crawl_data = { |
|
|
'client': None, |
|
|
'competitors': [], |
|
|
'failed_urls': [] |
|
|
} |
|
|
|
|
|
for i, url in enumerate(all_urls): |
|
|
result = await self.fetch_and_clean_html(url) |
|
|
|
|
|
if not result.get('success'): |
|
|
crawl_data['failed_urls'].append(result['url']) |
|
|
continue |
|
|
|
|
|
if i == 0: |
|
|
crawl_data['client'] = result |
|
|
else: |
|
|
crawl_data['competitors'].append(result) |
|
|
|
|
|
return crawl_data |
|
|
|
|
|
def chunk_content(self, crawl_data: Dict) -> List[ContentChunk]: |
|
|
"""Chunk all content using header-first, then paragraph-level splitting""" |
|
|
all_chunks = [] |
|
|
|
|
|
|
|
|
if crawl_data['client']: |
|
|
client_chunks = self._chunk_single_page( |
|
|
crawl_data['client'], 'client' |
|
|
) |
|
|
all_chunks.extend(client_chunks) |
|
|
|
|
|
|
|
|
for comp_data in crawl_data['competitors']: |
|
|
comp_chunks = self._chunk_single_page(comp_data, 'competitor') |
|
|
all_chunks.extend(comp_chunks) |
|
|
|
|
|
self.all_chunks = all_chunks |
|
|
return all_chunks |
|
|
|
|
|
def _chunk_single_page(self, page_data: Dict, page_type: str) -> List[ContentChunk]: |
|
|
"""Chunk a single page using header + paragraph strategy""" |
|
|
chunks = [] |
|
|
chunk_index = 0 |
|
|
|
|
|
try: |
|
|
|
|
|
if 'html' in page_data: |
|
|
header_splits = self.html_splitter.split_text(page_data['html']) |
|
|
|
|
|
if header_splits and len(header_splits) > 1: |
|
|
|
|
|
for split in header_splits: |
|
|
header_info = split.metadata if hasattr(split, 'metadata') else {} |
|
|
content = split.page_content if hasattr(split, 'page_content') else str(split) |
|
|
|
|
|
|
|
|
if len(content) > 800: |
|
|
sub_chunks = self.text_splitter.split_text(content) |
|
|
for i, sub_chunk in enumerate(sub_chunks): |
|
|
if len(sub_chunk.strip()) > 50: |
|
|
chunks.append(ContentChunk( |
|
|
content=sub_chunk.strip(), |
|
|
url=page_data['url'], |
|
|
page_type=page_type, |
|
|
chunk_index=chunk_index, |
|
|
chunk_type='header_subsection', |
|
|
header_info=header_info |
|
|
)) |
|
|
chunk_index += 1 |
|
|
else: |
|
|
|
|
|
if len(content.strip()) > 50: |
|
|
chunks.append(ContentChunk( |
|
|
content=content.strip(), |
|
|
url=page_data['url'], |
|
|
page_type=page_type, |
|
|
chunk_index=chunk_index, |
|
|
chunk_type='header_section', |
|
|
header_info=header_info |
|
|
)) |
|
|
chunk_index += 1 |
|
|
else: |
|
|
|
|
|
self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index) |
|
|
else: |
|
|
|
|
|
self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index) |
|
|
|
|
|
except Exception as e: |
|
|
self._add_paragraph_chunks(page_data, page_type, chunks, chunk_index) |
|
|
|
|
|
return chunks |
|
|
|
|
|
def _add_paragraph_chunks(self, page_data: Dict, page_type: str, chunks: List, start_index: int): |
|
|
"""Add paragraph-level chunks as fallback""" |
|
|
text_chunks = self.text_splitter.split_text(page_data['text']) |
|
|
chunk_index = start_index |
|
|
|
|
|
for chunk_text in text_chunks: |
|
|
if len(chunk_text.strip()) > 50: |
|
|
chunks.append(ContentChunk( |
|
|
content=chunk_text.strip(), |
|
|
url=page_data['url'], |
|
|
page_type=page_type, |
|
|
chunk_index=chunk_index, |
|
|
chunk_type='paragraph', |
|
|
header_info={} |
|
|
)) |
|
|
chunk_index += 1 |
|
|
|
|
|
async def calculate_similarities(self, keyword: str) -> List[ContentChunk]: |
|
|
"""Calculate cosine similarity between chunks and keyword""" |
|
|
if not self.all_chunks: |
|
|
raise ValueError("No chunks available. Run chunk_content first.") |
|
|
|
|
|
|
|
|
self.keyword_embedding = await self.embeddings.aembed_query(keyword) |
|
|
|
|
|
|
|
|
chunk_texts = [chunk.content for chunk in self.all_chunks] |
|
|
chunk_embeddings = await self.embeddings.aembed_documents(chunk_texts) |
|
|
|
|
|
|
|
|
similarities = cosine_similarity([self.keyword_embedding], chunk_embeddings)[0] |
|
|
|
|
|
|
|
|
for i, chunk in enumerate(self.all_chunks): |
|
|
chunk.similarity_score = float(similarities[i]) |
|
|
|
|
|
|
|
|
sorted_chunks = sorted(self.all_chunks, key=lambda x: x.similarity_score, reverse=True) |
|
|
|
|
|
return sorted_chunks |
|
|
|
|
|
def analyze_pages(self, sorted_chunks: List[ContentChunk]) -> Dict[str, PageAnalysis]: |
|
|
"""Analyze performance by page""" |
|
|
|
|
|
url_groups = {} |
|
|
for chunk in sorted_chunks: |
|
|
if chunk.url not in url_groups: |
|
|
url_groups[chunk.url] = [] |
|
|
url_groups[chunk.url].append(chunk) |
|
|
|
|
|
page_analyses = {} |
|
|
for url, chunks in url_groups.items(): |
|
|
page_type = chunks[0].page_type |
|
|
similarities = [chunk.similarity_score for chunk in chunks] |
|
|
|
|
|
analysis = PageAnalysis( |
|
|
url=url, |
|
|
page_type=page_type, |
|
|
total_chunks=len(chunks), |
|
|
avg_similarity=np.mean(similarities), |
|
|
max_similarity=np.max(similarities), |
|
|
top_chunks=sorted(chunks, key=lambda x: x.similarity_score, reverse=True)[:3] |
|
|
) |
|
|
|
|
|
page_analyses[url] = analysis |
|
|
|
|
|
return page_analyses |
|
|
|
|
|
async def generate_report(self, keyword: str, page_analyses: Dict[str, PageAnalysis], |
|
|
sorted_chunks: List[ContentChunk]) -> str: |
|
|
"""Generate comprehensive SEO report""" |
|
|
|
|
|
client_analysis = next((p for p in page_analyses.values() if p.page_type == 'client'), None) |
|
|
competitor_analyses = [p for p in page_analyses.values() if p.page_type == 'competitor'] |
|
|
|
|
|
|
|
|
top_chunks = sorted_chunks[:5] |
|
|
client_top_chunks = [c for c in sorted_chunks if c.page_type == 'client'][:3] |
|
|
competitor_top_chunks = [c for c in sorted_chunks if c.page_type == 'competitor'][:5] |
|
|
|
|
|
|
|
|
client_url = client_analysis.url if client_analysis else 'No client data' |
|
|
client_chunks = client_analysis.total_chunks if client_analysis else 0 |
|
|
client_avg = f"{client_analysis.avg_similarity:.4f}" if client_analysis else "0.0000" |
|
|
client_max = f"{client_analysis.max_similarity:.4f}" if client_analysis else "0.0000" |
|
|
|
|
|
|
|
|
prompt = f""" |
|
|
As an SEO expert, analyze this content relevance data for the keyword "{keyword}" and provide actionable insights. |
|
|
|
|
|
CLIENT PAGE PERFORMANCE: |
|
|
URL: {client_url} |
|
|
Total Chunks: {client_chunks} |
|
|
Average Similarity: {client_avg} |
|
|
Max Similarity: {client_max} |
|
|
|
|
|
TOP CLIENT CONTENT SECTIONS: |
|
|
{chr(10).join([f"Score {c.similarity_score:.4f}: {c.content[:200]}..." for c in client_top_chunks[:3]])} |
|
|
|
|
|
COMPETITOR PERFORMANCE: |
|
|
{chr(10).join([f"URL: {p.url}, Avg: {p.avg_similarity:.4f}, Max: {p.max_similarity:.4f}" for p in competitor_analyses])} |
|
|
|
|
|
TOP COMPETITOR CONTENT SECTIONS: |
|
|
{chr(10).join([f"Score {c.similarity_score:.4f} ({c.url}): {c.content[:200]}..." for c in competitor_top_chunks[:3]])} |
|
|
|
|
|
OVERALL TOP PERFORMING CONTENT: |
|
|
{chr(10).join([f"Score {c.similarity_score:.4f} ({c.page_type}): {c.content[:150]}..." for c in top_chunks])} |
|
|
|
|
|
1. Top-performing page for this keyword: Identify the strongest-ranking page (ours or a competitorβs), including its URL and why it performs well. |
|
|
2. Best-performing sections of content: Highlight the specific sections or content chunks (with text snippets and scores) that perform best for the keyword. |
|
|
3. What our clientβs page does well: Summarize the client pageβs strengths compared to competitors. |
|
|
4. What our clientβs page is missing: Identify gaps or underdeveloped areas in the clientβs content compared to competitors. |
|
|
5. Specific, actionable recommendations: |
|
|
Break this section into clearly labeled subcategories, such as: |
|
|
β’ Content Expansion: Missing sections, new topics, or deeper explanations. |
|
|
β’ Content Enhancement: Improvements to clarity, examples, visuals, or formatting. |
|
|
|
|
|
For each recommendation, include: |
|
|
β’ A clear title. |
|
|
β’ A brief explanation of why it matters. |
|
|
β’ A reference to the competitor content that demonstrates the point, including: |
|
|
β’ URL |
|
|
β’ Score |
|
|
β’ Content chunk or snippet |
|
|
|
|
|
Output format: |
|
|
β’ Use clear section headings and bullet points for readability. |
|
|
β’ Include competitor references (URL, score, snippet) wherever applicable to support recommendations. |
|
|
β’ Focus only on content-related improvements, not general SEO optimizations or monitoring advice. |
|
|
|
|
|
The goal is to help the client improve content relevance, depth, and authority for the target keyword β grounded in the analysis of vector embeddings and competitive content. |
|
|
|
|
|
""" |
|
|
|
|
|
response = await self.llm.ainvoke(prompt) |
|
|
return response.content |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def run_seo_analysis(api_key: str, keyword: str, client_url: str, competitor_urls_text: str, progress=gr.Progress()): |
|
|
"""Main function to run SEO analysis""" |
|
|
|
|
|
|
|
|
empty_summary_df = pd.DataFrame(columns=["URL", "Type", "Total Chunks", "Avg Similarity", "Max Similarity"]) |
|
|
empty_content_df = pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"]) |
|
|
|
|
|
if not api_key: |
|
|
return "β Please provide your OpenAI API key", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
if not keyword or not client_url: |
|
|
return "β Please provide both keyword and client URL", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
|
|
|
competitor_urls = [url.strip() for url in competitor_urls_text.split('\n') if url.strip()] |
|
|
|
|
|
if not competitor_urls: |
|
|
return "β Please provide at least one competitor URL", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
try: |
|
|
progress(0.1, desc="Initializing analyzer with Trafilatura...") |
|
|
analyzer = SEOContentAnalyzer(api_key) |
|
|
|
|
|
progress(0.2, desc="Crawling websites with enhanced extraction...") |
|
|
crawl_data = await analyzer.crawl_all_urls(client_url, competitor_urls) |
|
|
|
|
|
|
|
|
total_successful = 0 |
|
|
if crawl_data['client']: |
|
|
total_successful += 1 |
|
|
total_successful += len(crawl_data['competitors']) |
|
|
|
|
|
if total_successful == 0: |
|
|
failed_urls = ', '.join(crawl_data['failed_urls'][:3]) |
|
|
return f"β No URLs were successfully crawled. Failed URLs: {failed_urls}...", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
if not crawl_data['client']: |
|
|
return "β Failed to crawl client URL", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
if not crawl_data['competitors']: |
|
|
return "β Failed to crawl any competitor URLs", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
progress(0.4, desc="Processing content with intelligent chunking...") |
|
|
chunks = analyzer.chunk_content(crawl_data) |
|
|
|
|
|
if not chunks: |
|
|
return "β No content chunks were created from the crawled pages", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
progress(0.6, desc="Calculating semantic similarities...") |
|
|
sorted_chunks = await analyzer.calculate_similarities(keyword) |
|
|
|
|
|
progress(0.8, desc="Analyzing page performance...") |
|
|
page_analyses = analyzer.analyze_pages(sorted_chunks) |
|
|
|
|
|
progress(0.9, desc="Generating AI-powered SEO report...") |
|
|
report = await analyzer.generate_report(keyword, page_analyses, sorted_chunks) |
|
|
|
|
|
|
|
|
summary_data = [] |
|
|
for url, analysis in page_analyses.items(): |
|
|
summary_data.append({ |
|
|
'URL': url, |
|
|
'Type': analysis.page_type.title(), |
|
|
'Total Chunks': analysis.total_chunks, |
|
|
'Avg Similarity': f"{analysis.avg_similarity:.4f}", |
|
|
'Max Similarity': f"{analysis.max_similarity:.4f}" |
|
|
}) |
|
|
|
|
|
summary_df = pd.DataFrame(summary_data) |
|
|
|
|
|
|
|
|
top_content_data = [] |
|
|
for i, chunk in enumerate(sorted_chunks[:10], 1): |
|
|
top_content_data.append({ |
|
|
'Rank': i, |
|
|
'Type': chunk.page_type.title(), |
|
|
'Score': f"{chunk.similarity_score:.4f}", |
|
|
'Content Preview': chunk.content[:150] + "..." if len(chunk.content) > 150 else chunk.content, |
|
|
'URL': chunk.url |
|
|
}) |
|
|
|
|
|
top_content_df = pd.DataFrame(top_content_data) |
|
|
|
|
|
|
|
|
vector_data = [] |
|
|
for chunk in sorted_chunks: |
|
|
vector_data.append({ |
|
|
'url': chunk.url, |
|
|
'page_type': chunk.page_type, |
|
|
'chunk_index': chunk.chunk_index, |
|
|
'chunk_type': chunk.chunk_type, |
|
|
'header_info': str(chunk.header_info) if chunk.header_info else '', |
|
|
'similarity_score': chunk.similarity_score, |
|
|
'content_preview': chunk.content[:100] + '...' if len(chunk.content) > 100 else chunk.content, |
|
|
'content_length': len(chunk.content), |
|
|
'full_content': chunk.content |
|
|
}) |
|
|
|
|
|
vector_df = pd.DataFrame(vector_data) |
|
|
|
|
|
|
|
|
download_file_path = prepare_download(vector_df) |
|
|
|
|
|
progress(1.0, desc="Analysis complete!") |
|
|
|
|
|
return report, summary_df, top_content_df, vector_df |
|
|
|
|
|
except Exception as e: |
|
|
return f"β Error during analysis: {str(e)}", empty_summary_df, empty_content_df, empty_summary_df |
|
|
|
|
|
def sync_run_seo_analysis(*args): |
|
|
"""Synchronous wrapper for the async function""" |
|
|
return asyncio.run(run_seo_analysis(*args)) |
|
|
|
|
|
def handle_analysis_and_download(api_key, keyword, client_url, competitor_urls_text, progress=gr.Progress()): |
|
|
"""Handle analysis and prepare download file""" |
|
|
result = sync_run_seo_analysis(api_key, keyword, client_url, competitor_urls_text, progress) |
|
|
|
|
|
|
|
|
if len(result) == 4 and isinstance(result[3], pd.DataFrame) and not result[3].empty: |
|
|
download_file_path = prepare_download(result[3]) |
|
|
return result[0], result[1], result[2], download_file_path |
|
|
else: |
|
|
return result[0], result[1], result[2], None |
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
with gr.Blocks( |
|
|
title="SEO Content Gap Analysis", |
|
|
theme=gr.themes.Glass( |
|
|
primary_hue="blue", |
|
|
secondary_hue="slate", |
|
|
neutral_hue="zinc", |
|
|
font="Inter" |
|
|
) |
|
|
) as demo: |
|
|
gr.Markdown(""" |
|
|
# π SEO Content Relevance Analysis |
|
|
|
|
|
Analyze how well your content matches a target keyword compared to competitors using AI-powered semantic similarity. |
|
|
|
|
|
**Enhanced with Trafilatura** for superior content extraction and intelligent header-based chunking. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### π Configuration") |
|
|
|
|
|
api_key = gr.Textbox( |
|
|
label="OpenAI API Key", |
|
|
placeholder="sk-...", |
|
|
type="password", |
|
|
info="Your OpenAI API key for embeddings and analysis" |
|
|
) |
|
|
|
|
|
keyword = gr.Textbox( |
|
|
label="Target Keyword", |
|
|
placeholder="e.g., python web scraping", |
|
|
info="The keyword you want to optimize for" |
|
|
) |
|
|
|
|
|
client_url = gr.Textbox( |
|
|
label="Your Page URL", |
|
|
placeholder="https://yoursite.com/page", |
|
|
info="The URL of your page to analyze" |
|
|
) |
|
|
|
|
|
competitor_urls = gr.Textbox( |
|
|
label="Competitor URLs", |
|
|
placeholder="https://competitor1.com/page\nhttps://competitor2.com/page", |
|
|
lines=5, |
|
|
info="One URL per line (2-5 competitors recommended)" |
|
|
) |
|
|
|
|
|
analyze_btn = gr.Button("π Run Analysis", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("### π Results") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("π SEO Report"): |
|
|
report_output = gr.Markdown( |
|
|
label="AI-Generated SEO Analysis Report", |
|
|
value="Click 'Run Analysis' to generate your comprehensive SEO report with actionable insights..." |
|
|
) |
|
|
|
|
|
with gr.TabItem("π Page Summary"): |
|
|
summary_output = gr.Dataframe( |
|
|
label="Page Performance Summary", |
|
|
headers=["URL", "Type", "Total Chunks", "Avg Similarity", "Max Similarity"], |
|
|
value=pd.DataFrame(columns=["URL", "Type", "Total Chunks", "Avg Similarity", "Max Similarity"]) |
|
|
) |
|
|
|
|
|
with gr.TabItem("π― Top Content"): |
|
|
top_content_output = gr.Dataframe( |
|
|
label="Top Performing Content Sections", |
|
|
headers=["Rank", "Type", "Score", "Content Preview", "URL"], |
|
|
value=pd.DataFrame(columns=["Rank", "Type", "Score", "Content Preview", "URL"]) |
|
|
) |
|
|
|
|
|
with gr.TabItem("π Vector Data"): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### π₯ Download Complete Analysis Data") |
|
|
gr.Markdown(""" |
|
|
**Contains:** |
|
|
- All content chunks with similarity scores |
|
|
- Full content text for each chunk |
|
|
- Header information and chunk types |
|
|
- Perfect for further analysis in Excel/Python |
|
|
""") |
|
|
|
|
|
download_file = gr.File( |
|
|
label="Vector Data CSV (Generated after analysis)", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
### π‘ Example Usage |
|
|
|
|
|
**Keyword:** `content marketing strategy` |
|
|
**Your URL:** `https://yoursite.com/content-marketing-guide` |
|
|
**Competitors:** |
|
|
``` |
|
|
https://hubspot.com/content-marketing |
|
|
https://contentmarketinginstitute.com/strategy |
|
|
https://neilpatel.com/blog/content-marketing-strategy |
|
|
``` |
|
|
|
|
|
### β¨ What's New |
|
|
- **Enhanced Content Extraction**: Uses Trafilatura for better content quality |
|
|
- **Intelligent Chunking**: Header-aware splitting for more accurate analysis |
|
|
- **Improved Accuracy**: Better handling of complex page structures |
|
|
- **Glass Theme**: Modern, sleek interface design |
|
|
""") |
|
|
|
|
|
|
|
|
analyze_btn.click( |
|
|
fn=handle_analysis_and_download, |
|
|
inputs=[api_key, keyword, client_url, competitor_urls], |
|
|
outputs=[report_output, summary_output, top_content_output, download_file] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### β οΈ Important Notes |
|
|
- Analysis may take 2-5 minutes depending on content size |
|
|
- Requires OpenAI API key (costs ~$0.01-0.10 per analysis) |
|
|
- Enhanced extraction works best with any type of web content |
|
|
- Trafilatura respects robots.txt and implements smart rate limiting |
|
|
- Glass theme provides modern, professional appearance |
|
|
""") |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_interface() |
|
|
demo.launch() |