#Import libraries
import gradio as gr
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import pandas as pd
# --- INITIALIZATION ---
# Load a pre-trained Sentence Transformer model
# 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity.
print("Loading embedding model... (This might take a moment on first run)")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")
# --- HELPER FUNCTIONS ---
def get_text_from_url(url):
"""Fetches and extracts clean text from a URL."""
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
script_or_style.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
if not text:
return None
return text
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.")
def get_chunks(text):
"""Splits text into smaller chunks."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len
)
return text_splitter.split_text(text)
# --- CORE ANALYSIS LOGIC ---
def run_analysis(keyword, my_url, competitor_url):
"""The main function to perform the analysis and return structured results."""
print("Fetching and chunking content...")
my_content = get_text_from_url(my_url)
competitor_content = get_text_from_url(competitor_url)
if not my_content or not competitor_content:
raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.")
my_chunks = get_chunks(my_content)
competitor_chunks = get_chunks(competitor_content)
if not my_chunks or not competitor_chunks:
raise gr.Error("Could not chunk the content. Pages might be too short.")
print("Creating embeddings...")
keyword_embedding = model.encode(keyword, convert_to_tensor=True)
my_embeddings = model.encode(my_chunks, convert_to_tensor=True)
competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True)
# --- Keyword Alignment ---
my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0]
competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0]
top_k = min(5, len(my_keyword_scores))
my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy())
top_k = min(5, len(competitor_keyword_scores))
competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy())
# --- Similarities ---
similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings)
similar_pairs = []
for i in range(len(my_chunks)):
best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0)
if best_match_score > 0.70:
similar_pairs.append({
"Your Content Snippet": my_chunks[i],
"Competitor Content Snippet": competitor_chunks[best_match_idx],
"Similarity Score": f"{best_match_score.item():.2f}"
})
similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5])
# --- Gaps ---
content_gaps = []
for i in range(len(competitor_chunks)):
competitor_keyword_relevance = competitor_keyword_scores[i]
if competitor_keyword_relevance > 0.5:
my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0)
if my_best_coverage_score < 0.6:
content_gaps.append({
"Potential Content Gap (from Competitor)": competitor_chunks[i],
"Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}",
"Your Max Coverage": f"{my_best_coverage_score.item():.2f}"
})
content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5])
print("Analysis complete.")
return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df
# --- GRADIO INTERFACE ---
def gradio_interface(keyword, my_url, competitor_url):
"""Wrapper function to format results for the Gradio UI."""
if not all([keyword, my_url, competitor_url]):
raise gr.Error("Please fill in all three fields.")
my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url)
# Create a summary report in Markdown
report_summary = f"""
## Overall Keyword Alignment
*This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.*
- **Your Page Score:** {my_score:.2f}
- **Competitor Page Score:** {comp_score:.2f}
"""
return report_summary, similarities_df, gaps_df
# Example data to make testing easier
example_keyword = "benefits of serverless computing"
example_my_url = "https://www.ibm.com/topics/serverless"
example_comp_url = "https://aws.amazon.com/serverless/"
# Build the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Simple Content Analysis with Vector Embeddings")
gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.")
with gr.Row():
keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword)
with gr.Row():
my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url)
competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url)
submit_btn = gr.Button("Analyze Content", variant="primary")
gr.Markdown("---")
gr.Markdown("## Analysis Report")
# Outputs
summary_output = gr.Markdown(label="Alignment Summary")
with gr.Tab("Content Similarities"):
gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*")
similarities_output = gr.DataFrame(label="Similar Content Sections")
with gr.Tab("Content Gaps"):
gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*")
gaps_output = gr.DataFrame(label="Potential Content Gaps")
submit_btn.click(
fn=gradio_interface,
inputs=[keyword_input, my_url_input, competitor_url_input],
outputs=[summary_output, similarities_output, gaps_output]
)
# Launch the app with a shareable link
demo.launch(debug=True, share=True)