|
|
|
|
|
import gradio as gr |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
import numpy as np |
|
|
import torch |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading embedding model... (This might take a moment on first run)") |
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
print("Model loaded successfully.") |
|
|
|
|
|
|
|
|
|
|
|
def get_text_from_url(url): |
|
|
"""Fetches and extracts clean text from a URL.""" |
|
|
try: |
|
|
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']): |
|
|
script_or_style.decompose() |
|
|
text = soup.get_text() |
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
text = '\n'.join(chunk for chunk in chunks if chunk) |
|
|
if not text: |
|
|
return None |
|
|
return text |
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Error fetching {url}: {e}") |
|
|
raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.") |
|
|
|
|
|
|
|
|
def get_chunks(text): |
|
|
"""Splits text into smaller chunks.""" |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=500, |
|
|
chunk_overlap=50, |
|
|
length_function=len |
|
|
) |
|
|
return text_splitter.split_text(text) |
|
|
|
|
|
|
|
|
|
|
|
def run_analysis(keyword, my_url, competitor_url): |
|
|
"""The main function to perform the analysis and return structured results.""" |
|
|
|
|
|
print("Fetching and chunking content...") |
|
|
my_content = get_text_from_url(my_url) |
|
|
competitor_content = get_text_from_url(competitor_url) |
|
|
|
|
|
if not my_content or not competitor_content: |
|
|
raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.") |
|
|
|
|
|
my_chunks = get_chunks(my_content) |
|
|
competitor_chunks = get_chunks(competitor_content) |
|
|
|
|
|
if not my_chunks or not competitor_chunks: |
|
|
raise gr.Error("Could not chunk the content. Pages might be too short.") |
|
|
|
|
|
print("Creating embeddings...") |
|
|
keyword_embedding = model.encode(keyword, convert_to_tensor=True) |
|
|
my_embeddings = model.encode(my_chunks, convert_to_tensor=True) |
|
|
competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True) |
|
|
|
|
|
|
|
|
my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0] |
|
|
competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0] |
|
|
|
|
|
top_k = min(5, len(my_keyword_scores)) |
|
|
my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy()) |
|
|
|
|
|
top_k = min(5, len(competitor_keyword_scores)) |
|
|
competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy()) |
|
|
|
|
|
|
|
|
similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings) |
|
|
similar_pairs = [] |
|
|
for i in range(len(my_chunks)): |
|
|
best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0) |
|
|
if best_match_score > 0.70: |
|
|
similar_pairs.append({ |
|
|
"Your Content Snippet": my_chunks[i], |
|
|
"Competitor Content Snippet": competitor_chunks[best_match_idx], |
|
|
"Similarity Score": f"{best_match_score.item():.2f}" |
|
|
}) |
|
|
similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5]) |
|
|
|
|
|
|
|
|
content_gaps = [] |
|
|
for i in range(len(competitor_chunks)): |
|
|
competitor_keyword_relevance = competitor_keyword_scores[i] |
|
|
if competitor_keyword_relevance > 0.5: |
|
|
my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0) |
|
|
if my_best_coverage_score < 0.6: |
|
|
content_gaps.append({ |
|
|
"Potential Content Gap (from Competitor)": competitor_chunks[i], |
|
|
"Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}", |
|
|
"Your Max Coverage": f"{my_best_coverage_score.item():.2f}" |
|
|
}) |
|
|
content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5]) |
|
|
|
|
|
print("Analysis complete.") |
|
|
return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df |
|
|
|
|
|
|
|
|
|
|
|
def gradio_interface(keyword, my_url, competitor_url): |
|
|
"""Wrapper function to format results for the Gradio UI.""" |
|
|
if not all([keyword, my_url, competitor_url]): |
|
|
raise gr.Error("Please fill in all three fields.") |
|
|
|
|
|
my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url) |
|
|
|
|
|
|
|
|
report_summary = f""" |
|
|
## Overall Keyword Alignment |
|
|
*This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.* |
|
|
- **Your Page Score:** <span style="font-size: 1.5em; color: green;">{my_score:.2f}</span> |
|
|
- **Competitor Page Score:** <span style="font-size: 1.5em; color: red;">{comp_score:.2f}</span> |
|
|
""" |
|
|
|
|
|
return report_summary, similarities_df, gaps_df |
|
|
|
|
|
|
|
|
|
|
|
example_keyword = "benefits of serverless computing" |
|
|
example_my_url = "https://www.ibm.com/topics/serverless" |
|
|
example_comp_url = "https://aws.amazon.com/serverless/" |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# Simple Content Analysis with Vector Embeddings") |
|
|
gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.") |
|
|
|
|
|
with gr.Row(): |
|
|
keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword) |
|
|
with gr.Row(): |
|
|
my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url) |
|
|
competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url) |
|
|
|
|
|
submit_btn = gr.Button("Analyze Content", variant="primary") |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("## Analysis Report") |
|
|
|
|
|
|
|
|
summary_output = gr.Markdown(label="Alignment Summary") |
|
|
|
|
|
with gr.Tab("Content Similarities"): |
|
|
gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*") |
|
|
similarities_output = gr.DataFrame(label="Similar Content Sections") |
|
|
|
|
|
with gr.Tab("Content Gaps"): |
|
|
gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*") |
|
|
gaps_output = gr.DataFrame(label="Potential Content Gaps") |
|
|
|
|
|
submit_btn.click( |
|
|
fn=gradio_interface, |
|
|
inputs=[keyword_input, my_url_input, competitor_url_input], |
|
|
outputs=[summary_output, similarities_output, gaps_output] |
|
|
) |
|
|
|
|
|
|
|
|
demo.launch(debug=True, share=True) |