File size: 7,620 Bytes
71ad5c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
#Import libraries
import gradio as gr
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import pandas as pd
# --- INITIALIZATION ---
# Load a pre-trained Sentence Transformer model
# 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity.
print("Loading embedding model... (This might take a moment on first run)")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")
# --- HELPER FUNCTIONS ---
def get_text_from_url(url):
"""Fetches and extracts clean text from a URL."""
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
script_or_style.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
if not text:
return None
return text
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.")
def get_chunks(text):
"""Splits text into smaller chunks."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len
)
return text_splitter.split_text(text)
# --- CORE ANALYSIS LOGIC ---
def run_analysis(keyword, my_url, competitor_url):
"""The main function to perform the analysis and return structured results."""
print("Fetching and chunking content...")
my_content = get_text_from_url(my_url)
competitor_content = get_text_from_url(competitor_url)
if not my_content or not competitor_content:
raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.")
my_chunks = get_chunks(my_content)
competitor_chunks = get_chunks(competitor_content)
if not my_chunks or not competitor_chunks:
raise gr.Error("Could not chunk the content. Pages might be too short.")
print("Creating embeddings...")
keyword_embedding = model.encode(keyword, convert_to_tensor=True)
my_embeddings = model.encode(my_chunks, convert_to_tensor=True)
competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True)
# --- Keyword Alignment ---
my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0]
competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0]
top_k = min(5, len(my_keyword_scores))
my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy())
top_k = min(5, len(competitor_keyword_scores))
competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy())
# --- Similarities ---
similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings)
similar_pairs = []
for i in range(len(my_chunks)):
best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0)
if best_match_score > 0.70:
similar_pairs.append({
"Your Content Snippet": my_chunks[i],
"Competitor Content Snippet": competitor_chunks[best_match_idx],
"Similarity Score": f"{best_match_score.item():.2f}"
})
similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5])
# --- Gaps ---
content_gaps = []
for i in range(len(competitor_chunks)):
competitor_keyword_relevance = competitor_keyword_scores[i]
if competitor_keyword_relevance > 0.5:
my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0)
if my_best_coverage_score < 0.6:
content_gaps.append({
"Potential Content Gap (from Competitor)": competitor_chunks[i],
"Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}",
"Your Max Coverage": f"{my_best_coverage_score.item():.2f}"
})
content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5])
print("Analysis complete.")
return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df
# --- GRADIO INTERFACE ---
def gradio_interface(keyword, my_url, competitor_url):
"""Wrapper function to format results for the Gradio UI."""
if not all([keyword, my_url, competitor_url]):
raise gr.Error("Please fill in all three fields.")
my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url)
# Create a summary report in Markdown
report_summary = f"""
## Overall Keyword Alignment
*This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.*
- **Your Page Score:** <span style="font-size: 1.5em; color: green;">{my_score:.2f}</span>
- **Competitor Page Score:** <span style="font-size: 1.5em; color: red;">{comp_score:.2f}</span>
"""
return report_summary, similarities_df, gaps_df
# Example data to make testing easier
example_keyword = "benefits of serverless computing"
example_my_url = "https://www.ibm.com/topics/serverless"
example_comp_url = "https://aws.amazon.com/serverless/"
# Build the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Simple Content Analysis with Vector Embeddings")
gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.")
with gr.Row():
keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword)
with gr.Row():
my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url)
competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url)
submit_btn = gr.Button("Analyze Content", variant="primary")
gr.Markdown("---")
gr.Markdown("## Analysis Report")
# Outputs
summary_output = gr.Markdown(label="Alignment Summary")
with gr.Tab("Content Similarities"):
gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*")
similarities_output = gr.DataFrame(label="Similar Content Sections")
with gr.Tab("Content Gaps"):
gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*")
gaps_output = gr.DataFrame(label="Potential Content Gaps")
submit_btn.click(
fn=gradio_interface,
inputs=[keyword_input, my_url_input, competitor_url_input],
outputs=[summary_output, similarities_output, gaps_output]
)
# Launch the app with a shareable link
demo.launch(debug=True, share=True) |