Spaces:

ryanshelley
/

Simple_Content_Analysis_with_Vector_Embeddings

Sleeping

App Files Files Community

Simple_Content_Analysis_with_Vector_Embeddings / app.py

ryanshelley

Create app.py

71ad5c3 verified 6 months ago

raw

history blame contribute delete

7.62 kB

	#Import libraries
	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from sentence_transformers import SentenceTransformer, util
	import numpy as np
	import torch
	import pandas as pd

	# --- INITIALIZATION ---

	# Load a pre-trained Sentence Transformer model
	# 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity.
	print("Loading embedding model... (This might take a moment on first run)")
	model = SentenceTransformer('all-MiniLM-L6-v2')
	print("Model loaded successfully.")

	# --- HELPER FUNCTIONS ---

	def get_text_from_url(url):
	"""Fetches and extracts clean text from a URL."""
	try:
	response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
	script_or_style.decompose()
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = '\n'.join(chunk for chunk in chunks if chunk)
	if not text:
	return None
	return text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching {url}: {e}")
	raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.")


	def get_chunks(text):
	"""Splits text into smaller chunks."""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=50,
	length_function=len
	)
	return text_splitter.split_text(text)

	# --- CORE ANALYSIS LOGIC ---

	def run_analysis(keyword, my_url, competitor_url):
	"""The main function to perform the analysis and return structured results."""

	print("Fetching and chunking content...")
	my_content = get_text_from_url(my_url)
	competitor_content = get_text_from_url(competitor_url)

	if not my_content or not competitor_content:
	raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.")

	my_chunks = get_chunks(my_content)
	competitor_chunks = get_chunks(competitor_content)

	if not my_chunks or not competitor_chunks:
	raise gr.Error("Could not chunk the content. Pages might be too short.")

	print("Creating embeddings...")
	keyword_embedding = model.encode(keyword, convert_to_tensor=True)
	my_embeddings = model.encode(my_chunks, convert_to_tensor=True)
	competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True)

	# --- Keyword Alignment ---
	my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0]
	competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0]

	top_k = min(5, len(my_keyword_scores))
	my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy())

	top_k = min(5, len(competitor_keyword_scores))
	competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy())

	# --- Similarities ---
	similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings)
	similar_pairs = []
	for i in range(len(my_chunks)):
	best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0)
	if best_match_score > 0.70:
	similar_pairs.append({
	"Your Content Snippet": my_chunks[i],
	"Competitor Content Snippet": competitor_chunks[best_match_idx],
	"Similarity Score": f"{best_match_score.item():.2f}"
	})
	similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5])

	# --- Gaps ---
	content_gaps = []
	for i in range(len(competitor_chunks)):
	competitor_keyword_relevance = competitor_keyword_scores[i]
	if competitor_keyword_relevance > 0.5:
	my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0)
	if my_best_coverage_score < 0.6:
	content_gaps.append({
	"Potential Content Gap (from Competitor)": competitor_chunks[i],
	"Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}",
	"Your Max Coverage": f"{my_best_coverage_score.item():.2f}"
	})
	content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5])

	print("Analysis complete.")
	return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df

	# --- GRADIO INTERFACE ---

	def gradio_interface(keyword, my_url, competitor_url):
	"""Wrapper function to format results for the Gradio UI."""
	if not all([keyword, my_url, competitor_url]):
	raise gr.Error("Please fill in all three fields.")

	my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url)

	# Create a summary report in Markdown
	report_summary = f"""
	## Overall Keyword Alignment
	This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.
	- Your Page Score: <span style="font-size: 1.5em; color: green;">{my_score:.2f}</span>
	- Competitor Page Score: <span style="font-size: 1.5em; color: red;">{comp_score:.2f}</span>
	"""

	return report_summary, similarities_df, gaps_df


	# Example data to make testing easier
	example_keyword = "benefits of serverless computing"
	example_my_url = "https://www.ibm.com/topics/serverless"
	example_comp_url = "https://aws.amazon.com/serverless/"


	# Build the Gradio app
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Simple Content Analysis with Vector Embeddings")
	gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.")

	with gr.Row():
	keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword)
	with gr.Row():
	my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url)
	competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url)

	submit_btn = gr.Button("Analyze Content", variant="primary")

	gr.Markdown("---")
	gr.Markdown("## Analysis Report")

	# Outputs
	summary_output = gr.Markdown(label="Alignment Summary")

	with gr.Tab("Content Similarities"):
	gr.Markdown("### Where Your Content is Similar\nThese are the top content chunks from both pages that are most semantically similar.")
	similarities_output = gr.DataFrame(label="Similar Content Sections")

	with gr.Tab("Content Gaps"):
	gr.Markdown("### Content Gaps on Your Page\nThese are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.")
	gaps_output = gr.DataFrame(label="Potential Content Gaps")

	submit_btn.click(
	fn=gradio_interface,
	inputs=[keyword_input, my_url_input, competitor_url_input],
	outputs=[summary_output, similarities_output, gaps_output]
	)

	# Launch the app with a shareable link
	demo.launch(debug=True, share=True)