#Import libraries import gradio as gr import requests from bs4 import BeautifulSoup from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer, util import numpy as np import torch import pandas as pd # --- INITIALIZATION --- # Load a pre-trained Sentence Transformer model # 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity. print("Loading embedding model... (This might take a moment on first run)") model = SentenceTransformer('all-MiniLM-L6-v2') print("Model loaded successfully.") # --- HELPER FUNCTIONS --- def get_text_from_url(url): """Fetches and extracts clean text from a URL.""" try: response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']): script_or_style.decompose() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) if not text: return None return text except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.") def get_chunks(text): """Splits text into smaller chunks.""" text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, length_function=len ) return text_splitter.split_text(text) # --- CORE ANALYSIS LOGIC --- def run_analysis(keyword, my_url, competitor_url): """The main function to perform the analysis and return structured results.""" print("Fetching and chunking content...") my_content = get_text_from_url(my_url) competitor_content = get_text_from_url(competitor_url) if not my_content or not competitor_content: raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.") my_chunks = get_chunks(my_content) competitor_chunks = get_chunks(competitor_content) if not my_chunks or not competitor_chunks: raise gr.Error("Could not chunk the content. Pages might be too short.") print("Creating embeddings...") keyword_embedding = model.encode(keyword, convert_to_tensor=True) my_embeddings = model.encode(my_chunks, convert_to_tensor=True) competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True) # --- Keyword Alignment --- my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0] competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0] top_k = min(5, len(my_keyword_scores)) my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy()) top_k = min(5, len(competitor_keyword_scores)) competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy()) # --- Similarities --- similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings) similar_pairs = [] for i in range(len(my_chunks)): best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0) if best_match_score > 0.70: similar_pairs.append({ "Your Content Snippet": my_chunks[i], "Competitor Content Snippet": competitor_chunks[best_match_idx], "Similarity Score": f"{best_match_score.item():.2f}" }) similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5]) # --- Gaps --- content_gaps = [] for i in range(len(competitor_chunks)): competitor_keyword_relevance = competitor_keyword_scores[i] if competitor_keyword_relevance > 0.5: my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0) if my_best_coverage_score < 0.6: content_gaps.append({ "Potential Content Gap (from Competitor)": competitor_chunks[i], "Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}", "Your Max Coverage": f"{my_best_coverage_score.item():.2f}" }) content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5]) print("Analysis complete.") return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df # --- GRADIO INTERFACE --- def gradio_interface(keyword, my_url, competitor_url): """Wrapper function to format results for the Gradio UI.""" if not all([keyword, my_url, competitor_url]): raise gr.Error("Please fill in all three fields.") my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url) # Create a summary report in Markdown report_summary = f""" ## Overall Keyword Alignment *This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.* - **Your Page Score:** {my_score:.2f} - **Competitor Page Score:** {comp_score:.2f} """ return report_summary, similarities_df, gaps_df # Example data to make testing easier example_keyword = "benefits of serverless computing" example_my_url = "https://www.ibm.com/topics/serverless" example_comp_url = "https://aws.amazon.com/serverless/" # Build the Gradio app with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# Simple Content Analysis with Vector Embeddings") gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.") with gr.Row(): keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword) with gr.Row(): my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url) competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url) submit_btn = gr.Button("Analyze Content", variant="primary") gr.Markdown("---") gr.Markdown("## Analysis Report") # Outputs summary_output = gr.Markdown(label="Alignment Summary") with gr.Tab("Content Similarities"): gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*") similarities_output = gr.DataFrame(label="Similar Content Sections") with gr.Tab("Content Gaps"): gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*") gaps_output = gr.DataFrame(label="Potential Content Gaps") submit_btn.click( fn=gradio_interface, inputs=[keyword_input, my_url_input, competitor_url_input], outputs=[summary_output, similarities_output, gaps_output] ) # Launch the app with a shareable link demo.launch(debug=True, share=True)