Spaces:

ryanshelley
/

Simple_Content_Analysis_with_Vector_Embeddings

Build error

App Files Files Community

ryanshelley commited on Jul 10, 2025

Commit

71ad5c3

verified ·

1 Parent(s): c5fd059

Create app.py

Browse files

Files changed (1) hide show

app.py +173 -0

app.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#Import libraries
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer, util
+import numpy as np
+import torch
+import pandas as pd
+# --- INITIALIZATION ---
+# Load a pre-trained Sentence Transformer model
+# 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity.
+print("Loading embedding model... (This might take a moment on first run)")
+model = SentenceTransformer('all-MiniLM-L6-v2')
+print("Model loaded successfully.")
+# --- HELPER FUNCTIONS ---
+def get_text_from_url(url):
+    """Fetches and extracts clean text from a URL."""
+    try:
+        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
+            script_or_style.decompose()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+        if not text:
+            return None
+        return text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url}: {e}")
+        raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.")
+def get_chunks(text):
+    """Splits text into smaller chunks."""
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+        length_function=len
+    )
+    return text_splitter.split_text(text)
+# --- CORE ANALYSIS LOGIC ---
+def run_analysis(keyword, my_url, competitor_url):
+    """The main function to perform the analysis and return structured results."""
+    print("Fetching and chunking content...")
+    my_content = get_text_from_url(my_url)
+    competitor_content = get_text_from_url(competitor_url)
+    if not my_content or not competitor_content:
+        raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.")
+    my_chunks = get_chunks(my_content)
+    competitor_chunks = get_chunks(competitor_content)
+    if not my_chunks or not competitor_chunks:
+        raise gr.Error("Could not chunk the content. Pages might be too short.")
+    print("Creating embeddings...")
+    keyword_embedding = model.encode(keyword, convert_to_tensor=True)
+    my_embeddings = model.encode(my_chunks, convert_to_tensor=True)
+    competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True)
+    # --- Keyword Alignment ---
+    my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0]
+    competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0]
+    top_k = min(5, len(my_keyword_scores))
+    my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy())
+    top_k = min(5, len(competitor_keyword_scores))
+    competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy())
+    # --- Similarities ---
+    similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings)
+    similar_pairs = []
+    for i in range(len(my_chunks)):
+        best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0)
+        if best_match_score > 0.70:
+            similar_pairs.append({
+                "Your Content Snippet": my_chunks[i],
+                "Competitor Content Snippet": competitor_chunks[best_match_idx],
+                "Similarity Score": f"{best_match_score.item():.2f}"
+            })
+    similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5])
+    # --- Gaps ---
+    content_gaps = []
+    for i in range(len(competitor_chunks)):
+        competitor_keyword_relevance = competitor_keyword_scores[i]
+        if competitor_keyword_relevance > 0.5:
+            my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0)
+            if my_best_coverage_score < 0.6:
+                content_gaps.append({
+                    "Potential Content Gap (from Competitor)": competitor_chunks[i],
+                    "Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}",
+                    "Your Max Coverage": f"{my_best_coverage_score.item():.2f}"
+                })
+    content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5])
+    print("Analysis complete.")
+    return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df
+# --- GRADIO INTERFACE ---
+def gradio_interface(keyword, my_url, competitor_url):
+    """Wrapper function to format results for the Gradio UI."""
+    if not all([keyword, my_url, competitor_url]):
+        raise gr.Error("Please fill in all three fields.")
+    my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url)
+    # Create a summary report in Markdown
+    report_summary = f"""
+    ## Overall Keyword Alignment
+    *This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.*
+    - **Your Page Score:** <span style="font-size: 1.5em; color: green;">{my_score:.2f}</span>
+    - **Competitor Page Score:** <span style="font-size: 1.5em; color: red;">{comp_score:.2f}</span>
+    """
+    return report_summary, similarities_df, gaps_df
+# Example data to make testing easier
+example_keyword = "benefits of serverless computing"
+example_my_url = "https://www.ibm.com/topics/serverless"
+example_comp_url = "https://aws.amazon.com/serverless/"
+# Build the Gradio app
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Simple Content Analysis with Vector Embeddings")
+    gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.")
+    with gr.Row():
+        keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword)
+    with gr.Row():
+        my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url)
+        competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url)
+    submit_btn = gr.Button("Analyze Content", variant="primary")
+    gr.Markdown("---")
+    gr.Markdown("## Analysis Report")
+    # Outputs
+    summary_output = gr.Markdown(label="Alignment Summary")
+    with gr.Tab("Content Similarities"):
+        gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*")
+        similarities_output = gr.DataFrame(label="Similar Content Sections")
+    with gr.Tab("Content Gaps"):
+        gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*")
+        gaps_output = gr.DataFrame(label="Potential Content Gaps")
+    submit_btn.click(
+        fn=gradio_interface,
+        inputs=[keyword_input, my_url_input, competitor_url_input],
+        outputs=[summary_output, similarities_output, gaps_output]
+    )
+# Launch the app with a shareable link
+demo.launch(debug=True, share=True)