Spaces:

ryanshelley
/

Simple_Content_Analysis_with_Vector_Embeddings

Running

Simple_Content_Analysis_with_Vector_Embeddings

File size: 7,620 Bytes

71ad5c3

#Import libraries
import gradio as gr
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import pandas as pd

# --- INITIALIZATION ---

# Load a pre-trained Sentence Transformer model
# 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity.
print("Loading embedding model... (This might take a moment on first run)")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

# --- HELPER FUNCTIONS ---

def get_text_from_url(url):
    """Fetches and extracts clean text from a URL."""
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
            script_or_style.decompose()
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        if not text:
            return None
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.")


def get_chunks(text):
    """Splits text into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len
    )
    return text_splitter.split_text(text)

# --- CORE ANALYSIS LOGIC ---

def run_analysis(keyword, my_url, competitor_url):
    """The main function to perform the analysis and return structured results."""
    
    print("Fetching and chunking content...")
    my_content = get_text_from_url(my_url)
    competitor_content = get_text_from_url(competitor_url)

    if not my_content or not competitor_content:
        raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.")

    my_chunks = get_chunks(my_content)
    competitor_chunks = get_chunks(competitor_content)
    
    if not my_chunks or not competitor_chunks:
        raise gr.Error("Could not chunk the content. Pages might be too short.")

    print("Creating embeddings...")
    keyword_embedding = model.encode(keyword, convert_to_tensor=True)
    my_embeddings = model.encode(my_chunks, convert_to_tensor=True)
    competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True)

    # --- Keyword Alignment ---
    my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0]
    competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0]

    top_k = min(5, len(my_keyword_scores))
    my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy())
    
    top_k = min(5, len(competitor_keyword_scores))
    competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy())
    
    # --- Similarities ---
    similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings)
    similar_pairs = []
    for i in range(len(my_chunks)):
        best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0)
        if best_match_score > 0.70:
            similar_pairs.append({
                "Your Content Snippet": my_chunks[i],
                "Competitor Content Snippet": competitor_chunks[best_match_idx],
                "Similarity Score": f"{best_match_score.item():.2f}"
            })
    similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5])

    # --- Gaps ---
    content_gaps = []
    for i in range(len(competitor_chunks)):
        competitor_keyword_relevance = competitor_keyword_scores[i]
        if competitor_keyword_relevance > 0.5:
            my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0)
            if my_best_coverage_score < 0.6:
                content_gaps.append({
                    "Potential Content Gap (from Competitor)": competitor_chunks[i],
                    "Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}",
                    "Your Max Coverage": f"{my_best_coverage_score.item():.2f}"
                })
    content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5])
    
    print("Analysis complete.")
    return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df

# --- GRADIO INTERFACE ---

def gradio_interface(keyword, my_url, competitor_url):
    """Wrapper function to format results for the Gradio UI."""
    if not all([keyword, my_url, competitor_url]):
        raise gr.Error("Please fill in all three fields.")
        
    my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url)

    # Create a summary report in Markdown
    report_summary = f"""
    ## Overall Keyword Alignment
    *This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.*
    - **Your Page Score:** <span style="font-size: 1.5em; color: green;">{my_score:.2f}</span>
    - **Competitor Page Score:** <span style="font-size: 1.5em; color: red;">{comp_score:.2f}</span>
    """
    
    return report_summary, similarities_df, gaps_df


# Example data to make testing easier
example_keyword = "benefits of serverless computing"
example_my_url = "https://www.ibm.com/topics/serverless"
example_comp_url = "https://aws.amazon.com/serverless/"


# Build the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Simple Content Analysis with Vector Embeddings")
    gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.")
    
    with gr.Row():
        keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword)
    with gr.Row():
        my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url)
        competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url)
    
    submit_btn = gr.Button("Analyze Content", variant="primary")
    
    gr.Markdown("---")
    gr.Markdown("## Analysis Report")
    
    # Outputs
    summary_output = gr.Markdown(label="Alignment Summary")
    
    with gr.Tab("Content Similarities"):
        gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*")
        similarities_output = gr.DataFrame(label="Similar Content Sections")
        
    with gr.Tab("Content Gaps"):
        gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*")
        gaps_output = gr.DataFrame(label="Potential Content Gaps")

    submit_btn.click(
        fn=gradio_interface,
        inputs=[keyword_input, my_url_input, competitor_url_input],
        outputs=[summary_output, similarities_output, gaps_output]
    )

# Launch the app with a shareable link
demo.launch(debug=True, share=True)