File size: 7,620 Bytes
71ad5c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#Import libraries
import gradio as gr
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import pandas as pd

# --- INITIALIZATION ---

# Load a pre-trained Sentence Transformer model
# 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity.
print("Loading embedding model... (This might take a moment on first run)")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

# --- HELPER FUNCTIONS ---

def get_text_from_url(url):
    """Fetches and extracts clean text from a URL."""
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
            script_or_style.decompose()
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        if not text:
            return None
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.")


def get_chunks(text):
    """Splits text into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len
    )
    return text_splitter.split_text(text)

# --- CORE ANALYSIS LOGIC ---

def run_analysis(keyword, my_url, competitor_url):
    """The main function to perform the analysis and return structured results."""
    
    print("Fetching and chunking content...")
    my_content = get_text_from_url(my_url)
    competitor_content = get_text_from_url(competitor_url)

    if not my_content or not competitor_content:
        raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.")

    my_chunks = get_chunks(my_content)
    competitor_chunks = get_chunks(competitor_content)
    
    if not my_chunks or not competitor_chunks:
        raise gr.Error("Could not chunk the content. Pages might be too short.")

    print("Creating embeddings...")
    keyword_embedding = model.encode(keyword, convert_to_tensor=True)
    my_embeddings = model.encode(my_chunks, convert_to_tensor=True)
    competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True)

    # --- Keyword Alignment ---
    my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0]
    competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0]

    top_k = min(5, len(my_keyword_scores))
    my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy())
    
    top_k = min(5, len(competitor_keyword_scores))
    competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy())
    
    # --- Similarities ---
    similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings)
    similar_pairs = []
    for i in range(len(my_chunks)):
        best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0)
        if best_match_score > 0.70:
            similar_pairs.append({
                "Your Content Snippet": my_chunks[i],
                "Competitor Content Snippet": competitor_chunks[best_match_idx],
                "Similarity Score": f"{best_match_score.item():.2f}"
            })
    similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5])

    # --- Gaps ---
    content_gaps = []
    for i in range(len(competitor_chunks)):
        competitor_keyword_relevance = competitor_keyword_scores[i]
        if competitor_keyword_relevance > 0.5:
            my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0)
            if my_best_coverage_score < 0.6:
                content_gaps.append({
                    "Potential Content Gap (from Competitor)": competitor_chunks[i],
                    "Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}",
                    "Your Max Coverage": f"{my_best_coverage_score.item():.2f}"
                })
    content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5])
    
    print("Analysis complete.")
    return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df

# --- GRADIO INTERFACE ---

def gradio_interface(keyword, my_url, competitor_url):
    """Wrapper function to format results for the Gradio UI."""
    if not all([keyword, my_url, competitor_url]):
        raise gr.Error("Please fill in all three fields.")
        
    my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url)

    # Create a summary report in Markdown
    report_summary = f"""
    ## Overall Keyword Alignment
    *This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.*
    - **Your Page Score:** <span style="font-size: 1.5em; color: green;">{my_score:.2f}</span>
    - **Competitor Page Score:** <span style="font-size: 1.5em; color: red;">{comp_score:.2f}</span>
    """
    
    return report_summary, similarities_df, gaps_df


# Example data to make testing easier
example_keyword = "benefits of serverless computing"
example_my_url = "https://www.ibm.com/topics/serverless"
example_comp_url = "https://aws.amazon.com/serverless/"


# Build the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Simple Content Analysis with Vector Embeddings")
    gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.")
    
    with gr.Row():
        keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword)
    with gr.Row():
        my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url)
        competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url)
    
    submit_btn = gr.Button("Analyze Content", variant="primary")
    
    gr.Markdown("---")
    gr.Markdown("## Analysis Report")
    
    # Outputs
    summary_output = gr.Markdown(label="Alignment Summary")
    
    with gr.Tab("Content Similarities"):
        gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*")
        similarities_output = gr.DataFrame(label="Similar Content Sections")
        
    with gr.Tab("Content Gaps"):
        gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*")
        gaps_output = gr.DataFrame(label="Potential Content Gaps")

    submit_btn.click(
        fn=gradio_interface,
        inputs=[keyword_input, my_url_input, competitor_url_input],
        outputs=[summary_output, similarities_output, gaps_output]
    )

# Launch the app with a shareable link
demo.launch(debug=True, share=True)