ryanshelley commited on
Commit
71ad5c3
·
verified ·
1 Parent(s): c5fd059

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -0
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Import libraries
2
+ import gradio as gr
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import numpy as np
8
+ import torch
9
+ import pandas as pd
10
+
11
+ # --- INITIALIZATION ---
12
+
13
+ # Load a pre-trained Sentence Transformer model
14
+ # 'all-MiniLM-L6-v2' is a good, fast model for semantic similarity.
15
+ print("Loading embedding model... (This might take a moment on first run)")
16
+ model = SentenceTransformer('all-MiniLM-L6-v2')
17
+ print("Model loaded successfully.")
18
+
19
+ # --- HELPER FUNCTIONS ---
20
+
21
+ def get_text_from_url(url):
22
+ """Fetches and extracts clean text from a URL."""
23
+ try:
24
+ response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
25
+ response.raise_for_status()
26
+ soup = BeautifulSoup(response.text, 'html.parser')
27
+ for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
28
+ script_or_style.decompose()
29
+ text = soup.get_text()
30
+ lines = (line.strip() for line in text.splitlines())
31
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
32
+ text = '\n'.join(chunk for chunk in chunks if chunk)
33
+ if not text:
34
+ return None
35
+ return text
36
+ except requests.exceptions.RequestException as e:
37
+ print(f"Error fetching {url}: {e}")
38
+ raise gr.Error(f"Failed to fetch content from {url}. Please check the URL and try again.")
39
+
40
+
41
+ def get_chunks(text):
42
+ """Splits text into smaller chunks."""
43
+ text_splitter = RecursiveCharacterTextSplitter(
44
+ chunk_size=500,
45
+ chunk_overlap=50,
46
+ length_function=len
47
+ )
48
+ return text_splitter.split_text(text)
49
+
50
+ # --- CORE ANALYSIS LOGIC ---
51
+
52
+ def run_analysis(keyword, my_url, competitor_url):
53
+ """The main function to perform the analysis and return structured results."""
54
+
55
+ print("Fetching and chunking content...")
56
+ my_content = get_text_from_url(my_url)
57
+ competitor_content = get_text_from_url(competitor_url)
58
+
59
+ if not my_content or not competitor_content:
60
+ raise gr.Error("Could not retrieve enough text content from one or both URLs. The pages might be heavily JavaScript-based or have very little text.")
61
+
62
+ my_chunks = get_chunks(my_content)
63
+ competitor_chunks = get_chunks(competitor_content)
64
+
65
+ if not my_chunks or not competitor_chunks:
66
+ raise gr.Error("Could not chunk the content. Pages might be too short.")
67
+
68
+ print("Creating embeddings...")
69
+ keyword_embedding = model.encode(keyword, convert_to_tensor=True)
70
+ my_embeddings = model.encode(my_chunks, convert_to_tensor=True)
71
+ competitor_embeddings = model.encode(competitor_chunks, convert_to_tensor=True)
72
+
73
+ # --- Keyword Alignment ---
74
+ my_keyword_scores = util.pytorch_cos_sim(keyword_embedding, my_embeddings)[0]
75
+ competitor_keyword_scores = util.pytorch_cos_sim(keyword_embedding, competitor_embeddings)[0]
76
+
77
+ top_k = min(5, len(my_keyword_scores))
78
+ my_alignment_score = np.mean(torch.topk(my_keyword_scores, k=top_k).values.cpu().numpy())
79
+
80
+ top_k = min(5, len(competitor_keyword_scores))
81
+ competitor_alignment_score = np.mean(torch.topk(competitor_keyword_scores, k=top_k).values.cpu().numpy())
82
+
83
+ # --- Similarities ---
84
+ similarity_matrix = util.pytorch_cos_sim(my_embeddings, competitor_embeddings)
85
+ similar_pairs = []
86
+ for i in range(len(my_chunks)):
87
+ best_match_score, best_match_idx = torch.max(similarity_matrix[i], dim=0)
88
+ if best_match_score > 0.70:
89
+ similar_pairs.append({
90
+ "Your Content Snippet": my_chunks[i],
91
+ "Competitor Content Snippet": competitor_chunks[best_match_idx],
92
+ "Similarity Score": f"{best_match_score.item():.2f}"
93
+ })
94
+ similar_pairs_df = pd.DataFrame(sorted(similar_pairs, key=lambda x: x['Similarity Score'], reverse=True)[:5])
95
+
96
+ # --- Gaps ---
97
+ content_gaps = []
98
+ for i in range(len(competitor_chunks)):
99
+ competitor_keyword_relevance = competitor_keyword_scores[i]
100
+ if competitor_keyword_relevance > 0.5:
101
+ my_best_coverage_score, _ = torch.max(similarity_matrix[:, i], dim=0)
102
+ if my_best_coverage_score < 0.6:
103
+ content_gaps.append({
104
+ "Potential Content Gap (from Competitor)": competitor_chunks[i],
105
+ "Relevance to Keyword": f"{competitor_keyword_relevance.item():.2f}",
106
+ "Your Max Coverage": f"{my_best_coverage_score.item():.2f}"
107
+ })
108
+ content_gaps_df = pd.DataFrame(sorted(content_gaps, key=lambda x: x['Relevance to Keyword'], reverse=True)[:5])
109
+
110
+ print("Analysis complete.")
111
+ return my_alignment_score, competitor_alignment_score, similar_pairs_df, content_gaps_df
112
+
113
+ # --- GRADIO INTERFACE ---
114
+
115
+ def gradio_interface(keyword, my_url, competitor_url):
116
+ """Wrapper function to format results for the Gradio UI."""
117
+ if not all([keyword, my_url, competitor_url]):
118
+ raise gr.Error("Please fill in all three fields.")
119
+
120
+ my_score, comp_score, similarities_df, gaps_df = run_analysis(keyword, my_url, competitor_url)
121
+
122
+ # Create a summary report in Markdown
123
+ report_summary = f"""
124
+ ## Overall Keyword Alignment
125
+ *This score (0 to 1) shows how semantically aligned the page is to your keyword. Higher is better.*
126
+ - **Your Page Score:** <span style="font-size: 1.5em; color: green;">{my_score:.2f}</span>
127
+ - **Competitor Page Score:** <span style="font-size: 1.5em; color: red;">{comp_score:.2f}</span>
128
+ """
129
+
130
+ return report_summary, similarities_df, gaps_df
131
+
132
+
133
+ # Example data to make testing easier
134
+ example_keyword = "benefits of serverless computing"
135
+ example_my_url = "https://www.ibm.com/topics/serverless"
136
+ example_comp_url = "https://aws.amazon.com/serverless/"
137
+
138
+
139
+ # Build the Gradio app
140
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
141
+ gr.Markdown("# Simple Content Analysis with Vector Embeddings")
142
+ gr.Markdown("Enter a keyword and two URLs to compare how well their content aligns with the keyword, find similarities, and identify content gaps.")
143
+
144
+ with gr.Row():
145
+ keyword_input = gr.Textbox(label="Keyword", placeholder="e.g., benefits of serverless computing", value=example_keyword)
146
+ with gr.Row():
147
+ my_url_input = gr.Textbox(label="Your URL", placeholder="https://your-blog.com/your-article", value=example_my_url)
148
+ competitor_url_input = gr.Textbox(label="Competitor's URL", placeholder="https://competitor.com/their-article", value=example_comp_url)
149
+
150
+ submit_btn = gr.Button("Analyze Content", variant="primary")
151
+
152
+ gr.Markdown("---")
153
+ gr.Markdown("## Analysis Report")
154
+
155
+ # Outputs
156
+ summary_output = gr.Markdown(label="Alignment Summary")
157
+
158
+ with gr.Tab("Content Similarities"):
159
+ gr.Markdown("### Where Your Content is Similar\n*These are the top content chunks from both pages that are most semantically similar.*")
160
+ similarities_output = gr.DataFrame(label="Similar Content Sections")
161
+
162
+ with gr.Tab("Content Gaps"):
163
+ gr.Markdown("### Content Gaps on Your Page\n*These are topics the competitor covers that are relevant to the keyword, but your page seems to be missing.*")
164
+ gaps_output = gr.DataFrame(label="Potential Content Gaps")
165
+
166
+ submit_btn.click(
167
+ fn=gradio_interface,
168
+ inputs=[keyword_input, my_url_input, competitor_url_input],
169
+ outputs=[summary_output, similarities_output, gaps_output]
170
+ )
171
+
172
+ # Launch the app with a shareable link
173
+ demo.launch(debug=True, share=True)