vijaykumaredstellar commited on
Commit
35c63dd
Β·
verified Β·
1 Parent(s): 98f6c84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -70
app.py CHANGED
@@ -26,11 +26,13 @@ class KnowledgeBase:
26
  def load_from_huggingface(self, repo_id, hf_token=None):
27
  """Load knowledge base from Hugging Face"""
28
  try:
 
 
29
  kb_path = hf_hub_download(
30
  repo_id=repo_id,
31
  filename='knowledge_base.pkl',
32
  repo_type='dataset',
33
- token=hf_token if hf_token else None
34
  )
35
 
36
  with open(kb_path, 'rb') as f:
@@ -108,41 +110,31 @@ class OrphanPageAnalyzer:
108
  self.kb = kb
109
  self.client = client
110
 
111
- def get_orphan_metadata(self, orphan_url):
112
- """Extract metadata for orphan page from knowledge base"""
113
- matches = [p for p in self.kb.knowledge_base if p['url'] == orphan_url]
114
- if matches:
115
- return {
116
- 'title': matches[0]['title'],
117
- 'keyword': matches[0]['keyword'],
118
- 'category': matches[0]['category']
119
- }
120
- return None
121
-
122
- def analyze(self, orphan_url, num_sources=3):
123
  """
124
- Complete analysis: Find sources, placements, and generate report
 
125
  """
126
 
127
- # Get orphan page metadata
128
- orphan_meta = self.get_orphan_metadata(orphan_url)
129
-
130
- if not orphan_meta:
131
- return "❌ Orphan page not found in knowledge base. Please check the URL.", None
132
 
133
- orphan_title = orphan_meta['title']
134
- orphan_keyword = orphan_meta['keyword']
135
- orphan_category = orphan_meta['category']
136
 
137
- # Step 1: Find relevant source pages
138
- search_query = f"{orphan_title} {orphan_keyword} {orphan_category}"
139
  query_embedding = self.client.get_embedding(search_query)
 
 
140
  candidates = self.kb.search(query_embedding, top_k=50)
141
 
142
- # Group by URL and score
 
 
143
  url_scores = {}
144
  for item in candidates:
145
  url = item['url']
 
 
146
  if url == orphan_url:
147
  continue
148
 
@@ -161,17 +153,15 @@ class OrphanPageAnalyzer:
161
  'similarity': item['similarity_score']
162
  })
163
 
164
- # Rank sources
 
 
165
  ranked_sources = []
166
  for url, data in url_scores.items():
167
  avg_sim = np.mean([p['similarity'] for p in data['paragraphs']])
168
  max_sim = max([p['similarity'] for p in data['paragraphs']])
169
 
170
- score = (
171
- avg_sim * 0.4 +
172
- max_sim * 0.4 +
173
- (1 if data['category'] == orphan_category else 0) * 0.2
174
- )
175
 
176
  ranked_sources.append({
177
  **data,
@@ -181,36 +171,42 @@ class OrphanPageAnalyzer:
181
  ranked_sources.sort(key=lambda x: x['score'], reverse=True)
182
  top_sources = ranked_sources[:num_sources]
183
 
184
- # Step 2: Find best placements and generate modifications
 
 
185
  results = []
186
 
187
- for source in top_sources:
188
- # Get best paragraph
 
 
189
  best_para = max(source['paragraphs'], key=lambda x: x['similarity'])
190
 
191
- # Generate anchor text using LLM
192
  anchor_prompt = f"""Generate a natural 2-4 word anchor text to link to this page:
193
 
194
- Target: {orphan_title}
195
- Keyword: {orphan_keyword}
196
 
197
- Context: {best_para['text'][:200]}...
 
198
 
199
- Provide ONLY the anchor text."""
200
 
201
  anchor_text = self.client.chat([
202
  {"role": "user", "content": anchor_prompt}
203
  ]).strip().strip('"').strip("'")
204
 
205
- # Generate modified sentence using LLM
206
  modify_prompt = f"""Modify this sentence to naturally include an internal link.
207
 
208
  Current sentence:
209
  {best_para['text']}
210
 
211
- Link details:
212
  - Anchor text: "{anchor_text}"
213
- - Target: {orphan_title}
 
214
 
215
  Provide ONLY the modified sentence with the anchor text naturally integrated."""
216
 
@@ -250,7 +246,7 @@ Provide ONLY the modified sentence with the anchor text naturally integrated."""
250
  report = f"# πŸ”— Internal Linking Report\n\n"
251
  report += f"**Orphan Page:** {orphan_title}\n"
252
  report += f"**Target URL:** `{orphan_url}`\n"
253
- report += f"**Links Found:** {len(results)}\n\n"
254
  report += "---\n\n"
255
 
256
  for i, result in enumerate(results, 1):
@@ -295,21 +291,17 @@ def setup(api_key, hf_token):
295
  """Setup API and load knowledge base"""
296
  global analyzer
297
 
298
- status = []
299
-
300
- # Setup API
301
  if not api_key or not api_key.strip():
302
  return "❌ Please enter your OpenRouter API key", None
303
 
304
  try:
305
  client = OpenRouterClient(api_key)
306
- status.append("βœ… API key configured")
307
  except Exception as e:
308
  return f"❌ API Error: {str(e)}", None
309
 
310
  # Load knowledge base
311
- token = hf_token.strip() if hf_token else None
312
- success, message = kb.load_from_huggingface(HF_DATASET_REPO, token)
313
 
314
  if not success:
315
  return f"βœ… API key configured\n{message}", None
@@ -322,7 +314,7 @@ def setup(api_key, hf_token):
322
 
323
  return "\n".join(status), None
324
 
325
- def analyze_orphan(orphan_url, num_sources):
326
  """Analyze orphan page and generate report"""
327
 
328
  if not analyzer:
@@ -331,11 +323,21 @@ def analyze_orphan(orphan_url, num_sources):
331
  if not orphan_url or not orphan_url.strip():
332
  return "❌ Please enter an orphan page URL", None
333
 
 
 
 
334
  try:
335
- report, table = analyzer.analyze(orphan_url, num_sources)
 
 
 
 
 
336
  return report, table
337
  except Exception as e:
338
- return f"❌ Error: {str(e)}", None
 
 
339
 
340
  # ============================================
341
  # INTERFACE
@@ -343,11 +345,11 @@ def analyze_orphan(orphan_url, num_sources):
343
  with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft()) as app:
344
 
345
  gr.Markdown("# πŸ”— Edstellar Internal Linking Tool")
346
- gr.Markdown("Enter an orphan page URL to get instant internal linking recommendations")
347
 
348
  # Setup Section
349
- with gr.Accordion("βš™οΈ Setup (Click to expand - Do this once)", open=True):
350
- gr.Markdown("### Step 1: Configure API Keys")
351
 
352
  with gr.Row():
353
  api_key = gr.Textbox(
@@ -357,7 +359,7 @@ with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft())
357
  scale=2
358
  )
359
  hf_token = gr.Textbox(
360
- label="Hugging Face Token (optional)",
361
  placeholder="hf_...",
362
  type="password",
363
  scale=2
@@ -370,21 +372,35 @@ with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft())
370
 
371
  # Analysis Section
372
  gr.Markdown("### πŸ“Š Analyze Orphan Page")
 
373
 
374
  with gr.Row():
375
- orphan_url_input = gr.Textbox(
376
- label="Orphan Page URL",
377
- placeholder="https://edstellar.com/blog/your-orphan-page",
378
- scale=3
379
- )
380
- num_sources_input = gr.Slider(
381
- label="Number of Sources",
382
- minimum=3,
383
- maximum=5,
384
- value=3,
385
- step=1,
386
- scale=1
387
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
  analyze_btn = gr.Button("πŸ” Analyze & Generate Report", variant="primary", size="lg")
390
 
@@ -411,7 +427,7 @@ with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft())
411
 
412
  analyze_btn.click(
413
  analyze_orphan,
414
- inputs=[orphan_url_input, num_sources_input],
415
  outputs=[report_output, table_output]
416
  )
417
 
 
26
  def load_from_huggingface(self, repo_id, hf_token=None):
27
  """Load knowledge base from Hugging Face"""
28
  try:
29
+ token = hf_token.strip() if hf_token and hf_token.strip() else None
30
+
31
  kb_path = hf_hub_download(
32
  repo_id=repo_id,
33
  filename='knowledge_base.pkl',
34
  repo_type='dataset',
35
+ token=token
36
  )
37
 
38
  with open(kb_path, 'rb') as f:
 
110
  self.kb = kb
111
  self.client = client
112
 
113
+ def analyze(self, orphan_url, orphan_title, orphan_keyword, num_sources=3):
 
 
 
 
 
 
 
 
 
 
 
114
  """
115
+ Find pages in knowledge base that should link TO the orphan page
116
+ Orphan page does NOT need to be in the knowledge base
117
  """
118
 
119
+ # Create search query from orphan page info
120
+ search_query = f"{orphan_title} {orphan_keyword}"
 
 
 
121
 
122
+ print(f"πŸ” Searching for pages related to: {search_query}")
 
 
123
 
124
+ # Get embedding for the orphan page topic
 
125
  query_embedding = self.client.get_embedding(search_query)
126
+
127
+ # Search knowledge base for relevant paragraphs
128
  candidates = self.kb.search(query_embedding, top_k=50)
129
 
130
+ print(f"πŸ“Š Found {len(candidates)} candidate paragraphs")
131
+
132
+ # Group by URL (to find source pages)
133
  url_scores = {}
134
  for item in candidates:
135
  url = item['url']
136
+
137
+ # Skip if somehow the orphan URL is in KB
138
  if url == orphan_url:
139
  continue
140
 
 
153
  'similarity': item['similarity_score']
154
  })
155
 
156
+ print(f"πŸ“„ Found {len(url_scores)} unique source pages")
157
+
158
+ # Rank source pages
159
  ranked_sources = []
160
  for url, data in url_scores.items():
161
  avg_sim = np.mean([p['similarity'] for p in data['paragraphs']])
162
  max_sim = max([p['similarity'] for p in data['paragraphs']])
163
 
164
+ score = (avg_sim * 0.5 + max_sim * 0.5)
 
 
 
 
165
 
166
  ranked_sources.append({
167
  **data,
 
171
  ranked_sources.sort(key=lambda x: x['score'], reverse=True)
172
  top_sources = ranked_sources[:num_sources]
173
 
174
+ print(f"⭐ Selected top {len(top_sources)} sources")
175
+
176
+ # Generate linking recommendations for each source
177
  results = []
178
 
179
+ for idx, source in enumerate(top_sources, 1):
180
+ print(f"πŸ”— Processing source {idx}/{len(top_sources)}: {source['title']}")
181
+
182
+ # Get best paragraph in this source
183
  best_para = max(source['paragraphs'], key=lambda x: x['similarity'])
184
 
185
+ # Generate anchor text
186
  anchor_prompt = f"""Generate a natural 2-4 word anchor text to link to this page:
187
 
188
+ Target Page Title: {orphan_title}
189
+ Target Keyword: {orphan_keyword}
190
 
191
+ Context where link will be placed:
192
+ {best_para['text'][:200]}...
193
 
194
+ Provide ONLY the anchor text, no quotes or explanation."""
195
 
196
  anchor_text = self.client.chat([
197
  {"role": "user", "content": anchor_prompt}
198
  ]).strip().strip('"').strip("'")
199
 
200
+ # Generate modified sentence
201
  modify_prompt = f"""Modify this sentence to naturally include an internal link.
202
 
203
  Current sentence:
204
  {best_para['text']}
205
 
206
+ Add this internal link:
207
  - Anchor text: "{anchor_text}"
208
+ - Target page: {orphan_title}
209
+ - Target URL: {orphan_url}
210
 
211
  Provide ONLY the modified sentence with the anchor text naturally integrated."""
212
 
 
246
  report = f"# πŸ”— Internal Linking Report\n\n"
247
  report += f"**Orphan Page:** {orphan_title}\n"
248
  report += f"**Target URL:** `{orphan_url}`\n"
249
+ report += f"**Links Generated:** {len(results)}\n\n"
250
  report += "---\n\n"
251
 
252
  for i, result in enumerate(results, 1):
 
291
  """Setup API and load knowledge base"""
292
  global analyzer
293
 
 
 
 
294
  if not api_key or not api_key.strip():
295
  return "❌ Please enter your OpenRouter API key", None
296
 
297
  try:
298
  client = OpenRouterClient(api_key)
299
+ status = ["βœ… API key configured"]
300
  except Exception as e:
301
  return f"❌ API Error: {str(e)}", None
302
 
303
  # Load knowledge base
304
+ success, message = kb.load_from_huggingface(HF_DATASET_REPO, hf_token)
 
305
 
306
  if not success:
307
  return f"βœ… API key configured\n{message}", None
 
314
 
315
  return "\n".join(status), None
316
 
317
+ def analyze_orphan(orphan_url, orphan_title, orphan_keyword, num_sources):
318
  """Analyze orphan page and generate report"""
319
 
320
  if not analyzer:
 
323
  if not orphan_url or not orphan_url.strip():
324
  return "❌ Please enter an orphan page URL", None
325
 
326
+ if not orphan_title or not orphan_title.strip():
327
+ return "❌ Please enter the orphan page title", None
328
+
329
  try:
330
+ report, table = analyzer.analyze(
331
+ orphan_url.strip(),
332
+ orphan_title.strip(),
333
+ orphan_keyword.strip() if orphan_keyword else orphan_title.strip(),
334
+ num_sources
335
+ )
336
  return report, table
337
  except Exception as e:
338
+ import traceback
339
+ error_detail = traceback.format_exc()
340
+ return f"❌ Error: {str(e)}\n\nDetails:\n{error_detail}", None
341
 
342
  # ============================================
343
  # INTERFACE
 
345
  with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft()) as app:
346
 
347
  gr.Markdown("# πŸ”— Edstellar Internal Linking Tool")
348
+ gr.Markdown("Find the best existing blog posts to link to your orphan page")
349
 
350
  # Setup Section
351
+ with gr.Accordion("βš™οΈ Setup (Do this once)", open=True):
352
+ gr.Markdown("### Configure API Keys")
353
 
354
  with gr.Row():
355
  api_key = gr.Textbox(
 
359
  scale=2
360
  )
361
  hf_token = gr.Textbox(
362
+ label="Hugging Face Token",
363
  placeholder="hf_...",
364
  type="password",
365
  scale=2
 
372
 
373
  # Analysis Section
374
  gr.Markdown("### πŸ“Š Analyze Orphan Page")
375
+ gr.Markdown("Enter details about the orphan page you want to get links FOR")
376
 
377
  with gr.Row():
378
+ with gr.Column(scale=3):
379
+ orphan_url_input = gr.Textbox(
380
+ label="Orphan Page URL",
381
+ placeholder="https://edstellar.com/blog/your-orphan-page",
382
+ info="The page that needs backlinks"
383
+ )
384
+ orphan_title_input = gr.Textbox(
385
+ label="Orphan Page Title",
386
+ placeholder="Business Development Manager Roles",
387
+ info="The title/topic of your orphan page"
388
+ )
389
+ orphan_keyword_input = gr.Textbox(
390
+ label="Primary Keyword (Optional)",
391
+ placeholder="business development",
392
+ info="Main keyword for anchor text generation"
393
+ )
394
+
395
+ with gr.Column(scale=1):
396
+ num_sources_input = gr.Slider(
397
+ label="Number of Sources",
398
+ minimum=3,
399
+ maximum=5,
400
+ value=3,
401
+ step=1,
402
+ info="How many source pages to find"
403
+ )
404
 
405
  analyze_btn = gr.Button("πŸ” Analyze & Generate Report", variant="primary", size="lg")
406
 
 
427
 
428
  analyze_btn.click(
429
  analyze_orphan,
430
+ inputs=[orphan_url_input, orphan_title_input, orphan_keyword_input, num_sources_input],
431
  outputs=[report_output, table_output]
432
  )
433