vijaykumaredstellar commited on
Commit
cb5e2da
·
verified ·
1 Parent(s): 8feb880

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -519
app.py CHANGED
@@ -3,10 +3,11 @@ import pandas as pd
3
  import numpy as np
4
  from openai import OpenAI
5
  import pickle
6
- import json
7
  from huggingface_hub import hf_hub_download
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  import httpx
 
 
10
 
11
  # ============================================
12
  # CONFIGURATION
@@ -14,24 +15,9 @@ import httpx
14
  HF_DATASET_REPO = "vijaykumaredstellar/edstellar-internal-linking-kb"
15
  EMBEDDING_MODEL = "openai/text-embedding-3-small"
16
  CHAT_MODEL = "deepseek/deepseek-chat"
17
- TOP_K_CANDIDATES = 15
18
- TOP_N_SOURCES = 3
19
 
20
  # ============================================
21
- # GLOBAL STATE FOR DATA PASSING
22
- # ============================================
23
- class SessionState:
24
- def __init__(self):
25
- self.stage1_results = None
26
- self.stage2_results = None
27
- self.current_orphan_url = None
28
- self.current_orphan_title = None
29
- self.current_orphan_keyword = None
30
-
31
- session = SessionState()
32
-
33
- # ============================================
34
- # KNOWLEDGE BASE LOADER
35
  # ============================================
36
  class KnowledgeBase:
37
  def __init__(self):
@@ -42,8 +28,6 @@ class KnowledgeBase:
42
  def load_from_huggingface(self, repo_id, hf_token=None):
43
  """Load knowledge base from Hugging Face"""
44
  try:
45
- print(f"📥 Downloading knowledge base from {repo_id}...")
46
-
47
  kb_path = hf_hub_download(
48
  repo_id=repo_id,
49
  filename='knowledge_base.pkl',
@@ -58,20 +42,18 @@ class KnowledgeBase:
58
  self.embeddings = data['embeddings']
59
  self.loaded = True
60
 
61
- print(f"✅ Loaded {len(self.knowledge_base)} paragraphs")
62
- return True, f"✅ Successfully loaded {len(self.knowledge_base)} searchable paragraphs"
63
 
64
  except Exception as e:
65
- return False, f"❌ Error loading knowledge base: {str(e)}"
66
 
67
- def search(self, query_embedding, top_k=15):
68
  """Find most similar paragraphs"""
69
  if not self.loaded:
70
  return []
71
 
72
  query_embedding = np.array(query_embedding).reshape(1, -1)
73
  similarities = cosine_similarity(query_embedding, self.embeddings)[0]
74
-
75
  top_indices = np.argsort(similarities)[-top_k:][::-1]
76
 
77
  results = []
@@ -88,7 +70,6 @@ class KnowledgeBase:
88
  # ============================================
89
  class OpenRouterClient:
90
  def __init__(self, api_key):
91
- # Create custom HTTP client with headers
92
  http_client = httpx.Client(
93
  headers={
94
  "HTTP-Referer": "https://edstellar.com",
@@ -121,30 +102,50 @@ class OpenRouterClient:
121
  return response.choices[0].message.content
122
 
123
  # ============================================
124
- # STAGE 1: SOURCE PAGE DISCOVERY
125
  # ============================================
126
- class Stage1Discovery:
127
  def __init__(self, kb, client):
128
  self.kb = kb
129
  self.client = client
130
 
131
- def analyze(self, orphan_url, orphan_title, orphan_keyword, orphan_category):
132
- """Find top 15 candidate pages, recommend top 3"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Create search query
135
- search_query = f"{orphan_title} {orphan_keyword} {orphan_category}"
136
 
137
- # Get embedding
138
- query_embedding = self.client.get_embedding(search_query)
 
 
 
 
139
 
140
- # Search knowledge base
141
- candidates = self.kb.search(query_embedding, top_k=TOP_K_CANDIDATES * 3)
 
 
142
 
143
- # Group by URL and calculate scores
144
  url_scores = {}
145
  for item in candidates:
146
  url = item['url']
147
- if url == orphan_url: # Skip self-references
148
  continue
149
 
150
  if url not in url_scores:
@@ -153,528 +154,118 @@ class Stage1Discovery:
153
  'title': item['title'],
154
  'category': item['category'],
155
  'keyword': item['keyword'],
156
- 'similarity_scores': [],
157
- 'opportunities': 0
158
  }
159
 
160
- url_scores[url]['similarity_scores'].append(item['similarity_score'])
161
- url_scores[url]['opportunities'] += 1
 
 
 
162
 
163
- # Calculate final scores
164
- results = []
165
  for url, data in url_scores.items():
166
- avg_similarity = np.mean(data['similarity_scores'])
167
- max_similarity = max(data['similarity_scores'])
168
 
169
- # Scoring formula
170
  score = (
171
- avg_similarity * 0.4 +
172
- max_similarity * 0.3 +
173
- (1 if data['category'] == orphan_category else 0) * 0.2 +
174
- min(data['opportunities'] / 10, 1) * 0.1
175
  )
176
 
177
- results.append({
178
  **data,
179
- 'score': int(score * 100),
180
- 'similarity': int(avg_similarity * 100)
181
  })
182
 
183
- # Sort by score
184
- results.sort(key=lambda x: x['score'], reverse=True)
185
-
186
- return results[:TOP_K_CANDIDATES], results[:TOP_N_SOURCES]
187
-
188
- # ============================================
189
- # STAGE 2: PLACEMENT DISCOVERY
190
- # ============================================
191
- class Stage2Placement:
192
- def __init__(self, kb, client):
193
- self.kb = kb
194
- self.client = client
195
-
196
- def analyze(self, orphan_url, orphan_title, orphan_keyword, selected_sources):
197
- """Find exact placement locations in selected source pages"""
198
 
199
- placements = []
 
200
 
201
- for source in selected_sources:
202
- # Find all paragraphs from this source
203
- source_paragraphs = [
204
- p for p in self.kb.knowledge_base
205
- if p['url'] == source['url']
206
- ]
207
-
208
- if not source_paragraphs:
209
- continue
210
-
211
- # Get embedding for orphan
212
- orphan_embedding = self.client.get_embedding(f"{orphan_title} {orphan_keyword}")
213
- orphan_embedding = np.array(orphan_embedding).reshape(1, -1)
214
-
215
- # Calculate similarity for each paragraph
216
- para_scores = []
217
- for para in source_paragraphs:
218
- para_embedding = np.array(para['embedding']).reshape(1, -1)
219
- similarity = cosine_similarity(orphan_embedding, para_embedding)[0][0]
220
- para_scores.append({
221
- 'paragraph_index': para['paragraph_index'],
222
- 'text': para['text'],
223
- 'score': int(similarity * 100)
224
- })
225
-
226
  # Get best paragraph
227
- best_para = max(para_scores, key=lambda x: x['score'])
228
 
229
- # Use LLM to generate anchor text
230
- prompt = f"""You are an SEO expert. Generate a natural anchor text (2-4 words) to link to this page:
231
 
232
- Target Page: {orphan_title}
233
- Target Keyword: {orphan_keyword}
234
 
235
- Context paragraph where link will be inserted:
236
- {best_para['text'][:300]}...
237
 
238
- Provide ONLY the anchor text, nothing else."""
239
 
240
  anchor_text = self.client.chat([
241
- {"role": "user", "content": prompt}
242
  ]).strip().strip('"').strip("'")
243
 
244
- placements.append({
245
- 'source_url': source['url'],
246
- 'source_title': source['title'],
247
- 'paragraph_index': best_para['paragraph_index'],
248
- 'current_text': best_para['text'],
249
- 'score': best_para['score'],
250
- 'anchor_text': anchor_text
251
- })
252
-
253
- return placements
254
-
255
- # ============================================
256
- # STAGE 3: REPORT GENERATION
257
- # ============================================
258
- class Stage3Report:
259
- def __init__(self, client):
260
- self.client = client
261
-
262
- def generate(self, orphan_url, orphan_title, placements):
263
- """Generate implementation report with HTML code"""
264
-
265
- implementations = []
266
-
267
- for placement in placements:
268
- # Use LLM to create natural sentence modification
269
- prompt = f"""You are an SEO expert. Modify this sentence to naturally include an internal link.
270
 
271
  Current sentence:
272
- {placement['current_text'][:400]}
273
 
274
  Link details:
275
- - Anchor text: "{placement['anchor_text']}"
276
- - Target page: {orphan_title}
277
- - Target URL: {orphan_url}
278
 
279
- Provide the modified sentence with the anchor text naturally integrated. Keep the modification minimal and natural. Provide ONLY the modified sentence, nothing else."""
280
 
281
- modified_text = self.client.chat([
282
- {"role": "user", "content": prompt}
283
  ]).strip()
284
 
285
- # Generate HTML code
286
- html_code = modified_text.replace(
287
- placement['anchor_text'],
288
- f'<a href="{orphan_url}">{placement["anchor_text"]}</a>'
289
- )
290
-
291
- implementations.append({
292
- **placement,
293
- 'modified_text': modified_text,
294
- 'html_code': html_code
295
- })
296
-
297
- return implementations
298
-
299
- # ============================================
300
- # GLOBAL STATE
301
- # ============================================
302
- kb = KnowledgeBase()
303
- stage1 = None
304
- stage2 = None
305
- stage3 = None
306
-
307
- # ============================================
308
- # GRADIO INTERFACE FUNCTIONS
309
- # ============================================
310
- def setup_api_key(api_key):
311
- """Initialize OpenRouter client"""
312
- global stage1, stage2, stage3
313
-
314
- if not api_key or not api_key.strip():
315
- return "❌ Please enter a valid API key"
316
-
317
- try:
318
- client = OpenRouterClient(api_key)
319
- stage1 = Stage1Discovery(kb, client)
320
- stage2 = Stage2Placement(kb, client)
321
- stage3 = Stage3Report(client)
322
- return "✅ API Key configured successfully!"
323
- except Exception as e:
324
- return f"❌ Error: {str(e)}"
325
-
326
- def load_kb(hf_token):
327
- """Load knowledge base from HF"""
328
- token = hf_token.strip() if hf_token else None
329
- success, message = kb.load_from_huggingface(HF_DATASET_REPO, token)
330
- return message
331
-
332
- def run_stage1(orphan_url, orphan_title, orphan_keyword, orphan_category):
333
- """Run Stage 1 analysis"""
334
- if not kb.loaded:
335
- return "❌ Please load the knowledge base first!", None, None
336
-
337
- if not stage1:
338
- return "❌ Please configure your API key first!", None, None
339
-
340
- if not orphan_url or not orphan_title:
341
- return "❌ Please provide at least URL and Title", None, None
342
-
343
- try:
344
- # Store in session
345
- session.current_orphan_url = orphan_url
346
- session.current_orphan_title = orphan_title
347
- session.current_orphan_keyword = orphan_keyword
348
-
349
- all_candidates, top_3 = stage1.analyze(
350
- orphan_url, orphan_title, orphan_keyword, orphan_category
351
- )
352
-
353
- # Store results
354
- session.stage1_results = {
355
- 'all_candidates': all_candidates,
356
- 'top_3': top_3
357
- }
358
-
359
- # Format for display
360
- df_all = pd.DataFrame(all_candidates)[['url', 'title', 'score', 'similarity', 'opportunities']]
361
- df_top3 = pd.DataFrame(top_3)[['url', 'title', 'score']]
362
-
363
- return "✅ Stage 1 complete! Proceed to Stage 2.", df_all, df_top3
364
- except Exception as e:
365
- return f"❌ Error: {str(e)}", None, None
366
-
367
- def run_stage2(orphan_url, orphan_title, orphan_keyword, selected_urls_text):
368
- """Run Stage 2 analysis"""
369
- if not stage2:
370
- return "❌ Please configure your API key first!", None, gr.update(visible=False)
371
-
372
- # Parse selected URLs
373
- selected_urls = [url.strip() for url in selected_urls_text.split('\n') if url.strip()]
374
-
375
- if len(selected_urls) != 3:
376
- return f"❌ Please provide exactly 3 URLs (you provided {len(selected_urls)})", None, gr.update(visible=False)
377
-
378
- # Get source details from KB
379
- selected_sources = []
380
- for url in selected_urls:
381
- matching = [p for p in kb.knowledge_base if p['url'] == url]
382
- if matching:
383
- selected_sources.append({
384
- 'url': url,
385
- 'title': matching[0]['title']
386
  })
387
-
388
- if len(selected_sources) != 3:
389
- return f"❌ Some URLs not found in knowledge base", None, gr.update(visible=False)
390
-
391
- try:
392
- # Update session
393
- session.current_orphan_url = orphan_url
394
- session.current_orphan_title = orphan_title
395
- session.current_orphan_keyword = orphan_keyword
396
 
397
- placements = stage2.analyze(orphan_url, orphan_title, orphan_keyword, selected_sources)
 
398
 
399
- # Store in session for Stage 3
400
- session.stage2_results = placements
401
-
402
- # Format for display
403
  df = pd.DataFrame([{
404
- 'Source URL': p['source_url'],
405
- 'Source Title': p['source_title'],
406
- 'Para #': p['paragraph_index'],
407
- 'Score': p['score'],
408
- 'Anchor Text': p['anchor_text'],
409
- 'Current Text (preview)': p['current_text'][:100] + '...'
410
- } for p in placements])
411
-
412
- return "✅ Stage 2 complete! Click 'Stage 3' tab to generate implementation report.", df, gr.update(visible=True)
413
- except Exception as e:
414
- return f"❌ Error: {str(e)}", None, gr.update(visible=False)
415
-
416
- def run_stage3():
417
- """Run Stage 3 report generation - automatically uses data from Stage 2"""
418
- if not stage3:
419
- return "❌ Please configure your API key first!", "", None, ""
420
-
421
- if not session.stage2_results:
422
- return "❌ Please complete Stage 2 first!", "", None, ""
423
 
424
- try:
425
- # Generate implementations using stored data
426
- implementations = stage3.generate(
427
- session.current_orphan_url,
428
- session.current_orphan_title,
429
- session.stage2_results
430
- )
431
-
432
- # Format summary
433
- avg_score = sum(p['score'] for p in implementations) // len(implementations)
434
- summary_md = f"""
435
- ### 📊 Implementation Summary
436
-
437
- **Orphan Page:** {session.current_orphan_title}
438
- **Target URL:** {session.current_orphan_url}
439
-
440
- **Statistics:**
441
- - ✅ Total links to implement: **{len(implementations)}**
442
- - 📈 Average placement score: **{avg_score}/100**
443
- - 🎯 Anchor text diversity: **Excellent** (all unique)
444
- - 🔗 Total backlinks created: **{len(implementations)} unique inbound links**
445
-
446
- **Next Steps:**
447
- 1. Review the implementation table below
448
- 2. Copy the HTML code snippets
449
- 3. Navigate to each source page in Webflow
450
- 4. Replace the current text with the HTML code
451
- 5. Publish changes
452
- """
453
 
454
- # Format table
455
- df = pd.DataFrame([{
456
- 'Source Page': impl['source_title'][:40],
457
- 'Para #': impl['paragraph_index'],
458
- 'Anchor Text': impl['anchor_text'],
459
- 'Score': impl['score'],
460
- 'Current Text (first 80 chars)': impl['current_text'][:80] + '...',
461
- 'Modified Text (first 80 chars)': impl['modified_text'][:80] + '...'
462
- } for impl in implementations])
463
-
464
- # Format HTML output with detailed instructions
465
- html_sections = []
466
- for i, impl in enumerate(implementations):
467
- html_sections.append(f"""
468
- {'='*80}
469
- LINK {i+1} of {len(implementations)}
470
- {'='*80}
471
-
472
- SOURCE PAGE: {impl['source_title']}
473
- URL: {impl['source_url']}
474
- PARAGRAPH #: {impl['paragraph_index']}
475
- PLACEMENT SCORE: {impl['score']}/100
476
 
477
- ---
478
- CURRENT TEXT (FIND THIS IN WEBFLOW):
479
- ---
480
- {impl['current_text'][:300]}...
481
 
482
- ---
483
- REPLACE WITH THIS HTML CODE:
484
- ---
485
- {impl['html_code']}
486
-
487
- ---
488
- ANCHOR TEXT: "{impl['anchor_text']}"
489
- TARGET URL: {session.current_orphan_url}
490
  ---
491
 
492
- """)
493
 
494
- html_output = "\n".join(html_sections)
495
-
496
- return "✅ Stage 3 complete! Review and implement the suggestions below.", summary_md, df, html_output
497
-
498
- except Exception as e:
499
- return f"❌ Error: {str(e)}", "", None, ""
500
 
501
- # ============================================
502
- # BUILD INTERFACE
503
- # ============================================
504
- with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft()) as app:
505
- gr.Markdown("# 🔗 Edstellar Internal Linking RAG Tool")
506
- gr.Markdown("AI-powered 3-stage analysis to find optimal internal linking opportunities for orphan pages")
507
-
508
- with gr.Tab("⚙️ Setup"):
509
- gr.Markdown("## Step 1: Configure API Access")
510
-
511
- with gr.Row():
512
- api_key_input = gr.Textbox(
513
- label="OpenRouter API Key",
514
- placeholder="sk-or-v1-...",
515
- type="password"
516
- )
517
- api_setup_btn = gr.Button("Configure API Key", variant="primary")
518
-
519
- api_status = gr.Textbox(label="Status", interactive=False)
520
-
521
- gr.Markdown("---")
522
- gr.Markdown("## Step 2: Load Knowledge Base")
523
- gr.Markdown("*This loads your pre-built knowledge base with 523 searchable blog paragraphs*")
524
-
525
- with gr.Row():
526
- hf_token_input = gr.Textbox(
527
- label="Hugging Face Token (optional for private repos)",
528
- placeholder="hf_...",
529
- type="password"
530
- )
531
- kb_load_btn = gr.Button("Load Knowledge Base", variant="primary")
532
-
533
- kb_status = gr.Textbox(label="Status", interactive=False)
534
-
535
- with gr.Tab("📊 Stage 1: Find Source Pages"):
536
- gr.Markdown("## Identify Top 15 Candidates → Select Best 3")
537
- gr.Markdown("Enter your orphan page details to find the best source pages for internal links")
538
-
539
- with gr.Row():
540
- with gr.Column():
541
- s1_orphan_url = gr.Textbox(
542
- label="Orphan Page URL",
543
- placeholder="https://edstellar.com/blog/employee-training-tips"
544
- )
545
- s1_orphan_title = gr.Textbox(
546
- label="Orphan Page Title",
547
- placeholder="Employee Training Tips"
548
- )
549
- s1_orphan_keyword = gr.Textbox(
550
- label="Primary Keyword",
551
- placeholder="employee training"
552
- )
553
- s1_orphan_category = gr.Textbox(
554
- label="Category",
555
- placeholder="Learning & Development"
556
- )
557
- s1_analyze_btn = gr.Button("🔍 Find Source Pages", variant="primary", size="lg")
558
-
559
- with gr.Column():
560
- s1_status = gr.Textbox(label="Status", lines=3)
561
-
562
- gr.Markdown("### 📋 All Candidates (Top 15)")
563
- s1_all_candidates = gr.Dataframe(
564
- label="All Candidates",
565
- interactive=False,
566
- wrap=True
567
- )
568
-
569
- gr.Markdown("### ⭐ Recommended Top 3")
570
- gr.Markdown("*These are automatically selected based on relevance, category match, and linking potential*")
571
- s1_top3 = gr.Dataframe(
572
- label="Top 3 Sources",
573
- interactive=False
574
- )
575
-
576
- with gr.Tab("📍 Stage 2: Find Placements"):
577
- gr.Markdown("## Identify Exact Link Placement Locations")
578
- gr.Markdown("Paste 3 source URLs (from Stage 1) to find optimal paragraph placements")
579
-
580
- with gr.Row():
581
- with gr.Column():
582
- s2_orphan_url = gr.Textbox(
583
- label="Orphan Page URL",
584
- placeholder="(Copy from Stage 1)"
585
- )
586
- s2_orphan_title = gr.Textbox(
587
- label="Orphan Page Title",
588
- placeholder="(Copy from Stage 1)"
589
- )
590
- s2_orphan_keyword = gr.Textbox(
591
- label="Primary Keyword",
592
- placeholder="(Copy from Stage 1)"
593
- )
594
- s2_selected_urls = gr.Textbox(
595
- label="Selected 3 URLs (one per line)",
596
- placeholder="https://edstellar.com/blog/page1\nhttps://edstellar.com/blog/page2\nhttps://edstellar.com/blog/page3",
597
- lines=4
598
- )
599
- s2_analyze_btn = gr.Button("🎯 Find Placements", variant="primary", size="lg")
600
-
601
- with gr.Column():
602
- s2_status = gr.Textbox(label="Status", lines=5)
603
-
604
- s2_placements = gr.Dataframe(
605
- label="Placement Recommendations",
606
- interactive=False,
607
- wrap=True
608
- )
609
-
610
- s2_proceed_notice = gr.Markdown(
611
- "✅ **Data saved!** Click the **Stage 3** tab to generate implementation report.",
612
- visible=False
613
- )
614
-
615
- with gr.Tab("📄 Stage 3: Implementation Report"):
616
- gr.Markdown("## Generate Ready-to-Use HTML Code")
617
- gr.Markdown("Automatically generates implementation guide using results from Stage 2")
618
-
619
- gr.Markdown("### ⚡ Quick Start")
620
- gr.Markdown("Click the button below to generate your implementation report. No manual input needed!")
621
-
622
- s3_generate_btn = gr.Button(
623
- "📋 Generate Implementation Report",
624
- variant="primary",
625
- size="lg"
626
- )
627
-
628
- s3_status = gr.Textbox(label="Status", lines=2)
629
-
630
- s3_summary = gr.Markdown()
631
-
632
- gr.Markdown("### 📊 Implementation Table")
633
- s3_report = gr.Dataframe(
634
- label="Detailed Recommendations",
635
- interactive=False,
636
- wrap=True
637
- )
638
-
639
- gr.Markdown("### 💻 HTML Code Snippets")
640
- gr.Markdown("Copy each section and paste into the corresponding Webflow page")
641
- s3_html_output = gr.Code(
642
- label="Copy-Paste Ready Implementation Guide",
643
- language="html",
644
- lines=20
645
- )
646
-
647
- # Wire up events
648
- api_setup_btn.click(
649
- setup_api_key,
650
- inputs=[api_key_input],
651
- outputs=[api_status]
652
- )
653
-
654
- kb_load_btn.click(
655
- load_kb,
656
- inputs=[hf_token_input],
657
- outputs=[kb_status]
658
- )
659
-
660
- s1_analyze_btn.click(
661
- run_stage1,
662
- inputs=[s1_orphan_url, s1_orphan_title, s1_orphan_keyword, s1_orphan_category],
663
- outputs=[s1_status, s1_all_candidates, s1_top3]
664
- )
665
-
666
- s2_analyze_btn.click(
667
- run_stage2,
668
- inputs=[s2_orphan_url, s2_orphan_title, s2_orphan_keyword, s2_selected_urls],
669
- outputs=[s2_status, s2_placements, s2_proceed_notice]
670
- )
671
-
672
- s3_generate_btn.click(
673
- run_stage3,
674
- inputs=[], # No inputs needed - uses session data
675
- outputs=[s3_status, s3_summary, s3_report, s3_html_output]
676
- )
677
 
678
- # Launch
679
- if __name__ == "__main__":
680
- app.launch()
 
3
  import numpy as np
4
  from openai import OpenAI
5
  import pickle
 
6
  from huggingface_hub import hf_hub_download
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  import httpx
9
+ from bs4 import BeautifulSoup
10
+ import re
11
 
12
  # ============================================
13
  # CONFIGURATION
 
15
  HF_DATASET_REPO = "vijaykumaredstellar/edstellar-internal-linking-kb"
16
  EMBEDDING_MODEL = "openai/text-embedding-3-small"
17
  CHAT_MODEL = "deepseek/deepseek-chat"
 
 
18
 
19
  # ============================================
20
+ # KNOWLEDGE BASE
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # ============================================
22
  class KnowledgeBase:
23
  def __init__(self):
 
28
  def load_from_huggingface(self, repo_id, hf_token=None):
29
  """Load knowledge base from Hugging Face"""
30
  try:
 
 
31
  kb_path = hf_hub_download(
32
  repo_id=repo_id,
33
  filename='knowledge_base.pkl',
 
42
  self.embeddings = data['embeddings']
43
  self.loaded = True
44
 
45
+ return True, f"✅ Loaded {len(self.knowledge_base)} searchable paragraphs from {len(set(p['url'] for p in self.knowledge_base))} blog posts"
 
46
 
47
  except Exception as e:
48
+ return False, f"❌ Error: {str(e)}"
49
 
50
+ def search(self, query_embedding, top_k=50):
51
  """Find most similar paragraphs"""
52
  if not self.loaded:
53
  return []
54
 
55
  query_embedding = np.array(query_embedding).reshape(1, -1)
56
  similarities = cosine_similarity(query_embedding, self.embeddings)[0]
 
57
  top_indices = np.argsort(similarities)[-top_k:][::-1]
58
 
59
  results = []
 
70
  # ============================================
71
  class OpenRouterClient:
72
  def __init__(self, api_key):
 
73
  http_client = httpx.Client(
74
  headers={
75
  "HTTP-Referer": "https://edstellar.com",
 
102
  return response.choices[0].message.content
103
 
104
  # ============================================
105
+ # ORPHAN PAGE ANALYZER
106
  # ============================================
107
+ class OrphanPageAnalyzer:
108
  def __init__(self, kb, client):
109
  self.kb = kb
110
  self.client = client
111
 
112
+ def get_orphan_metadata(self, orphan_url):
113
+ """Extract metadata for orphan page from knowledge base"""
114
+ matches = [p for p in self.kb.knowledge_base if p['url'] == orphan_url]
115
+ if matches:
116
+ return {
117
+ 'title': matches[0]['title'],
118
+ 'keyword': matches[0]['keyword'],
119
+ 'category': matches[0]['category']
120
+ }
121
+ return None
122
+
123
+ def analyze(self, orphan_url, num_sources=3):
124
+ """
125
+ Complete analysis: Find sources, placements, and generate report
126
+ Returns: markdown report with implementation details
127
+ """
128
 
129
+ # Get orphan page metadata
130
+ orphan_meta = self.get_orphan_metadata(orphan_url)
131
 
132
+ if not orphan_meta:
133
+ return "❌ Orphan page not found in knowledge base. Please check the URL.", None
134
+
135
+ orphan_title = orphan_meta['title']
136
+ orphan_keyword = orphan_meta['keyword']
137
+ orphan_category = orphan_meta['category']
138
 
139
+ # Step 1: Find relevant source pages
140
+ search_query = f"{orphan_title} {orphan_keyword} {orphan_category}"
141
+ query_embedding = self.client.get_embedding(search_query)
142
+ candidates = self.kb.search(query_embedding, top_k=50)
143
 
144
+ # Group by URL and score
145
  url_scores = {}
146
  for item in candidates:
147
  url = item['url']
148
+ if url == orphan_url:
149
  continue
150
 
151
  if url not in url_scores:
 
154
  'title': item['title'],
155
  'category': item['category'],
156
  'keyword': item['keyword'],
157
+ 'paragraphs': []
 
158
  }
159
 
160
+ url_scores[url]['paragraphs'].append({
161
+ 'index': item['paragraph_index'],
162
+ 'text': item['text'],
163
+ 'similarity': item['similarity_score']
164
+ })
165
 
166
+ # Rank sources
167
+ ranked_sources = []
168
  for url, data in url_scores.items():
169
+ avg_sim = np.mean([p['similarity'] for p in data['paragraphs']])
170
+ max_sim = max([p['similarity'] for p in data['paragraphs']])
171
 
 
172
  score = (
173
+ avg_sim * 0.4 +
174
+ max_sim * 0.4 +
175
+ (1 if data['category'] == orphan_category else 0) * 0.2
 
176
  )
177
 
178
+ ranked_sources.append({
179
  **data,
180
+ 'score': score
 
181
  })
182
 
183
+ ranked_sources.sort(key=lambda x: x['score'], reverse=True)
184
+ top_sources = ranked_sources[:num_sources]
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # Step 2: Find best placements and generate modifications
187
+ results = []
188
 
189
+ for source in top_sources:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # Get best paragraph
191
+ best_para = max(source['paragraphs'], key=lambda x: x['similarity'])
192
 
193
+ # Generate anchor text using LLM
194
+ anchor_prompt = f"""Generate a natural 2-4 word anchor text to link to this page:
195
 
196
+ Target: {orphan_title}
197
+ Keyword: {orphan_keyword}
198
 
199
+ Context: {best_para['text'][:200]}...
 
200
 
201
+ Provide ONLY the anchor text."""
202
 
203
  anchor_text = self.client.chat([
204
+ {"role": "user", "content": anchor_prompt}
205
  ]).strip().strip('"').strip("'")
206
 
207
+ # Generate modified sentence using LLM
208
+ modify_prompt = f"""Modify this sentence to naturally include an internal link.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  Current sentence:
211
+ {best_para['text']}
212
 
213
  Link details:
214
+ - Anchor text: "{anchor_text}"
215
+ - Target: {orphan_title}
 
216
 
217
+ Provide ONLY the modified sentence with the anchor text naturally integrated."""
218
 
219
+ new_sentence = self.client.chat([
220
+ {"role": "user", "content": modify_prompt}
221
  ]).strip()
222
 
223
+ results.append({
224
+ 'source_url': source['url'],
225
+ 'source_title': source['title'],
226
+ 'score': int(source['score'] * 100),
227
+ 'paragraph_index': best_para['index'],
228
+ 'current_sentence': best_para['text'],
229
+ 'new_sentence': new_sentence,
230
+ 'anchor_text': anchor_text,
231
+ 'target_url': orphan_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  })
 
 
 
 
 
 
 
 
 
233
 
234
+ # Generate report
235
+ report = self.generate_report(orphan_url, orphan_title, results)
236
 
237
+ # Generate table
 
 
 
238
  df = pd.DataFrame([{
239
+ 'Source Page': r['source_title'][:50],
240
+ 'Paragraph #': r['paragraph_index'],
241
+ 'Score': r['score'],
242
+ 'Anchor Text': r['anchor_text'],
243
+ 'Current Sentence': r['current_sentence'][:100] + '...',
244
+ 'New Sentence': r['new_sentence'][:100] + '...'
245
+ } for r in results])
246
+
247
+ return report, df
 
 
 
 
 
 
 
 
 
 
248
 
249
+ def generate_report(self, orphan_url, orphan_title, results):
250
+ """Generate markdown report"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ report = f"""# 🔗 Internal Linking Report
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
+ **Orphan Page:** {orphan_title}
255
+ **Target URL:** `{orphan_url}`
256
+ **Links Found:** {len(results)}
 
257
 
 
 
 
 
 
 
 
 
258
  ---
259
 
260
+ """
261
 
262
+ for i, result in enumerate(results, 1):
263
+ report += f"""
264
+ ## Link {i}: {result['source_title']}
 
 
 
265
 
266
+ **Source URL:** `{result['source_url']}`
267
+ **Paragraph #:** {result['paragraph_index']}
268
+ **Relevance Score:** {result['score']}/100
269
+ **Anchor Text:** "{result['anchor_text']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ ### Current Sentence: