Spaces:

edstellar
/

internallinksuggestor

Sleeping

App Files Files Community

vijaykumaredstellar commited on Dec 24, 2025

Commit

cb5e2da

verified ·

1 Parent(s): 8feb880

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -519

app.py CHANGED Viewed

@@ -3,10 +3,11 @@ import pandas as pd
 import numpy as np
 from openai import OpenAI
 import pickle
-import json
 from huggingface_hub import hf_hub_download
 from sklearn.metrics.pairwise import cosine_similarity
 import httpx
 # ============================================
 # CONFIGURATION
@@ -14,24 +15,9 @@ import httpx
 HF_DATASET_REPO = "vijaykumaredstellar/edstellar-internal-linking-kb"
 EMBEDDING_MODEL = "openai/text-embedding-3-small"
 CHAT_MODEL = "deepseek/deepseek-chat"
-TOP_K_CANDIDATES = 15
-TOP_N_SOURCES = 3
 # ============================================
-# GLOBAL STATE FOR DATA PASSING
-# ============================================
-class SessionState:
-    def __init__(self):
-        self.stage1_results = None
-        self.stage2_results = None
-        self.current_orphan_url = None
-        self.current_orphan_title = None
-        self.current_orphan_keyword = None
-session = SessionState()
-# ============================================
-# KNOWLEDGE BASE LOADER
 # ============================================
 class KnowledgeBase:
     def __init__(self):
@@ -42,8 +28,6 @@ class KnowledgeBase:
     def load_from_huggingface(self, repo_id, hf_token=None):
         """Load knowledge base from Hugging Face"""
         try:
-            print(f"📥 Downloading knowledge base from {repo_id}...")
             kb_path = hf_hub_download(
                 repo_id=repo_id,
                 filename='knowledge_base.pkl',
@@ -58,20 +42,18 @@ class KnowledgeBase:
             self.embeddings = data['embeddings']
             self.loaded = True
-            print(f"✅ Loaded {len(self.knowledge_base)} paragraphs")
-            return True, f"✅ Successfully loaded {len(self.knowledge_base)} searchable paragraphs"
         except Exception as e:
-            return False, f"❌ Error loading knowledge base: {str(e)}"
-    def search(self, query_embedding, top_k=15):
         """Find most similar paragraphs"""
         if not self.loaded:
             return []
         query_embedding = np.array(query_embedding).reshape(1, -1)
         similarities = cosine_similarity(query_embedding, self.embeddings)[0]
         top_indices = np.argsort(similarities)[-top_k:][::-1]
         results = []
@@ -88,7 +70,6 @@ class KnowledgeBase:
 # ============================================
 class OpenRouterClient:
     def __init__(self, api_key):
-        # Create custom HTTP client with headers
         http_client = httpx.Client(
             headers={
                 "HTTP-Referer": "https://edstellar.com",
@@ -121,30 +102,50 @@ class OpenRouterClient:
         return response.choices[0].message.content
 # ============================================
-# STAGE 1: SOURCE PAGE DISCOVERY
 # ============================================
-class Stage1Discovery:
     def __init__(self, kb, client):
         self.kb = kb
         self.client = client
-    def analyze(self, orphan_url, orphan_title, orphan_keyword, orphan_category):
-        """Find top 15 candidate pages, recommend top 3"""
-        # Create search query
-        search_query = f"{orphan_title} {orphan_keyword} {orphan_category}"
-        # Get embedding
-        query_embedding = self.client.get_embedding(search_query)
-        # Search knowledge base
-        candidates = self.kb.search(query_embedding, top_k=TOP_K_CANDIDATES * 3)
-        # Group by URL and calculate scores
         url_scores = {}
         for item in candidates:
             url = item['url']
-            if url == orphan_url:  # Skip self-references
                 continue
             if url not in url_scores:
@@ -153,528 +154,118 @@ class Stage1Discovery:
                     'title': item['title'],
                     'category': item['category'],
                     'keyword': item['keyword'],
-                    'similarity_scores': [],
-                    'opportunities': 0
                 }
-            url_scores[url]['similarity_scores'].append(item['similarity_score'])
-            url_scores[url]['opportunities'] += 1
-        # Calculate final scores
-        results = []
         for url, data in url_scores.items():
-            avg_similarity = np.mean(data['similarity_scores'])
-            max_similarity = max(data['similarity_scores'])
-            # Scoring formula
             score = (
-                avg_similarity * 0.4 +
-                max_similarity * 0.3 +
-                (1 if data['category'] == orphan_category else 0) * 0.2 +
-                min(data['opportunities'] / 10, 1) * 0.1
             )
-            results.append({
                 **data,
-                'score': int(score * 100),
-                'similarity': int(avg_similarity * 100)
             })
-        # Sort by score
-        results.sort(key=lambda x: x['score'], reverse=True)
-        return results[:TOP_K_CANDIDATES], results[:TOP_N_SOURCES]
-# ============================================
-# STAGE 2: PLACEMENT DISCOVERY
-# ============================================
-class Stage2Placement:
-    def __init__(self, kb, client):
-        self.kb = kb
-        self.client = client
-    def analyze(self, orphan_url, orphan_title, orphan_keyword, selected_sources):
-        """Find exact placement locations in selected source pages"""
-        placements = []
-        for source in selected_sources:
-            # Find all paragraphs from this source
-            source_paragraphs = [
-                p for p in self.kb.knowledge_base
-                if p['url'] == source['url']
-            ]
-            if not source_paragraphs:
-                continue
-            # Get embedding for orphan
-            orphan_embedding = self.client.get_embedding(f"{orphan_title} {orphan_keyword}")
-            orphan_embedding = np.array(orphan_embedding).reshape(1, -1)
-            # Calculate similarity for each paragraph
-            para_scores = []
-            for para in source_paragraphs:
-                para_embedding = np.array(para['embedding']).reshape(1, -1)
-                similarity = cosine_similarity(orphan_embedding, para_embedding)[0][0]
-                para_scores.append({
-                    'paragraph_index': para['paragraph_index'],
-                    'text': para['text'],
-                    'score': int(similarity * 100)
-                })
             # Get best paragraph
-            best_para = max(para_scores, key=lambda x: x['score'])
-            # Use LLM to generate anchor text
-            prompt = f"""You are an SEO expert. Generate a natural anchor text (2-4 words) to link to this page:
-Target Page: {orphan_title}
-Target Keyword: {orphan_keyword}
-Context paragraph where link will be inserted:
-{best_para['text'][:300]}...
-Provide ONLY the anchor text, nothing else."""
             anchor_text = self.client.chat([
-                {"role": "user", "content": prompt}
             ]).strip().strip('"').strip("'")
-            placements.append({
-                'source_url': source['url'],
-                'source_title': source['title'],
-                'paragraph_index': best_para['paragraph_index'],
-                'current_text': best_para['text'],
-                'score': best_para['score'],
-                'anchor_text': anchor_text
-            })
-        return placements
-# ============================================
-# STAGE 3: REPORT GENERATION
-# ============================================
-class Stage3Report:
-    def __init__(self, client):
-        self.client = client
-    def generate(self, orphan_url, orphan_title, placements):
-        """Generate implementation report with HTML code"""
-        implementations = []
-        for placement in placements:
-            # Use LLM to create natural sentence modification
-            prompt = f"""You are an SEO expert. Modify this sentence to naturally include an internal link.
 Current sentence:
-{placement['current_text'][:400]}
 Link details:
-- Anchor text: "{placement['anchor_text']}"
-- Target page: {orphan_title}
-- Target URL: {orphan_url}
-Provide the modified sentence with the anchor text naturally integrated. Keep the modification minimal and natural. Provide ONLY the modified sentence, nothing else."""
-            modified_text = self.client.chat([
-                {"role": "user", "content": prompt}
             ]).strip()
-            # Generate HTML code
-            html_code = modified_text.replace(
-                placement['anchor_text'],
-                f'<a href="{orphan_url}">{placement["anchor_text"]}</a>'
-            )
-            implementations.append({
-                **placement,
-                'modified_text': modified_text,
-                'html_code': html_code
-            })
-        return implementations
-# ============================================
-# GLOBAL STATE
-# ============================================
-kb = KnowledgeBase()
-stage1 = None
-stage2 = None
-stage3 = None
-# ============================================
-# GRADIO INTERFACE FUNCTIONS
-# ============================================
-def setup_api_key(api_key):
-    """Initialize OpenRouter client"""
-    global stage1, stage2, stage3
-    if not api_key or not api_key.strip():
-        return "❌ Please enter a valid API key"
-    try:
-        client = OpenRouterClient(api_key)
-        stage1 = Stage1Discovery(kb, client)
-        stage2 = Stage2Placement(kb, client)
-        stage3 = Stage3Report(client)
-        return "✅ API Key configured successfully!"
-    except Exception as e:
-        return f"❌ Error: {str(e)}"
-def load_kb(hf_token):
-    """Load knowledge base from HF"""
-    token = hf_token.strip() if hf_token else None
-    success, message = kb.load_from_huggingface(HF_DATASET_REPO, token)
-    return message
-def run_stage1(orphan_url, orphan_title, orphan_keyword, orphan_category):
-    """Run Stage 1 analysis"""
-    if not kb.loaded:
-        return "❌ Please load the knowledge base first!", None, None
-    if not stage1:
-        return "❌ Please configure your API key first!", None, None
-    if not orphan_url or not orphan_title:
-        return "❌ Please provide at least URL and Title", None, None
-    try:
-        # Store in session
-        session.current_orphan_url = orphan_url
-        session.current_orphan_title = orphan_title
-        session.current_orphan_keyword = orphan_keyword
-        all_candidates, top_3 = stage1.analyze(
-            orphan_url, orphan_title, orphan_keyword, orphan_category
-        )
-        # Store results
-        session.stage1_results = {
-            'all_candidates': all_candidates,
-            'top_3': top_3
-        }
-        # Format for display
-        df_all = pd.DataFrame(all_candidates)[['url', 'title', 'score', 'similarity', 'opportunities']]
-        df_top3 = pd.DataFrame(top_3)[['url', 'title', 'score']]
-        return "✅ Stage 1 complete! Proceed to Stage 2.", df_all, df_top3
-    except Exception as e:
-        return f"❌ Error: {str(e)}", None, None
-def run_stage2(orphan_url, orphan_title, orphan_keyword, selected_urls_text):
-    """Run Stage 2 analysis"""
-    if not stage2:
-        return "❌ Please configure your API key first!", None, gr.update(visible=False)
-    # Parse selected URLs
-    selected_urls = [url.strip() for url in selected_urls_text.split('\n') if url.strip()]
-    if len(selected_urls) != 3:
-        return f"❌ Please provide exactly 3 URLs (you provided {len(selected_urls)})", None, gr.update(visible=False)
-    # Get source details from KB
-    selected_sources = []
-    for url in selected_urls:
-        matching = [p for p in kb.knowledge_base if p['url'] == url]
-        if matching:
-            selected_sources.append({
-                'url': url,
-                'title': matching[0]['title']
             })
-    if len(selected_sources) != 3:
-        return f"❌ Some URLs not found in knowledge base", None, gr.update(visible=False)
-    try:
-        # Update session
-        session.current_orphan_url = orphan_url
-        session.current_orphan_title = orphan_title
-        session.current_orphan_keyword = orphan_keyword
-        placements = stage2.analyze(orphan_url, orphan_title, orphan_keyword, selected_sources)
-        # Store in session for Stage 3
-        session.stage2_results = placements
-        # Format for display
         df = pd.DataFrame([{
-            'Source URL': p['source_url'],
-            'Source Title': p['source_title'],
-            'Para #': p['paragraph_index'],
-            'Score': p['score'],
-            'Anchor Text': p['anchor_text'],
-            'Current Text (preview)': p['current_text'][:100] + '...'
-        } for p in placements])
-        return "✅ Stage 2 complete! Click 'Stage 3' tab to generate implementation report.", df, gr.update(visible=True)
-    except Exception as e:
-        return f"❌ Error: {str(e)}", None, gr.update(visible=False)
-def run_stage3():
-    """Run Stage 3 report generation - automatically uses data from Stage 2"""
-    if not stage3:
-        return "❌ Please configure your API key first!", "", None, ""
-    if not session.stage2_results:
-        return "❌ Please complete Stage 2 first!", "", None, ""
-    try:
-        # Generate implementations using stored data
-        implementations = stage3.generate(
-            session.current_orphan_url,
-            session.current_orphan_title,
-            session.stage2_results
-        )
-        # Format summary
-        avg_score = sum(p['score'] for p in implementations) // len(implementations)
-        summary_md = f"""
-### 📊 Implementation Summary
-**Orphan Page:** {session.current_orphan_title}
-**Target URL:** {session.current_orphan_url}
-**Statistics:**
-- ✅ Total links to implement: **{len(implementations)}**
-- 📈 Average placement score: **{avg_score}/100**
-- 🎯 Anchor text diversity: **Excellent** (all unique)
-- 🔗 Total backlinks created: **{len(implementations)} unique inbound links**
-**Next Steps:**
-1. Review the implementation table below
-2. Copy the HTML code snippets
-3. Navigate to each source page in Webflow
-4. Replace the current text with the HTML code
-5. Publish changes
-        """
-        # Format table
-        df = pd.DataFrame([{
-            'Source Page': impl['source_title'][:40],
-            'Para #': impl['paragraph_index'],
-            'Anchor Text': impl['anchor_text'],
-            'Score': impl['score'],
-            'Current Text (first 80 chars)': impl['current_text'][:80] + '...',
-            'Modified Text (first 80 chars)': impl['modified_text'][:80] + '...'
-        } for impl in implementations])
-        # Format HTML output with detailed instructions
-        html_sections = []
-        for i, impl in enumerate(implementations):
-            html_sections.append(f"""
-{'='*80}
-LINK {i+1} of {len(implementations)}
-{'='*80}
-SOURCE PAGE: {impl['source_title']}
-URL: {impl['source_url']}
-PARAGRAPH #: {impl['paragraph_index']}
-PLACEMENT SCORE: {impl['score']}/100
----
-CURRENT TEXT (FIND THIS IN WEBFLOW):
----
-{impl['current_text'][:300]}...
----
-REPLACE WITH THIS HTML CODE:
----
-{impl['html_code']}
----
-ANCHOR TEXT: "{impl['anchor_text']}"
-TARGET URL: {session.current_orphan_url}
 ---
-""")
-        html_output = "\n".join(html_sections)
-        return "✅ Stage 3 complete! Review and implement the suggestions below.", summary_md, df, html_output
-    except Exception as e:
-        return f"❌ Error: {str(e)}", "", None, ""
-# ============================================
-# BUILD INTERFACE
-# ============================================
-with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft()) as app:
-    gr.Markdown("# 🔗 Edstellar Internal Linking RAG Tool")
-    gr.Markdown("AI-powered 3-stage analysis to find optimal internal linking opportunities for orphan pages")
-    with gr.Tab("⚙️ Setup"):
-        gr.Markdown("## Step 1: Configure API Access")
-        with gr.Row():
-            api_key_input = gr.Textbox(
-                label="OpenRouter API Key",
-                placeholder="sk-or-v1-...",
-                type="password"
-            )
-            api_setup_btn = gr.Button("Configure API Key", variant="primary")
-        api_status = gr.Textbox(label="Status", interactive=False)
-        gr.Markdown("---")
-        gr.Markdown("## Step 2: Load Knowledge Base")
-        gr.Markdown("*This loads your pre-built knowledge base with 523 searchable blog paragraphs*")
-        with gr.Row():
-            hf_token_input = gr.Textbox(
-                label="Hugging Face Token (optional for private repos)",
-                placeholder="hf_...",
-                type="password"
-            )
-            kb_load_btn = gr.Button("Load Knowledge Base", variant="primary")
-        kb_status = gr.Textbox(label="Status", interactive=False)
-    with gr.Tab("📊 Stage 1: Find Source Pages"):
-        gr.Markdown("## Identify Top 15 Candidates → Select Best 3")
-        gr.Markdown("Enter your orphan page details to find the best source pages for internal links")
-        with gr.Row():
-            with gr.Column():
-                s1_orphan_url = gr.Textbox(
-                    label="Orphan Page URL",
-                    placeholder="https://edstellar.com/blog/employee-training-tips"
-                )
-                s1_orphan_title = gr.Textbox(
-                    label="Orphan Page Title",
-                    placeholder="Employee Training Tips"
-                )
-                s1_orphan_keyword = gr.Textbox(
-                    label="Primary Keyword",
-                    placeholder="employee training"
-                )
-                s1_orphan_category = gr.Textbox(
-                    label="Category",
-                    placeholder="Learning & Development"
-                )
-                s1_analyze_btn = gr.Button("🔍 Find Source Pages", variant="primary", size="lg")
-            with gr.Column():
-                s1_status = gr.Textbox(label="Status", lines=3)
-        gr.Markdown("### 📋 All Candidates (Top 15)")
-        s1_all_candidates = gr.Dataframe(
-            label="All Candidates",
-            interactive=False,
-            wrap=True
-        )
-        gr.Markdown("### ⭐ Recommended Top 3")
-        gr.Markdown("*These are automatically selected based on relevance, category match, and linking potential*")
-        s1_top3 = gr.Dataframe(
-            label="Top 3 Sources",
-            interactive=False
-        )
-    with gr.Tab("📍 Stage 2: Find Placements"):
-        gr.Markdown("## Identify Exact Link Placement Locations")
-        gr.Markdown("Paste 3 source URLs (from Stage 1) to find optimal paragraph placements")
-        with gr.Row():
-            with gr.Column():
-                s2_orphan_url = gr.Textbox(
-                    label="Orphan Page URL",
-                    placeholder="(Copy from Stage 1)"
-                )
-                s2_orphan_title = gr.Textbox(
-                    label="Orphan Page Title",
-                    placeholder="(Copy from Stage 1)"
-                )
-                s2_orphan_keyword = gr.Textbox(
-                    label="Primary Keyword",
-                    placeholder="(Copy from Stage 1)"
-                )
-                s2_selected_urls = gr.Textbox(
-                    label="Selected 3 URLs (one per line)",
-                    placeholder="https://edstellar.com/blog/page1\nhttps://edstellar.com/blog/page2\nhttps://edstellar.com/blog/page3",
-                    lines=4
-                )
-                s2_analyze_btn = gr.Button("🎯 Find Placements", variant="primary", size="lg")
-            with gr.Column():
-                s2_status = gr.Textbox(label="Status", lines=5)
-        s2_placements = gr.Dataframe(
-            label="Placement Recommendations",
-            interactive=False,
-            wrap=True
-        )
-        s2_proceed_notice = gr.Markdown(
-            "✅ **Data saved!** Click the **Stage 3** tab to generate implementation report.",
-            visible=False
-        )
-    with gr.Tab("📄 Stage 3: Implementation Report"):
-        gr.Markdown("## Generate Ready-to-Use HTML Code")
-        gr.Markdown("Automatically generates implementation guide using results from Stage 2")
-        gr.Markdown("### ⚡ Quick Start")
-        gr.Markdown("Click the button below to generate your implementation report. No manual input needed!")
-        s3_generate_btn = gr.Button(
-            "📋 Generate Implementation Report",
-            variant="primary",
-            size="lg"
-        )
-        s3_status = gr.Textbox(label="Status", lines=2)
-        s3_summary = gr.Markdown()
-        gr.Markdown("### 📊 Implementation Table")
-        s3_report = gr.Dataframe(
-            label="Detailed Recommendations",
-            interactive=False,
-            wrap=True
-        )
-        gr.Markdown("### 💻 HTML Code Snippets")
-        gr.Markdown("Copy each section and paste into the corresponding Webflow page")
-        s3_html_output = gr.Code(
-            label="Copy-Paste Ready Implementation Guide",
-            language="html",
-            lines=20
-        )
-    # Wire up events
-    api_setup_btn.click(
-        setup_api_key,
-        inputs=[api_key_input],
-        outputs=[api_status]
-    )
-    kb_load_btn.click(
-        load_kb,
-        inputs=[hf_token_input],
-        outputs=[kb_status]
-    )
-    s1_analyze_btn.click(
-        run_stage1,
-        inputs=[s1_orphan_url, s1_orphan_title, s1_orphan_keyword, s1_orphan_category],
-        outputs=[s1_status, s1_all_candidates, s1_top3]
-    )
-    s2_analyze_btn.click(
-        run_stage2,
-        inputs=[s2_orphan_url, s2_orphan_title, s2_orphan_keyword, s2_selected_urls],
-        outputs=[s2_status, s2_placements, s2_proceed_notice]
-    )
-    s3_generate_btn.click(
-        run_stage3,
-        inputs=[],  # No inputs needed - uses session data
-        outputs=[s3_status, s3_summary, s3_report, s3_html_output]
-    )
-# Launch
-if __name__ == "__main__":
-    app.launch()

 import numpy as np
 from openai import OpenAI
 import pickle
 from huggingface_hub import hf_hub_download
 from sklearn.metrics.pairwise import cosine_similarity
 import httpx
+from bs4 import BeautifulSoup
+import re
 # ============================================
 # CONFIGURATION
 HF_DATASET_REPO = "vijaykumaredstellar/edstellar-internal-linking-kb"
 EMBEDDING_MODEL = "openai/text-embedding-3-small"
 CHAT_MODEL = "deepseek/deepseek-chat"
 # ============================================
+# KNOWLEDGE BASE
 # ============================================
 class KnowledgeBase:
     def __init__(self):
     def load_from_huggingface(self, repo_id, hf_token=None):
         """Load knowledge base from Hugging Face"""
         try:
             kb_path = hf_hub_download(
                 repo_id=repo_id,
                 filename='knowledge_base.pkl',
             self.embeddings = data['embeddings']
             self.loaded = True
+            return True, f"✅ Loaded {len(self.knowledge_base)} searchable paragraphs from {len(set(p['url'] for p in self.knowledge_base))} blog posts"
         except Exception as e:
+            return False, f"❌ Error: {str(e)}"
+    def search(self, query_embedding, top_k=50):
         """Find most similar paragraphs"""
         if not self.loaded:
             return []
         query_embedding = np.array(query_embedding).reshape(1, -1)
         similarities = cosine_similarity(query_embedding, self.embeddings)[0]
         top_indices = np.argsort(similarities)[-top_k:][::-1]
         results = []
 # ============================================
 class OpenRouterClient:
     def __init__(self, api_key):
         http_client = httpx.Client(
             headers={
                 "HTTP-Referer": "https://edstellar.com",
         return response.choices[0].message.content
 # ============================================
+# ORPHAN PAGE ANALYZER
 # ============================================
+class OrphanPageAnalyzer:
     def __init__(self, kb, client):
         self.kb = kb
         self.client = client
+    def get_orphan_metadata(self, orphan_url):
+        """Extract metadata for orphan page from knowledge base"""
+        matches = [p for p in self.kb.knowledge_base if p['url'] == orphan_url]
+        if matches:
+            return {
+                'title': matches[0]['title'],
+                'keyword': matches[0]['keyword'],
+                'category': matches[0]['category']
+            }
+        return None
+    def analyze(self, orphan_url, num_sources=3):
+        """
+        Complete analysis: Find sources, placements, and generate report
+        Returns: markdown report with implementation details
+        """
+        # Get orphan page metadata
+        orphan_meta = self.get_orphan_metadata(orphan_url)
+        if not orphan_meta:
+            return "❌ Orphan page not found in knowledge base. Please check the URL.", None
+        orphan_title = orphan_meta['title']
+        orphan_keyword = orphan_meta['keyword']
+        orphan_category = orphan_meta['category']
+        # Step 1: Find relevant source pages
+        search_query = f"{orphan_title} {orphan_keyword} {orphan_category}"
+        query_embedding = self.client.get_embedding(search_query)
+        candidates = self.kb.search(query_embedding, top_k=50)
+        # Group by URL and score
         url_scores = {}
         for item in candidates:
             url = item['url']
+            if url == orphan_url:
                 continue
             if url not in url_scores:
                     'title': item['title'],
                     'category': item['category'],
                     'keyword': item['keyword'],
+                    'paragraphs': []
                 }
+            url_scores[url]['paragraphs'].append({
+                'index': item['paragraph_index'],
+                'text': item['text'],
+                'similarity': item['similarity_score']
+            })
+        # Rank sources
+        ranked_sources = []
         for url, data in url_scores.items():
+            avg_sim = np.mean([p['similarity'] for p in data['paragraphs']])
+            max_sim = max([p['similarity'] for p in data['paragraphs']])
             score = (
+                avg_sim * 0.4 +
+                max_sim * 0.4 +
+                (1 if data['category'] == orphan_category else 0) * 0.2
             )
+            ranked_sources.append({
                 **data,
+                'score': score
             })
+        ranked_sources.sort(key=lambda x: x['score'], reverse=True)
+        top_sources = ranked_sources[:num_sources]
+        # Step 2: Find best placements and generate modifications
+        results = []
+        for source in top_sources:
             # Get best paragraph
+            best_para = max(source['paragraphs'], key=lambda x: x['similarity'])
+            # Generate anchor text using LLM
+            anchor_prompt = f"""Generate a natural 2-4 word anchor text to link to this page:
+Target: {orphan_title}
+Keyword: {orphan_keyword}
+Context: {best_para['text'][:200]}...
+Provide ONLY the anchor text."""
             anchor_text = self.client.chat([
+                {"role": "user", "content": anchor_prompt}
             ]).strip().strip('"').strip("'")
+            # Generate modified sentence using LLM
+            modify_prompt = f"""Modify this sentence to naturally include an internal link.
 Current sentence:
+{best_para['text']}
 Link details:
+- Anchor text: "{anchor_text}"
+- Target: {orphan_title}
+Provide ONLY the modified sentence with the anchor text naturally integrated."""
+            new_sentence = self.client.chat([
+                {"role": "user", "content": modify_prompt}
             ]).strip()
+            results.append({
+                'source_url': source['url'],
+                'source_title': source['title'],
+                'score': int(source['score'] * 100),
+                'paragraph_index': best_para['index'],
+                'current_sentence': best_para['text'],
+                'new_sentence': new_sentence,
+                'anchor_text': anchor_text,
+                'target_url': orphan_url
             })
+        # Generate report
+        report = self.generate_report(orphan_url, orphan_title, results)
+        # Generate table
         df = pd.DataFrame([{
+            'Source Page': r['source_title'][:50],
+            'Paragraph #': r['paragraph_index'],
+            'Score': r['score'],
+            'Anchor Text': r['anchor_text'],
+            'Current Sentence': r['current_sentence'][:100] + '...',
+            'New Sentence': r['new_sentence'][:100] + '...'
+        } for r in results])
+        return report, df
+    def generate_report(self, orphan_url, orphan_title, results):
+        """Generate markdown report"""
+        report = f"""# 🔗 Internal Linking Report
+**Orphan Page:** {orphan_title}
+**Target URL:** `{orphan_url}`
+**Links Found:** {len(results)}
 ---
+"""
+        for i, result in enumerate(results, 1):
+            report += f"""
+## Link {i}: {result['source_title']}
+**Source URL:** `{result['source_url']}`
+**Paragraph #:** {result['paragraph_index']}
+**Relevance Score:** {result['score']}/100
+**Anchor Text:** "{result['anchor_text']}"
+### Current Sentence: