Spaces:

edstellar
/

internallinksuggestor

Sleeping

App Files Files Community

vijaykumaredstellar commited on Dec 24, 2025

Commit

35c63dd

verified ·

1 Parent(s): 98f6c84

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -70

app.py CHANGED Viewed

@@ -26,11 +26,13 @@ class KnowledgeBase:
     def load_from_huggingface(self, repo_id, hf_token=None):
         """Load knowledge base from Hugging Face"""
         try:
             kb_path = hf_hub_download(
                 repo_id=repo_id,
                 filename='knowledge_base.pkl',
                 repo_type='dataset',
-                token=hf_token if hf_token else None
             )
             with open(kb_path, 'rb') as f:
@@ -108,41 +110,31 @@ class OrphanPageAnalyzer:
         self.kb = kb
         self.client = client
-    def get_orphan_metadata(self, orphan_url):
-        """Extract metadata for orphan page from knowledge base"""
-        matches = [p for p in self.kb.knowledge_base if p['url'] == orphan_url]
-        if matches:
-            return {
-                'title': matches[0]['title'],
-                'keyword': matches[0]['keyword'],
-                'category': matches[0]['category']
-            }
-        return None
-    def analyze(self, orphan_url, num_sources=3):
         """
-        Complete analysis: Find sources, placements, and generate report
         """
-        # Get orphan page metadata
-        orphan_meta = self.get_orphan_metadata(orphan_url)
-        if not orphan_meta:
-            return "❌ Orphan page not found in knowledge base. Please check the URL.", None
-        orphan_title = orphan_meta['title']
-        orphan_keyword = orphan_meta['keyword']
-        orphan_category = orphan_meta['category']
-        # Step 1: Find relevant source pages
-        search_query = f"{orphan_title} {orphan_keyword} {orphan_category}"
         query_embedding = self.client.get_embedding(search_query)
         candidates = self.kb.search(query_embedding, top_k=50)
-        # Group by URL and score
         url_scores = {}
         for item in candidates:
             url = item['url']
             if url == orphan_url:
                 continue
@@ -161,17 +153,15 @@ class OrphanPageAnalyzer:
                 'similarity': item['similarity_score']
             })
-        # Rank sources
         ranked_sources = []
         for url, data in url_scores.items():
             avg_sim = np.mean([p['similarity'] for p in data['paragraphs']])
             max_sim = max([p['similarity'] for p in data['paragraphs']])
-            score = (
-                avg_sim * 0.4 +
-                max_sim * 0.4 +
-                (1 if data['category'] == orphan_category else 0) * 0.2
-            )
             ranked_sources.append({
                 **data,
@@ -181,36 +171,42 @@ class OrphanPageAnalyzer:
         ranked_sources.sort(key=lambda x: x['score'], reverse=True)
         top_sources = ranked_sources[:num_sources]
-        # Step 2: Find best placements and generate modifications
         results = []
-        for source in top_sources:
-            # Get best paragraph
             best_para = max(source['paragraphs'], key=lambda x: x['similarity'])
-            # Generate anchor text using LLM
             anchor_prompt = f"""Generate a natural 2-4 word anchor text to link to this page:
-Target: {orphan_title}
-Keyword: {orphan_keyword}
-Context: {best_para['text'][:200]}...
-Provide ONLY the anchor text."""
             anchor_text = self.client.chat([
                 {"role": "user", "content": anchor_prompt}
             ]).strip().strip('"').strip("'")
-            # Generate modified sentence using LLM
             modify_prompt = f"""Modify this sentence to naturally include an internal link.
 Current sentence:
 {best_para['text']}
-Link details:
 - Anchor text: "{anchor_text}"
-- Target: {orphan_title}
 Provide ONLY the modified sentence with the anchor text naturally integrated."""
@@ -250,7 +246,7 @@ Provide ONLY the modified sentence with the anchor text naturally integrated."""
         report = f"# 🔗 Internal Linking Report\n\n"
         report += f"**Orphan Page:** {orphan_title}\n"
         report += f"**Target URL:** `{orphan_url}`\n"
-        report += f"**Links Found:** {len(results)}\n\n"
         report += "---\n\n"
         for i, result in enumerate(results, 1):
@@ -295,21 +291,17 @@ def setup(api_key, hf_token):
     """Setup API and load knowledge base"""
     global analyzer
-    status = []
-    # Setup API
     if not api_key or not api_key.strip():
         return "❌ Please enter your OpenRouter API key", None
     try:
         client = OpenRouterClient(api_key)
-        status.append("✅ API key configured")
     except Exception as e:
         return f"❌ API Error: {str(e)}", None
     # Load knowledge base
-    token = hf_token.strip() if hf_token else None
-    success, message = kb.load_from_huggingface(HF_DATASET_REPO, token)
     if not success:
         return f"✅ API key configured\n{message}", None
@@ -322,7 +314,7 @@ def setup(api_key, hf_token):
     return "\n".join(status), None
-def analyze_orphan(orphan_url, num_sources):
     """Analyze orphan page and generate report"""
     if not analyzer:
@@ -331,11 +323,21 @@ def analyze_orphan(orphan_url, num_sources):
     if not orphan_url or not orphan_url.strip():
         return "❌ Please enter an orphan page URL", None
     try:
-        report, table = analyzer.analyze(orphan_url, num_sources)
         return report, table
     except Exception as e:
-        return f"❌ Error: {str(e)}", None
 # ============================================
 # INTERFACE
@@ -343,11 +345,11 @@ def analyze_orphan(orphan_url, num_sources):
 with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🔗 Edstellar Internal Linking Tool")
-    gr.Markdown("Enter an orphan page URL to get instant internal linking recommendations")
     # Setup Section
-    with gr.Accordion("⚙️ Setup (Click to expand - Do this once)", open=True):
-        gr.Markdown("### Step 1: Configure API Keys")
         with gr.Row():
             api_key = gr.Textbox(
@@ -357,7 +359,7 @@ with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft())
                 scale=2
             )
             hf_token = gr.Textbox(
-                label="Hugging Face Token (optional)",
                 placeholder="hf_...",
                 type="password",
                 scale=2
@@ -370,21 +372,35 @@ with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft())
     # Analysis Section
     gr.Markdown("### 📊 Analyze Orphan Page")
     with gr.Row():
-        orphan_url_input = gr.Textbox(
-            label="Orphan Page URL",
-            placeholder="https://edstellar.com/blog/your-orphan-page",
-            scale=3
-        )
-        num_sources_input = gr.Slider(
-            label="Number of Sources",
-            minimum=3,
-            maximum=5,
-            value=3,
-            step=1,
-            scale=1
-        )
     analyze_btn = gr.Button("🔍 Analyze & Generate Report", variant="primary", size="lg")
@@ -411,7 +427,7 @@ with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft())
     analyze_btn.click(
         analyze_orphan,
-        inputs=[orphan_url_input, num_sources_input],
         outputs=[report_output, table_output]
     )

     def load_from_huggingface(self, repo_id, hf_token=None):
         """Load knowledge base from Hugging Face"""
         try:
+            token = hf_token.strip() if hf_token and hf_token.strip() else None
             kb_path = hf_hub_download(
                 repo_id=repo_id,
                 filename='knowledge_base.pkl',
                 repo_type='dataset',
+                token=token
             )
             with open(kb_path, 'rb') as f:
         self.kb = kb
         self.client = client
+    def analyze(self, orphan_url, orphan_title, orphan_keyword, num_sources=3):
         """
+        Find pages in knowledge base that should link TO the orphan page
+        Orphan page does NOT need to be in the knowledge base
         """
+        # Create search query from orphan page info
+        search_query = f"{orphan_title} {orphan_keyword}"
+        print(f"🔍 Searching for pages related to: {search_query}")
+        # Get embedding for the orphan page topic
         query_embedding = self.client.get_embedding(search_query)
+        # Search knowledge base for relevant paragraphs
         candidates = self.kb.search(query_embedding, top_k=50)
+        print(f"📊 Found {len(candidates)} candidate paragraphs")
+        # Group by URL (to find source pages)
         url_scores = {}
         for item in candidates:
             url = item['url']
+            # Skip if somehow the orphan URL is in KB
             if url == orphan_url:
                 continue
                 'similarity': item['similarity_score']
             })
+        print(f"📄 Found {len(url_scores)} unique source pages")
+        # Rank source pages
         ranked_sources = []
         for url, data in url_scores.items():
             avg_sim = np.mean([p['similarity'] for p in data['paragraphs']])
             max_sim = max([p['similarity'] for p in data['paragraphs']])
+            score = (avg_sim * 0.5 + max_sim * 0.5)
             ranked_sources.append({
                 **data,
         ranked_sources.sort(key=lambda x: x['score'], reverse=True)
         top_sources = ranked_sources[:num_sources]
+        print(f"⭐ Selected top {len(top_sources)} sources")
+        # Generate linking recommendations for each source
         results = []
+        for idx, source in enumerate(top_sources, 1):
+            print(f"🔗 Processing source {idx}/{len(top_sources)}: {source['title']}")
+            # Get best paragraph in this source
             best_para = max(source['paragraphs'], key=lambda x: x['similarity'])
+            # Generate anchor text
             anchor_prompt = f"""Generate a natural 2-4 word anchor text to link to this page:
+Target Page Title: {orphan_title}
+Target Keyword: {orphan_keyword}
+Context where link will be placed:
+{best_para['text'][:200]}...
+Provide ONLY the anchor text, no quotes or explanation."""
             anchor_text = self.client.chat([
                 {"role": "user", "content": anchor_prompt}
             ]).strip().strip('"').strip("'")
+            # Generate modified sentence
             modify_prompt = f"""Modify this sentence to naturally include an internal link.
 Current sentence:
 {best_para['text']}
+Add this internal link:
 - Anchor text: "{anchor_text}"
+- Target page: {orphan_title}
+- Target URL: {orphan_url}
 Provide ONLY the modified sentence with the anchor text naturally integrated."""
         report = f"# 🔗 Internal Linking Report\n\n"
         report += f"**Orphan Page:** {orphan_title}\n"
         report += f"**Target URL:** `{orphan_url}`\n"
+        report += f"**Links Generated:** {len(results)}\n\n"
         report += "---\n\n"
         for i, result in enumerate(results, 1):
     """Setup API and load knowledge base"""
     global analyzer
     if not api_key or not api_key.strip():
         return "❌ Please enter your OpenRouter API key", None
     try:
         client = OpenRouterClient(api_key)
+        status = ["✅ API key configured"]
     except Exception as e:
         return f"❌ API Error: {str(e)}", None
     # Load knowledge base
+    success, message = kb.load_from_huggingface(HF_DATASET_REPO, hf_token)
     if not success:
         return f"✅ API key configured\n{message}", None
     return "\n".join(status), None
+def analyze_orphan(orphan_url, orphan_title, orphan_keyword, num_sources):
     """Analyze orphan page and generate report"""
     if not analyzer:
     if not orphan_url or not orphan_url.strip():
         return "❌ Please enter an orphan page URL", None
+    if not orphan_title or not orphan_title.strip():
+        return "❌ Please enter the orphan page title", None
     try:
+        report, table = analyzer.analyze(
+            orphan_url.strip(),
+            orphan_title.strip(),
+            orphan_keyword.strip() if orphan_keyword else orphan_title.strip(),
+            num_sources
+        )
         return report, table
     except Exception as e:
+        import traceback
+        error_detail = traceback.format_exc()
+        return f"❌ Error: {str(e)}\n\nDetails:\n{error_detail}", None
 # ============================================
 # INTERFACE
 with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🔗 Edstellar Internal Linking Tool")
+    gr.Markdown("Find the best existing blog posts to link to your orphan page")
     # Setup Section
+    with gr.Accordion("⚙️ Setup (Do this once)", open=True):
+        gr.Markdown("### Configure API Keys")
         with gr.Row():
             api_key = gr.Textbox(
                 scale=2
             )
             hf_token = gr.Textbox(
+                label="Hugging Face Token",
                 placeholder="hf_...",
                 type="password",
                 scale=2
     # Analysis Section
     gr.Markdown("### 📊 Analyze Orphan Page")
+    gr.Markdown("Enter details about the orphan page you want to get links FOR")
     with gr.Row():
+        with gr.Column(scale=3):
+            orphan_url_input = gr.Textbox(
+                label="Orphan Page URL",
+                placeholder="https://edstellar.com/blog/your-orphan-page",
+                info="The page that needs backlinks"
+            )
+            orphan_title_input = gr.Textbox(
+                label="Orphan Page Title",
+                placeholder="Business Development Manager Roles",
+                info="The title/topic of your orphan page"
+            )
+            orphan_keyword_input = gr.Textbox(
+                label="Primary Keyword (Optional)",
+                placeholder="business development",
+                info="Main keyword for anchor text generation"
+            )
+        with gr.Column(scale=1):
+            num_sources_input = gr.Slider(
+                label="Number of Sources",
+                minimum=3,
+                maximum=5,
+                value=3,
+                step=1,
+                info="How many source pages to find"
+            )
     analyze_btn = gr.Button("🔍 Analyze & Generate Report", variant="primary", size="lg")
     analyze_btn.click(
         analyze_orphan,
+        inputs=[orphan_url_input, orphan_title_input, orphan_keyword_input, num_sources_input],
         outputs=[report_output, table_output]
     )