Spaces:

edstellar
/

internallinksuggestor

Sleeping

App Files Files Community

vijaykumaredstellar commited on Dec 24, 2025

Commit

af2e520

verified ·

1 Parent(s): 57f193d

Update app.py

Browse files

Files changed (1) hide show

app.py +1033 -0

app.py CHANGED Viewed

	@@ -0,0 +1,1033 @@

+# ============================================================================
+# EDSTELLAR INTERNAL LINKING RAG TOOL
+# OpenRouter API + DeepSeek V3
+# Google Colab → Hugging Face Deployment
+# ============================================================================
+# CELL 1: Install Dependencies
+# ============================================================================
+!pip install -q gradio openai pandas numpy scikit-learn
+# CELL 2: Import Libraries
+# ============================================================================
+import gradio as gr
+import pandas as pd
+import numpy as np
+import json
+import os
+from typing import List, Dict, Tuple
+import time
+from sklearn.metrics.pairwise import cosine_similarity
+# CELL 3: Configuration
+# ============================================================================
+class Config:
+    OPENROUTER_API_KEY = ""  # Will be set via Gradio interface
+    OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+    # DeepSeek V3 models on OpenRouter
+    CHAT_MODEL = "deepseek/deepseek-chat"  # DeepSeek V3
+    EMBEDDING_MODEL = "openai/text-embedding-3-small"  # For embeddings (DeepSeek doesn't have embedding API)
+    # Pricing (OpenRouter rates for DeepSeek V3)
+    CHAT_COST_PER_1K_INPUT = 0.0014  # $1.40 per 1M input tokens
+    CHAT_COST_PER_1K_OUTPUT = 0.0028  # $2.80 per 1M output tokens
+    EMBEDDING_COST_PER_1K = 0.00002  # text-embedding-3-small
+    TOP_K_CANDIDATES = 15
+    TOP_N_SOURCES = 3
+config = Config()
+# CELL 4: OpenRouter API Client
+# ============================================================================
+from openai import OpenAI
+class OpenRouterClient:
+    def __init__(self, api_key: str):
+        self.client = OpenAI(
+            api_key=api_key,
+            base_url=config.OPENROUTER_BASE_URL
+        )
+        self.total_cost = 0.0
+    def get_embedding(self, text: str) -> List[float]:
+        """Generate embedding for text using OpenAI's embedding model"""
+        try:
+            # Truncate if too long
+            text = text[:8000]
+            response = self.client.embeddings.create(
+                model=config.EMBEDDING_MODEL,
+                input=text,
+                extra_headers={
+                    "HTTP-Referer": "https://edstellar.com",  # Optional: your site
+                    "X-Title": "Edstellar Internal Linking Tool"  # Optional: app name
+                }
+            )
+            # Track cost
+            tokens = response.usage.total_tokens
+            cost = (tokens / 1000) * config.EMBEDDING_COST_PER_1K
+            self.total_cost += cost
+            return response.data[0].embedding
+        except Exception as e:
+            raise Exception(f"Embedding error: {str(e)}")
+    def chat_completion(self, messages: List[Dict], temperature: float = 0.3) -> Tuple[str, float]:
+        """Generate chat completion using DeepSeek V3"""
+        try:
+            response = self.client.chat.completions.create(
+                model=config.CHAT_MODEL,
+                messages=messages,
+                temperature=temperature,
+                extra_headers={
+                    "HTTP-Referer": "https://edstellar.com",
+                    "X-Title": "Edstellar Internal Linking Tool"
+                }
+            )
+            # Track cost (OpenRouter provides usage data)
+            if hasattr(response, 'usage'):
+                input_tokens = response.usage.prompt_tokens
+                output_tokens = response.usage.completion_tokens
+                cost = (input_tokens / 1000) * config.CHAT_COST_PER_1K_INPUT
+                cost += (output_tokens / 1000) * config.CHAT_COST_PER_1K_OUTPUT
+                self.total_cost += cost
+            else:
+                cost = 0.0
+            return response.choices[0].message.content, cost
+        except Exception as e:
+            raise Exception(f"Chat completion error: {str(e)}")
+    def get_total_cost(self) -> float:
+        """Get total API cost so far"""
+        return self.total_cost
+    def reset_cost(self):
+        """Reset cost counter"""
+        self.total_cost = 0.0
+# CELL 5: Data Processing
+# ============================================================================
+class DataProcessor:
+    @staticmethod
+    def parse_csv(file_path: str) -> pd.DataFrame:
+        """Parse Webflow CSV export"""
+        df = pd.read_csv(file_path)
+        # Rename columns for easier access
+        column_mapping = {
+            'Name': 'title',
+            'Slug': 'slug',
+            'Content': 'content',
+            'Meta Description': 'meta_description',
+            'Primary Keyword': 'primary_keyword',
+            'Training Category': 'category',
+            'Related Tags': 'tags',
+            'Views': 'views',
+            'Main Tag': 'main_tag'
+        }
+        # Only rename columns that exist
+        existing_columns = {k: v for k, v in column_mapping.items() if k in df.columns}
+        df = df.rename(columns=existing_columns)
+        # Create full URL
+        df['url'] = df['slug'].apply(lambda x: f"/blog/{x}" if pd.notna(x) else "")
+        # Fill NaN values with empty strings for text columns
+        text_columns = ['title', 'content', 'meta_description', 'primary_keyword', 'category', 'tags']
+        for col in text_columns:
+            if col in df.columns:
+                df[col] = df[col].fillna('')
+        # Fill NaN values with 0 for numeric columns
+        if 'views' in df.columns:
+            df['views'] = pd.to_numeric(df['views'], errors='coerce').fillna(0).astype(int)
+        return df
+    @staticmethod
+    def clean_html(html_text: str) -> str:
+        """Remove HTML tags and clean text"""
+        import re
+        if pd.isna(html_text) or html_text == '':
+            return ""
+        # Remove script and style tags
+        text = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', str(html_text), flags=re.IGNORECASE)
+        text = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', text, flags=re.IGNORECASE)
+        # Remove HTML tags
+        text = re.sub(r'<[^>]+>', ' ', text)
+        # Decode HTML entities
+        import html
+        text = html.unescape(text)
+        # Clean whitespace
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+    @staticmethod
+    def extract_paragraphs(content: str, min_length: int = 100, max_paragraphs: int = 30) -> List[Dict]:
+        """Extract paragraphs from content"""
+        clean_content = DataProcessor.clean_html(content)
+        if not clean_content:
+            return []
+        # Split by multiple newlines or periods
+        import re
+        # Try to split by paragraph markers first
+        raw_paragraphs = re.split(r'\n\n+', clean_content)
+        paragraphs = []
+        for para in raw_paragraphs:
+            para = para.strip()
+            # Skip if too short
+            if len(para) < min_length:
+                continue
+            # If paragraph is very long, split by sentences
+            if len(para) > 600:
+                sentences = re.split(r'(?<=[.!?])\s+', para)
+                current_chunk = []
+                current_length = 0
+                for sentence in sentences:
+                    current_chunk.append(sentence)
+                    current_length += len(sentence)
+                    if current_length >= 300:  # Target chunk size
+                        chunk_text = ' '.join(current_chunk)
+                        if len(chunk_text) >= min_length:
+                            paragraphs.append({
+                                'text': chunk_text,
+                                'length': len(chunk_text)
+                            })
+                        current_chunk = []
+                        current_length = 0
+                # Add remaining
+                if current_chunk:
+                    chunk_text = ' '.join(current_chunk)
+                    if len(chunk_text) >= min_length:
+                        paragraphs.append({
+                            'text': chunk_text,
+                            'length': len(chunk_text)
+                        })
+            else:
+                paragraphs.append({
+                    'text': para,
+                    'length': len(para)
+                })
+            # Limit total paragraphs per post
+            if len(paragraphs) >= max_paragraphs:
+                break
+        return paragraphs
+# CELL 6: Knowledge Base
+# ============================================================================
+class KnowledgeBase:
+    def __init__(self):
+        self.entries = []
+        self.embeddings = []
+        self.build_cost = 0.0
+    def build(self, df: pd.DataFrame, client: OpenRouterClient,
+              progress_callback=None) -> Tuple[int, float]:
+        """Build knowledge base from DataFrame"""
+        self.entries = []
+        self.embeddings = []
+        client.reset_cost()  # Reset cost counter
+        total_posts = len(df)
+        for idx, row in df.iterrows():
+            if progress_callback:
+                progress_callback(
+                    idx + 1,
+                    total_posts,
+                    f"Processing: {row['title'][:50]}... (Cost: ${client.get_total_cost():.3f})"
+                )
+            # Skip if no content
+            if not row['content'] or row['content'] == '':
+                continue
+            # Extract paragraphs
+            paragraphs = DataProcessor.extract_paragraphs(row['content'])
+            if not paragraphs:
+                continue
+            for para_idx, para in enumerate(paragraphs):
+                # Create entry
+                entry = {
+                    'id': f"{row['url']}_para_{para_idx}",
+                    'post_url': row['url'],
+                    'post_title': row['title'],
+                    'post_category': row.get('category', ''),
+                    'post_keyword': row.get('primary_keyword', ''),
+                    'post_tags': row.get('tags', ''),
+                    'post_views': row.get('views', 0),
+                    'paragraph_index': para_idx,
+                    'paragraph_text': para['text']
+                }
+                # Generate embedding
+                try:
+                    embedding = client.get_embedding(para['text'])
+                    self.entries.append(entry)
+                    self.embeddings.append(embedding)
+                except Exception as e:
+                    print(f"Error processing {entry['id']}: {e}")
+                    continue
+                # Rate limiting (OpenRouter: 20 requests/second, but be conservative)
+                time.sleep(0.3)
+        # Convert embeddings to numpy array
+        if self.embeddings:
+            self.embeddings = np.array(self.embeddings)
+        self.build_cost = client.get_total_cost()
+        return len(self.entries), self.build_cost
+    def search(self, query_embedding: np.ndarray, top_k: int = 20,
+               exclude_url: str = None) -> List[Dict]:
+        """Semantic search in knowledge base"""
+        if len(self.embeddings) == 0:
+            return []
+        # Calculate cosine similarity
+        query_embedding = np.array(query_embedding).reshape(1, -1)
+        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
+        # Get top K indices
+        top_indices = np.argsort(similarities)[::-1]
+        # Filter and return entries with scores
+        results = []
+        for idx in top_indices:
+            entry = self.entries[idx].copy()
+            # Skip if same post
+            if exclude_url and entry['post_url'] == exclude_url:
+                continue
+            entry['similarity'] = float(similarities[idx])
+            results.append(entry)
+            if len(results) >= top_k:
+                break
+        return results
+# CELL 7: Stage 1 - Source Page Discovery
+# ============================================================================
+class Stage1Discovery:
+    @staticmethod
+    def analyze(orphan_url: str, df: pd.DataFrame, kb: KnowledgeBase,
+                client: OpenRouterClient) -> Tuple[List[Dict], float]:
+        """Find top candidate source pages"""
+        # Reset cost tracking
+        initial_cost = client.get_total_cost()
+        # Get orphan page data
+        orphan_row = df[df['url'] == orphan_url].iloc[0]
+        # Create orphan profile
+        orphan_profile = f"{orphan_row['title']}. {orphan_row.get('meta_description', '')}. "
+        orphan_profile += f"Keywords: {orphan_row.get('primary_keyword', '')}. "
+        orphan_profile += DataProcessor.clean_html(orphan_row['content'])[:2000]
+        # Get embedding
+        orphan_embedding = client.get_embedding(orphan_profile)
+        # Search knowledge base
+        results = kb.search(orphan_embedding, top_k=200, exclude_url=orphan_url)
+        # Group by post (aggregate paragraph scores)
+        post_scores = {}
+        for result in results:
+            post_url = result['post_url']
+            if post_url not in post_scores:
+                post_scores[post_url] = {
+                    'url': post_url,
+                    'title': result['post_title'],
+                    'category': result['post_category'],
+                    'keyword': result['post_keyword'],
+                    'tags': result['post_tags'],
+                    'views': result['post_views'],
+                    'similarities': [],
+                    'paragraph_count': 0
+                }
+            post_scores[post_url]['similarities'].append(result['similarity'])
+            post_scores[post_url]['paragraph_count'] += 1
+        # Calculate aggregate scores
+        candidates = []
+        for post_url, data in post_scores.items():
+            # Average of top 3 similarities
+            top_sims = sorted(data['similarities'], reverse=True)[:3]
+            avg_similarity = np.mean(top_sims) if top_sims else 0
+            # Base score from similarity (0-100)
+            score = avg_similarity * 100
+            # Boost for same category
+            orphan_category = orphan_row.get('category', '').lower()
+            post_category = data['category'].lower()
+            if orphan_category and post_category and orphan_category == post_category:
+                score += 8
+            # Boost for keyword overlap
+            orphan_keywords = set(str(orphan_row.get('primary_keyword', '')).lower().split())
+            post_keywords = set(str(data['keyword']).lower().split())
+            keyword_overlap = len(orphan_keywords & post_keywords)
+            score += keyword_overlap * 3
+            # Slight boost for high traffic
+            if data['views'] > 10000:
+                score += 3
+            elif data['views'] > 5000:
+                score += 1
+            # Cap at 100
+            score = min(score, 100)
+            candidates.append({
+                'rank': 0,
+                'url': post_url,
+                'title': data['title'],
+                'score': int(score),
+                'traffic': int(data['views']),
+                'category': data['category'],
+                'similarity': round(avg_similarity * 100, 1),
+                'opportunities': min(data['paragraph_count'], 5)
+            })
+        # Sort by score
+        candidates = sorted(candidates, key=lambda x: x['score'], reverse=True)
+        # Add ranks
+        for idx, candidate in enumerate(candidates):
+            candidate['rank'] = idx + 1
+        # Calculate cost for this stage
+        stage_cost = client.get_total_cost() - initial_cost
+        # Return top 15
+        return candidates[:config.TOP_K_CANDIDATES], stage_cost
+# CELL 8: Stage 2 - Placement Discovery
+# ============================================================================
+class Stage2Placement:
+    @staticmethod
+    def analyze(orphan_url: str, selected_sources: List[str], df: pd.DataFrame,
+                kb: KnowledgeBase, client: OpenRouterClient) -> Tuple[List[Dict], float]:
+        """Find best placement in each selected source"""
+        initial_cost = client.get_total_cost()
+        orphan_row = df[df['url'] == orphan_url].iloc[0]
+        placements = []
+        # Get orphan embedding once
+        orphan_profile = f"{orphan_row['title']}. {orphan_row.get('primary_keyword', '')}"
+        orphan_embedding = client.get_embedding(orphan_profile)
+        for source_url in selected_sources:
+            source_row = df[df['url'] == source_url].iloc[0]
+            # Get all paragraphs for this source from KB
+            source_paragraphs = [
+                entry for entry in kb.entries
+                if entry['post_url'] == source_url
+            ]
+            if not source_paragraphs:
+                continue
+            # Find best paragraph by similarity
+            best_para = None
+            best_score = 0
+            # Get embeddings for source paragraphs
+            para_indices = [kb.entries.index(p) for p in source_paragraphs]
+            para_embeddings = kb.embeddings[para_indices]
+            # Calculate similarities
+            similarities = cosine_similarity(
+                np.array(orphan_embedding).reshape(1, -1),
+                para_embeddings
+            )[0]
+            for idx, (para, similarity) in enumerate(zip(source_paragraphs, similarities)):
+                score = similarity * 100
+                # Prefer middle paragraphs
+                total_paras = len(source_paragraphs)
+                if total_paras > 4 and 2 < para['paragraph_index'] < total_paras - 2:
+                    score += 5
+                # Prefer certain length
+                para_len = len(para['paragraph_text'])
+                if 150 < para_len < 500:
+                    score += 3
+                if score > best_score:
+                    best_score = score
+                    best_para = para
+            if best_para:
+                # Use LLM to generate modified sentence
+                placement = Stage2Placement._generate_placement(
+                    orphan_row, source_row, best_para, client
+                )
+                placement['score'] = int(best_score)
+                placements.append(placement)
+        stage_cost = client.get_total_cost() - initial_cost
+        return placements, stage_cost
+    @staticmethod
+    def _generate_placement(orphan_row, source_row, paragraph, client) -> Dict:
+        """Use LLM to generate placement details"""
+        # Truncate paragraph if too long
+        para_text = paragraph['paragraph_text']
+        if len(para_text) > 400:
+            para_text = para_text[:400] + "..."
+        prompt = f"""You are an SEO expert. Analyze this paragraph and suggest how to add an internal link naturally.
+SOURCE ARTICLE: {source_row['title']}
+PARAGRAPH: "{para_text}"
+TARGET PAGE TO LINK:
+- Title: {orphan_row['title']}
+- Keyword: {orphan_row.get('primary_keyword', '')}
+Task: Find a natural spot to add the link.
+Respond in JSON format:
+{{
+  "current_sentence": "the original sentence to modify",
+  "modified_sentence": "new sentence with [ANCHOR] placeholder where link goes",
+  "anchor_text": "suggested anchor text (2-4 words)",
+  "anchor_alternatives": ["alternative 1", "alternative 2"]
+}}
+Make the link insertion natural and valuable to readers."""
+        messages = [
+            {"role": "system", "content": "You are an SEO expert specializing in natural internal linking."},
+            {"role": "user", "content": prompt}
+        ]
+        try:
+            response, cost = client.chat_completion(messages)
+            # Try to parse JSON
+            try:
+                result = json.loads(response)
+            except:
+                # If not valid JSON, try to extract from markdown code block
+                import re
+                json_match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
+                if json_match:
+                    result = json.loads(json_match.group(1))
+                else:
+                    # Fallback
+                    result = {
+                        "current_sentence": para_text[:100] + "...",
+                        "modified_sentence": f"...with [ANCHOR] for better understanding.",
+                        "anchor_text": orphan_row.get('primary_keyword', 'more information'),
+                        "anchor_alternatives": ["related guide", "detailed tips"]
+                    }
+            return {
+                'source_url': source_row['url'],
+                'source_title': source_row['title'],
+                'paragraph_index': paragraph['paragraph_index'],
+                'paragraph_text': paragraph['paragraph_text'],
+                'current_sentence': result.get('current_sentence', para_text[:100]),
+                'modified_sentence': result.get('modified_sentence', ''),
+                'anchor_text': result.get('anchor_text', orphan_row.get('primary_keyword', '')),
+                'anchor_alternatives': result.get('anchor_alternatives', [])
+            }
+        except Exception as e:
+            print(f"Error in LLM generation: {e}")
+            # Fallback: simple modification
+            return {
+                'source_url': source_row['url'],
+                'source_title': source_row['title'],
+                'paragraph_index': paragraph['paragraph_index'],
+                'paragraph_text': para_text,
+                'current_sentence': para_text[:100] + "...",
+                'modified_sentence': f"...implementing [ANCHOR] can significantly improve results.",
+                'anchor_text': orphan_row.get('primary_keyword', 'effective strategies'),
+                'anchor_alternatives': []
+            }
+# CELL 9: Stage 3 - Report Generation
+# ============================================================================
+class Stage3Report:
+    @staticmethod
+    def generate(orphan_url: str, placements: List[Dict]) -> Dict:
+        """Generate final implementation report"""
+        links = []
+        for idx, placement in enumerate(placements):
+            # Create HTML code
+            html_code = placement['modified_sentence'].replace(
+                '[ANCHOR]',
+                f'<a href="{orphan_url}">{placement["anchor_text"]}</a>'
+            )
+            links.append({
+                'number': idx + 1,
+                'source_url': placement['source_url'],
+                'source_title': placement['source_title'],
+                'paragraph': placement['paragraph_index'],
+                'score': placement['score'],
+                'current_sentence': placement['current_sentence'],
+                'modified_sentence': placement['modified_sentence'],
+                'anchor_text': placement['anchor_text'],
+                'anchor_alternatives': placement.get('anchor_alternatives', []),
+                'html_code': html_code
+            })
+        # Calculate metrics
+        avg_score = int(np.mean([l['score'] for l in links])) if links else 0
+        unique_anchors = len(set(l['anchor_text'] for l in links))
+        anchor_diversity = 'Excellent' if unique_anchors == len(links) else ('Good' if unique_anchors >= len(links) - 1 else 'Fair')
+        return {
+            'orphan_url': orphan_url,
+            'links': links,
+            'avg_score': avg_score,
+            'anchor_diversity': anchor_diversity,
+            'total_links': len(links)
+        }
+# CELL 10: Gradio Interface Functions
+# ============================================================================
+# Global state
+app_state = {
+    'df': None,
+    'kb': None,
+    'client': None,
+    'stage1_results': None,
+    'stage2_results': None,
+    'selected_sources': [],
+    'current_orphan_url': None
+}
+def setup_api_key(api_key: str) -> str:
+    """Initialize OpenRouter client"""
+    if not api_key or not api_key.startswith('sk-'):
+        return "❌ Please enter a valid OpenRouter API key"
+    try:
+        app_state['client'] = OpenRouterClient(api_key)
+        # Test the API key with a simple embedding
+        app_state['client'].get_embedding("test connection")
+        return "✅ API Key validated successfully! Ready to use."
+    except Exception as e:
+        return f"❌ Error: {str(e)}\n\nMake sure you're using an OpenRouter API key."
+def upload_csv(file) -> str:
+    """Process uploaded CSV"""
+    if file is None:
+        return "❌ No file uploaded"
+    try:
+        app_state['df'] = DataProcessor.parse_csv(file.name)
+        # Show stats
+        total_posts = len(app_state['df'])
+        posts_with_content = len(app_state['df'][app_state['df']['content'] != ''])
+        return f"✅ CSV loaded successfully!\n\n📊 Stats:\n- Total posts: {total_posts}\n- Posts with content: {posts_with_content}\n- Ready to build knowledge base"
+    except Exception as e:
+        return f"❌ Error parsing CSV: {str(e)}\n\nMake sure it's a valid Webflow export."
+def build_knowledge_base(progress=gr.Progress()) -> str:
+    """Build knowledge base with embeddings"""
+    if app_state['df'] is None:
+        return "❌ Please upload CSV first"
+    if app_state['client'] is None:
+        return "❌ Please set API key first"
+    try:
+        app_state['kb'] = KnowledgeBase()
+        progress(0, desc="Starting knowledge base build...")
+        def progress_callback(current, total, message):
+            progress((current, total), desc=message)
+        num_entries, cost = app_state['kb'].build(
+            app_state['df'],
+            app_state['client'],
+            progress_callback
+        )
+        if num_entries == 0:
+            return "❌ No entries created. Check if CSV has content."
+        return f"✅ Knowledge base built successfully!\n\n📊 Results:\n- Paragraphs indexed: {num_entries:,}\n- Cost: ${cost:.2f}\n- Ready to analyze orphan pages"
+    except Exception as e:
+        return f"❌ Error building knowledge base: {str(e)}"
+def run_stage1(orphan_url: str) -> Tuple[pd.DataFrame, str]:
+    """Run Stage 1: Find candidate sources"""
+    if app_state['kb'] is None or len(app_state['kb'].entries) == 0:
+        return None, "❌ Please build knowledge base first"
+    if not orphan_url:
+        return None, "❌ Please enter an orphan page URL"
+    # Clean URL
+    orphan_url = orphan_url.strip()
+    if not orphan_url.startswith('/'):
+        orphan_url = '/' + orphan_url
+    if not orphan_url.startswith('/blog/'):
+        orphan_url = '/blog/' + orphan_url.replace('/blog/', '')
+    try:
+        # Validate orphan URL
+        if orphan_url not in app_state['df']['url'].values:
+            available_urls = app_state['df']['url'].head(5).tolist()
+            return None, f"❌ Orphan URL not found in CSV.\n\nFormat should be: /blog/slug-here\n\nExample URLs in your CSV:\n" + "\n".join(available_urls)
+        results, cost = Stage1Discovery.analyze(
+            orphan_url,
+            app_state['df'],
+            app_state['kb'],
+            app_state['client']
+        )
+        if not results:
+            return None, "❌ No candidates found. Try a different orphan page."
+        app_state['stage1_results'] = results
+        app_state['current_orphan_url'] = orphan_url
+        # Auto-select top 3
+        app_state['selected_sources'] = [results[0]['url'], results[1]['url'], results[2]['url']]
+        # Convert to DataFrame for display
+        df_display = pd.DataFrame(results)
+        df_display = df_display[['rank', 'score', 'url', 'traffic']]
+        df_display.columns = ['#', 'Score', 'Source Page', 'Traffic/mo']
+        status = f"✅ Found {len(results)} candidates (Cost: ${cost:.3f})\n\n"
+        status += "🏆 Top 3 auto-selected:\n"
+        for i in range(min(3, len(results))):
+            status += f"{i+1}. {results[i]['url']} (Score: {results[i]['score']})\n"
+        status += "\nClick 'Find Placements' to continue →"
+        return df_display, status
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+def run_stage2() -> Tuple[pd.DataFrame, str]:
+    """Run Stage 2: Find placements"""
+    if not app_state['selected_sources']:
+        return None, "❌ Please run Stage 1 first"
+    if not app_state['current_orphan_url']:
+        return None, "❌ No orphan URL set. Please run Stage 1."
+    try:
+        placements, cost = Stage2Placement.analyze(
+            app_state['current_orphan_url'],
+            app_state['selected_sources'],
+            app_state['df'],
+            app_state['kb'],
+            app_state['client']
+        )
+        if not placements:
+            return None, "❌ No placements found. This shouldn't happen. Try different sources."
+        app_state['stage2_results'] = placements
+        # Convert to DataFrame
+        df_display = pd.DataFrame([
+            {
+                'Source Page': p['source_url'],
+                'Para': p['paragraph_index'],
+                'Score': p['score'],
+                'Anchor': p['anchor_text'][:50]
+            }
+            for p in placements
+        ])
+        status = f"✅ {len(placements)} placements identified (Cost: ${cost:.3f})\n\n"
+        status += f"Average Score: {int(np.mean([p['score'] for p in placements]))}\n\n"
+        status += "Click 'Generate Report' to see full details →"
+        return df_display, status
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+def run_stage3() -> str:
+    """Run Stage 3: Generate report"""
+    if app_state['stage2_results'] is None:
+        return "❌ Please run Stage 2 first"
+    if not app_state['current_orphan_url']:
+        return "❌ No orphan URL set"
+    try:
+        report = Stage3Report.generate(
+            app_state['current_orphan_url'],
+            app_state['stage2_results']
+        )
+        # Format as markdown
+        md = f"# 📄 Implementation Report\n\n"
+        md += f"**Orphan Page:** `{report['orphan_url']}`\n\n"
+        md += f"**Total Links:** {report['total_links']} | "
+        md += f"**Avg Score:** {report['avg_score']} | "
+        md += f"**Anchor Diversity:** {report['anchor_diversity']}\n\n"
+        md += f"**Total Cost This Session:** ${app_state['client'].get_total_cost():.3f}\n\n"
+        md += "---\n\n"
+        for link in report['links']:
+            md += f"## 🔗 Link #{link['number']}: `{link['source_url']}`\n\n"
+            md += f"**Location:** Paragraph {link['paragraph']} | **Score:** {link['score']}/100\n\n"
+            md += f"### Current Text:\n"
+            md += f"> {link['current_sentence']}\n\n"
+            md += f"### Modified Text:\n"
+            anchor_display = f"**[{link['anchor_text']}]**"
+            md += f"> {link['modified_sentence'].replace('[ANCHOR]', anchor_display)}\n\n"
+            md += f"**Anchor Text:** `{link['anchor_text']}`\n\n"
+            if link['anchor_alternatives']:
+                md += f"**Alternatives:** "
+                md += ", ".join(f"`{alt}`" for alt in link['anchor_alternatives'])
+                md += "\n\n"
+            md += f"### 📋 HTML Code (Copy This):\n\n"
+            md += f"```html\n{link['html_code']}\n```\n\n"
+            md += f"### 📝 Implementation Steps:\n"
+            md += f"1. Open `{link['source_url']}` in Webflow CMS\n"
+            md += f"2. Find paragraph {link['paragraph']}\n"
+            md += f"3. Replace the sentence with HTML code above\n"
+            md += f"4. Publish changes\n\n"
+            md += "---\n\n"
+        md += f"## ✅ Next Steps\n\n"
+        md += f"1. Copy each HTML code block above\n"
+        md += f"2. Implement in Webflow CMS\n"
+        md += f"3. Test links after publishing\n"
+        md += f"4. Monitor traffic to orphan page\n\n"
+        md += f"**Ready to analyze another orphan? Use the Stage 1 tab!**\n"
+        return md
+    except Exception as e:
+        return f"❌ Error generating report: {str(e)}"
+# CELL 11: Build Gradio UI
+# ============================================================================
+with gr.Blocks(
+    title="Edstellar Internal Linking RAG Tool",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🔗 Edstellar Internal Linking RAG Tool
+    **AI-powered 3-stage analysis** to find optimal internal linking opportunities for orphan pages.
+    Uses **DeepSeek V3** via OpenRouter API for intelligent semantic matching.
+    """)
+    with gr.Tab("⚙️ Setup"):
+        gr.Markdown("### Step 1: Configure OpenRouter API Key")
+        gr.Markdown("Get your API key from [OpenRouter.ai](https://openrouter.ai/keys)")
+        api_key_input = gr.Textbox(
+            label="OpenRouter API Key",
+            type="password",
+            placeholder="sk-or-v1-...",
+            info="Your API key is never stored and only used for this session"
+        )
+        api_key_btn = gr.Button("✓ Validate API Key", variant="primary", size="sm")
+        api_key_status = gr.Textbox(label="Status", interactive=False, lines=2)
+        api_key_btn.click(
+            fn=setup_api_key,
+            inputs=[api_key_input],
+            outputs=[api_key_status]
+        )
+        gr.Markdown("---")
+        gr.Markdown("### Step 2: Upload Blog Posts CSV")
+        gr.Markdown("Upload your Webflow CSV export containing all blog posts")
+        csv_upload = gr.File(
+            label="Upload CSV File",
+            file_types=[".csv"],
+            type="filepath"
+        )
+        csv_status = gr.Textbox(label="Status", interactive=False, lines=4)
+        csv_upload.change(
+            fn=upload_csv,
+            inputs=[csv_upload],
+            outputs=[csv_status]
+        )
+        gr.Markdown("---")
+        gr.Markdown("### Step 3: Build Knowledge Base")
+        gr.Markdown("""
+        ⚠️ **One-time process:**
+        - Takes 30-45 minutes depending on content size
+        - Costs approximately $1-2
+        - Creates searchable index of all blog content
+        - Only needs to be done once per CSV upload
+        """)
+        kb_btn = gr.Button("🔨 Build Knowledge Base", variant="primary", size="lg")
+        kb_status = gr.Textbox(label="Status", interactive=False, lines=5)
+        kb_btn.click(
+            fn=build_knowledge_base,
+            outputs=[kb_status]
+        )
+    with gr.Tab("🔍 Stage 1: Find Sources"):
+        gr.Markdown("""
+        ### Find Best Source Pages
+        Enter an orphan page URL to find the top candidate pages that should link to it.
+        """)
+        orphan_url_1 = gr.Textbox(
+            label="Orphan Page URL",
+            placeholder="/blog/employee-training-tips",
+            info="Format: /blog/slug-name"
+        )
+        stage1_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
+        stage1_results = gr.Dataframe(
+            label="Candidates Found (Top 3 Auto-Selected)",
+            interactive=False,
+            wrap=True
+        )
+        stage1_status = gr.Textbox(label="Status", interactive=False, lines=5)
+        stage1_btn.click(
+            fn=run_stage1,
+            inputs=[orphan_url_1],
+            outputs=[stage1_results, stage1_status]
+        )
+    with gr.Tab("📍 Stage 2: Find Placements"):
+        gr.Markdown("""
+        ### Identify Exact Placement Locations
+        Find the specific paragraphs in each source page where links should be added.
+        """)
+        gr.Markdown("*Uses the orphan URL and sources from Stage 1*")
+        stage2_btn = gr.Button("📍 Find Placements", variant="primary", size="lg")
+        stage2_results = gr.Dataframe(
+            label="Placements Identified",
+            interactive=False,
+            wrap=True
+        )
+        stage2_status = gr.Textbox(label="Status", interactive=False, lines=4)
+        stage2_btn.click(
+            fn=run_stage2,
+            outputs=[stage2_results, stage2_status]
+        )
+    with gr.Tab("📄 Stage 3: Implementation Report"):
+        gr.Markdown("""
+        ### Generate Copy-Paste Ready Report
+        Get detailed HTML code and implementation instructions for each link.
+        """)
+        stage3_btn = gr.Button("📄 Generate Report", variant="primary", size="lg")
+        stage3_report = gr.Markdown(
+            label="Implementation Report",
+            value="*Report will appear here after generation*"
+        )
+        stage3_btn.click(
+            fn=run_stage3,
+            outputs=[stage3_report]
+        )
+    gr.Markdown("""
+    ---
+    ### 💡 Tips:
+    - Build knowledge base once, then analyze multiple orphan pages
+    - Each orphan analysis costs ~$0.02-0.05
+    - Copy HTML code directly into Webflow rich text editor
+    - Review all suggestions before implementing
+    ### 🔒 Privacy:
+    - All data stays in your session
+    - API keys are not stored
+    - No data is saved after session ends
+    """)
+# CELL 12: Launch
+# ============================================================================
+if __name__ == "__main__":
+    demo.launch(
+        share=True,
+        debug=True,
+        server_name="0.0.0.0",  # For Hugging Face deployment
+        server_port=7860  # Default Gradio port
+    )