vijaykumaredstellar commited on
Commit
af2e520
Β·
verified Β·
1 Parent(s): 57f193d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1033 -0
app.py CHANGED
@@ -0,0 +1,1033 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================================
2
+ # EDSTELLAR INTERNAL LINKING RAG TOOL
3
+ # OpenRouter API + DeepSeek V3
4
+ # Google Colab β†’ Hugging Face Deployment
5
+ # ============================================================================
6
+
7
+ # CELL 1: Install Dependencies
8
+ # ============================================================================
9
+ !pip install -q gradio openai pandas numpy scikit-learn
10
+
11
+ # CELL 2: Import Libraries
12
+ # ============================================================================
13
+ import gradio as gr
14
+ import pandas as pd
15
+ import numpy as np
16
+ import json
17
+ import os
18
+ from typing import List, Dict, Tuple
19
+ import time
20
+ from sklearn.metrics.pairwise import cosine_similarity
21
+
22
+ # CELL 3: Configuration
23
+ # ============================================================================
24
+ class Config:
25
+ OPENROUTER_API_KEY = "" # Will be set via Gradio interface
26
+ OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
27
+
28
+ # DeepSeek V3 models on OpenRouter
29
+ CHAT_MODEL = "deepseek/deepseek-chat" # DeepSeek V3
30
+ EMBEDDING_MODEL = "openai/text-embedding-3-small" # For embeddings (DeepSeek doesn't have embedding API)
31
+
32
+ # Pricing (OpenRouter rates for DeepSeek V3)
33
+ CHAT_COST_PER_1K_INPUT = 0.0014 # $1.40 per 1M input tokens
34
+ CHAT_COST_PER_1K_OUTPUT = 0.0028 # $2.80 per 1M output tokens
35
+ EMBEDDING_COST_PER_1K = 0.00002 # text-embedding-3-small
36
+
37
+ TOP_K_CANDIDATES = 15
38
+ TOP_N_SOURCES = 3
39
+
40
+ config = Config()
41
+
42
+ # CELL 4: OpenRouter API Client
43
+ # ============================================================================
44
+ from openai import OpenAI
45
+
46
+ class OpenRouterClient:
47
+ def __init__(self, api_key: str):
48
+ self.client = OpenAI(
49
+ api_key=api_key,
50
+ base_url=config.OPENROUTER_BASE_URL
51
+ )
52
+ self.total_cost = 0.0
53
+
54
+ def get_embedding(self, text: str) -> List[float]:
55
+ """Generate embedding for text using OpenAI's embedding model"""
56
+ try:
57
+ # Truncate if too long
58
+ text = text[:8000]
59
+
60
+ response = self.client.embeddings.create(
61
+ model=config.EMBEDDING_MODEL,
62
+ input=text,
63
+ extra_headers={
64
+ "HTTP-Referer": "https://edstellar.com", # Optional: your site
65
+ "X-Title": "Edstellar Internal Linking Tool" # Optional: app name
66
+ }
67
+ )
68
+
69
+ # Track cost
70
+ tokens = response.usage.total_tokens
71
+ cost = (tokens / 1000) * config.EMBEDDING_COST_PER_1K
72
+ self.total_cost += cost
73
+
74
+ return response.data[0].embedding
75
+ except Exception as e:
76
+ raise Exception(f"Embedding error: {str(e)}")
77
+
78
+ def chat_completion(self, messages: List[Dict], temperature: float = 0.3) -> Tuple[str, float]:
79
+ """Generate chat completion using DeepSeek V3"""
80
+ try:
81
+ response = self.client.chat.completions.create(
82
+ model=config.CHAT_MODEL,
83
+ messages=messages,
84
+ temperature=temperature,
85
+ extra_headers={
86
+ "HTTP-Referer": "https://edstellar.com",
87
+ "X-Title": "Edstellar Internal Linking Tool"
88
+ }
89
+ )
90
+
91
+ # Track cost (OpenRouter provides usage data)
92
+ if hasattr(response, 'usage'):
93
+ input_tokens = response.usage.prompt_tokens
94
+ output_tokens = response.usage.completion_tokens
95
+
96
+ cost = (input_tokens / 1000) * config.CHAT_COST_PER_1K_INPUT
97
+ cost += (output_tokens / 1000) * config.CHAT_COST_PER_1K_OUTPUT
98
+ self.total_cost += cost
99
+ else:
100
+ cost = 0.0
101
+
102
+ return response.choices[0].message.content, cost
103
+ except Exception as e:
104
+ raise Exception(f"Chat completion error: {str(e)}")
105
+
106
+ def get_total_cost(self) -> float:
107
+ """Get total API cost so far"""
108
+ return self.total_cost
109
+
110
+ def reset_cost(self):
111
+ """Reset cost counter"""
112
+ self.total_cost = 0.0
113
+
114
+ # CELL 5: Data Processing
115
+ # ============================================================================
116
+ class DataProcessor:
117
+ @staticmethod
118
+ def parse_csv(file_path: str) -> pd.DataFrame:
119
+ """Parse Webflow CSV export"""
120
+ df = pd.read_csv(file_path)
121
+
122
+ # Rename columns for easier access
123
+ column_mapping = {
124
+ 'Name': 'title',
125
+ 'Slug': 'slug',
126
+ 'Content': 'content',
127
+ 'Meta Description': 'meta_description',
128
+ 'Primary Keyword': 'primary_keyword',
129
+ 'Training Category': 'category',
130
+ 'Related Tags': 'tags',
131
+ 'Views': 'views',
132
+ 'Main Tag': 'main_tag'
133
+ }
134
+
135
+ # Only rename columns that exist
136
+ existing_columns = {k: v for k, v in column_mapping.items() if k in df.columns}
137
+ df = df.rename(columns=existing_columns)
138
+
139
+ # Create full URL
140
+ df['url'] = df['slug'].apply(lambda x: f"/blog/{x}" if pd.notna(x) else "")
141
+
142
+ # Fill NaN values with empty strings for text columns
143
+ text_columns = ['title', 'content', 'meta_description', 'primary_keyword', 'category', 'tags']
144
+ for col in text_columns:
145
+ if col in df.columns:
146
+ df[col] = df[col].fillna('')
147
+
148
+ # Fill NaN values with 0 for numeric columns
149
+ if 'views' in df.columns:
150
+ df['views'] = pd.to_numeric(df['views'], errors='coerce').fillna(0).astype(int)
151
+
152
+ return df
153
+
154
+ @staticmethod
155
+ def clean_html(html_text: str) -> str:
156
+ """Remove HTML tags and clean text"""
157
+ import re
158
+
159
+ if pd.isna(html_text) or html_text == '':
160
+ return ""
161
+
162
+ # Remove script and style tags
163
+ text = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', str(html_text), flags=re.IGNORECASE)
164
+ text = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', text, flags=re.IGNORECASE)
165
+
166
+ # Remove HTML tags
167
+ text = re.sub(r'<[^>]+>', ' ', text)
168
+
169
+ # Decode HTML entities
170
+ import html
171
+ text = html.unescape(text)
172
+
173
+ # Clean whitespace
174
+ text = re.sub(r'\s+', ' ', text)
175
+
176
+ return text.strip()
177
+
178
+ @staticmethod
179
+ def extract_paragraphs(content: str, min_length: int = 100, max_paragraphs: int = 30) -> List[Dict]:
180
+ """Extract paragraphs from content"""
181
+ clean_content = DataProcessor.clean_html(content)
182
+
183
+ if not clean_content:
184
+ return []
185
+
186
+ # Split by multiple newlines or periods
187
+ import re
188
+
189
+ # Try to split by paragraph markers first
190
+ raw_paragraphs = re.split(r'\n\n+', clean_content)
191
+
192
+ paragraphs = []
193
+
194
+ for para in raw_paragraphs:
195
+ para = para.strip()
196
+
197
+ # Skip if too short
198
+ if len(para) < min_length:
199
+ continue
200
+
201
+ # If paragraph is very long, split by sentences
202
+ if len(para) > 600:
203
+ sentences = re.split(r'(?<=[.!?])\s+', para)
204
+ current_chunk = []
205
+ current_length = 0
206
+
207
+ for sentence in sentences:
208
+ current_chunk.append(sentence)
209
+ current_length += len(sentence)
210
+
211
+ if current_length >= 300: # Target chunk size
212
+ chunk_text = ' '.join(current_chunk)
213
+ if len(chunk_text) >= min_length:
214
+ paragraphs.append({
215
+ 'text': chunk_text,
216
+ 'length': len(chunk_text)
217
+ })
218
+ current_chunk = []
219
+ current_length = 0
220
+
221
+ # Add remaining
222
+ if current_chunk:
223
+ chunk_text = ' '.join(current_chunk)
224
+ if len(chunk_text) >= min_length:
225
+ paragraphs.append({
226
+ 'text': chunk_text,
227
+ 'length': len(chunk_text)
228
+ })
229
+ else:
230
+ paragraphs.append({
231
+ 'text': para,
232
+ 'length': len(para)
233
+ })
234
+
235
+ # Limit total paragraphs per post
236
+ if len(paragraphs) >= max_paragraphs:
237
+ break
238
+
239
+ return paragraphs
240
+
241
+ # CELL 6: Knowledge Base
242
+ # ============================================================================
243
+ class KnowledgeBase:
244
+ def __init__(self):
245
+ self.entries = []
246
+ self.embeddings = []
247
+ self.build_cost = 0.0
248
+
249
+ def build(self, df: pd.DataFrame, client: OpenRouterClient,
250
+ progress_callback=None) -> Tuple[int, float]:
251
+ """Build knowledge base from DataFrame"""
252
+ self.entries = []
253
+ self.embeddings = []
254
+
255
+ client.reset_cost() # Reset cost counter
256
+
257
+ total_posts = len(df)
258
+
259
+ for idx, row in df.iterrows():
260
+ if progress_callback:
261
+ progress_callback(
262
+ idx + 1,
263
+ total_posts,
264
+ f"Processing: {row['title'][:50]}... (Cost: ${client.get_total_cost():.3f})"
265
+ )
266
+
267
+ # Skip if no content
268
+ if not row['content'] or row['content'] == '':
269
+ continue
270
+
271
+ # Extract paragraphs
272
+ paragraphs = DataProcessor.extract_paragraphs(row['content'])
273
+
274
+ if not paragraphs:
275
+ continue
276
+
277
+ for para_idx, para in enumerate(paragraphs):
278
+ # Create entry
279
+ entry = {
280
+ 'id': f"{row['url']}_para_{para_idx}",
281
+ 'post_url': row['url'],
282
+ 'post_title': row['title'],
283
+ 'post_category': row.get('category', ''),
284
+ 'post_keyword': row.get('primary_keyword', ''),
285
+ 'post_tags': row.get('tags', ''),
286
+ 'post_views': row.get('views', 0),
287
+ 'paragraph_index': para_idx,
288
+ 'paragraph_text': para['text']
289
+ }
290
+
291
+ # Generate embedding
292
+ try:
293
+ embedding = client.get_embedding(para['text'])
294
+
295
+ self.entries.append(entry)
296
+ self.embeddings.append(embedding)
297
+
298
+ except Exception as e:
299
+ print(f"Error processing {entry['id']}: {e}")
300
+ continue
301
+
302
+ # Rate limiting (OpenRouter: 20 requests/second, but be conservative)
303
+ time.sleep(0.3)
304
+
305
+ # Convert embeddings to numpy array
306
+ if self.embeddings:
307
+ self.embeddings = np.array(self.embeddings)
308
+
309
+ self.build_cost = client.get_total_cost()
310
+
311
+ return len(self.entries), self.build_cost
312
+
313
+ def search(self, query_embedding: np.ndarray, top_k: int = 20,
314
+ exclude_url: str = None) -> List[Dict]:
315
+ """Semantic search in knowledge base"""
316
+ if len(self.embeddings) == 0:
317
+ return []
318
+
319
+ # Calculate cosine similarity
320
+ query_embedding = np.array(query_embedding).reshape(1, -1)
321
+ similarities = cosine_similarity(query_embedding, self.embeddings)[0]
322
+
323
+ # Get top K indices
324
+ top_indices = np.argsort(similarities)[::-1]
325
+
326
+ # Filter and return entries with scores
327
+ results = []
328
+ for idx in top_indices:
329
+ entry = self.entries[idx].copy()
330
+
331
+ # Skip if same post
332
+ if exclude_url and entry['post_url'] == exclude_url:
333
+ continue
334
+
335
+ entry['similarity'] = float(similarities[idx])
336
+ results.append(entry)
337
+
338
+ if len(results) >= top_k:
339
+ break
340
+
341
+ return results
342
+
343
+ # CELL 7: Stage 1 - Source Page Discovery
344
+ # ============================================================================
345
+ class Stage1Discovery:
346
+ @staticmethod
347
+ def analyze(orphan_url: str, df: pd.DataFrame, kb: KnowledgeBase,
348
+ client: OpenRouterClient) -> Tuple[List[Dict], float]:
349
+ """Find top candidate source pages"""
350
+
351
+ # Reset cost tracking
352
+ initial_cost = client.get_total_cost()
353
+
354
+ # Get orphan page data
355
+ orphan_row = df[df['url'] == orphan_url].iloc[0]
356
+
357
+ # Create orphan profile
358
+ orphan_profile = f"{orphan_row['title']}. {orphan_row.get('meta_description', '')}. "
359
+ orphan_profile += f"Keywords: {orphan_row.get('primary_keyword', '')}. "
360
+ orphan_profile += DataProcessor.clean_html(orphan_row['content'])[:2000]
361
+
362
+ # Get embedding
363
+ orphan_embedding = client.get_embedding(orphan_profile)
364
+
365
+ # Search knowledge base
366
+ results = kb.search(orphan_embedding, top_k=200, exclude_url=orphan_url)
367
+
368
+ # Group by post (aggregate paragraph scores)
369
+ post_scores = {}
370
+ for result in results:
371
+ post_url = result['post_url']
372
+
373
+ if post_url not in post_scores:
374
+ post_scores[post_url] = {
375
+ 'url': post_url,
376
+ 'title': result['post_title'],
377
+ 'category': result['post_category'],
378
+ 'keyword': result['post_keyword'],
379
+ 'tags': result['post_tags'],
380
+ 'views': result['post_views'],
381
+ 'similarities': [],
382
+ 'paragraph_count': 0
383
+ }
384
+
385
+ post_scores[post_url]['similarities'].append(result['similarity'])
386
+ post_scores[post_url]['paragraph_count'] += 1
387
+
388
+ # Calculate aggregate scores
389
+ candidates = []
390
+ for post_url, data in post_scores.items():
391
+ # Average of top 3 similarities
392
+ top_sims = sorted(data['similarities'], reverse=True)[:3]
393
+ avg_similarity = np.mean(top_sims) if top_sims else 0
394
+
395
+ # Base score from similarity (0-100)
396
+ score = avg_similarity * 100
397
+
398
+ # Boost for same category
399
+ orphan_category = orphan_row.get('category', '').lower()
400
+ post_category = data['category'].lower()
401
+ if orphan_category and post_category and orphan_category == post_category:
402
+ score += 8
403
+
404
+ # Boost for keyword overlap
405
+ orphan_keywords = set(str(orphan_row.get('primary_keyword', '')).lower().split())
406
+ post_keywords = set(str(data['keyword']).lower().split())
407
+ keyword_overlap = len(orphan_keywords & post_keywords)
408
+ score += keyword_overlap * 3
409
+
410
+ # Slight boost for high traffic
411
+ if data['views'] > 10000:
412
+ score += 3
413
+ elif data['views'] > 5000:
414
+ score += 1
415
+
416
+ # Cap at 100
417
+ score = min(score, 100)
418
+
419
+ candidates.append({
420
+ 'rank': 0,
421
+ 'url': post_url,
422
+ 'title': data['title'],
423
+ 'score': int(score),
424
+ 'traffic': int(data['views']),
425
+ 'category': data['category'],
426
+ 'similarity': round(avg_similarity * 100, 1),
427
+ 'opportunities': min(data['paragraph_count'], 5)
428
+ })
429
+
430
+ # Sort by score
431
+ candidates = sorted(candidates, key=lambda x: x['score'], reverse=True)
432
+
433
+ # Add ranks
434
+ for idx, candidate in enumerate(candidates):
435
+ candidate['rank'] = idx + 1
436
+
437
+ # Calculate cost for this stage
438
+ stage_cost = client.get_total_cost() - initial_cost
439
+
440
+ # Return top 15
441
+ return candidates[:config.TOP_K_CANDIDATES], stage_cost
442
+
443
+ # CELL 8: Stage 2 - Placement Discovery
444
+ # ============================================================================
445
+ class Stage2Placement:
446
+ @staticmethod
447
+ def analyze(orphan_url: str, selected_sources: List[str], df: pd.DataFrame,
448
+ kb: KnowledgeBase, client: OpenRouterClient) -> Tuple[List[Dict], float]:
449
+ """Find best placement in each selected source"""
450
+
451
+ initial_cost = client.get_total_cost()
452
+
453
+ orphan_row = df[df['url'] == orphan_url].iloc[0]
454
+
455
+ placements = []
456
+
457
+ # Get orphan embedding once
458
+ orphan_profile = f"{orphan_row['title']}. {orphan_row.get('primary_keyword', '')}"
459
+ orphan_embedding = client.get_embedding(orphan_profile)
460
+
461
+ for source_url in selected_sources:
462
+ source_row = df[df['url'] == source_url].iloc[0]
463
+
464
+ # Get all paragraphs for this source from KB
465
+ source_paragraphs = [
466
+ entry for entry in kb.entries
467
+ if entry['post_url'] == source_url
468
+ ]
469
+
470
+ if not source_paragraphs:
471
+ continue
472
+
473
+ # Find best paragraph by similarity
474
+ best_para = None
475
+ best_score = 0
476
+
477
+ # Get embeddings for source paragraphs
478
+ para_indices = [kb.entries.index(p) for p in source_paragraphs]
479
+ para_embeddings = kb.embeddings[para_indices]
480
+
481
+ # Calculate similarities
482
+ similarities = cosine_similarity(
483
+ np.array(orphan_embedding).reshape(1, -1),
484
+ para_embeddings
485
+ )[0]
486
+
487
+ for idx, (para, similarity) in enumerate(zip(source_paragraphs, similarities)):
488
+ score = similarity * 100
489
+
490
+ # Prefer middle paragraphs
491
+ total_paras = len(source_paragraphs)
492
+ if total_paras > 4 and 2 < para['paragraph_index'] < total_paras - 2:
493
+ score += 5
494
+
495
+ # Prefer certain length
496
+ para_len = len(para['paragraph_text'])
497
+ if 150 < para_len < 500:
498
+ score += 3
499
+
500
+ if score > best_score:
501
+ best_score = score
502
+ best_para = para
503
+
504
+ if best_para:
505
+ # Use LLM to generate modified sentence
506
+ placement = Stage2Placement._generate_placement(
507
+ orphan_row, source_row, best_para, client
508
+ )
509
+ placement['score'] = int(best_score)
510
+ placements.append(placement)
511
+
512
+ stage_cost = client.get_total_cost() - initial_cost
513
+
514
+ return placements, stage_cost
515
+
516
+ @staticmethod
517
+ def _generate_placement(orphan_row, source_row, paragraph, client) -> Dict:
518
+ """Use LLM to generate placement details"""
519
+
520
+ # Truncate paragraph if too long
521
+ para_text = paragraph['paragraph_text']
522
+ if len(para_text) > 400:
523
+ para_text = para_text[:400] + "..."
524
+
525
+ prompt = f"""You are an SEO expert. Analyze this paragraph and suggest how to add an internal link naturally.
526
+
527
+ SOURCE ARTICLE: {source_row['title']}
528
+ PARAGRAPH: "{para_text}"
529
+
530
+ TARGET PAGE TO LINK:
531
+ - Title: {orphan_row['title']}
532
+ - Keyword: {orphan_row.get('primary_keyword', '')}
533
+
534
+ Task: Find a natural spot to add the link.
535
+
536
+ Respond in JSON format:
537
+ {{
538
+ "current_sentence": "the original sentence to modify",
539
+ "modified_sentence": "new sentence with [ANCHOR] placeholder where link goes",
540
+ "anchor_text": "suggested anchor text (2-4 words)",
541
+ "anchor_alternatives": ["alternative 1", "alternative 2"]
542
+ }}
543
+
544
+ Make the link insertion natural and valuable to readers."""
545
+
546
+ messages = [
547
+ {"role": "system", "content": "You are an SEO expert specializing in natural internal linking."},
548
+ {"role": "user", "content": prompt}
549
+ ]
550
+
551
+ try:
552
+ response, cost = client.chat_completion(messages)
553
+
554
+ # Try to parse JSON
555
+ try:
556
+ result = json.loads(response)
557
+ except:
558
+ # If not valid JSON, try to extract from markdown code block
559
+ import re
560
+ json_match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
561
+ if json_match:
562
+ result = json.loads(json_match.group(1))
563
+ else:
564
+ # Fallback
565
+ result = {
566
+ "current_sentence": para_text[:100] + "...",
567
+ "modified_sentence": f"...with [ANCHOR] for better understanding.",
568
+ "anchor_text": orphan_row.get('primary_keyword', 'more information'),
569
+ "anchor_alternatives": ["related guide", "detailed tips"]
570
+ }
571
+
572
+ return {
573
+ 'source_url': source_row['url'],
574
+ 'source_title': source_row['title'],
575
+ 'paragraph_index': paragraph['paragraph_index'],
576
+ 'paragraph_text': paragraph['paragraph_text'],
577
+ 'current_sentence': result.get('current_sentence', para_text[:100]),
578
+ 'modified_sentence': result.get('modified_sentence', ''),
579
+ 'anchor_text': result.get('anchor_text', orphan_row.get('primary_keyword', '')),
580
+ 'anchor_alternatives': result.get('anchor_alternatives', [])
581
+ }
582
+ except Exception as e:
583
+ print(f"Error in LLM generation: {e}")
584
+ # Fallback: simple modification
585
+ return {
586
+ 'source_url': source_row['url'],
587
+ 'source_title': source_row['title'],
588
+ 'paragraph_index': paragraph['paragraph_index'],
589
+ 'paragraph_text': para_text,
590
+ 'current_sentence': para_text[:100] + "...",
591
+ 'modified_sentence': f"...implementing [ANCHOR] can significantly improve results.",
592
+ 'anchor_text': orphan_row.get('primary_keyword', 'effective strategies'),
593
+ 'anchor_alternatives': []
594
+ }
595
+
596
+ # CELL 9: Stage 3 - Report Generation
597
+ # ============================================================================
598
+ class Stage3Report:
599
+ @staticmethod
600
+ def generate(orphan_url: str, placements: List[Dict]) -> Dict:
601
+ """Generate final implementation report"""
602
+
603
+ links = []
604
+
605
+ for idx, placement in enumerate(placements):
606
+ # Create HTML code
607
+ html_code = placement['modified_sentence'].replace(
608
+ '[ANCHOR]',
609
+ f'<a href="{orphan_url}">{placement["anchor_text"]}</a>'
610
+ )
611
+
612
+ links.append({
613
+ 'number': idx + 1,
614
+ 'source_url': placement['source_url'],
615
+ 'source_title': placement['source_title'],
616
+ 'paragraph': placement['paragraph_index'],
617
+ 'score': placement['score'],
618
+ 'current_sentence': placement['current_sentence'],
619
+ 'modified_sentence': placement['modified_sentence'],
620
+ 'anchor_text': placement['anchor_text'],
621
+ 'anchor_alternatives': placement.get('anchor_alternatives', []),
622
+ 'html_code': html_code
623
+ })
624
+
625
+ # Calculate metrics
626
+ avg_score = int(np.mean([l['score'] for l in links])) if links else 0
627
+ unique_anchors = len(set(l['anchor_text'] for l in links))
628
+ anchor_diversity = 'Excellent' if unique_anchors == len(links) else ('Good' if unique_anchors >= len(links) - 1 else 'Fair')
629
+
630
+ return {
631
+ 'orphan_url': orphan_url,
632
+ 'links': links,
633
+ 'avg_score': avg_score,
634
+ 'anchor_diversity': anchor_diversity,
635
+ 'total_links': len(links)
636
+ }
637
+
638
+ # CELL 10: Gradio Interface Functions
639
+ # ============================================================================
640
+
641
+ # Global state
642
+ app_state = {
643
+ 'df': None,
644
+ 'kb': None,
645
+ 'client': None,
646
+ 'stage1_results': None,
647
+ 'stage2_results': None,
648
+ 'selected_sources': [],
649
+ 'current_orphan_url': None
650
+ }
651
+
652
+ def setup_api_key(api_key: str) -> str:
653
+ """Initialize OpenRouter client"""
654
+ if not api_key or not api_key.startswith('sk-'):
655
+ return "❌ Please enter a valid OpenRouter API key"
656
+
657
+ try:
658
+ app_state['client'] = OpenRouterClient(api_key)
659
+ # Test the API key with a simple embedding
660
+ app_state['client'].get_embedding("test connection")
661
+ return "βœ… API Key validated successfully! Ready to use."
662
+ except Exception as e:
663
+ return f"❌ Error: {str(e)}\n\nMake sure you're using an OpenRouter API key."
664
+
665
+ def upload_csv(file) -> str:
666
+ """Process uploaded CSV"""
667
+ if file is None:
668
+ return "❌ No file uploaded"
669
+
670
+ try:
671
+ app_state['df'] = DataProcessor.parse_csv(file.name)
672
+
673
+ # Show stats
674
+ total_posts = len(app_state['df'])
675
+ posts_with_content = len(app_state['df'][app_state['df']['content'] != ''])
676
+
677
+ return f"βœ… CSV loaded successfully!\n\nπŸ“Š Stats:\n- Total posts: {total_posts}\n- Posts with content: {posts_with_content}\n- Ready to build knowledge base"
678
+ except Exception as e:
679
+ return f"❌ Error parsing CSV: {str(e)}\n\nMake sure it's a valid Webflow export."
680
+
681
+ def build_knowledge_base(progress=gr.Progress()) -> str:
682
+ """Build knowledge base with embeddings"""
683
+ if app_state['df'] is None:
684
+ return "❌ Please upload CSV first"
685
+
686
+ if app_state['client'] is None:
687
+ return "❌ Please set API key first"
688
+
689
+ try:
690
+ app_state['kb'] = KnowledgeBase()
691
+
692
+ progress(0, desc="Starting knowledge base build...")
693
+
694
+ def progress_callback(current, total, message):
695
+ progress((current, total), desc=message)
696
+
697
+ num_entries, cost = app_state['kb'].build(
698
+ app_state['df'],
699
+ app_state['client'],
700
+ progress_callback
701
+ )
702
+
703
+ if num_entries == 0:
704
+ return "❌ No entries created. Check if CSV has content."
705
+
706
+ return f"βœ… Knowledge base built successfully!\n\nπŸ“Š Results:\n- Paragraphs indexed: {num_entries:,}\n- Cost: ${cost:.2f}\n- Ready to analyze orphan pages"
707
+ except Exception as e:
708
+ return f"❌ Error building knowledge base: {str(e)}"
709
+
710
+ def run_stage1(orphan_url: str) -> Tuple[pd.DataFrame, str]:
711
+ """Run Stage 1: Find candidate sources"""
712
+ if app_state['kb'] is None or len(app_state['kb'].entries) == 0:
713
+ return None, "❌ Please build knowledge base first"
714
+
715
+ if not orphan_url:
716
+ return None, "❌ Please enter an orphan page URL"
717
+
718
+ # Clean URL
719
+ orphan_url = orphan_url.strip()
720
+ if not orphan_url.startswith('/'):
721
+ orphan_url = '/' + orphan_url
722
+ if not orphan_url.startswith('/blog/'):
723
+ orphan_url = '/blog/' + orphan_url.replace('/blog/', '')
724
+
725
+ try:
726
+ # Validate orphan URL
727
+ if orphan_url not in app_state['df']['url'].values:
728
+ available_urls = app_state['df']['url'].head(5).tolist()
729
+ return None, f"❌ Orphan URL not found in CSV.\n\nFormat should be: /blog/slug-here\n\nExample URLs in your CSV:\n" + "\n".join(available_urls)
730
+
731
+ results, cost = Stage1Discovery.analyze(
732
+ orphan_url,
733
+ app_state['df'],
734
+ app_state['kb'],
735
+ app_state['client']
736
+ )
737
+
738
+ if not results:
739
+ return None, "❌ No candidates found. Try a different orphan page."
740
+
741
+ app_state['stage1_results'] = results
742
+ app_state['current_orphan_url'] = orphan_url
743
+
744
+ # Auto-select top 3
745
+ app_state['selected_sources'] = [results[0]['url'], results[1]['url'], results[2]['url']]
746
+
747
+ # Convert to DataFrame for display
748
+ df_display = pd.DataFrame(results)
749
+ df_display = df_display[['rank', 'score', 'url', 'traffic']]
750
+ df_display.columns = ['#', 'Score', 'Source Page', 'Traffic/mo']
751
+
752
+ status = f"βœ… Found {len(results)} candidates (Cost: ${cost:.3f})\n\n"
753
+ status += "πŸ† Top 3 auto-selected:\n"
754
+ for i in range(min(3, len(results))):
755
+ status += f"{i+1}. {results[i]['url']} (Score: {results[i]['score']})\n"
756
+ status += "\nClick 'Find Placements' to continue β†’"
757
+
758
+ return df_display, status
759
+ except Exception as e:
760
+ return None, f"❌ Error: {str(e)}"
761
+
762
+ def run_stage2() -> Tuple[pd.DataFrame, str]:
763
+ """Run Stage 2: Find placements"""
764
+ if not app_state['selected_sources']:
765
+ return None, "❌ Please run Stage 1 first"
766
+
767
+ if not app_state['current_orphan_url']:
768
+ return None, "❌ No orphan URL set. Please run Stage 1."
769
+
770
+ try:
771
+ placements, cost = Stage2Placement.analyze(
772
+ app_state['current_orphan_url'],
773
+ app_state['selected_sources'],
774
+ app_state['df'],
775
+ app_state['kb'],
776
+ app_state['client']
777
+ )
778
+
779
+ if not placements:
780
+ return None, "❌ No placements found. This shouldn't happen. Try different sources."
781
+
782
+ app_state['stage2_results'] = placements
783
+
784
+ # Convert to DataFrame
785
+ df_display = pd.DataFrame([
786
+ {
787
+ 'Source Page': p['source_url'],
788
+ 'Para': p['paragraph_index'],
789
+ 'Score': p['score'],
790
+ 'Anchor': p['anchor_text'][:50]
791
+ }
792
+ for p in placements
793
+ ])
794
+
795
+ status = f"βœ… {len(placements)} placements identified (Cost: ${cost:.3f})\n\n"
796
+ status += f"Average Score: {int(np.mean([p['score'] for p in placements]))}\n\n"
797
+ status += "Click 'Generate Report' to see full details β†’"
798
+
799
+ return df_display, status
800
+ except Exception as e:
801
+ return None, f"❌ Error: {str(e)}"
802
+
803
+ def run_stage3() -> str:
804
+ """Run Stage 3: Generate report"""
805
+ if app_state['stage2_results'] is None:
806
+ return "❌ Please run Stage 2 first"
807
+
808
+ if not app_state['current_orphan_url']:
809
+ return "❌ No orphan URL set"
810
+
811
+ try:
812
+ report = Stage3Report.generate(
813
+ app_state['current_orphan_url'],
814
+ app_state['stage2_results']
815
+ )
816
+
817
+ # Format as markdown
818
+ md = f"# πŸ“„ Implementation Report\n\n"
819
+ md += f"**Orphan Page:** `{report['orphan_url']}`\n\n"
820
+ md += f"**Total Links:** {report['total_links']} | "
821
+ md += f"**Avg Score:** {report['avg_score']} | "
822
+ md += f"**Anchor Diversity:** {report['anchor_diversity']}\n\n"
823
+ md += f"**Total Cost This Session:** ${app_state['client'].get_total_cost():.3f}\n\n"
824
+ md += "---\n\n"
825
+
826
+ for link in report['links']:
827
+ md += f"## πŸ”— Link #{link['number']}: `{link['source_url']}`\n\n"
828
+ md += f"**Location:** Paragraph {link['paragraph']} | **Score:** {link['score']}/100\n\n"
829
+
830
+ md += f"### Current Text:\n"
831
+ md += f"> {link['current_sentence']}\n\n"
832
+
833
+ md += f"### Modified Text:\n"
834
+ anchor_display = f"**[{link['anchor_text']}]**"
835
+ md += f"> {link['modified_sentence'].replace('[ANCHOR]', anchor_display)}\n\n"
836
+
837
+ md += f"**Anchor Text:** `{link['anchor_text']}`\n\n"
838
+
839
+ if link['anchor_alternatives']:
840
+ md += f"**Alternatives:** "
841
+ md += ", ".join(f"`{alt}`" for alt in link['anchor_alternatives'])
842
+ md += "\n\n"
843
+
844
+ md += f"### πŸ“‹ HTML Code (Copy This):\n\n"
845
+ md += f"```html\n{link['html_code']}\n```\n\n"
846
+
847
+ md += f"### πŸ“ Implementation Steps:\n"
848
+ md += f"1. Open `{link['source_url']}` in Webflow CMS\n"
849
+ md += f"2. Find paragraph {link['paragraph']}\n"
850
+ md += f"3. Replace the sentence with HTML code above\n"
851
+ md += f"4. Publish changes\n\n"
852
+
853
+ md += "---\n\n"
854
+
855
+ md += f"## βœ… Next Steps\n\n"
856
+ md += f"1. Copy each HTML code block above\n"
857
+ md += f"2. Implement in Webflow CMS\n"
858
+ md += f"3. Test links after publishing\n"
859
+ md += f"4. Monitor traffic to orphan page\n\n"
860
+ md += f"**Ready to analyze another orphan? Use the Stage 1 tab!**\n"
861
+
862
+ return md
863
+ except Exception as e:
864
+ return f"❌ Error generating report: {str(e)}"
865
+
866
+ # CELL 11: Build Gradio UI
867
+ # ============================================================================
868
+
869
+ with gr.Blocks(
870
+ title="Edstellar Internal Linking RAG Tool",
871
+ theme=gr.themes.Soft(),
872
+ css="""
873
+ .gradio-container {
874
+ max-width: 1200px !important;
875
+ }
876
+ """
877
+ ) as demo:
878
+ gr.Markdown("""
879
+ # πŸ”— Edstellar Internal Linking RAG Tool
880
+
881
+ **AI-powered 3-stage analysis** to find optimal internal linking opportunities for orphan pages.
882
+
883
+ Uses **DeepSeek V3** via OpenRouter API for intelligent semantic matching.
884
+ """)
885
+
886
+ with gr.Tab("βš™οΈ Setup"):
887
+ gr.Markdown("### Step 1: Configure OpenRouter API Key")
888
+ gr.Markdown("Get your API key from [OpenRouter.ai](https://openrouter.ai/keys)")
889
+
890
+ api_key_input = gr.Textbox(
891
+ label="OpenRouter API Key",
892
+ type="password",
893
+ placeholder="sk-or-v1-...",
894
+ info="Your API key is never stored and only used for this session"
895
+ )
896
+ api_key_btn = gr.Button("βœ“ Validate API Key", variant="primary", size="sm")
897
+ api_key_status = gr.Textbox(label="Status", interactive=False, lines=2)
898
+
899
+ api_key_btn.click(
900
+ fn=setup_api_key,
901
+ inputs=[api_key_input],
902
+ outputs=[api_key_status]
903
+ )
904
+
905
+ gr.Markdown("---")
906
+ gr.Markdown("### Step 2: Upload Blog Posts CSV")
907
+ gr.Markdown("Upload your Webflow CSV export containing all blog posts")
908
+
909
+ csv_upload = gr.File(
910
+ label="Upload CSV File",
911
+ file_types=[".csv"],
912
+ type="filepath"
913
+ )
914
+ csv_status = gr.Textbox(label="Status", interactive=False, lines=4)
915
+
916
+ csv_upload.change(
917
+ fn=upload_csv,
918
+ inputs=[csv_upload],
919
+ outputs=[csv_status]
920
+ )
921
+
922
+ gr.Markdown("---")
923
+ gr.Markdown("### Step 3: Build Knowledge Base")
924
+ gr.Markdown("""
925
+ ⚠️ **One-time process:**
926
+ - Takes 30-45 minutes depending on content size
927
+ - Costs approximately $1-2
928
+ - Creates searchable index of all blog content
929
+ - Only needs to be done once per CSV upload
930
+ """)
931
+
932
+ kb_btn = gr.Button("πŸ”¨ Build Knowledge Base", variant="primary", size="lg")
933
+ kb_status = gr.Textbox(label="Status", interactive=False, lines=5)
934
+
935
+ kb_btn.click(
936
+ fn=build_knowledge_base,
937
+ outputs=[kb_status]
938
+ )
939
+
940
+ with gr.Tab("πŸ” Stage 1: Find Sources"):
941
+ gr.Markdown("""
942
+ ### Find Best Source Pages
943
+
944
+ Enter an orphan page URL to find the top candidate pages that should link to it.
945
+ """)
946
+
947
+ orphan_url_1 = gr.Textbox(
948
+ label="Orphan Page URL",
949
+ placeholder="/blog/employee-training-tips",
950
+ info="Format: /blog/slug-name"
951
+ )
952
+
953
+ stage1_btn = gr.Button("πŸ” Analyze", variant="primary", size="lg")
954
+
955
+ stage1_results = gr.Dataframe(
956
+ label="Candidates Found (Top 3 Auto-Selected)",
957
+ interactive=False,
958
+ wrap=True
959
+ )
960
+ stage1_status = gr.Textbox(label="Status", interactive=False, lines=5)
961
+
962
+ stage1_btn.click(
963
+ fn=run_stage1,
964
+ inputs=[orphan_url_1],
965
+ outputs=[stage1_results, stage1_status]
966
+ )
967
+
968
+ with gr.Tab("πŸ“ Stage 2: Find Placements"):
969
+ gr.Markdown("""
970
+ ### Identify Exact Placement Locations
971
+
972
+ Find the specific paragraphs in each source page where links should be added.
973
+ """)
974
+
975
+ gr.Markdown("*Uses the orphan URL and sources from Stage 1*")
976
+
977
+ stage2_btn = gr.Button("πŸ“ Find Placements", variant="primary", size="lg")
978
+
979
+ stage2_results = gr.Dataframe(
980
+ label="Placements Identified",
981
+ interactive=False,
982
+ wrap=True
983
+ )
984
+ stage2_status = gr.Textbox(label="Status", interactive=False, lines=4)
985
+
986
+ stage2_btn.click(
987
+ fn=run_stage2,
988
+ outputs=[stage2_results, stage2_status]
989
+ )
990
+
991
+ with gr.Tab("πŸ“„ Stage 3: Implementation Report"):
992
+ gr.Markdown("""
993
+ ### Generate Copy-Paste Ready Report
994
+
995
+ Get detailed HTML code and implementation instructions for each link.
996
+ """)
997
+
998
+ stage3_btn = gr.Button("πŸ“„ Generate Report", variant="primary", size="lg")
999
+
1000
+ stage3_report = gr.Markdown(
1001
+ label="Implementation Report",
1002
+ value="*Report will appear here after generation*"
1003
+ )
1004
+
1005
+ stage3_btn.click(
1006
+ fn=run_stage3,
1007
+ outputs=[stage3_report]
1008
+ )
1009
+
1010
+ gr.Markdown("""
1011
+ ---
1012
+
1013
+ ### πŸ’‘ Tips:
1014
+ - Build knowledge base once, then analyze multiple orphan pages
1015
+ - Each orphan analysis costs ~$0.02-0.05
1016
+ - Copy HTML code directly into Webflow rich text editor
1017
+ - Review all suggestions before implementing
1018
+
1019
+ ### πŸ”’ Privacy:
1020
+ - All data stays in your session
1021
+ - API keys are not stored
1022
+ - No data is saved after session ends
1023
+ """)
1024
+
1025
+ # CELL 12: Launch
1026
+ # ============================================================================
1027
+ if __name__ == "__main__":
1028
+ demo.launch(
1029
+ share=True,
1030
+ debug=True,
1031
+ server_name="0.0.0.0", # For Hugging Face deployment
1032
+ server_port=7860 # Default Gradio port
1033
+ )