Spaces:

edstellar
/

internallinksuggestor

Sleeping

App Files Files Community

internallinksuggestor / app.py

vijaykumaredstellar

Update app.py

35c63dd verified 2 months ago

raw

history blame contribute delete

14.8 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	from openai import OpenAI
	import pickle
	from huggingface_hub import hf_hub_download
	from sklearn.metrics.pairwise import cosine_similarity
	import httpx

	# ============================================
	# CONFIGURATION
	# ============================================
	HF_DATASET_REPO = "vijaykumaredstellar/edstellar-internal-linking-kb"
	EMBEDDING_MODEL = "openai/text-embedding-3-small"
	CHAT_MODEL = "deepseek/deepseek-chat"

	# ============================================
	# KNOWLEDGE BASE
	# ============================================
	class KnowledgeBase:
	def __init__(self):
	self.knowledge_base = []
	self.embeddings = None
	self.loaded = False

	def load_from_huggingface(self, repo_id, hf_token=None):
	"""Load knowledge base from Hugging Face"""
	try:
	token = hf_token.strip() if hf_token and hf_token.strip() else None

	kb_path = hf_hub_download(
	repo_id=repo_id,
	filename='knowledge_base.pkl',
	repo_type='dataset',
	token=token
	)

	with open(kb_path, 'rb') as f:
	data = pickle.load(f)

	self.knowledge_base = data['knowledge_base']
	self.embeddings = data['embeddings']
	self.loaded = True

	num_posts = len(set(p['url'] for p in self.knowledge_base))
	return True, f"✅ Loaded {len(self.knowledge_base)} paragraphs from {num_posts} blog posts"

	except Exception as e:
	return False, f"❌ Error: {str(e)}"

	def search(self, query_embedding, top_k=50):
	"""Find most similar paragraphs"""
	if not self.loaded:
	return []

	query_embedding = np.array(query_embedding).reshape(1, -1)
	similarities = cosine_similarity(query_embedding, self.embeddings)[0]
	top_indices = np.argsort(similarities)[-top_k:][::-1]

	results = []
	for idx in top_indices:
	results.append({
	**self.knowledge_base[idx],
	'similarity_score': float(similarities[idx])
	})

	return results

	# ============================================
	# OPENROUTER CLIENT
	# ============================================
	class OpenRouterClient:
	def __init__(self, api_key):
	http_client = httpx.Client(
	headers={
	"HTTP-Referer": "https://edstellar.com",
	"X-Title": "Edstellar Internal Linking Tool"
	},
	timeout=60.0
	)

	self.client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	http_client=http_client
	)

	def get_embedding(self, text):
	"""Get embedding vector"""
	response = self.client.embeddings.create(
	model=EMBEDDING_MODEL,
	input=text[:8000]
	)
	return response.data[0].embedding

	def chat(self, messages, temperature=0.3):
	"""Get LLM response"""
	response = self.client.chat.completions.create(
	model=CHAT_MODEL,
	messages=messages,
	temperature=temperature
	)
	return response.choices[0].message.content

	# ============================================
	# ORPHAN PAGE ANALYZER
	# ============================================
	class OrphanPageAnalyzer:
	def __init__(self, kb, client):
	self.kb = kb
	self.client = client

	def analyze(self, orphan_url, orphan_title, orphan_keyword, num_sources=3):
	"""
	Find pages in knowledge base that should link TO the orphan page
	Orphan page does NOT need to be in the knowledge base
	"""

	# Create search query from orphan page info
	search_query = f"{orphan_title} {orphan_keyword}"

	print(f"🔍 Searching for pages related to: {search_query}")

	# Get embedding for the orphan page topic
	query_embedding = self.client.get_embedding(search_query)

	# Search knowledge base for relevant paragraphs
	candidates = self.kb.search(query_embedding, top_k=50)

	print(f"📊 Found {len(candidates)} candidate paragraphs")

	# Group by URL (to find source pages)
	url_scores = {}
	for item in candidates:
	url = item['url']

	# Skip if somehow the orphan URL is in KB
	if url == orphan_url:
	continue

	if url not in url_scores:
	url_scores[url] = {
	'url': url,
	'title': item['title'],
	'category': item['category'],
	'keyword': item['keyword'],
	'paragraphs': []
	}

	url_scores[url]['paragraphs'].append({
	'index': item['paragraph_index'],
	'text': item['text'],
	'similarity': item['similarity_score']
	})

	print(f"📄 Found {len(url_scores)} unique source pages")

	# Rank source pages
	ranked_sources = []
	for url, data in url_scores.items():
	avg_sim = np.mean([p['similarity'] for p in data['paragraphs']])
	max_sim = max([p['similarity'] for p in data['paragraphs']])

	score = (avg_sim * 0.5 + max_sim * 0.5)

	ranked_sources.append({
	**data,
	'score': score
	})

	ranked_sources.sort(key=lambda x: x['score'], reverse=True)
	top_sources = ranked_sources[:num_sources]

	print(f"⭐ Selected top {len(top_sources)} sources")

	# Generate linking recommendations for each source
	results = []

	for idx, source in enumerate(top_sources, 1):
	print(f"🔗 Processing source {idx}/{len(top_sources)}: {source['title']}")

	# Get best paragraph in this source
	best_para = max(source['paragraphs'], key=lambda x: x['similarity'])

	# Generate anchor text
	anchor_prompt = f"""Generate a natural 2-4 word anchor text to link to this page:

	Target Page Title: {orphan_title}
	Target Keyword: {orphan_keyword}

	Context where link will be placed:
	{best_para['text'][:200]}...

	Provide ONLY the anchor text, no quotes or explanation."""

	anchor_text = self.client.chat([
	{"role": "user", "content": anchor_prompt}
	]).strip().strip('"').strip("'")

	# Generate modified sentence
	modify_prompt = f"""Modify this sentence to naturally include an internal link.

	Current sentence:
	{best_para['text']}

	Add this internal link:
	- Anchor text: "{anchor_text}"
	- Target page: {orphan_title}
	- Target URL: {orphan_url}

	Provide ONLY the modified sentence with the anchor text naturally integrated."""

	new_sentence = self.client.chat([
	{"role": "user", "content": modify_prompt}
	]).strip()

	results.append({
	'source_url': source['url'],
	'source_title': source['title'],
	'score': int(source['score'] * 100),
	'paragraph_index': best_para['index'],
	'current_sentence': best_para['text'],
	'new_sentence': new_sentence,
	'anchor_text': anchor_text,
	'target_url': orphan_url
	})

	# Generate report
	report = self.generate_report(orphan_url, orphan_title, results)

	# Generate table
	df = pd.DataFrame([{
	'Source Page': r['source_title'][:50],
	'Paragraph #': r['paragraph_index'],
	'Score': r['score'],
	'Anchor Text': r['anchor_text'],
	'Current Sentence': r['current_sentence'][:100] + '...',
	'New Sentence': r['new_sentence'][:100] + '...'
	} for r in results])

	return report, df

	def generate_report(self, orphan_url, orphan_title, results):
	"""Generate markdown report"""

	report = f"# 🔗 Internal Linking Report\n\n"
	report += f"Orphan Page: {orphan_title}\n"
	report += f"Target URL: `{orphan_url}`\n"
	report += f"Links Generated: {len(results)}\n\n"
	report += "---\n\n"

	for i, result in enumerate(results, 1):
	report += f"## Link {i}: {result['source_title']}\n\n"
	report += f"Source URL: `{result['source_url']}`\n"
	report += f"Paragraph #: {result['paragraph_index']}\n"
	report += f"Relevance Score: {result['score']}/100\n"
	report += f"Anchor Text: \"{result['anchor_text']}\"\n\n"

	report += "### Current Sentence:\n"
	report += "```\n"
	report += result['current_sentence'] + "\n"
	report += "```\n\n"

	report += "### New Sentence (with link):\n"
	report += "```\n"
	report += result['new_sentence'] + "\n"
	report += "```\n\n"

	report += "### HTML Code:\n"
	report += "```html\n"
	html_code = result['new_sentence'].replace(
	result['anchor_text'],
	f'<a href="{result["target_url"]}">{result["anchor_text"]}</a>'
	)
	report += html_code + "\n"
	report += "```\n\n"
	report += "---\n\n"

	return report

	# ============================================
	# GLOBAL STATE
	# ============================================
	kb = KnowledgeBase()
	analyzer = None

	# ============================================
	# GRADIO FUNCTIONS
	# ============================================
	def setup(api_key, hf_token):
	"""Setup API and load knowledge base"""
	global analyzer

	if not api_key or not api_key.strip():
	return "❌ Please enter your OpenRouter API key", None

	try:
	client = OpenRouterClient(api_key)
	status = ["✅ API key configured"]
	except Exception as e:
	return f"❌ API Error: {str(e)}", None

	# Load knowledge base
	success, message = kb.load_from_huggingface(HF_DATASET_REPO, hf_token)

	if not success:
	return f"✅ API key configured\n{message}", None

	status.append(message)

	# Create analyzer
	analyzer = OrphanPageAnalyzer(kb, client)
	status.append("✅ System ready!")

	return "\n".join(status), None

	def analyze_orphan(orphan_url, orphan_title, orphan_keyword, num_sources):
	"""Analyze orphan page and generate report"""

	if not analyzer:
	return "❌ Please complete setup first", None

	if not orphan_url or not orphan_url.strip():
	return "❌ Please enter an orphan page URL", None

	if not orphan_title or not orphan_title.strip():
	return "❌ Please enter the orphan page title", None

	try:
	report, table = analyzer.analyze(
	orphan_url.strip(),
	orphan_title.strip(),
	orphan_keyword.strip() if orphan_keyword else orphan_title.strip(),
	num_sources
	)
	return report, table
	except Exception as e:
	import traceback
	error_detail = traceback.format_exc()
	return f"❌ Error: {str(e)}\n\nDetails:\n{error_detail}", None

	# ============================================
	# INTERFACE
	# ============================================
	with gr.Blocks(title="Edstellar Internal Linking Tool", theme=gr.themes.Soft()) as app:

	gr.Markdown("# 🔗 Edstellar Internal Linking Tool")
	gr.Markdown("Find the best existing blog posts to link to your orphan page")

	# Setup Section
	with gr.Accordion("⚙️ Setup (Do this once)", open=True):
	gr.Markdown("### Configure API Keys")

	with gr.Row():
	api_key = gr.Textbox(
	label="OpenRouter API Key",
	placeholder="sk-or-v1-...",
	type="password",
	scale=2
	)
	hf_token = gr.Textbox(
	label="Hugging Face Token",
	placeholder="hf_...",
	type="password",
	scale=2
	)

	setup_btn = gr.Button("🚀 Setup System", variant="primary", size="lg")
	setup_status = gr.Textbox(label="Setup Status", lines=3, interactive=False)

	gr.Markdown("---")

	# Analysis Section
	gr.Markdown("### 📊 Analyze Orphan Page")
	gr.Markdown("Enter details about the orphan page you want to get links FOR")

	with gr.Row():
	with gr.Column(scale=3):
	orphan_url_input = gr.Textbox(
	label="Orphan Page URL",
	placeholder="https://edstellar.com/blog/your-orphan-page",
	info="The page that needs backlinks"
	)
	orphan_title_input = gr.Textbox(
	label="Orphan Page Title",
	placeholder="Business Development Manager Roles",
	info="The title/topic of your orphan page"
	)
	orphan_keyword_input = gr.Textbox(
	label="Primary Keyword (Optional)",
	placeholder="business development",
	info="Main keyword for anchor text generation"
	)

	with gr.Column(scale=1):
	num_sources_input = gr.Slider(
	label="Number of Sources",
	minimum=3,
	maximum=5,
	value=3,
	step=1,
	info="How many source pages to find"
	)

	analyze_btn = gr.Button("🔍 Analyze & Generate Report", variant="primary", size="lg")

	gr.Markdown("---")

	# Results Section
	gr.Markdown("### 📄 Report")

	report_output = gr.Markdown()

	gr.Markdown("### 📊 Summary Table")
	table_output = gr.Dataframe(
	label="Quick Overview",
	wrap=True,
	interactive=False
	)

	# Wire up events
	setup_btn.click(
	setup,
	inputs=[api_key, hf_token],
	outputs=[setup_status, table_output]
	)

	analyze_btn.click(
	analyze_orphan,
	inputs=[orphan_url_input, orphan_title_input, orphan_keyword_input, num_sources_input],
	outputs=[report_output, table_output]
	)

	# Launch
	if __name__ == "__main__":
	app.launch()