Spaces:

crackbit
/

ai-learning-path-generator

Sleeping

ai-learning-path-generator / src /ml /resource_search.py

“shubhamdhamal”

Deploy Flask app with Docker

7644eac about 1 month ago

14.6 kB

	"""OpenAI-powered resource search helper.

	This module queries the OpenAI Chat Completion endpoint to retrieve
	real, high-quality learning resources (videos, articles, docs) that a user
	can click to continue learning. It returns a simple list of dictionaries so
	upstream code can map them into `ResourceItem` Pydantic objects.

	If the `OPENAI_API_KEY` environment variable is missing, or the API call
	fails, we fall back to a single placeholder so the rest of the app
	continues to work.
	"""
	from __future__ import annotations

	import json
	import logging
	import os
	import re
	import time
	from typing import Dict, List

	from openai import OpenAI
	from langsmith import traceable as langsmith_traceable

	from src.utils.observability import get_observability_manager
	from src.utils.config import (
	PERPLEXITY_PROMPT_COST_PER_1K,
	PERPLEXITY_COMPLETION_COST_PER_1K,
	)

	# Initialize OpenAI client
	client = None


	def _stub_resources() -> List[Dict[str, str]]:
	"""Return a static placeholder when real search is unavailable."""
	return [
	{
	"type": "article",
	"url": "https://example.com/placeholder-resource",
	"description": "Add your OpenAI API key to see real learning resources.",
	}
	]

	def _extract_keywords(query: str) -> List[str]:
	"""Collect meaningful keywords from the query for simple relevance filtering."""
	tokens = re.findall(r"[\w']+", query.lower())
	stopwords = {
	"the",
	"with",
	"your",
	"from",
	"this",
	"that",
	"about",
	"topic",
	"learn",
	"learning",
	"skill",
	"skills",
	"path",
	"guide",
	"study",
	"course",
	"for",
	"into",
	"using",
	"based",
	"mastery",
	"introduction",
	"advanced",
	"beginner",
	"intermediate",
	}
	# Extract keywords, prioritizing the first word (usually the main topic)
	keywords = [tok for tok in tokens if len(tok) > 3 and tok not in stopwords]

	# If query has a colon (e.g., "Mandarin: Pronunciation"), extract both parts
	if ":" in query:
	parts = query.split(":")
	main_topic = parts[0].strip().lower()
	# Main topic is critical - add all its words
	main_tokens = re.findall(r"[\w']+", main_topic)
	keywords.extend([tok for tok in main_tokens if len(tok) > 3 and tok not in stopwords])

	return list(set(keywords)) # Remove duplicates


	def _filter_by_keywords(resources: List[Dict[str, str]], query: str) -> List[Dict[str, str]]:
	"""Filter out resources that do not mention any significant query keywords."""
	keywords = _extract_keywords(query)
	if not keywords:
	return resources

	# Extract main topic (first word or word before colon)
	main_topic = query.split(":")[0].strip().lower() if ":" in query else query.split()[0].lower()

	filtered: List[Dict[str, str]] = []
	for item in resources:
	haystack = " ".join(
	[item.get("url", ""), item.get("description", ""), item.get("type", "")]
	).lower()

	# STRICT: Main topic MUST be present
	if main_topic not in haystack:
	logging.info(f"⚠️ Filtered out resource (missing main topic '{main_topic}'): {item.get('description', '')[:50]}")
	continue

	# Also check if any other keyword matches
	if any(keyword in haystack for keyword in keywords):
	filtered.append(item)

	# If everything was filtered out, keep the originals to avoid empty lists
	if not filtered:
	logging.warning(f"All resources filtered out for query '{query}'. Keeping originals.")
	return filtered or resources


	@langsmith_traceable(name="perplexity_resource_search")
	def search_resources(query: str, k: int = 3, timeout: int = 45, trusted_sources: Dict[str, List[str]] = None) -> List[Dict[str, str]]:
	"""Search for learning resources using Perplexity (with OpenAI fallback).

	Each dict has keys: `type`, `url`, `description`.

	Args:
	query: The search query/milestone title
	k: Number of resources to return
	timeout: API timeout in seconds
	trusted_sources: Dict with 'youtube' and 'websites' lists of trusted sources
	"""
	# Build source-specific instructions
	source_instruction = ""
	if trusted_sources:
	youtube_channels = trusted_sources.get('youtube', [])
	websites = trusted_sources.get('websites', [])

	if youtube_channels or websites:
	source_instruction = "\n\n🎯 CRITICAL - SEARCH ONLY IN THESE CURATED SOURCES:\n"
	if youtube_channels:
	source_instruction += f"✅ APPROVED YouTube Channels (search ONLY these): {', '.join(youtube_channels)}\n"
	source_instruction += " - Go to each channel's videos page\n"
	source_instruction += " - Find videos that match the query topic\n"
	source_instruction += " - Return DIRECT video watch URLs (youtube.com/watch?v=...)\n"
	if websites:
	source_instruction += f"✅ APPROVED Websites (search ONLY these): {', '.join(websites)}\n"
	source_instruction += " - Search within these domains for relevant content\n"
	source_instruction += " - Return direct article/tutorial URLs, not homepages\n"
	source_instruction += "\n❌ FORBIDDEN: Do NOT search or suggest content from ANY other sources\n"
	source_instruction += "❌ FORBIDDEN: Do NOT make up or hallucinate URLs\n"
	source_instruction += "✅ REQUIRED: Every URL must be from the approved list above\n"
	source_instruction += "✅ REQUIRED: Every URL must be a real, existing page you found by searching\n"

	prompt = (
	f"Search the web and find {k} real, working FREE learning resources SPECIFICALLY for: '{query}'. "
	"\n"
	"🎯 CRITICAL REQUIREMENTS:\n"
	"1. PRIORITIZE FREE CONTENT: YouTube videos, free tutorials, open documentation\n"
	"2. AVOID PAID COURSES: Do NOT suggest Udemy, Coursera, or any paid platforms unless they have free content\n"
	"3. DIRECT VIDEO LINKS ONLY: For YouTube, provide DIRECT VIDEO LINKS (youtube.com/watch?v=...), NOT:\n"
	" - Channel homepages\n"
	" - Playlist pages\n"
	" - Search result pages\n"
	"4. SPECIFIC ARTICLES: For websites, link to the SPECIFIC PAGE/ARTICLE, not homepages\n"
	"5. EXACT TOPIC MATCH: Every resource MUST be directly about the EXACT topic in the query\n"
	"6. VERIFY RELEVANCE: The resource title/description must explicitly mention the main topic\n"
	"7. PREFER COMPREHENSIVE CONTENT: Look for 'full course', 'complete tutorial', 'crash course'\n"
	f"{source_instruction}"
	"\n"
	"📺 YOUTUBE PRIORITY: At least 60% of resources should be YouTube videos with direct watch links\n"
	"\n"
	"Return ONLY valid JSON array (no markdown, no code blocks) with format: "
	'[{"type": "video", "url": "https://youtube.com/watch?v=...", "description": "Full Course Title by Channel Name"}, ...]'
	"\n"
	"✅ VALIDATION: Each URL must be:\n"
	"- A real, working link that exists right now\n"
	"- Directly clickable and accessible\n"
	)

	obs_manager = get_observability_manager()

	# Try Perplexity first (real-time web search)
	perplexity_key = os.getenv("PERPLEXITY_API_KEY")
	if perplexity_key:
	try:
	logging.info("Searching for resources using Perplexity (web search)...")
	client = OpenAI(
	api_key=perplexity_key,
	base_url="https://api.perplexity.ai"
	)

	start_time = time.time()
	completion = client.chat.completions.create(
	model="sonar-pro", # Online search model
	messages=[
	{
	"role": "system",
	"content": "You are a helpful assistant that searches the web for real learning resources. Always return valid JSON with actual, working URLs.",
	},
	{"role": "user", "content": prompt},
	],
	temperature=0.2,
	max_tokens=500,
	timeout=timeout,
	)
	latency_ms = (time.time() - start_time) * 1000
	content = completion.choices[0].message.content.strip()

	# Remove markdown code blocks if present
	if content.startswith("```"):
	content = content.split("```")[1]
	if content.startswith("json"):
	content = content[4:]
	content = content.strip()

	resources: List[Dict[str, str]] = json.loads(content)
	cleaned: List[Dict[str, str]] = []
	for item in resources[:k]:
	cleaned.append({
	"type": item.get("type", "article"),
	"url": item.get("url", ""),
	"description": item.get("description", ""),
	})

	if cleaned:
	cleaned = _filter_by_keywords(cleaned, query)
	logging.info(f"✅ Found {len(cleaned)} resources via Perplexity")

	# Extract usage (token counts) if provided
	prompt_tokens = 0
	completion_tokens = 0
	total_tokens = 0

	usage = getattr(completion, "usage", None)
	if usage:
	prompt_tokens = getattr(usage, "prompt_tokens", 0) or getattr(usage, "input_tokens", 0)
	completion_tokens = getattr(usage, "completion_tokens", 0) or getattr(usage, "output_tokens", 0)
	total_tokens = getattr(usage, "total_tokens", 0) or (prompt_tokens + completion_tokens)
	else:
	# Fallback: some clients expose model_dump / dict style
	usage_payload = None
	if hasattr(completion, "model_dump") and callable(completion.model_dump):
	usage_payload = completion.model_dump().get("usage")
	elif isinstance(completion, dict):
	usage_payload = completion.get("usage")

	if usage_payload:
	prompt_tokens = usage_payload.get("prompt_tokens", usage_payload.get("input_tokens", 0))
	completion_tokens = usage_payload.get("completion_tokens", usage_payload.get("output_tokens", 0))
	total_tokens = usage_payload.get("total_tokens", prompt_tokens + completion_tokens)

	# Estimate cost using configured pricing (per 1K tokens)
	perplexity_cost = 0.0
	if PERPLEXITY_PROMPT_COST_PER_1K > 0 or PERPLEXITY_COMPLETION_COST_PER_1K > 0:
	perplexity_cost = (
	(prompt_tokens / 1000.0) * PERPLEXITY_PROMPT_COST_PER_1K
	+ (completion_tokens / 1000.0) * PERPLEXITY_COMPLETION_COST_PER_1K
	)

	# Log to observability platforms
	obs_manager.log_llm_call(
	prompt=prompt,
	response=content,
	model="perplexity-sonar-pro",
	metadata={
	"provider": "perplexity",
	"query": query,
	"trusted_sources": trusted_sources or {},
	},
	latency_ms=latency_ms,
	token_count=total_tokens or None,
	cost=perplexity_cost or None,
	)

	obs_manager.log_metric(
	"perplexity_latency_ms",
	float(latency_ms),
	{
	"query": query,
	"result_count": len(cleaned),
	},
	)

	if prompt_tokens:
	obs_manager.log_metric(
	"perplexity_prompt_tokens",
	float(prompt_tokens),
	{"query": query},
	)
	if completion_tokens:
	obs_manager.log_metric(
	"perplexity_completion_tokens",
	float(completion_tokens),
	{"query": query},
	)
	if perplexity_cost:
	obs_manager.log_metric(
	"perplexity_cost_usd",
	perplexity_cost,
	{"query": query},
	)

	return cleaned
	except Exception as exc:
	logging.warning(f"Perplexity resource search failed: {exc}. Falling back to OpenAI...")

	# Fallback to OpenAI
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	logging.info("OPENAI_API_KEY not set; returning stub resources")
	return _stub_resources()

	model = os.getenv("DEFAULT_MODEL", "gpt-4o-mini")

	try:
	client = OpenAI(api_key=api_key)

	completion = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": "You are a helpful research assistant that provides real, working URLs to learning resources."},
	{"role": "user", "content": prompt},
	],
	temperature=0.2,
	max_tokens=400,
	timeout=timeout,
	)
	content = completion.choices[0].message.content.strip()

	# Remove markdown code blocks if present
	if content.startswith("```"):
	content = content.split("```")[1]
	if content.startswith("json"):
	content = content[4:]
	content = content.strip()

	resources: List[Dict[str, str]] = json.loads(content)
	cleaned: List[Dict[str, str]] = []
	for item in resources[:k]:
	cleaned.append({
	"type": item.get("type", "article"),
	"url": item.get("url", ""),
	"description": item.get("description", ""),
	})
	cleaned = _filter_by_keywords(cleaned, query)
	return cleaned or _stub_resources()
	except Exception as exc:
	logging.warning("OpenAI resource search failed: %s", exc)
	return _stub_resources()