Spaces:

vlsiddarth
/

Knowledge-Universe

Running

App Files Files Community

Knowledge-Universe / scripts /competitor_test.py

vlsiddarth

Block 2: Enterprise Endpoints, Smart SSL, and Coverage Fixes

0699b16 about 1 month ago

raw

history blame contribute delete

9.56 kB

	"""
	Knowledge Universe — T1 Competitor Analysis
	Run: python scripts/competitor_test.py

	Requires .env entries:
	TAVILY_API_KEY=tvly-...
	EXA_API_KEY=...
	SERPAPI_KEY=...

	Install: pip install tavily-python exa-py google-search-results httpx
	"""

	import os, time, json
	from dotenv import load_dotenv
	load_dotenv()

	QUERY = "transformer architecture"
	# Defaulting to your active HF test key so it works instantly without .env configuration
	API_KEY = os.getenv("API_KEY")
	KU_BASE = "https://vlsiddarth-knowledge-universe.hf.space"


	def test_tavily():
	from tavily import TavilyClient
	client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
	start = time.time()
	result = client.search(query=QUERY, search_depth="advanced", max_results=10)
	ms = round((time.time() - start) * 1000, 1)

	results = result.get("results", [])
	first = results[0] if results else {}

	print(f"\n{'='60}\nTAVILY — {ms}ms\n{'='60}")
	print(f"Count: {len(results)} \| Fields: {list(first.keys())}")
	for i, r in enumerate(results[:5], 1):
	print(f" [{i}] score={r.get('score','N/A'):<6} {r.get('title','')[:55]}")
	print(f" {r.get('url','')[:60]}")

	return {
	"provider": "tavily",
	"latency_ms": ms,
	"result_count": len(results),
	"has_scores": "score" in first,
	"has_dates": "published_date" in first,
	"has_decay": False,
	"raw_fields": list(first.keys()),
	"domains": list(set(r.get("url","").split("/")[2] for r in results if r.get("url"))),
	}


	def test_exa():
	# exa-py 2.x API
	from exa_py import Exa
	client = Exa(api_key=os.getenv("EXA_API_KEY"))
	start = time.time()
	result = client.search(
	QUERY,
	num_results=10,
	type="auto",
	contents={"text": True},
	)
	ms = round((time.time() - start) * 1000, 1)

	results = result.results if hasattr(result, "results") else []
	first = results[0] if results else None

	print(f"\n{'='60}\nEXA — {ms}ms\n{'='60}")
	print(f"Count: {len(results)}")

	for i, r in enumerate(results[:5], 1):
	raw_score = getattr(r, "score", None)
	score_str = f"{raw_score:.4f}" if isinstance(raw_score, (int, float)) else "N/A"

	raw_date = getattr(r, "published_date", "N/A")
	date_str = str(raw_date) if raw_date else "N/A"

	title = str(getattr(r, "title", ""))[:55]
	url = str(getattr(r, "url", ""))[:60]

	print(f" [{i}] score={score_str:<10} date={date_str:<12} {title}")
	print(f" {url}")

	first_attrs = [a for a in dir(first) if not a.startswith("_")] if first else []

	return {
	"provider": "exa",
	"latency_ms": ms,
	"result_count": len(results),
	"has_scores": any(getattr(r, "score", None) is not None for r in results),
	"has_dates": any(getattr(r, "published_date", None) is not None for r in results),
	"has_decay": False,
	"raw_fields": first_attrs,
	"domains": list(set(str(getattr(r,"url","")).split("/")[2] for r in results if getattr(r, "url", None))),
	}


	def test_serpapi():
	from serpapi import GoogleSearch
	start = time.time()
	result = GoogleSearch({"q": QUERY, "api_key": os.getenv("SERPAPI_KEY"), "num": 10}).get_dict()
	ms = round((time.time() - start) * 1000, 1)

	organics = result.get("organic_results", [])
	first = organics[0] if organics else {}

	print(f"\n{'='60}\nSERPAPI — {ms}ms\n{'='60}")
	print(f"Count: {len(organics)} \| Fields: {list(first.keys())}")
	for i, r in enumerate(organics[:5], 1):
	print(f" [{i}] pos={r.get('position')} date={r.get('date','N/A'):<12} {r.get('title','')[:50]}")
	print(f" {r.get('link','')[:60]}")

	return {
	"provider": "serpapi",
	"latency_ms": ms,
	"result_count": len(organics),
	"has_scores": False,
	"has_dates": any(r.get("date") for r in organics),
	"has_decay": False,
	"raw_fields": list(first.keys()),
	"domains": list(set(r.get("link","").split("/")[2] for r in organics if r.get("link"))),
	}


	def test_ku():
	import httpx
	start = time.time()
	resp = httpx.post(
	f"{KU_BASE}/v1/discover",
	headers={"X-API-Key": API_KEY},
	json={"topic": QUERY, "difficulty": 3,
	"formats": ["pdf","github","jupyter","video","stackoverflow"], "max_results": 10},
	timeout=60,
	)
	ms = round((time.time() - start) * 1000, 1)

	# --- AGGRESSIVE ERROR CATCHING ---
	if resp.status_code != 200:
	print(f"\n{'='60}\nKNOWLEDGE UNIVERSE HTTP ERROR: {resp.status_code}\n{'='60}")
	print(f"Raw Error Response:\n{resp.text}")
	return {
	"provider": "knowledge_universe", "latency_ms": ms, "result_count": 0,
	"has_scores": False, "has_decay": False, "has_dates": False,
	"has_difficulty": False, "has_pedagogical": False, "has_format_filter": False,
	"has_embeddings": False
	}

	try:
	data = resp.json()
	except Exception as e:
	print(f"\n[DEBUG] Failed to parse JSON. Raw text: {resp.text}")
	return {"provider": "knowledge_universe", "result_count": 0}

	sources = data.get("sources", [])
	print(f"\n{'='60}\nKNOWLEDGE UNIVERSE — {ms}ms (cache={data.get('cache_hit')})\n{'='60}")
	print(f"Count: {len(sources)} \| Platforms: {list(data.get('formats_found',{}).keys())}")
	for i, s in enumerate(sources[:5], 1):
	d = s.get("decay_report") or {}
	print(f" [{i}] quality={s.get('quality_score', 0):<5} decay={d.get('decay_score','?')} ({d.get('label','?')})")
	print(f" [{s.get('source_platform', 'unknown')}] {s.get('title', '')[:55]}")

	return {
	"provider": "knowledge_universe",
	"latency_ms": ms,
	"result_count": len(sources),
	"has_scores": True,
	"has_decay": True,
	"has_dates": True,
	"has_difficulty": True,
	"has_pedagogical": True,
	"has_format_filter": True,
	"has_embeddings": True,
	"output_formats": ["json", "embeddings", "html"],
	"platforms_covered": list(data.get("formats_found", {}).keys()),
	}


	def print_table(results):
	print(f"\n{'='*72}")
	print("FINAL COMPARISON TABLE")
	print(f"{'='*72}")

	def val(prov, key, true_val="✓", false_val="✗"):
	v = results.get(prov, {}).get(key)
	if isinstance(v, bool): return true_val if v else false_val
	return str(v) if v is not None else "N/A"

	rows = [
	("Cold latency", "latency_ms", "latency_ms", "latency_ms", "latency_ms"),
	("Results returned", "result_count", "result_count", "result_count", "result_count"),
	("Relevance scores", "has_scores", "has_scores", "has_scores", "has_scores"),
	("Publication dates", "has_dates", "has_dates", "has_dates", "has_dates"),
	("Freshness/decay score", "has_decay", "has_decay", "has_decay", "has_decay"),
	("Difficulty rating", None, None, None, "has_difficulty"),
	("Pedagogical fit", None, None, None, "has_pedagogical"),
	("Format filtering", None, None, None, "has_format_filter"),
	("Embeddings output", None, None, None, "has_embeddings"),
	]

	print(f"{'Feature':<28} {'Tavily':>10} {'Exa':>10} {'SerpAPI':>10} {'KU':>10}")
	print("-" * 72)

	providers = ["tavily", "exa", "serpapi", "knowledge_universe"]
	for row in rows:
	label = row[0]
	cells = []
	for i, prov in enumerate(providers):
	key = row[i+1]
	if key is None:
	cells.append("✗")
	else:
	v = results.get(prov, {}).get(key)
	if isinstance(v, bool):
	cells.append("✓" if v else "✗")
	elif v is None:
	cells.append("✗")
	else:
	cells.append(str(v))
	print(f"{label:<28} {cells[0]:>10} {cells[1]:>10} {cells[2]:>10} {cells[3]:>10}")


	if __name__ == "__main__":
	results = {}
	print(f"Testing query: '{QUERY}'\n")
	print(f"Targeting remote API: {KU_BASE}\n")

	for name, fn in [("tavily", test_tavily),
	("exa", test_exa),
	("serpapi", test_serpapi),
	("knowledge_universe", test_ku)]:
	key_map = {"tavily": "TAVILY_API_KEY",
	"exa": "EXA_API_KEY",
	"serpapi": "SERPAPI_KEY"}
	env_key = key_map.get(name)

	if env_key and not os.getenv(env_key):
	print(f"\n⚠ Skipping {name} — {env_key} not set in .env")
	continue
	try:
	results[name] = fn()
	except Exception as e:
	print(f"\n✗ {name} failed: {e}")

	print_table(results)

	with open("research_notes_t1.json", "w") as f:
	json.dump(results, f, indent=2)
	print(f"\n✓ Results saved to research_notes_t1.json")