Knowledge-Universe / scripts /competitor_test.py
vlsiddarth's picture
Block 2: Enterprise Endpoints, Smart SSL, and Coverage Fixes
0699b16
"""
Knowledge Universe β€” T1 Competitor Analysis
Run: python scripts/competitor_test.py
Requires .env entries:
TAVILY_API_KEY=tvly-...
EXA_API_KEY=...
SERPAPI_KEY=...
Install: pip install tavily-python exa-py google-search-results httpx
"""
import os, time, json
from dotenv import load_dotenv
load_dotenv()
QUERY = "transformer architecture"
# Defaulting to your active HF test key so it works instantly without .env configuration
API_KEY = os.getenv("API_KEY")
KU_BASE = "https://vlsiddarth-knowledge-universe.hf.space"
def test_tavily():
from tavily import TavilyClient
client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
start = time.time()
result = client.search(query=QUERY, search_depth="advanced", max_results=10)
ms = round((time.time() - start) * 1000, 1)
results = result.get("results", [])
first = results[0] if results else {}
print(f"\n{'='*60}\nTAVILY β€” {ms}ms\n{'='*60}")
print(f"Count: {len(results)} | Fields: {list(first.keys())}")
for i, r in enumerate(results[:5], 1):
print(f" [{i}] score={r.get('score','N/A'):<6} {r.get('title','')[:55]}")
print(f" {r.get('url','')[:60]}")
return {
"provider": "tavily",
"latency_ms": ms,
"result_count": len(results),
"has_scores": "score" in first,
"has_dates": "published_date" in first,
"has_decay": False,
"raw_fields": list(first.keys()),
"domains": list(set(r.get("url","").split("/")[2] for r in results if r.get("url"))),
}
def test_exa():
# exa-py 2.x API
from exa_py import Exa
client = Exa(api_key=os.getenv("EXA_API_KEY"))
start = time.time()
result = client.search(
QUERY,
num_results=10,
type="auto",
contents={"text": True},
)
ms = round((time.time() - start) * 1000, 1)
results = result.results if hasattr(result, "results") else []
first = results[0] if results else None
print(f"\n{'='*60}\nEXA β€” {ms}ms\n{'='*60}")
print(f"Count: {len(results)}")
for i, r in enumerate(results[:5], 1):
raw_score = getattr(r, "score", None)
score_str = f"{raw_score:.4f}" if isinstance(raw_score, (int, float)) else "N/A"
raw_date = getattr(r, "published_date", "N/A")
date_str = str(raw_date) if raw_date else "N/A"
title = str(getattr(r, "title", ""))[:55]
url = str(getattr(r, "url", ""))[:60]
print(f" [{i}] score={score_str:<10} date={date_str:<12} {title}")
print(f" {url}")
first_attrs = [a for a in dir(first) if not a.startswith("_")] if first else []
return {
"provider": "exa",
"latency_ms": ms,
"result_count": len(results),
"has_scores": any(getattr(r, "score", None) is not None for r in results),
"has_dates": any(getattr(r, "published_date", None) is not None for r in results),
"has_decay": False,
"raw_fields": first_attrs,
"domains": list(set(str(getattr(r,"url","")).split("/")[2] for r in results if getattr(r, "url", None))),
}
def test_serpapi():
from serpapi import GoogleSearch
start = time.time()
result = GoogleSearch({"q": QUERY, "api_key": os.getenv("SERPAPI_KEY"), "num": 10}).get_dict()
ms = round((time.time() - start) * 1000, 1)
organics = result.get("organic_results", [])
first = organics[0] if organics else {}
print(f"\n{'='*60}\nSERPAPI β€” {ms}ms\n{'='*60}")
print(f"Count: {len(organics)} | Fields: {list(first.keys())}")
for i, r in enumerate(organics[:5], 1):
print(f" [{i}] pos={r.get('position')} date={r.get('date','N/A'):<12} {r.get('title','')[:50]}")
print(f" {r.get('link','')[:60]}")
return {
"provider": "serpapi",
"latency_ms": ms,
"result_count": len(organics),
"has_scores": False,
"has_dates": any(r.get("date") for r in organics),
"has_decay": False,
"raw_fields": list(first.keys()),
"domains": list(set(r.get("link","").split("/")[2] for r in organics if r.get("link"))),
}
def test_ku():
import httpx
start = time.time()
resp = httpx.post(
f"{KU_BASE}/v1/discover",
headers={"X-API-Key": API_KEY},
json={"topic": QUERY, "difficulty": 3,
"formats": ["pdf","github","jupyter","video","stackoverflow"], "max_results": 10},
timeout=60,
)
ms = round((time.time() - start) * 1000, 1)
# --- AGGRESSIVE ERROR CATCHING ---
if resp.status_code != 200:
print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE HTTP ERROR: {resp.status_code}\n{'='*60}")
print(f"Raw Error Response:\n{resp.text}")
return {
"provider": "knowledge_universe", "latency_ms": ms, "result_count": 0,
"has_scores": False, "has_decay": False, "has_dates": False,
"has_difficulty": False, "has_pedagogical": False, "has_format_filter": False,
"has_embeddings": False
}
try:
data = resp.json()
except Exception as e:
print(f"\n[DEBUG] Failed to parse JSON. Raw text: {resp.text}")
return {"provider": "knowledge_universe", "result_count": 0}
sources = data.get("sources", [])
print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE β€” {ms}ms (cache={data.get('cache_hit')})\n{'='*60}")
print(f"Count: {len(sources)} | Platforms: {list(data.get('formats_found',{}).keys())}")
for i, s in enumerate(sources[:5], 1):
d = s.get("decay_report") or {}
print(f" [{i}] quality={s.get('quality_score', 0):<5} decay={d.get('decay_score','?')} ({d.get('label','?')})")
print(f" [{s.get('source_platform', 'unknown')}] {s.get('title', '')[:55]}")
return {
"provider": "knowledge_universe",
"latency_ms": ms,
"result_count": len(sources),
"has_scores": True,
"has_decay": True,
"has_dates": True,
"has_difficulty": True,
"has_pedagogical": True,
"has_format_filter": True,
"has_embeddings": True,
"output_formats": ["json", "embeddings", "html"],
"platforms_covered": list(data.get("formats_found", {}).keys()),
}
def print_table(results):
print(f"\n{'='*72}")
print("FINAL COMPARISON TABLE")
print(f"{'='*72}")
def val(prov, key, true_val="βœ“", false_val="βœ—"):
v = results.get(prov, {}).get(key)
if isinstance(v, bool): return true_val if v else false_val
return str(v) if v is not None else "N/A"
rows = [
("Cold latency", "latency_ms", "latency_ms", "latency_ms", "latency_ms"),
("Results returned", "result_count", "result_count", "result_count", "result_count"),
("Relevance scores", "has_scores", "has_scores", "has_scores", "has_scores"),
("Publication dates", "has_dates", "has_dates", "has_dates", "has_dates"),
("Freshness/decay score", "has_decay", "has_decay", "has_decay", "has_decay"),
("Difficulty rating", None, None, None, "has_difficulty"),
("Pedagogical fit", None, None, None, "has_pedagogical"),
("Format filtering", None, None, None, "has_format_filter"),
("Embeddings output", None, None, None, "has_embeddings"),
]
print(f"{'Feature':<28} {'Tavily':>10} {'Exa':>10} {'SerpAPI':>10} {'KU':>10}")
print("-" * 72)
providers = ["tavily", "exa", "serpapi", "knowledge_universe"]
for row in rows:
label = row[0]
cells = []
for i, prov in enumerate(providers):
key = row[i+1]
if key is None:
cells.append("βœ—")
else:
v = results.get(prov, {}).get(key)
if isinstance(v, bool):
cells.append("βœ“" if v else "βœ—")
elif v is None:
cells.append("βœ—")
else:
cells.append(str(v))
print(f"{label:<28} {cells[0]:>10} {cells[1]:>10} {cells[2]:>10} {cells[3]:>10}")
if __name__ == "__main__":
results = {}
print(f"Testing query: '{QUERY}'\n")
print(f"Targeting remote API: {KU_BASE}\n")
for name, fn in [("tavily", test_tavily),
("exa", test_exa),
("serpapi", test_serpapi),
("knowledge_universe", test_ku)]:
key_map = {"tavily": "TAVILY_API_KEY",
"exa": "EXA_API_KEY",
"serpapi": "SERPAPI_KEY"}
env_key = key_map.get(name)
if env_key and not os.getenv(env_key):
print(f"\n⚠ Skipping {name} β€” {env_key} not set in .env")
continue
try:
results[name] = fn()
except Exception as e:
print(f"\nβœ— {name} failed: {e}")
print_table(results)
with open("research_notes_t1.json", "w") as f:
json.dump(results, f, indent=2)
print(f"\nβœ“ Results saved to research_notes_t1.json")