""" Knowledge Universe — T1 Competitor Analysis Run: python scripts/competitor_test.py Requires .env entries: TAVILY_API_KEY=tvly-... EXA_API_KEY=... SERPAPI_KEY=... Install: pip install tavily-python exa-py google-search-results httpx """ import os, time, json from dotenv import load_dotenv load_dotenv() QUERY = "transformer architecture" # Defaulting to your active HF test key so it works instantly without .env configuration API_KEY = os.getenv("API_KEY") KU_BASE = "https://vlsiddarth-knowledge-universe.hf.space" def test_tavily(): from tavily import TavilyClient client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) start = time.time() result = client.search(query=QUERY, search_depth="advanced", max_results=10) ms = round((time.time() - start) * 1000, 1) results = result.get("results", []) first = results[0] if results else {} print(f"\n{'='*60}\nTAVILY — {ms}ms\n{'='*60}") print(f"Count: {len(results)} | Fields: {list(first.keys())}") for i, r in enumerate(results[:5], 1): print(f" [{i}] score={r.get('score','N/A'):<6} {r.get('title','')[:55]}") print(f" {r.get('url','')[:60]}") return { "provider": "tavily", "latency_ms": ms, "result_count": len(results), "has_scores": "score" in first, "has_dates": "published_date" in first, "has_decay": False, "raw_fields": list(first.keys()), "domains": list(set(r.get("url","").split("/")[2] for r in results if r.get("url"))), } def test_exa(): # exa-py 2.x API from exa_py import Exa client = Exa(api_key=os.getenv("EXA_API_KEY")) start = time.time() result = client.search( QUERY, num_results=10, type="auto", contents={"text": True}, ) ms = round((time.time() - start) * 1000, 1) results = result.results if hasattr(result, "results") else [] first = results[0] if results else None print(f"\n{'='*60}\nEXA — {ms}ms\n{'='*60}") print(f"Count: {len(results)}") for i, r in enumerate(results[:5], 1): raw_score = getattr(r, "score", None) score_str = f"{raw_score:.4f}" if isinstance(raw_score, (int, float)) else "N/A" raw_date = getattr(r, "published_date", "N/A") date_str = str(raw_date) if raw_date else "N/A" title = str(getattr(r, "title", ""))[:55] url = str(getattr(r, "url", ""))[:60] print(f" [{i}] score={score_str:<10} date={date_str:<12} {title}") print(f" {url}") first_attrs = [a for a in dir(first) if not a.startswith("_")] if first else [] return { "provider": "exa", "latency_ms": ms, "result_count": len(results), "has_scores": any(getattr(r, "score", None) is not None for r in results), "has_dates": any(getattr(r, "published_date", None) is not None for r in results), "has_decay": False, "raw_fields": first_attrs, "domains": list(set(str(getattr(r,"url","")).split("/")[2] for r in results if getattr(r, "url", None))), } def test_serpapi(): from serpapi import GoogleSearch start = time.time() result = GoogleSearch({"q": QUERY, "api_key": os.getenv("SERPAPI_KEY"), "num": 10}).get_dict() ms = round((time.time() - start) * 1000, 1) organics = result.get("organic_results", []) first = organics[0] if organics else {} print(f"\n{'='*60}\nSERPAPI — {ms}ms\n{'='*60}") print(f"Count: {len(organics)} | Fields: {list(first.keys())}") for i, r in enumerate(organics[:5], 1): print(f" [{i}] pos={r.get('position')} date={r.get('date','N/A'):<12} {r.get('title','')[:50]}") print(f" {r.get('link','')[:60]}") return { "provider": "serpapi", "latency_ms": ms, "result_count": len(organics), "has_scores": False, "has_dates": any(r.get("date") for r in organics), "has_decay": False, "raw_fields": list(first.keys()), "domains": list(set(r.get("link","").split("/")[2] for r in organics if r.get("link"))), } def test_ku(): import httpx start = time.time() resp = httpx.post( f"{KU_BASE}/v1/discover", headers={"X-API-Key": API_KEY}, json={"topic": QUERY, "difficulty": 3, "formats": ["pdf","github","jupyter","video","stackoverflow"], "max_results": 10}, timeout=60, ) ms = round((time.time() - start) * 1000, 1) # --- AGGRESSIVE ERROR CATCHING --- if resp.status_code != 200: print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE HTTP ERROR: {resp.status_code}\n{'='*60}") print(f"Raw Error Response:\n{resp.text}") return { "provider": "knowledge_universe", "latency_ms": ms, "result_count": 0, "has_scores": False, "has_decay": False, "has_dates": False, "has_difficulty": False, "has_pedagogical": False, "has_format_filter": False, "has_embeddings": False } try: data = resp.json() except Exception as e: print(f"\n[DEBUG] Failed to parse JSON. Raw text: {resp.text}") return {"provider": "knowledge_universe", "result_count": 0} sources = data.get("sources", []) print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE — {ms}ms (cache={data.get('cache_hit')})\n{'='*60}") print(f"Count: {len(sources)} | Platforms: {list(data.get('formats_found',{}).keys())}") for i, s in enumerate(sources[:5], 1): d = s.get("decay_report") or {} print(f" [{i}] quality={s.get('quality_score', 0):<5} decay={d.get('decay_score','?')} ({d.get('label','?')})") print(f" [{s.get('source_platform', 'unknown')}] {s.get('title', '')[:55]}") return { "provider": "knowledge_universe", "latency_ms": ms, "result_count": len(sources), "has_scores": True, "has_decay": True, "has_dates": True, "has_difficulty": True, "has_pedagogical": True, "has_format_filter": True, "has_embeddings": True, "output_formats": ["json", "embeddings", "html"], "platforms_covered": list(data.get("formats_found", {}).keys()), } def print_table(results): print(f"\n{'='*72}") print("FINAL COMPARISON TABLE") print(f"{'='*72}") def val(prov, key, true_val="✓", false_val="✗"): v = results.get(prov, {}).get(key) if isinstance(v, bool): return true_val if v else false_val return str(v) if v is not None else "N/A" rows = [ ("Cold latency", "latency_ms", "latency_ms", "latency_ms", "latency_ms"), ("Results returned", "result_count", "result_count", "result_count", "result_count"), ("Relevance scores", "has_scores", "has_scores", "has_scores", "has_scores"), ("Publication dates", "has_dates", "has_dates", "has_dates", "has_dates"), ("Freshness/decay score", "has_decay", "has_decay", "has_decay", "has_decay"), ("Difficulty rating", None, None, None, "has_difficulty"), ("Pedagogical fit", None, None, None, "has_pedagogical"), ("Format filtering", None, None, None, "has_format_filter"), ("Embeddings output", None, None, None, "has_embeddings"), ] print(f"{'Feature':<28} {'Tavily':>10} {'Exa':>10} {'SerpAPI':>10} {'KU':>10}") print("-" * 72) providers = ["tavily", "exa", "serpapi", "knowledge_universe"] for row in rows: label = row[0] cells = [] for i, prov in enumerate(providers): key = row[i+1] if key is None: cells.append("✗") else: v = results.get(prov, {}).get(key) if isinstance(v, bool): cells.append("✓" if v else "✗") elif v is None: cells.append("✗") else: cells.append(str(v)) print(f"{label:<28} {cells[0]:>10} {cells[1]:>10} {cells[2]:>10} {cells[3]:>10}") if __name__ == "__main__": results = {} print(f"Testing query: '{QUERY}'\n") print(f"Targeting remote API: {KU_BASE}\n") for name, fn in [("tavily", test_tavily), ("exa", test_exa), ("serpapi", test_serpapi), ("knowledge_universe", test_ku)]: key_map = {"tavily": "TAVILY_API_KEY", "exa": "EXA_API_KEY", "serpapi": "SERPAPI_KEY"} env_key = key_map.get(name) if env_key and not os.getenv(env_key): print(f"\n⚠ Skipping {name} — {env_key} not set in .env") continue try: results[name] = fn() except Exception as e: print(f"\n✗ {name} failed: {e}") print_table(results) with open("research_notes_t1.json", "w") as f: json.dump(results, f, indent=2) print(f"\n✓ Results saved to research_notes_t1.json")