Spaces:
Running
Running
| """ | |
| Knowledge Universe β T1 Competitor Analysis | |
| Run: python scripts/competitor_test.py | |
| Requires .env entries: | |
| TAVILY_API_KEY=tvly-... | |
| EXA_API_KEY=... | |
| SERPAPI_KEY=... | |
| Install: pip install tavily-python exa-py google-search-results httpx | |
| """ | |
| import os, time, json | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| QUERY = "transformer architecture" | |
| # Defaulting to your active HF test key so it works instantly without .env configuration | |
| API_KEY = os.getenv("API_KEY") | |
| KU_BASE = "https://vlsiddarth-knowledge-universe.hf.space" | |
| def test_tavily(): | |
| from tavily import TavilyClient | |
| client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) | |
| start = time.time() | |
| result = client.search(query=QUERY, search_depth="advanced", max_results=10) | |
| ms = round((time.time() - start) * 1000, 1) | |
| results = result.get("results", []) | |
| first = results[0] if results else {} | |
| print(f"\n{'='*60}\nTAVILY β {ms}ms\n{'='*60}") | |
| print(f"Count: {len(results)} | Fields: {list(first.keys())}") | |
| for i, r in enumerate(results[:5], 1): | |
| print(f" [{i}] score={r.get('score','N/A'):<6} {r.get('title','')[:55]}") | |
| print(f" {r.get('url','')[:60]}") | |
| return { | |
| "provider": "tavily", | |
| "latency_ms": ms, | |
| "result_count": len(results), | |
| "has_scores": "score" in first, | |
| "has_dates": "published_date" in first, | |
| "has_decay": False, | |
| "raw_fields": list(first.keys()), | |
| "domains": list(set(r.get("url","").split("/")[2] for r in results if r.get("url"))), | |
| } | |
| def test_exa(): | |
| # exa-py 2.x API | |
| from exa_py import Exa | |
| client = Exa(api_key=os.getenv("EXA_API_KEY")) | |
| start = time.time() | |
| result = client.search( | |
| QUERY, | |
| num_results=10, | |
| type="auto", | |
| contents={"text": True}, | |
| ) | |
| ms = round((time.time() - start) * 1000, 1) | |
| results = result.results if hasattr(result, "results") else [] | |
| first = results[0] if results else None | |
| print(f"\n{'='*60}\nEXA β {ms}ms\n{'='*60}") | |
| print(f"Count: {len(results)}") | |
| for i, r in enumerate(results[:5], 1): | |
| raw_score = getattr(r, "score", None) | |
| score_str = f"{raw_score:.4f}" if isinstance(raw_score, (int, float)) else "N/A" | |
| raw_date = getattr(r, "published_date", "N/A") | |
| date_str = str(raw_date) if raw_date else "N/A" | |
| title = str(getattr(r, "title", ""))[:55] | |
| url = str(getattr(r, "url", ""))[:60] | |
| print(f" [{i}] score={score_str:<10} date={date_str:<12} {title}") | |
| print(f" {url}") | |
| first_attrs = [a for a in dir(first) if not a.startswith("_")] if first else [] | |
| return { | |
| "provider": "exa", | |
| "latency_ms": ms, | |
| "result_count": len(results), | |
| "has_scores": any(getattr(r, "score", None) is not None for r in results), | |
| "has_dates": any(getattr(r, "published_date", None) is not None for r in results), | |
| "has_decay": False, | |
| "raw_fields": first_attrs, | |
| "domains": list(set(str(getattr(r,"url","")).split("/")[2] for r in results if getattr(r, "url", None))), | |
| } | |
| def test_serpapi(): | |
| from serpapi import GoogleSearch | |
| start = time.time() | |
| result = GoogleSearch({"q": QUERY, "api_key": os.getenv("SERPAPI_KEY"), "num": 10}).get_dict() | |
| ms = round((time.time() - start) * 1000, 1) | |
| organics = result.get("organic_results", []) | |
| first = organics[0] if organics else {} | |
| print(f"\n{'='*60}\nSERPAPI β {ms}ms\n{'='*60}") | |
| print(f"Count: {len(organics)} | Fields: {list(first.keys())}") | |
| for i, r in enumerate(organics[:5], 1): | |
| print(f" [{i}] pos={r.get('position')} date={r.get('date','N/A'):<12} {r.get('title','')[:50]}") | |
| print(f" {r.get('link','')[:60]}") | |
| return { | |
| "provider": "serpapi", | |
| "latency_ms": ms, | |
| "result_count": len(organics), | |
| "has_scores": False, | |
| "has_dates": any(r.get("date") for r in organics), | |
| "has_decay": False, | |
| "raw_fields": list(first.keys()), | |
| "domains": list(set(r.get("link","").split("/")[2] for r in organics if r.get("link"))), | |
| } | |
| def test_ku(): | |
| import httpx | |
| start = time.time() | |
| resp = httpx.post( | |
| f"{KU_BASE}/v1/discover", | |
| headers={"X-API-Key": API_KEY}, | |
| json={"topic": QUERY, "difficulty": 3, | |
| "formats": ["pdf","github","jupyter","video","stackoverflow"], "max_results": 10}, | |
| timeout=60, | |
| ) | |
| ms = round((time.time() - start) * 1000, 1) | |
| # --- AGGRESSIVE ERROR CATCHING --- | |
| if resp.status_code != 200: | |
| print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE HTTP ERROR: {resp.status_code}\n{'='*60}") | |
| print(f"Raw Error Response:\n{resp.text}") | |
| return { | |
| "provider": "knowledge_universe", "latency_ms": ms, "result_count": 0, | |
| "has_scores": False, "has_decay": False, "has_dates": False, | |
| "has_difficulty": False, "has_pedagogical": False, "has_format_filter": False, | |
| "has_embeddings": False | |
| } | |
| try: | |
| data = resp.json() | |
| except Exception as e: | |
| print(f"\n[DEBUG] Failed to parse JSON. Raw text: {resp.text}") | |
| return {"provider": "knowledge_universe", "result_count": 0} | |
| sources = data.get("sources", []) | |
| print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE β {ms}ms (cache={data.get('cache_hit')})\n{'='*60}") | |
| print(f"Count: {len(sources)} | Platforms: {list(data.get('formats_found',{}).keys())}") | |
| for i, s in enumerate(sources[:5], 1): | |
| d = s.get("decay_report") or {} | |
| print(f" [{i}] quality={s.get('quality_score', 0):<5} decay={d.get('decay_score','?')} ({d.get('label','?')})") | |
| print(f" [{s.get('source_platform', 'unknown')}] {s.get('title', '')[:55]}") | |
| return { | |
| "provider": "knowledge_universe", | |
| "latency_ms": ms, | |
| "result_count": len(sources), | |
| "has_scores": True, | |
| "has_decay": True, | |
| "has_dates": True, | |
| "has_difficulty": True, | |
| "has_pedagogical": True, | |
| "has_format_filter": True, | |
| "has_embeddings": True, | |
| "output_formats": ["json", "embeddings", "html"], | |
| "platforms_covered": list(data.get("formats_found", {}).keys()), | |
| } | |
| def print_table(results): | |
| print(f"\n{'='*72}") | |
| print("FINAL COMPARISON TABLE") | |
| print(f"{'='*72}") | |
| def val(prov, key, true_val="β", false_val="β"): | |
| v = results.get(prov, {}).get(key) | |
| if isinstance(v, bool): return true_val if v else false_val | |
| return str(v) if v is not None else "N/A" | |
| rows = [ | |
| ("Cold latency", "latency_ms", "latency_ms", "latency_ms", "latency_ms"), | |
| ("Results returned", "result_count", "result_count", "result_count", "result_count"), | |
| ("Relevance scores", "has_scores", "has_scores", "has_scores", "has_scores"), | |
| ("Publication dates", "has_dates", "has_dates", "has_dates", "has_dates"), | |
| ("Freshness/decay score", "has_decay", "has_decay", "has_decay", "has_decay"), | |
| ("Difficulty rating", None, None, None, "has_difficulty"), | |
| ("Pedagogical fit", None, None, None, "has_pedagogical"), | |
| ("Format filtering", None, None, None, "has_format_filter"), | |
| ("Embeddings output", None, None, None, "has_embeddings"), | |
| ] | |
| print(f"{'Feature':<28} {'Tavily':>10} {'Exa':>10} {'SerpAPI':>10} {'KU':>10}") | |
| print("-" * 72) | |
| providers = ["tavily", "exa", "serpapi", "knowledge_universe"] | |
| for row in rows: | |
| label = row[0] | |
| cells = [] | |
| for i, prov in enumerate(providers): | |
| key = row[i+1] | |
| if key is None: | |
| cells.append("β") | |
| else: | |
| v = results.get(prov, {}).get(key) | |
| if isinstance(v, bool): | |
| cells.append("β" if v else "β") | |
| elif v is None: | |
| cells.append("β") | |
| else: | |
| cells.append(str(v)) | |
| print(f"{label:<28} {cells[0]:>10} {cells[1]:>10} {cells[2]:>10} {cells[3]:>10}") | |
| if __name__ == "__main__": | |
| results = {} | |
| print(f"Testing query: '{QUERY}'\n") | |
| print(f"Targeting remote API: {KU_BASE}\n") | |
| for name, fn in [("tavily", test_tavily), | |
| ("exa", test_exa), | |
| ("serpapi", test_serpapi), | |
| ("knowledge_universe", test_ku)]: | |
| key_map = {"tavily": "TAVILY_API_KEY", | |
| "exa": "EXA_API_KEY", | |
| "serpapi": "SERPAPI_KEY"} | |
| env_key = key_map.get(name) | |
| if env_key and not os.getenv(env_key): | |
| print(f"\nβ Skipping {name} β {env_key} not set in .env") | |
| continue | |
| try: | |
| results[name] = fn() | |
| except Exception as e: | |
| print(f"\nβ {name} failed: {e}") | |
| print_table(results) | |
| with open("research_notes_t1.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\nβ Results saved to research_notes_t1.json") |