Spaces:

vlsiddarth
/

Knowledge-Universe

Running

File size: 9,564 Bytes

"""
Knowledge Universe — T1 Competitor Analysis
Run: python scripts/competitor_test.py

Requires .env entries:
  TAVILY_API_KEY=tvly-...
  EXA_API_KEY=...
  SERPAPI_KEY=...

Install: pip install tavily-python exa-py google-search-results httpx
"""

import os, time, json
from dotenv import load_dotenv
load_dotenv()

QUERY   = "transformer architecture"
# Defaulting to your active HF test key so it works instantly without .env configuration
API_KEY = os.getenv("API_KEY")
KU_BASE = "https://vlsiddarth-knowledge-universe.hf.space"


def test_tavily():
    from tavily import TavilyClient
    client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
    start  = time.time()
    result = client.search(query=QUERY, search_depth="advanced", max_results=10)
    ms     = round((time.time() - start) * 1000, 1)

    results = result.get("results", [])
    first   = results[0] if results else {}

    print(f"\n{'='*60}\nTAVILY — {ms}ms\n{'='*60}")
    print(f"Count: {len(results)}  |  Fields: {list(first.keys())}")
    for i, r in enumerate(results[:5], 1):
        print(f"  [{i}] score={r.get('score','N/A'):<6}  {r.get('title','')[:55]}")
        print(f"       {r.get('url','')[:60]}")

    return {
        "provider":       "tavily",
        "latency_ms":     ms,
        "result_count":   len(results),
        "has_scores":     "score"          in first,
        "has_dates":      "published_date" in first,
        "has_decay":      False,
        "raw_fields":     list(first.keys()),
        "domains":        list(set(r.get("url","").split("/")[2] for r in results if r.get("url"))),
    }


def test_exa():
    # exa-py 2.x API
    from exa_py import Exa
    client = Exa(api_key=os.getenv("EXA_API_KEY"))
    start  = time.time()
    result = client.search(
        QUERY,
        num_results=10,
        type="auto",
        contents={"text": True},
    )
    ms = round((time.time() - start) * 1000, 1)

    results = result.results if hasattr(result, "results") else []
    first   = results[0] if results else None

    print(f"\n{'='*60}\nEXA — {ms}ms\n{'='*60}")
    print(f"Count: {len(results)}")
    
    for i, r in enumerate(results[:5], 1):
        raw_score = getattr(r, "score", None)
        score_str = f"{raw_score:.4f}" if isinstance(raw_score, (int, float)) else "N/A"
        
        raw_date = getattr(r, "published_date", "N/A")
        date_str = str(raw_date) if raw_date else "N/A"
        
        title = str(getattr(r, "title", ""))[:55]
        url   = str(getattr(r, "url",   ""))[:60]
        
        print(f"  [{i}] score={score_str:<10} date={date_str:<12} {title}")
        print(f"       {url}")

    first_attrs = [a for a in dir(first) if not a.startswith("_")] if first else []

    return {
        "provider":     "exa",
        "latency_ms":   ms,
        "result_count": len(results),
        "has_scores":   any(getattr(r, "score", None) is not None for r in results),
        "has_dates":    any(getattr(r, "published_date", None) is not None for r in results),
        "has_decay":    False,
        "raw_fields":   first_attrs,
        "domains":      list(set(str(getattr(r,"url","")).split("/")[2] for r in results if getattr(r, "url", None))),
    }


def test_serpapi():
    from serpapi import GoogleSearch
    start  = time.time()
    result = GoogleSearch({"q": QUERY, "api_key": os.getenv("SERPAPI_KEY"), "num": 10}).get_dict()
    ms     = round((time.time() - start) * 1000, 1)

    organics = result.get("organic_results", [])
    first    = organics[0] if organics else {}

    print(f"\n{'='*60}\nSERPAPI — {ms}ms\n{'='*60}")
    print(f"Count: {len(organics)}  |  Fields: {list(first.keys())}")
    for i, r in enumerate(organics[:5], 1):
        print(f"  [{i}] pos={r.get('position')}  date={r.get('date','N/A'):<12}  {r.get('title','')[:50]}")
        print(f"       {r.get('link','')[:60]}")

    return {
        "provider":     "serpapi",
        "latency_ms":   ms,
        "result_count": len(organics),
        "has_scores":   False,
        "has_dates":    any(r.get("date") for r in organics),
        "has_decay":    False,
        "raw_fields":   list(first.keys()),
        "domains":      list(set(r.get("link","").split("/")[2] for r in organics if r.get("link"))),
    }


def test_ku():
    import httpx
    start = time.time()
    resp  = httpx.post(
        f"{KU_BASE}/v1/discover",
        headers={"X-API-Key": API_KEY},
        json={"topic": QUERY, "difficulty": 3,
              "formats": ["pdf","github","jupyter","video","stackoverflow"], "max_results": 10},
        timeout=60,
    )
    ms = round((time.time() - start) * 1000, 1)
    
    # --- AGGRESSIVE ERROR CATCHING ---
    if resp.status_code != 200:
        print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE HTTP ERROR: {resp.status_code}\n{'='*60}")
        print(f"Raw Error Response:\n{resp.text}")
        return {
            "provider": "knowledge_universe", "latency_ms": ms, "result_count": 0,
            "has_scores": False, "has_decay": False, "has_dates": False,
            "has_difficulty": False, "has_pedagogical": False, "has_format_filter": False,
            "has_embeddings": False
        }

    try:
        data = resp.json()
    except Exception as e:
        print(f"\n[DEBUG] Failed to parse JSON. Raw text: {resp.text}")
        return {"provider": "knowledge_universe", "result_count": 0}

    sources = data.get("sources", [])
    print(f"\n{'='*60}\nKNOWLEDGE UNIVERSE — {ms}ms (cache={data.get('cache_hit')})\n{'='*60}")
    print(f"Count: {len(sources)}  |  Platforms: {list(data.get('formats_found',{}).keys())}")
    for i, s in enumerate(sources[:5], 1):
        d = s.get("decay_report") or {}
        print(f"  [{i}] quality={s.get('quality_score', 0):<5} decay={d.get('decay_score','?')} ({d.get('label','?')})")
        print(f"       [{s.get('source_platform', 'unknown')}] {s.get('title', '')[:55]}")

    return {
        "provider":           "knowledge_universe",
        "latency_ms":         ms,
        "result_count":       len(sources),
        "has_scores":         True,
        "has_decay":          True,
        "has_dates":          True,
        "has_difficulty":     True,
        "has_pedagogical":    True,
        "has_format_filter":  True,
        "has_embeddings":     True,
        "output_formats":     ["json", "embeddings", "html"],
        "platforms_covered":  list(data.get("formats_found", {}).keys()),
    }


def print_table(results):
    print(f"\n{'='*72}")
    print("FINAL COMPARISON TABLE")
    print(f"{'='*72}")

    def val(prov, key, true_val="✓", false_val="✗"):
        v = results.get(prov, {}).get(key)
        if isinstance(v, bool): return true_val if v else false_val
        return str(v) if v is not None else "N/A"

    rows = [
        ("Cold latency",             "latency_ms",        "latency_ms",        "latency_ms",        "latency_ms"),
        ("Results returned",         "result_count",      "result_count",      "result_count",      "result_count"),
        ("Relevance scores",         "has_scores",        "has_scores",        "has_scores",        "has_scores"),
        ("Publication dates",        "has_dates",         "has_dates",         "has_dates",         "has_dates"),
        ("Freshness/decay score",    "has_decay",         "has_decay",         "has_decay",         "has_decay"),
        ("Difficulty rating",        None,                None,                None,                "has_difficulty"),
        ("Pedagogical fit",          None,                None,                None,                "has_pedagogical"),
        ("Format filtering",         None,                None,                None,                "has_format_filter"),
        ("Embeddings output",        None,                None,                None,                "has_embeddings"),
    ]

    print(f"{'Feature':<28} {'Tavily':>10} {'Exa':>10} {'SerpAPI':>10} {'KU':>10}")
    print("-" * 72)

    providers = ["tavily", "exa", "serpapi", "knowledge_universe"]
    for row in rows:
        label = row[0]
        cells = []
        for i, prov in enumerate(providers):
            key = row[i+1]
            if key is None:
                cells.append("✗")
            else:
                v = results.get(prov, {}).get(key)
                if isinstance(v, bool):
                    cells.append("✓" if v else "✗")
                elif v is None:
                    cells.append("✗")
                else:
                    cells.append(str(v))
        print(f"{label:<28} {cells[0]:>10} {cells[1]:>10} {cells[2]:>10} {cells[3]:>10}")


if __name__ == "__main__":
    results = {}
    print(f"Testing query: '{QUERY}'\n")
    print(f"Targeting remote API: {KU_BASE}\n")

    for name, fn in [("tavily",  test_tavily),
                     ("exa",     test_exa),
                     ("serpapi", test_serpapi),
                     ("knowledge_universe", test_ku)]:
        key_map = {"tavily":  "TAVILY_API_KEY",
                   "exa":     "EXA_API_KEY",
                   "serpapi": "SERPAPI_KEY"}
        env_key = key_map.get(name)

        if env_key and not os.getenv(env_key):
            print(f"\n⚠  Skipping {name} — {env_key} not set in .env")
            continue
        try:
            results[name] = fn()
        except Exception as e:
            print(f"\n✗ {name} failed: {e}")

    print_table(results)

    with open("research_notes_t1.json", "w") as f:
        json.dump(results, f, indent=2)
    print(f"\n✓ Results saved to research_notes_t1.json")