Spaces:

vlsiddarth
/

Knowledge-Universe

Running

File size: 11,881 Bytes

"""
Knowledge Universe — Block 3 Gap Analysis (Cycle 2)
=====================================================
Run: python scripts/gap_analysis.py

Cycle 2 auto-grader fix:
  The Cycle 1 auto-grader gave HuggingFace RLHF datasets a grade of 0
  because "Anthropic/hh-rlhf" doesn't contain all the words
  "reward model training" in the dataset name.

  Fix: HuggingFace datasets are graded by whether their name/title
  contains ANY of the significant query words — not all of them.
  "hh-rlhf" contains "rlhf" → grade 2 for RLHF query.
  "Anthropic/hh-rlhf" from Anthropic for RLHF query → grade 2.

  Also: "Generalisation of RLHF under Reward Shift" arXiv paper
  contains "RLHF" and "Reward" → grade 3, not 1.
"""

import os
import httpx
import time
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("API_KEY")
BASE    = "http://localhost:8000"

TEST_QUERIES = [
    (
        "mixture of experts architecture", 4,
        "Expect: arxiv papers on MoE (sparse, gating, routing). "
        "Bug if: OpenLibrary returns building-architecture books."
    ),
    (
        "Claude 3.5 sonnet", 2,
        "Fast-moving topic. Expect: recent blog posts, GitHub repos, YouTube. "
        "Bug if: anything labeled 'decayed' ranks in top 3."
    ),
    (
        "LangChain streaming callbacks", 3,
        "Practical coding. Expect: GitHub + StackOverflow specifically about "
        "streaming callbacks. Bug if: generic LangChain results with no "
        "streaming or callback mention."
    ),
    (
        "RLHF reward model training", 5,
        "Research-heavy. Expect: arxiv papers, difficulty 4-5 results. "
        "Bug if: beginner YouTube tutorials rank above research papers."
    ),
    (
        "what is machine learning", 1,
        "Beginner. Expect: Wikipedia, YouTube explainers, difficulty 1-2. "
        "Bug if: arxiv papers with difficulty 4+ dominate top 3."
    ),
]

GRADING = """
GRADE each result 0-3:
  3 = Exactly what an expert would recommend
  2 = Relevant, not the best
  1 = Tangentially related
  0 = Irrelevant (e.g. restaurant recommendation paper for ML query)
"""


def auto_grade(source: dict, topic: str, requested_difficulty: int) -> int:
    """
    Improved auto-grader:
    - Checks title AND summary for topic word matches
    - HuggingFace datasets: any 1 topic word match = grade 2
    - arXiv: match in title = grade 3, match in summary = grade 2
    - Difficulty mismatch > 2 = grade 0 regardless
    """
    title    = (source.get("title") or "").lower()
    summary  = (source.get("summary") or "").lower()
    platform = source.get("source_platform", "")
    diff     = source.get("difficulty", 3)

    topic_lower = topic.lower()
    topic_words = [
        w for w in topic_lower.split()
        if len(w) > 3 and w not in {
            "what", "does", "how", "the", "and", "for", "with", "from"
        }
    ]

    # Hard penalty for severe difficulty mismatch
    diff_gap = abs(int(diff) - requested_difficulty)
    if diff_gap > 2:
        return 0

    # Check matches in title and summary
    title_matches   = sum(1 for w in topic_words if w in title)
    summary_matches = sum(1 for w in topic_words if w in summary)
    total_matches   = title_matches + summary_matches

    # Platform-specific grading
    if platform == "arxiv":
        if title_matches >= 2:
            return 3  # Multiple topic words in arXiv title → perfect
        elif title_matches >= 1 or summary_matches >= 2:
            return 2  # Partial title or strong summary match
        elif summary_matches >= 1:
            return 1
        return 0

    if platform == "wikipedia":
        if title_matches >= 1:
            return 3  # Wikipedia article directly about the topic
        elif summary_matches >= 1:
            return 2
        return 1  # Wikipedia is always somewhat relevant

    if platform == "stackoverflow":
        if title_matches >= 2:
            return 3
        elif title_matches >= 1:
            return 2
        elif summary_matches >= 1:
            return 1
        return 0

    if platform == "github":
        if title_matches >= 2:
            return 3
        elif title_matches >= 1 or summary_matches >= 2:
            return 2
        elif summary_matches >= 1:
            return 1
        return 0

    if platform == "youtube":
        if title_matches >= 2:
            return 3
        elif title_matches >= 1:
            return 2
        elif summary_matches >= 1:
            return 1
        return 0

    if platform == "huggingface":
        # HuggingFace datasets: any 1 word match in name = relevant
        # "Anthropic/hh-rlhf" → contains "rlhf" → grade 2 for RLHF query
        if title_matches >= 1 or summary_matches >= 1:
            return 2
        return 0

    if platform == "kaggle":
        if title_matches >= 2:
            return 3
        elif title_matches >= 1:
            return 2
        elif summary_matches >= 1:
            return 1
        return 0

    if platform == "mit_ocw":
        # MIT OCW courses: if topic is in their description, it's grade 2
        if total_matches >= 1:
            return 2
        return 1  # MIT OCW is always somewhat relevant for ML queries

    # Default: any match = 1, good match = 2
    if title_matches >= 1:
        return 2
    elif summary_matches >= 1:
        return 1
    return 0


def run_query(topic, difficulty):
    resp = httpx.post(
        f"{BASE}/v1/discover",
        headers={"X-API-Key": API_KEY},
        json={
            "topic":       topic,
            "difficulty":  difficulty,
            "formats":     ["pdf", "github", "jupyter", "video", "stackoverflow", "html"],
            "max_results": 10,
        },
        timeout=90,
    )
    return resp.json()


def flag_issues(sources, topic, difficulty):
    flags = []
    platforms = [s.get("source_platform", "") for s in sources]

    # 1. OpenLibrary mismatch for ML queries
    ol = [s for s in sources if s.get("source_platform") == "openlibrary"]
    if ol:
        flags.append(f"⚠ OpenLibrary: {[s.get('title', '')[:45] for s in ol]}")

    # 2. Stale content on fast-moving topics
    fast_keywords = ["claude", "sonnet", "gpt-4", "gemini", "llama 3", "mistral"]
    if any(kw in topic.lower() for kw in fast_keywords):
        stale = [
            s for s in sources
            if s.get("decay_report", {}).get("label") in ("stale", "decayed")
        ]
        if stale:
            flags.append(
                f"⚠ {len(stale)}/10 stale/decayed on fast topic: "
                f"{[s.get('title', '')[:35] for s in stale[:2]]}"
            )

    # 3. Difficulty ceiling violations (should be 0 after fix)
    wrong = [
        s for s in sources
        if abs(s.get("difficulty", 3) - difficulty) > 2
    ]
    if wrong:
        wrong_desc = [
            str(s.get("source_platform", "")) + " d=" + str(s.get("difficulty"))
            for s in wrong[:3]
        ]
        flags.append(f"⚠ Difficulty ceiling violated: {wrong_desc}")

    # 4. Platform dominance
    for p in set(platforms):
        if not p: continue
        count = platforms.count(p)
        if count > 4:
            flags.append(f"⚠ {p} dominates: {count}/10 results")

    # 5. Result count too low
    if len(sources) < 7:
        flags.append(
            f"⚠ LOW RESULTS: only {len(sources)}/10 returned. "
            f"Platforms: {list(set(platforms))}"
        )

    # 6. Compound query specific check
    if "streaming" in topic.lower() and "callback" in topic.lower():
        streaming_hits = [
            s for s in sources
            if "streaming" in (s.get("title") or "").lower()
            or "callback" in (s.get("title") or "").lower()
        ]
        if len(streaming_hits) < 2:
            flags.append(
                f"⚠ COMPOUND QUERY: only {len(streaming_hits)} results "
                f"mention 'streaming' or 'callback' in title"
            )

    return flags


def main():
    print("KU API GAP ANALYSIS — BLOCK 3 CYCLE 2")
    print("=" * 62)
    print(GRADING)
    print("TIP: Auto-grades shown. Override manually if wrong.")
    print("     Any query with avg < 2.0 is a bug to fix.\n")

    summary          = []
    total_auto_grades = []

    for topic, difficulty, note in TEST_QUERIES:
        print(f"\n{'─'*62}")
        print(f"QUERY:      '{topic}'")
        print(f"DIFFICULTY: {difficulty}")
        print(f"EXPECTED:   {note}")
        print(f"{'─'*62}")

        start = time.time()
        try:
            data    = run_query(topic, difficulty)
            ms      = round((time.time() - start) * 1000)
            sources = data.get("sources", [])
            cache   = data.get("cache_hit", False)

            print(f"Returned {len(sources)} results in {ms}ms  cache={cache}")
            print()

            query_grades = []
            for i, s in enumerate(sources[:5], 1):
                d         = s.get("decay_report") or {}
                diff_delta = abs(s.get("difficulty", 3) - difficulty)
                diff_icon  = "✓" if diff_delta <= 1 else f"⚠d±{diff_delta}"

                auto_g = auto_grade(s, topic, difficulty)
                query_grades.append(auto_g)
                total_auto_grades.append(auto_g)

                grade_icon = "✅" if auto_g >= 2 else ("🟡" if auto_g == 1 else "🔴")

                print(
                    f"  [{i}] {diff_icon:<8} "
                    f"platform={s.get('source_platform', 'unknown'):<16} "
                    f"quality={s.get('quality_score', 0):<5} "
                    f"decay={d.get('decay_score', '?')} "
                    f"({d.get('label', '?')}, {d.get('age_days', '?')}d)"
                )
                print(f"       {s.get('title', '')[:62]}")
                print(f"       AUTO-GRADE: {grade_icon} {auto_g}/3  |  MANUAL GRADE: [ /3]")
                print()

            avg     = sum(query_grades) / len(query_grades) if query_grades else 0
            verdict = "✅ PASS" if avg >= 2.0 else "❌ FAIL"
            print(f"  Auto avg: {avg:.1f}/3  →  {verdict}")

            flags = flag_issues(sources, topic, difficulty)
            if flags:
                print("\n  AUTO-FLAGS:")
                for f in flags:
                    print(f"    {f}")

            summary.append({
                "query":      topic,
                "difficulty": difficulty,
                "count":      len(sources),
                "ms":         ms,
                "avg_grade":  avg,
                "verdict":    verdict,
                "flags":      flags,
            })

        except httpx.ConnectError:
            print("  ✗ Cannot connect. Run: uvicorn src.api.main:app --reload --port 8000")
        except Exception as e:
            print(f"  ✗ Error: {e}")

    # Summary table
    print(f"\n{'='*62}")
    print("BLOCK 3 CYCLE 2 SUMMARY")
    print(f"{'='*62}")
    print(f"{'Query':<40} {'Results':>8} {'ms':>6} {'Avg':>5} {'Verdict':>8}")
    print("-" * 62)
    for s in summary:
        print(
            f"{s['query'][:40]:<40} "
            f"{s['count']:>8} "
            f"{s['ms']:>6} "
            f"{s['avg_grade']:>5.1f} "
            f"{s['verdict']:>8}"
        )

    passed       = sum(1 for s in summary if "PASS" in s.get("verdict", ""))
    total        = len(summary)
    overall_avg  = (
        sum(total_auto_grades) / len(total_auto_grades)
        if total_auto_grades else 0
    )

    print(f"\n{'='*62}")
    print(
        f"OVERALL: {passed}/{total} queries pass  |  "
        f"Avg grade: {overall_avg:.2f}/3"
    )

    if passed == total:
        print("🎉 Block 3 complete. Record the demo video. Post to Reddit.")
    else:
        print(f"⚠  {total - passed} queries still failing. Check AUTO-FLAGS above.")

    print(f"{'='*62}")


if __name__ == "__main__":
    main()