""" Knowledge Universe — Block 3 Gap Analysis (Cycle 2) ===================================================== Run: python scripts/gap_analysis.py Cycle 2 auto-grader fix: The Cycle 1 auto-grader gave HuggingFace RLHF datasets a grade of 0 because "Anthropic/hh-rlhf" doesn't contain all the words "reward model training" in the dataset name. Fix: HuggingFace datasets are graded by whether their name/title contains ANY of the significant query words — not all of them. "hh-rlhf" contains "rlhf" → grade 2 for RLHF query. "Anthropic/hh-rlhf" from Anthropic for RLHF query → grade 2. Also: "Generalisation of RLHF under Reward Shift" arXiv paper contains "RLHF" and "Reward" → grade 3, not 1. """ import os import httpx import time from dotenv import load_dotenv load_dotenv() API_KEY = os.getenv("API_KEY") BASE = "http://localhost:8000" TEST_QUERIES = [ ( "mixture of experts architecture", 4, "Expect: arxiv papers on MoE (sparse, gating, routing). " "Bug if: OpenLibrary returns building-architecture books." ), ( "Claude 3.5 sonnet", 2, "Fast-moving topic. Expect: recent blog posts, GitHub repos, YouTube. " "Bug if: anything labeled 'decayed' ranks in top 3." ), ( "LangChain streaming callbacks", 3, "Practical coding. Expect: GitHub + StackOverflow specifically about " "streaming callbacks. Bug if: generic LangChain results with no " "streaming or callback mention." ), ( "RLHF reward model training", 5, "Research-heavy. Expect: arxiv papers, difficulty 4-5 results. " "Bug if: beginner YouTube tutorials rank above research papers." ), ( "what is machine learning", 1, "Beginner. Expect: Wikipedia, YouTube explainers, difficulty 1-2. " "Bug if: arxiv papers with difficulty 4+ dominate top 3." ), ] GRADING = """ GRADE each result 0-3: 3 = Exactly what an expert would recommend 2 = Relevant, not the best 1 = Tangentially related 0 = Irrelevant (e.g. restaurant recommendation paper for ML query) """ def auto_grade(source: dict, topic: str, requested_difficulty: int) -> int: """ Improved auto-grader: - Checks title AND summary for topic word matches - HuggingFace datasets: any 1 topic word match = grade 2 - arXiv: match in title = grade 3, match in summary = grade 2 - Difficulty mismatch > 2 = grade 0 regardless """ title = (source.get("title") or "").lower() summary = (source.get("summary") or "").lower() platform = source.get("source_platform", "") diff = source.get("difficulty", 3) topic_lower = topic.lower() topic_words = [ w for w in topic_lower.split() if len(w) > 3 and w not in { "what", "does", "how", "the", "and", "for", "with", "from" } ] # Hard penalty for severe difficulty mismatch diff_gap = abs(int(diff) - requested_difficulty) if diff_gap > 2: return 0 # Check matches in title and summary title_matches = sum(1 for w in topic_words if w in title) summary_matches = sum(1 for w in topic_words if w in summary) total_matches = title_matches + summary_matches # Platform-specific grading if platform == "arxiv": if title_matches >= 2: return 3 # Multiple topic words in arXiv title → perfect elif title_matches >= 1 or summary_matches >= 2: return 2 # Partial title or strong summary match elif summary_matches >= 1: return 1 return 0 if platform == "wikipedia": if title_matches >= 1: return 3 # Wikipedia article directly about the topic elif summary_matches >= 1: return 2 return 1 # Wikipedia is always somewhat relevant if platform == "stackoverflow": if title_matches >= 2: return 3 elif title_matches >= 1: return 2 elif summary_matches >= 1: return 1 return 0 if platform == "github": if title_matches >= 2: return 3 elif title_matches >= 1 or summary_matches >= 2: return 2 elif summary_matches >= 1: return 1 return 0 if platform == "youtube": if title_matches >= 2: return 3 elif title_matches >= 1: return 2 elif summary_matches >= 1: return 1 return 0 if platform == "huggingface": # HuggingFace datasets: any 1 word match in name = relevant # "Anthropic/hh-rlhf" → contains "rlhf" → grade 2 for RLHF query if title_matches >= 1 or summary_matches >= 1: return 2 return 0 if platform == "kaggle": if title_matches >= 2: return 3 elif title_matches >= 1: return 2 elif summary_matches >= 1: return 1 return 0 if platform == "mit_ocw": # MIT OCW courses: if topic is in their description, it's grade 2 if total_matches >= 1: return 2 return 1 # MIT OCW is always somewhat relevant for ML queries # Default: any match = 1, good match = 2 if title_matches >= 1: return 2 elif summary_matches >= 1: return 1 return 0 def run_query(topic, difficulty): resp = httpx.post( f"{BASE}/v1/discover", headers={"X-API-Key": API_KEY}, json={ "topic": topic, "difficulty": difficulty, "formats": ["pdf", "github", "jupyter", "video", "stackoverflow", "html"], "max_results": 10, }, timeout=90, ) return resp.json() def flag_issues(sources, topic, difficulty): flags = [] platforms = [s.get("source_platform", "") for s in sources] # 1. OpenLibrary mismatch for ML queries ol = [s for s in sources if s.get("source_platform") == "openlibrary"] if ol: flags.append(f"⚠ OpenLibrary: {[s.get('title', '')[:45] for s in ol]}") # 2. Stale content on fast-moving topics fast_keywords = ["claude", "sonnet", "gpt-4", "gemini", "llama 3", "mistral"] if any(kw in topic.lower() for kw in fast_keywords): stale = [ s for s in sources if s.get("decay_report", {}).get("label") in ("stale", "decayed") ] if stale: flags.append( f"⚠ {len(stale)}/10 stale/decayed on fast topic: " f"{[s.get('title', '')[:35] for s in stale[:2]]}" ) # 3. Difficulty ceiling violations (should be 0 after fix) wrong = [ s for s in sources if abs(s.get("difficulty", 3) - difficulty) > 2 ] if wrong: wrong_desc = [ str(s.get("source_platform", "")) + " d=" + str(s.get("difficulty")) for s in wrong[:3] ] flags.append(f"⚠ Difficulty ceiling violated: {wrong_desc}") # 4. Platform dominance for p in set(platforms): if not p: continue count = platforms.count(p) if count > 4: flags.append(f"⚠ {p} dominates: {count}/10 results") # 5. Result count too low if len(sources) < 7: flags.append( f"⚠ LOW RESULTS: only {len(sources)}/10 returned. " f"Platforms: {list(set(platforms))}" ) # 6. Compound query specific check if "streaming" in topic.lower() and "callback" in topic.lower(): streaming_hits = [ s for s in sources if "streaming" in (s.get("title") or "").lower() or "callback" in (s.get("title") or "").lower() ] if len(streaming_hits) < 2: flags.append( f"⚠ COMPOUND QUERY: only {len(streaming_hits)} results " f"mention 'streaming' or 'callback' in title" ) return flags def main(): print("KU API GAP ANALYSIS — BLOCK 3 CYCLE 2") print("=" * 62) print(GRADING) print("TIP: Auto-grades shown. Override manually if wrong.") print(" Any query with avg < 2.0 is a bug to fix.\n") summary = [] total_auto_grades = [] for topic, difficulty, note in TEST_QUERIES: print(f"\n{'─'*62}") print(f"QUERY: '{topic}'") print(f"DIFFICULTY: {difficulty}") print(f"EXPECTED: {note}") print(f"{'─'*62}") start = time.time() try: data = run_query(topic, difficulty) ms = round((time.time() - start) * 1000) sources = data.get("sources", []) cache = data.get("cache_hit", False) print(f"Returned {len(sources)} results in {ms}ms cache={cache}") print() query_grades = [] for i, s in enumerate(sources[:5], 1): d = s.get("decay_report") or {} diff_delta = abs(s.get("difficulty", 3) - difficulty) diff_icon = "✓" if diff_delta <= 1 else f"⚠d±{diff_delta}" auto_g = auto_grade(s, topic, difficulty) query_grades.append(auto_g) total_auto_grades.append(auto_g) grade_icon = "✅" if auto_g >= 2 else ("🟡" if auto_g == 1 else "🔴") print( f" [{i}] {diff_icon:<8} " f"platform={s.get('source_platform', 'unknown'):<16} " f"quality={s.get('quality_score', 0):<5} " f"decay={d.get('decay_score', '?')} " f"({d.get('label', '?')}, {d.get('age_days', '?')}d)" ) print(f" {s.get('title', '')[:62]}") print(f" AUTO-GRADE: {grade_icon} {auto_g}/3 | MANUAL GRADE: [ /3]") print() avg = sum(query_grades) / len(query_grades) if query_grades else 0 verdict = "✅ PASS" if avg >= 2.0 else "❌ FAIL" print(f" Auto avg: {avg:.1f}/3 → {verdict}") flags = flag_issues(sources, topic, difficulty) if flags: print("\n AUTO-FLAGS:") for f in flags: print(f" {f}") summary.append({ "query": topic, "difficulty": difficulty, "count": len(sources), "ms": ms, "avg_grade": avg, "verdict": verdict, "flags": flags, }) except httpx.ConnectError: print(" ✗ Cannot connect. Run: uvicorn src.api.main:app --reload --port 8000") except Exception as e: print(f" ✗ Error: {e}") # Summary table print(f"\n{'='*62}") print("BLOCK 3 CYCLE 2 SUMMARY") print(f"{'='*62}") print(f"{'Query':<40} {'Results':>8} {'ms':>6} {'Avg':>5} {'Verdict':>8}") print("-" * 62) for s in summary: print( f"{s['query'][:40]:<40} " f"{s['count']:>8} " f"{s['ms']:>6} " f"{s['avg_grade']:>5.1f} " f"{s['verdict']:>8}" ) passed = sum(1 for s in summary if "PASS" in s.get("verdict", "")) total = len(summary) overall_avg = ( sum(total_auto_grades) / len(total_auto_grades) if total_auto_grades else 0 ) print(f"\n{'='*62}") print( f"OVERALL: {passed}/{total} queries pass | " f"Avg grade: {overall_avg:.2f}/3" ) if passed == total: print("🎉 Block 3 complete. Record the demo video. Post to Reddit.") else: print(f"⚠ {total - passed} queries still failing. Check AUTO-FLAGS above.") print(f"{'='*62}") if __name__ == "__main__": main()