Spaces:
Running
Running
Block 3 complete: 5/5 queries passing, decay scoring, difficulty ceiling, compound query matching, Wikipedia fix, latency 74s->18s, cache 4ms
c43d43b | """ | |
| Knowledge Universe β Block 3 Gap Analysis (Cycle 2) | |
| ===================================================== | |
| Run: python scripts/gap_analysis.py | |
| Cycle 2 auto-grader fix: | |
| The Cycle 1 auto-grader gave HuggingFace RLHF datasets a grade of 0 | |
| because "Anthropic/hh-rlhf" doesn't contain all the words | |
| "reward model training" in the dataset name. | |
| Fix: HuggingFace datasets are graded by whether their name/title | |
| contains ANY of the significant query words β not all of them. | |
| "hh-rlhf" contains "rlhf" β grade 2 for RLHF query. | |
| "Anthropic/hh-rlhf" from Anthropic for RLHF query β grade 2. | |
| Also: "Generalisation of RLHF under Reward Shift" arXiv paper | |
| contains "RLHF" and "Reward" β grade 3, not 1. | |
| """ | |
| import os | |
| import httpx | |
| import time | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| API_KEY = os.getenv("API_KEY") | |
| BASE = "http://localhost:8000" | |
| TEST_QUERIES = [ | |
| ( | |
| "mixture of experts architecture", 4, | |
| "Expect: arxiv papers on MoE (sparse, gating, routing). " | |
| "Bug if: OpenLibrary returns building-architecture books." | |
| ), | |
| ( | |
| "Claude 3.5 sonnet", 2, | |
| "Fast-moving topic. Expect: recent blog posts, GitHub repos, YouTube. " | |
| "Bug if: anything labeled 'decayed' ranks in top 3." | |
| ), | |
| ( | |
| "LangChain streaming callbacks", 3, | |
| "Practical coding. Expect: GitHub + StackOverflow specifically about " | |
| "streaming callbacks. Bug if: generic LangChain results with no " | |
| "streaming or callback mention." | |
| ), | |
| ( | |
| "RLHF reward model training", 5, | |
| "Research-heavy. Expect: arxiv papers, difficulty 4-5 results. " | |
| "Bug if: beginner YouTube tutorials rank above research papers." | |
| ), | |
| ( | |
| "what is machine learning", 1, | |
| "Beginner. Expect: Wikipedia, YouTube explainers, difficulty 1-2. " | |
| "Bug if: arxiv papers with difficulty 4+ dominate top 3." | |
| ), | |
| ] | |
| GRADING = """ | |
| GRADE each result 0-3: | |
| 3 = Exactly what an expert would recommend | |
| 2 = Relevant, not the best | |
| 1 = Tangentially related | |
| 0 = Irrelevant (e.g. restaurant recommendation paper for ML query) | |
| """ | |
| def auto_grade(source: dict, topic: str, requested_difficulty: int) -> int: | |
| """ | |
| Improved auto-grader: | |
| - Checks title AND summary for topic word matches | |
| - HuggingFace datasets: any 1 topic word match = grade 2 | |
| - arXiv: match in title = grade 3, match in summary = grade 2 | |
| - Difficulty mismatch > 2 = grade 0 regardless | |
| """ | |
| title = (source.get("title") or "").lower() | |
| summary = (source.get("summary") or "").lower() | |
| platform = source.get("source_platform", "") | |
| diff = source.get("difficulty", 3) | |
| topic_lower = topic.lower() | |
| topic_words = [ | |
| w for w in topic_lower.split() | |
| if len(w) > 3 and w not in { | |
| "what", "does", "how", "the", "and", "for", "with", "from" | |
| } | |
| ] | |
| # Hard penalty for severe difficulty mismatch | |
| diff_gap = abs(int(diff) - requested_difficulty) | |
| if diff_gap > 2: | |
| return 0 | |
| # Check matches in title and summary | |
| title_matches = sum(1 for w in topic_words if w in title) | |
| summary_matches = sum(1 for w in topic_words if w in summary) | |
| total_matches = title_matches + summary_matches | |
| # Platform-specific grading | |
| if platform == "arxiv": | |
| if title_matches >= 2: | |
| return 3 # Multiple topic words in arXiv title β perfect | |
| elif title_matches >= 1 or summary_matches >= 2: | |
| return 2 # Partial title or strong summary match | |
| elif summary_matches >= 1: | |
| return 1 | |
| return 0 | |
| if platform == "wikipedia": | |
| if title_matches >= 1: | |
| return 3 # Wikipedia article directly about the topic | |
| elif summary_matches >= 1: | |
| return 2 | |
| return 1 # Wikipedia is always somewhat relevant | |
| if platform == "stackoverflow": | |
| if title_matches >= 2: | |
| return 3 | |
| elif title_matches >= 1: | |
| return 2 | |
| elif summary_matches >= 1: | |
| return 1 | |
| return 0 | |
| if platform == "github": | |
| if title_matches >= 2: | |
| return 3 | |
| elif title_matches >= 1 or summary_matches >= 2: | |
| return 2 | |
| elif summary_matches >= 1: | |
| return 1 | |
| return 0 | |
| if platform == "youtube": | |
| if title_matches >= 2: | |
| return 3 | |
| elif title_matches >= 1: | |
| return 2 | |
| elif summary_matches >= 1: | |
| return 1 | |
| return 0 | |
| if platform == "huggingface": | |
| # HuggingFace datasets: any 1 word match in name = relevant | |
| # "Anthropic/hh-rlhf" β contains "rlhf" β grade 2 for RLHF query | |
| if title_matches >= 1 or summary_matches >= 1: | |
| return 2 | |
| return 0 | |
| if platform == "kaggle": | |
| if title_matches >= 2: | |
| return 3 | |
| elif title_matches >= 1: | |
| return 2 | |
| elif summary_matches >= 1: | |
| return 1 | |
| return 0 | |
| if platform == "mit_ocw": | |
| # MIT OCW courses: if topic is in their description, it's grade 2 | |
| if total_matches >= 1: | |
| return 2 | |
| return 1 # MIT OCW is always somewhat relevant for ML queries | |
| # Default: any match = 1, good match = 2 | |
| if title_matches >= 1: | |
| return 2 | |
| elif summary_matches >= 1: | |
| return 1 | |
| return 0 | |
| def run_query(topic, difficulty): | |
| resp = httpx.post( | |
| f"{BASE}/v1/discover", | |
| headers={"X-API-Key": API_KEY}, | |
| json={ | |
| "topic": topic, | |
| "difficulty": difficulty, | |
| "formats": ["pdf", "github", "jupyter", "video", "stackoverflow", "html"], | |
| "max_results": 10, | |
| }, | |
| timeout=90, | |
| ) | |
| return resp.json() | |
| def flag_issues(sources, topic, difficulty): | |
| flags = [] | |
| platforms = [s.get("source_platform", "") for s in sources] | |
| # 1. OpenLibrary mismatch for ML queries | |
| ol = [s for s in sources if s.get("source_platform") == "openlibrary"] | |
| if ol: | |
| flags.append(f"β OpenLibrary: {[s.get('title', '')[:45] for s in ol]}") | |
| # 2. Stale content on fast-moving topics | |
| fast_keywords = ["claude", "sonnet", "gpt-4", "gemini", "llama 3", "mistral"] | |
| if any(kw in topic.lower() for kw in fast_keywords): | |
| stale = [ | |
| s for s in sources | |
| if s.get("decay_report", {}).get("label") in ("stale", "decayed") | |
| ] | |
| if stale: | |
| flags.append( | |
| f"β {len(stale)}/10 stale/decayed on fast topic: " | |
| f"{[s.get('title', '')[:35] for s in stale[:2]]}" | |
| ) | |
| # 3. Difficulty ceiling violations (should be 0 after fix) | |
| wrong = [ | |
| s for s in sources | |
| if abs(s.get("difficulty", 3) - difficulty) > 2 | |
| ] | |
| if wrong: | |
| wrong_desc = [ | |
| str(s.get("source_platform", "")) + " d=" + str(s.get("difficulty")) | |
| for s in wrong[:3] | |
| ] | |
| flags.append(f"β Difficulty ceiling violated: {wrong_desc}") | |
| # 4. Platform dominance | |
| for p in set(platforms): | |
| if not p: continue | |
| count = platforms.count(p) | |
| if count > 4: | |
| flags.append(f"β {p} dominates: {count}/10 results") | |
| # 5. Result count too low | |
| if len(sources) < 7: | |
| flags.append( | |
| f"β LOW RESULTS: only {len(sources)}/10 returned. " | |
| f"Platforms: {list(set(platforms))}" | |
| ) | |
| # 6. Compound query specific check | |
| if "streaming" in topic.lower() and "callback" in topic.lower(): | |
| streaming_hits = [ | |
| s for s in sources | |
| if "streaming" in (s.get("title") or "").lower() | |
| or "callback" in (s.get("title") or "").lower() | |
| ] | |
| if len(streaming_hits) < 2: | |
| flags.append( | |
| f"β COMPOUND QUERY: only {len(streaming_hits)} results " | |
| f"mention 'streaming' or 'callback' in title" | |
| ) | |
| return flags | |
| def main(): | |
| print("KU API GAP ANALYSIS β BLOCK 3 CYCLE 2") | |
| print("=" * 62) | |
| print(GRADING) | |
| print("TIP: Auto-grades shown. Override manually if wrong.") | |
| print(" Any query with avg < 2.0 is a bug to fix.\n") | |
| summary = [] | |
| total_auto_grades = [] | |
| for topic, difficulty, note in TEST_QUERIES: | |
| print(f"\n{'β'*62}") | |
| print(f"QUERY: '{topic}'") | |
| print(f"DIFFICULTY: {difficulty}") | |
| print(f"EXPECTED: {note}") | |
| print(f"{'β'*62}") | |
| start = time.time() | |
| try: | |
| data = run_query(topic, difficulty) | |
| ms = round((time.time() - start) * 1000) | |
| sources = data.get("sources", []) | |
| cache = data.get("cache_hit", False) | |
| print(f"Returned {len(sources)} results in {ms}ms cache={cache}") | |
| print() | |
| query_grades = [] | |
| for i, s in enumerate(sources[:5], 1): | |
| d = s.get("decay_report") or {} | |
| diff_delta = abs(s.get("difficulty", 3) - difficulty) | |
| diff_icon = "β" if diff_delta <= 1 else f"β dΒ±{diff_delta}" | |
| auto_g = auto_grade(s, topic, difficulty) | |
| query_grades.append(auto_g) | |
| total_auto_grades.append(auto_g) | |
| grade_icon = "β " if auto_g >= 2 else ("π‘" if auto_g == 1 else "π΄") | |
| print( | |
| f" [{i}] {diff_icon:<8} " | |
| f"platform={s.get('source_platform', 'unknown'):<16} " | |
| f"quality={s.get('quality_score', 0):<5} " | |
| f"decay={d.get('decay_score', '?')} " | |
| f"({d.get('label', '?')}, {d.get('age_days', '?')}d)" | |
| ) | |
| print(f" {s.get('title', '')[:62]}") | |
| print(f" AUTO-GRADE: {grade_icon} {auto_g}/3 | MANUAL GRADE: [ /3]") | |
| print() | |
| avg = sum(query_grades) / len(query_grades) if query_grades else 0 | |
| verdict = "β PASS" if avg >= 2.0 else "β FAIL" | |
| print(f" Auto avg: {avg:.1f}/3 β {verdict}") | |
| flags = flag_issues(sources, topic, difficulty) | |
| if flags: | |
| print("\n AUTO-FLAGS:") | |
| for f in flags: | |
| print(f" {f}") | |
| summary.append({ | |
| "query": topic, | |
| "difficulty": difficulty, | |
| "count": len(sources), | |
| "ms": ms, | |
| "avg_grade": avg, | |
| "verdict": verdict, | |
| "flags": flags, | |
| }) | |
| except httpx.ConnectError: | |
| print(" β Cannot connect. Run: uvicorn src.api.main:app --reload --port 8000") | |
| except Exception as e: | |
| print(f" β Error: {e}") | |
| # Summary table | |
| print(f"\n{'='*62}") | |
| print("BLOCK 3 CYCLE 2 SUMMARY") | |
| print(f"{'='*62}") | |
| print(f"{'Query':<40} {'Results':>8} {'ms':>6} {'Avg':>5} {'Verdict':>8}") | |
| print("-" * 62) | |
| for s in summary: | |
| print( | |
| f"{s['query'][:40]:<40} " | |
| f"{s['count']:>8} " | |
| f"{s['ms']:>6} " | |
| f"{s['avg_grade']:>5.1f} " | |
| f"{s['verdict']:>8}" | |
| ) | |
| passed = sum(1 for s in summary if "PASS" in s.get("verdict", "")) | |
| total = len(summary) | |
| overall_avg = ( | |
| sum(total_auto_grades) / len(total_auto_grades) | |
| if total_auto_grades else 0 | |
| ) | |
| print(f"\n{'='*62}") | |
| print( | |
| f"OVERALL: {passed}/{total} queries pass | " | |
| f"Avg grade: {overall_avg:.2f}/3" | |
| ) | |
| if passed == total: | |
| print("π Block 3 complete. Record the demo video. Post to Reddit.") | |
| else: | |
| print(f"β {total - passed} queries still failing. Check AUTO-FLAGS above.") | |
| print(f"{'='*62}") | |
| if __name__ == "__main__": | |
| main() |