Knowledge-Universe / scripts /gap_analysis.py
vlsiddarth's picture
Block 3 complete: 5/5 queries passing, decay scoring, difficulty ceiling, compound query matching, Wikipedia fix, latency 74s->18s, cache 4ms
c43d43b
"""
Knowledge Universe β€” Block 3 Gap Analysis (Cycle 2)
=====================================================
Run: python scripts/gap_analysis.py
Cycle 2 auto-grader fix:
The Cycle 1 auto-grader gave HuggingFace RLHF datasets a grade of 0
because "Anthropic/hh-rlhf" doesn't contain all the words
"reward model training" in the dataset name.
Fix: HuggingFace datasets are graded by whether their name/title
contains ANY of the significant query words β€” not all of them.
"hh-rlhf" contains "rlhf" β†’ grade 2 for RLHF query.
"Anthropic/hh-rlhf" from Anthropic for RLHF query β†’ grade 2.
Also: "Generalisation of RLHF under Reward Shift" arXiv paper
contains "RLHF" and "Reward" β†’ grade 3, not 1.
"""
import os
import httpx
import time
from dotenv import load_dotenv
load_dotenv()
API_KEY = os.getenv("API_KEY")
BASE = "http://localhost:8000"
TEST_QUERIES = [
(
"mixture of experts architecture", 4,
"Expect: arxiv papers on MoE (sparse, gating, routing). "
"Bug if: OpenLibrary returns building-architecture books."
),
(
"Claude 3.5 sonnet", 2,
"Fast-moving topic. Expect: recent blog posts, GitHub repos, YouTube. "
"Bug if: anything labeled 'decayed' ranks in top 3."
),
(
"LangChain streaming callbacks", 3,
"Practical coding. Expect: GitHub + StackOverflow specifically about "
"streaming callbacks. Bug if: generic LangChain results with no "
"streaming or callback mention."
),
(
"RLHF reward model training", 5,
"Research-heavy. Expect: arxiv papers, difficulty 4-5 results. "
"Bug if: beginner YouTube tutorials rank above research papers."
),
(
"what is machine learning", 1,
"Beginner. Expect: Wikipedia, YouTube explainers, difficulty 1-2. "
"Bug if: arxiv papers with difficulty 4+ dominate top 3."
),
]
GRADING = """
GRADE each result 0-3:
3 = Exactly what an expert would recommend
2 = Relevant, not the best
1 = Tangentially related
0 = Irrelevant (e.g. restaurant recommendation paper for ML query)
"""
def auto_grade(source: dict, topic: str, requested_difficulty: int) -> int:
"""
Improved auto-grader:
- Checks title AND summary for topic word matches
- HuggingFace datasets: any 1 topic word match = grade 2
- arXiv: match in title = grade 3, match in summary = grade 2
- Difficulty mismatch > 2 = grade 0 regardless
"""
title = (source.get("title") or "").lower()
summary = (source.get("summary") or "").lower()
platform = source.get("source_platform", "")
diff = source.get("difficulty", 3)
topic_lower = topic.lower()
topic_words = [
w for w in topic_lower.split()
if len(w) > 3 and w not in {
"what", "does", "how", "the", "and", "for", "with", "from"
}
]
# Hard penalty for severe difficulty mismatch
diff_gap = abs(int(diff) - requested_difficulty)
if diff_gap > 2:
return 0
# Check matches in title and summary
title_matches = sum(1 for w in topic_words if w in title)
summary_matches = sum(1 for w in topic_words if w in summary)
total_matches = title_matches + summary_matches
# Platform-specific grading
if platform == "arxiv":
if title_matches >= 2:
return 3 # Multiple topic words in arXiv title β†’ perfect
elif title_matches >= 1 or summary_matches >= 2:
return 2 # Partial title or strong summary match
elif summary_matches >= 1:
return 1
return 0
if platform == "wikipedia":
if title_matches >= 1:
return 3 # Wikipedia article directly about the topic
elif summary_matches >= 1:
return 2
return 1 # Wikipedia is always somewhat relevant
if platform == "stackoverflow":
if title_matches >= 2:
return 3
elif title_matches >= 1:
return 2
elif summary_matches >= 1:
return 1
return 0
if platform == "github":
if title_matches >= 2:
return 3
elif title_matches >= 1 or summary_matches >= 2:
return 2
elif summary_matches >= 1:
return 1
return 0
if platform == "youtube":
if title_matches >= 2:
return 3
elif title_matches >= 1:
return 2
elif summary_matches >= 1:
return 1
return 0
if platform == "huggingface":
# HuggingFace datasets: any 1 word match in name = relevant
# "Anthropic/hh-rlhf" β†’ contains "rlhf" β†’ grade 2 for RLHF query
if title_matches >= 1 or summary_matches >= 1:
return 2
return 0
if platform == "kaggle":
if title_matches >= 2:
return 3
elif title_matches >= 1:
return 2
elif summary_matches >= 1:
return 1
return 0
if platform == "mit_ocw":
# MIT OCW courses: if topic is in their description, it's grade 2
if total_matches >= 1:
return 2
return 1 # MIT OCW is always somewhat relevant for ML queries
# Default: any match = 1, good match = 2
if title_matches >= 1:
return 2
elif summary_matches >= 1:
return 1
return 0
def run_query(topic, difficulty):
resp = httpx.post(
f"{BASE}/v1/discover",
headers={"X-API-Key": API_KEY},
json={
"topic": topic,
"difficulty": difficulty,
"formats": ["pdf", "github", "jupyter", "video", "stackoverflow", "html"],
"max_results": 10,
},
timeout=90,
)
return resp.json()
def flag_issues(sources, topic, difficulty):
flags = []
platforms = [s.get("source_platform", "") for s in sources]
# 1. OpenLibrary mismatch for ML queries
ol = [s for s in sources if s.get("source_platform") == "openlibrary"]
if ol:
flags.append(f"⚠ OpenLibrary: {[s.get('title', '')[:45] for s in ol]}")
# 2. Stale content on fast-moving topics
fast_keywords = ["claude", "sonnet", "gpt-4", "gemini", "llama 3", "mistral"]
if any(kw in topic.lower() for kw in fast_keywords):
stale = [
s for s in sources
if s.get("decay_report", {}).get("label") in ("stale", "decayed")
]
if stale:
flags.append(
f"⚠ {len(stale)}/10 stale/decayed on fast topic: "
f"{[s.get('title', '')[:35] for s in stale[:2]]}"
)
# 3. Difficulty ceiling violations (should be 0 after fix)
wrong = [
s for s in sources
if abs(s.get("difficulty", 3) - difficulty) > 2
]
if wrong:
wrong_desc = [
str(s.get("source_platform", "")) + " d=" + str(s.get("difficulty"))
for s in wrong[:3]
]
flags.append(f"⚠ Difficulty ceiling violated: {wrong_desc}")
# 4. Platform dominance
for p in set(platforms):
if not p: continue
count = platforms.count(p)
if count > 4:
flags.append(f"⚠ {p} dominates: {count}/10 results")
# 5. Result count too low
if len(sources) < 7:
flags.append(
f"⚠ LOW RESULTS: only {len(sources)}/10 returned. "
f"Platforms: {list(set(platforms))}"
)
# 6. Compound query specific check
if "streaming" in topic.lower() and "callback" in topic.lower():
streaming_hits = [
s for s in sources
if "streaming" in (s.get("title") or "").lower()
or "callback" in (s.get("title") or "").lower()
]
if len(streaming_hits) < 2:
flags.append(
f"⚠ COMPOUND QUERY: only {len(streaming_hits)} results "
f"mention 'streaming' or 'callback' in title"
)
return flags
def main():
print("KU API GAP ANALYSIS β€” BLOCK 3 CYCLE 2")
print("=" * 62)
print(GRADING)
print("TIP: Auto-grades shown. Override manually if wrong.")
print(" Any query with avg < 2.0 is a bug to fix.\n")
summary = []
total_auto_grades = []
for topic, difficulty, note in TEST_QUERIES:
print(f"\n{'─'*62}")
print(f"QUERY: '{topic}'")
print(f"DIFFICULTY: {difficulty}")
print(f"EXPECTED: {note}")
print(f"{'─'*62}")
start = time.time()
try:
data = run_query(topic, difficulty)
ms = round((time.time() - start) * 1000)
sources = data.get("sources", [])
cache = data.get("cache_hit", False)
print(f"Returned {len(sources)} results in {ms}ms cache={cache}")
print()
query_grades = []
for i, s in enumerate(sources[:5], 1):
d = s.get("decay_report") or {}
diff_delta = abs(s.get("difficulty", 3) - difficulty)
diff_icon = "βœ“" if diff_delta <= 1 else f"⚠dΒ±{diff_delta}"
auto_g = auto_grade(s, topic, difficulty)
query_grades.append(auto_g)
total_auto_grades.append(auto_g)
grade_icon = "βœ…" if auto_g >= 2 else ("🟑" if auto_g == 1 else "πŸ”΄")
print(
f" [{i}] {diff_icon:<8} "
f"platform={s.get('source_platform', 'unknown'):<16} "
f"quality={s.get('quality_score', 0):<5} "
f"decay={d.get('decay_score', '?')} "
f"({d.get('label', '?')}, {d.get('age_days', '?')}d)"
)
print(f" {s.get('title', '')[:62]}")
print(f" AUTO-GRADE: {grade_icon} {auto_g}/3 | MANUAL GRADE: [ /3]")
print()
avg = sum(query_grades) / len(query_grades) if query_grades else 0
verdict = "βœ… PASS" if avg >= 2.0 else "❌ FAIL"
print(f" Auto avg: {avg:.1f}/3 β†’ {verdict}")
flags = flag_issues(sources, topic, difficulty)
if flags:
print("\n AUTO-FLAGS:")
for f in flags:
print(f" {f}")
summary.append({
"query": topic,
"difficulty": difficulty,
"count": len(sources),
"ms": ms,
"avg_grade": avg,
"verdict": verdict,
"flags": flags,
})
except httpx.ConnectError:
print(" βœ— Cannot connect. Run: uvicorn src.api.main:app --reload --port 8000")
except Exception as e:
print(f" βœ— Error: {e}")
# Summary table
print(f"\n{'='*62}")
print("BLOCK 3 CYCLE 2 SUMMARY")
print(f"{'='*62}")
print(f"{'Query':<40} {'Results':>8} {'ms':>6} {'Avg':>5} {'Verdict':>8}")
print("-" * 62)
for s in summary:
print(
f"{s['query'][:40]:<40} "
f"{s['count']:>8} "
f"{s['ms']:>6} "
f"{s['avg_grade']:>5.1f} "
f"{s['verdict']:>8}"
)
passed = sum(1 for s in summary if "PASS" in s.get("verdict", ""))
total = len(summary)
overall_avg = (
sum(total_auto_grades) / len(total_auto_grades)
if total_auto_grades else 0
)
print(f"\n{'='*62}")
print(
f"OVERALL: {passed}/{total} queries pass | "
f"Avg grade: {overall_avg:.2f}/3"
)
if passed == total:
print("πŸŽ‰ Block 3 complete. Record the demo video. Post to Reddit.")
else:
print(f"⚠ {total - passed} queries still failing. Check AUTO-FLAGS above.")
print(f"{'='*62}")
if __name__ == "__main__":
main()