""" Knowledge Universe — Feature Extractor Extracts robust signals (0.0 to 10.0) from raw source metadata. Final Block 3 version fixes: - Wikipedia authority boosted more aggressively for low-difficulty queries so it ranks above arXiv papers for "what is X" queries - arXiv difficulty penalty increased: gap=3 now costs 4.0 quality points (was 3.0), enough to drop a difficulty-mismatch paper below Wikipedia - Educational signal boost for arXiv: survey/tutorial/overview papers get +1.5 authority (was +1.0) - Application signal penalty for arXiv: -3.0 (was -2.0) Belt AND suspenders — feature_extractor deprioritizes even if Gate 4 in arxiv_crawler misses something """ import math from typing import Dict, Any, Optional # Educational keywords in title/abstract → paper is ABOUT the technique _EDUCATIONAL_SIGNALS = { "survey", "tutorial", "overview", "review", "introduction", "explained", "guide", "fundamentals", "primer", "comprehensive", "illustrated", "understanding", "deep dive", "from scratch", "step by step", "beginner", "what is", "how to", "learn", } # Application domain keywords → paper USES the technique for another domain _APPLICATION_SIGNALS = { "detection", "surveillance", "radar", "traffic", "medical", "clinical", "satellite", "remote sensing", "seismic", "earthquake", "financial", "stock", "crypto", "drug", "protein", "genomic", "manufacturing", "industrial", "autonomous driving", "lidar", "pathology", "radiology", "ecg", "eeg", "weather", "climate", "agriculture", "crop", "soil", "maritime", "naval", "military", "physical model", "quantum", "photonic", "optical", "verilog", "fpga", "circuit", "hardware", } class FeatureExtractor: def extract( self, source: Dict[str, Any], request: Optional[Any] = None, ) -> Dict[str, float]: return { "authority": self._authority(source), "completeness": self._completeness(source), "social_proof": self._social(source), "difficulty_alignment": self._difficulty(source, request), "accessibility": self._accessibility(source), } # ── Authority ──────────────────────────────────────────────────────── def _authority(self, s: Dict) -> float: platform = s.get("source_platform", "").lower() base = { "arxiv": 9.0, "mit_ocw": 9.5, "wikipedia": 8.5, "github": 7.5, "huggingface": 7.0, "stackoverflow": 7.0, "youtube": 6.5, "kaggle": 6.0, "openlibrary": 6.0, "podcast": 5.0, "common_crawl": 4.0, }.get(platform, 5.0) # ── arXiv: educational vs application adjustment ────────────────── if platform == "arxiv": title = (s.get("title") or "").lower() summary = (s.get("summary") or "").lower()[:200] combined = title + " " + summary edu_hits = sum(1 for kw in _EDUCATIONAL_SIGNALS if kw in combined) app_hits = sum(1 for kw in _APPLICATION_SIGNALS if kw in combined) if edu_hits > 0 and app_hits == 0: base += 1.5 # Pure educational paper — strong boost elif app_hits > 0 and edu_hits == 0: base -= 3.0 # Pure application paper — strong penalty elif app_hits > edu_hits: base -= 1.5 # Mostly application — moderate penalty # edu >= app: no change base = max(4.0, min(10.0, base)) # ── Wikipedia: boost for low-difficulty queries ─────────────────── # Wikipedia is authoritative for "what is X" queries. # Boost it when the source difficulty ≤ 2 so it ranks above # arXiv papers that get difficulty-penalized. if platform == "wikipedia": src_diff = s.get("difficulty", 3) try: if int(src_diff) <= 2: base = min(10.0, base + 1.5) except (ValueError, TypeError): pass return base # ── Completeness ───────────────────────────────────────────────────── def _completeness(self, s: Dict) -> float: score = 3.0 summary = s.get("summary") or "" if len(summary) > 200: score += 4.0 elif len(summary) > 50: score += 2.0 tags = s.get("tags") or [] if len(tags) >= 3: score += 2.0 elif len(tags) >= 1: score += 1.0 if s.get("authors"): score += 0.5 if s.get("publication_date"): score += 0.5 return min(10.0, score) # ── Social Proof ───────────────────────────────────────────────────── def _social(self, s: Dict) -> float: stars = float(s.get("stars") or 0) citations = float(s.get("citation_count") or 0) views = float(s.get("views") or 0) downloads = float(s.get("downloads") or 0) likes = float(s.get("likes") or 0) score = 0.0 if stars > 0: score += math.log10(stars + 1) * 2.0 if citations > 0: score += math.log10(citations + 1) * 3.0 if views > 0: score += math.log10(views + 1) * 1.0 if downloads > 0: score += math.log10(downloads + 1) * 1.5 if likes > 0: score += math.log10(likes + 1) * 1.0 has_signal = any([stars, citations, views, downloads, likes]) score = max(4.0 if has_signal else 3.5, score) return min(10.0, score) # ── Difficulty Alignment ───────────────────────────────────────────── def _difficulty(self, s: Dict, req: Optional[Any]) -> float: """ Smooth penalty for difficulty mismatch. gap=0: 10.0 (perfect) gap=1: 8.5 (acceptable) gap=2: 6.0 (marginal) gap=3: 2.0 (poor — was 3.0, increased penalty) gap=4: 0.5 (very poor) The gap=3 increase from 3.0 to 2.0 is critical: It ensures a difficulty=4 arXiv paper scores BELOW Wikipedia (d=1) for a difficulty=1 query, because: arXiv: base_quality ~8.0 × penalty(gap=3, score=2.0) → ~5.6 Wikipedia: base_quality ~8.0 × penalty(gap=0, score=10.0) → ~8.0 """ if not req or not hasattr(req, "difficulty"): return 7.0 src_diff = s.get("difficulty") if src_diff is None: return 6.0 try: src_diff = int(src_diff) except (ValueError, TypeError): return 6.0 gap = abs(src_diff - req.difficulty) return { 0: 10.0, 1: 8.5, 2: 6.0, 3: 2.0, # Increased from 3.0 — penalizes d=4 papers for d=1 queries 4: 0.5, }.get(gap, 0.2) # ── Accessibility ──────────────────────────────────────────────────── def _accessibility(self, s: Dict) -> float: score = 5.0 if s.get("open_access", True): score += 5.0 return min(10.0, score)