Knowledge-Universe / src /scoring /feature_extractor.py
vlsiddarth's picture
Block 3 complete: decay scoring, relevance gates, Wikipedia fix, latency optimization
7abb8fd
"""
Knowledge Universe β€” Feature Extractor
Extracts robust signals (0.0 to 10.0) from raw source metadata.
Final Block 3 version fixes:
- Wikipedia authority boosted more aggressively for low-difficulty queries
so it ranks above arXiv papers for "what is X" queries
- arXiv difficulty penalty increased: gap=3 now costs 4.0 quality points
(was 3.0), enough to drop a difficulty-mismatch paper below Wikipedia
- Educational signal boost for arXiv: survey/tutorial/overview papers
get +1.5 authority (was +1.0)
- Application signal penalty for arXiv: -3.0 (was -2.0)
Belt AND suspenders β€” feature_extractor deprioritizes even if
Gate 4 in arxiv_crawler misses something
"""
import math
from typing import Dict, Any, Optional
# Educational keywords in title/abstract β†’ paper is ABOUT the technique
_EDUCATIONAL_SIGNALS = {
"survey", "tutorial", "overview", "review", "introduction",
"explained", "guide", "fundamentals", "primer", "comprehensive",
"illustrated", "understanding", "deep dive", "from scratch",
"step by step", "beginner", "what is", "how to", "learn",
}
# Application domain keywords β†’ paper USES the technique for another domain
_APPLICATION_SIGNALS = {
"detection", "surveillance", "radar", "traffic", "medical",
"clinical", "satellite", "remote sensing", "seismic", "earthquake",
"financial", "stock", "crypto", "drug", "protein", "genomic",
"manufacturing", "industrial", "autonomous driving", "lidar",
"pathology", "radiology", "ecg", "eeg", "weather", "climate",
"agriculture", "crop", "soil", "maritime", "naval", "military",
"physical model", "quantum", "photonic", "optical",
"verilog", "fpga", "circuit", "hardware",
}
class FeatureExtractor:
def extract(
self,
source: Dict[str, Any],
request: Optional[Any] = None,
) -> Dict[str, float]:
return {
"authority": self._authority(source),
"completeness": self._completeness(source),
"social_proof": self._social(source),
"difficulty_alignment": self._difficulty(source, request),
"accessibility": self._accessibility(source),
}
# ── Authority ────────────────────────────────────────────────────────
def _authority(self, s: Dict) -> float:
platform = s.get("source_platform", "").lower()
base = {
"arxiv": 9.0,
"mit_ocw": 9.5,
"wikipedia": 8.5,
"github": 7.5,
"huggingface": 7.0,
"stackoverflow": 7.0,
"youtube": 6.5,
"kaggle": 6.0,
"openlibrary": 6.0,
"podcast": 5.0,
"common_crawl": 4.0,
}.get(platform, 5.0)
# ── arXiv: educational vs application adjustment ──────────────────
if platform == "arxiv":
title = (s.get("title") or "").lower()
summary = (s.get("summary") or "").lower()[:200]
combined = title + " " + summary
edu_hits = sum(1 for kw in _EDUCATIONAL_SIGNALS if kw in combined)
app_hits = sum(1 for kw in _APPLICATION_SIGNALS if kw in combined)
if edu_hits > 0 and app_hits == 0:
base += 1.5 # Pure educational paper β€” strong boost
elif app_hits > 0 and edu_hits == 0:
base -= 3.0 # Pure application paper β€” strong penalty
elif app_hits > edu_hits:
base -= 1.5 # Mostly application β€” moderate penalty
# edu >= app: no change
base = max(4.0, min(10.0, base))
# ── Wikipedia: boost for low-difficulty queries ───────────────────
# Wikipedia is authoritative for "what is X" queries.
# Boost it when the source difficulty ≀ 2 so it ranks above
# arXiv papers that get difficulty-penalized.
if platform == "wikipedia":
src_diff = s.get("difficulty", 3)
try:
if int(src_diff) <= 2:
base = min(10.0, base + 1.5)
except (ValueError, TypeError):
pass
return base
# ── Completeness ─────────────────────────────────────────────────────
def _completeness(self, s: Dict) -> float:
score = 3.0
summary = s.get("summary") or ""
if len(summary) > 200:
score += 4.0
elif len(summary) > 50:
score += 2.0
tags = s.get("tags") or []
if len(tags) >= 3:
score += 2.0
elif len(tags) >= 1:
score += 1.0
if s.get("authors"):
score += 0.5
if s.get("publication_date"):
score += 0.5
return min(10.0, score)
# ── Social Proof ─────────────────────────────────────────────────────
def _social(self, s: Dict) -> float:
stars = float(s.get("stars") or 0)
citations = float(s.get("citation_count") or 0)
views = float(s.get("views") or 0)
downloads = float(s.get("downloads") or 0)
likes = float(s.get("likes") or 0)
score = 0.0
if stars > 0: score += math.log10(stars + 1) * 2.0
if citations > 0: score += math.log10(citations + 1) * 3.0
if views > 0: score += math.log10(views + 1) * 1.0
if downloads > 0: score += math.log10(downloads + 1) * 1.5
if likes > 0: score += math.log10(likes + 1) * 1.0
has_signal = any([stars, citations, views, downloads, likes])
score = max(4.0 if has_signal else 3.5, score)
return min(10.0, score)
# ── Difficulty Alignment ─────────────────────────────────────────────
def _difficulty(self, s: Dict, req: Optional[Any]) -> float:
"""
Smooth penalty for difficulty mismatch.
gap=0: 10.0 (perfect)
gap=1: 8.5 (acceptable)
gap=2: 6.0 (marginal)
gap=3: 2.0 (poor β€” was 3.0, increased penalty)
gap=4: 0.5 (very poor)
The gap=3 increase from 3.0 to 2.0 is critical:
It ensures a difficulty=4 arXiv paper scores BELOW Wikipedia (d=1)
for a difficulty=1 query, because:
arXiv: base_quality ~8.0 Γ— penalty(gap=3, score=2.0) β†’ ~5.6
Wikipedia: base_quality ~8.0 Γ— penalty(gap=0, score=10.0) β†’ ~8.0
"""
if not req or not hasattr(req, "difficulty"):
return 7.0
src_diff = s.get("difficulty")
if src_diff is None:
return 6.0
try:
src_diff = int(src_diff)
except (ValueError, TypeError):
return 6.0
gap = abs(src_diff - req.difficulty)
return {
0: 10.0,
1: 8.5,
2: 6.0,
3: 2.0, # Increased from 3.0 β€” penalizes d=4 papers for d=1 queries
4: 0.5,
}.get(gap, 0.2)
# ── Accessibility ────────────────────────────────────────────────────
def _accessibility(self, s: Dict) -> float:
score = 5.0
if s.get("open_access", True):
score += 5.0
return min(10.0, score)