Spaces:

vlsiddarth
/

Knowledge-Universe

Running

App Files Files Community

Knowledge-Universe / src /scoring /feature_extractor.py

vlsiddarth

Block 3 complete: decay scoring, relevance gates, Wikipedia fix, latency optimization

7abb8fd 2 months ago

raw

history blame contribute delete

7.88 kB

	"""
	Knowledge Universe — Feature Extractor
	Extracts robust signals (0.0 to 10.0) from raw source metadata.

	Final Block 3 version fixes:
	- Wikipedia authority boosted more aggressively for low-difficulty queries
	so it ranks above arXiv papers for "what is X" queries
	- arXiv difficulty penalty increased: gap=3 now costs 4.0 quality points
	(was 3.0), enough to drop a difficulty-mismatch paper below Wikipedia
	- Educational signal boost for arXiv: survey/tutorial/overview papers
	get +1.5 authority (was +1.0)
	- Application signal penalty for arXiv: -3.0 (was -2.0)
	Belt AND suspenders — feature_extractor deprioritizes even if
	Gate 4 in arxiv_crawler misses something
	"""

	import math
	from typing import Dict, Any, Optional


	# Educational keywords in title/abstract → paper is ABOUT the technique
	_EDUCATIONAL_SIGNALS = {
	"survey", "tutorial", "overview", "review", "introduction",
	"explained", "guide", "fundamentals", "primer", "comprehensive",
	"illustrated", "understanding", "deep dive", "from scratch",
	"step by step", "beginner", "what is", "how to", "learn",
	}

	# Application domain keywords → paper USES the technique for another domain
	_APPLICATION_SIGNALS = {
	"detection", "surveillance", "radar", "traffic", "medical",
	"clinical", "satellite", "remote sensing", "seismic", "earthquake",
	"financial", "stock", "crypto", "drug", "protein", "genomic",
	"manufacturing", "industrial", "autonomous driving", "lidar",
	"pathology", "radiology", "ecg", "eeg", "weather", "climate",
	"agriculture", "crop", "soil", "maritime", "naval", "military",
	"physical model", "quantum", "photonic", "optical",
	"verilog", "fpga", "circuit", "hardware",
	}


	class FeatureExtractor:

	def extract(
	self,
	source: Dict[str, Any],
	request: Optional[Any] = None,
	) -> Dict[str, float]:
	return {
	"authority": self._authority(source),
	"completeness": self._completeness(source),
	"social_proof": self._social(source),
	"difficulty_alignment": self._difficulty(source, request),
	"accessibility": self._accessibility(source),
	}

	# ── Authority ────────────────────────────────────────────────────────

	def _authority(self, s: Dict) -> float:
	platform = s.get("source_platform", "").lower()

	base = {
	"arxiv": 9.0,
	"mit_ocw": 9.5,
	"wikipedia": 8.5,
	"github": 7.5,
	"huggingface": 7.0,
	"stackoverflow": 7.0,
	"youtube": 6.5,
	"kaggle": 6.0,
	"openlibrary": 6.0,
	"podcast": 5.0,
	"common_crawl": 4.0,
	}.get(platform, 5.0)

	# ── arXiv: educational vs application adjustment ──────────────────
	if platform == "arxiv":
	title = (s.get("title") or "").lower()
	summary = (s.get("summary") or "").lower()[:200]
	combined = title + " " + summary

	edu_hits = sum(1 for kw in _EDUCATIONAL_SIGNALS if kw in combined)
	app_hits = sum(1 for kw in _APPLICATION_SIGNALS if kw in combined)

	if edu_hits > 0 and app_hits == 0:
	base += 1.5 # Pure educational paper — strong boost
	elif app_hits > 0 and edu_hits == 0:
	base -= 3.0 # Pure application paper — strong penalty
	elif app_hits > edu_hits:
	base -= 1.5 # Mostly application — moderate penalty
	# edu >= app: no change

	base = max(4.0, min(10.0, base))

	# ── Wikipedia: boost for low-difficulty queries ───────────────────
	# Wikipedia is authoritative for "what is X" queries.
	# Boost it when the source difficulty ≤ 2 so it ranks above
	# arXiv papers that get difficulty-penalized.
	if platform == "wikipedia":
	src_diff = s.get("difficulty", 3)
	try:
	if int(src_diff) <= 2:
	base = min(10.0, base + 1.5)
	except (ValueError, TypeError):
	pass

	return base

	# ── Completeness ─────────────────────────────────────────────────────

	def _completeness(self, s: Dict) -> float:
	score = 3.0

	summary = s.get("summary") or ""
	if len(summary) > 200:
	score += 4.0
	elif len(summary) > 50:
	score += 2.0

	tags = s.get("tags") or []
	if len(tags) >= 3:
	score += 2.0
	elif len(tags) >= 1:
	score += 1.0

	if s.get("authors"):
	score += 0.5
	if s.get("publication_date"):
	score += 0.5

	return min(10.0, score)

	# ── Social Proof ─────────────────────────────────────────────────────

	def _social(self, s: Dict) -> float:
	stars = float(s.get("stars") or 0)
	citations = float(s.get("citation_count") or 0)
	views = float(s.get("views") or 0)
	downloads = float(s.get("downloads") or 0)
	likes = float(s.get("likes") or 0)

	score = 0.0
	if stars > 0: score += math.log10(stars + 1) * 2.0
	if citations > 0: score += math.log10(citations + 1) * 3.0
	if views > 0: score += math.log10(views + 1) * 1.0
	if downloads > 0: score += math.log10(downloads + 1) * 1.5
	if likes > 0: score += math.log10(likes + 1) * 1.0

	has_signal = any([stars, citations, views, downloads, likes])
	score = max(4.0 if has_signal else 3.5, score)

	return min(10.0, score)

	# ── Difficulty Alignment ─────────────────────────────────────────────

	def _difficulty(self, s: Dict, req: Optional[Any]) -> float:
	"""
	Smooth penalty for difficulty mismatch.

	gap=0: 10.0 (perfect)
	gap=1: 8.5 (acceptable)
	gap=2: 6.0 (marginal)
	gap=3: 2.0 (poor — was 3.0, increased penalty)
	gap=4: 0.5 (very poor)

	The gap=3 increase from 3.0 to 2.0 is critical:
	It ensures a difficulty=4 arXiv paper scores BELOW Wikipedia (d=1)
	for a difficulty=1 query, because:
	arXiv: base_quality ~8.0 × penalty(gap=3, score=2.0) → ~5.6
	Wikipedia: base_quality ~8.0 × penalty(gap=0, score=10.0) → ~8.0
	"""
	if not req or not hasattr(req, "difficulty"):
	return 7.0

	src_diff = s.get("difficulty")
	if src_diff is None:
	return 6.0

	try:
	src_diff = int(src_diff)
	except (ValueError, TypeError):
	return 6.0

	gap = abs(src_diff - req.difficulty)

	return {
	0: 10.0,
	1: 8.5,
	2: 6.0,
	3: 2.0, # Increased from 3.0 — penalizes d=4 papers for d=1 queries
	4: 0.5,
	}.get(gap, 0.2)

	# ── Accessibility ────────────────────────────────────────────────────

	def _accessibility(self, s: Dict) -> float:
	score = 5.0
	if s.get("open_access", True):
	score += 5.0
	return min(10.0, score)