Spaces:
Running
Running
| """ | |
| Knowledge Universe β Feature Extractor | |
| Extracts robust signals (0.0 to 10.0) from raw source metadata. | |
| Final Block 3 version fixes: | |
| - Wikipedia authority boosted more aggressively for low-difficulty queries | |
| so it ranks above arXiv papers for "what is X" queries | |
| - arXiv difficulty penalty increased: gap=3 now costs 4.0 quality points | |
| (was 3.0), enough to drop a difficulty-mismatch paper below Wikipedia | |
| - Educational signal boost for arXiv: survey/tutorial/overview papers | |
| get +1.5 authority (was +1.0) | |
| - Application signal penalty for arXiv: -3.0 (was -2.0) | |
| Belt AND suspenders β feature_extractor deprioritizes even if | |
| Gate 4 in arxiv_crawler misses something | |
| """ | |
| import math | |
| from typing import Dict, Any, Optional | |
| # Educational keywords in title/abstract β paper is ABOUT the technique | |
| _EDUCATIONAL_SIGNALS = { | |
| "survey", "tutorial", "overview", "review", "introduction", | |
| "explained", "guide", "fundamentals", "primer", "comprehensive", | |
| "illustrated", "understanding", "deep dive", "from scratch", | |
| "step by step", "beginner", "what is", "how to", "learn", | |
| } | |
| # Application domain keywords β paper USES the technique for another domain | |
| _APPLICATION_SIGNALS = { | |
| "detection", "surveillance", "radar", "traffic", "medical", | |
| "clinical", "satellite", "remote sensing", "seismic", "earthquake", | |
| "financial", "stock", "crypto", "drug", "protein", "genomic", | |
| "manufacturing", "industrial", "autonomous driving", "lidar", | |
| "pathology", "radiology", "ecg", "eeg", "weather", "climate", | |
| "agriculture", "crop", "soil", "maritime", "naval", "military", | |
| "physical model", "quantum", "photonic", "optical", | |
| "verilog", "fpga", "circuit", "hardware", | |
| } | |
| class FeatureExtractor: | |
| def extract( | |
| self, | |
| source: Dict[str, Any], | |
| request: Optional[Any] = None, | |
| ) -> Dict[str, float]: | |
| return { | |
| "authority": self._authority(source), | |
| "completeness": self._completeness(source), | |
| "social_proof": self._social(source), | |
| "difficulty_alignment": self._difficulty(source, request), | |
| "accessibility": self._accessibility(source), | |
| } | |
| # ββ Authority ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _authority(self, s: Dict) -> float: | |
| platform = s.get("source_platform", "").lower() | |
| base = { | |
| "arxiv": 9.0, | |
| "mit_ocw": 9.5, | |
| "wikipedia": 8.5, | |
| "github": 7.5, | |
| "huggingface": 7.0, | |
| "stackoverflow": 7.0, | |
| "youtube": 6.5, | |
| "kaggle": 6.0, | |
| "openlibrary": 6.0, | |
| "podcast": 5.0, | |
| "common_crawl": 4.0, | |
| }.get(platform, 5.0) | |
| # ββ arXiv: educational vs application adjustment ββββββββββββββββββ | |
| if platform == "arxiv": | |
| title = (s.get("title") or "").lower() | |
| summary = (s.get("summary") or "").lower()[:200] | |
| combined = title + " " + summary | |
| edu_hits = sum(1 for kw in _EDUCATIONAL_SIGNALS if kw in combined) | |
| app_hits = sum(1 for kw in _APPLICATION_SIGNALS if kw in combined) | |
| if edu_hits > 0 and app_hits == 0: | |
| base += 1.5 # Pure educational paper β strong boost | |
| elif app_hits > 0 and edu_hits == 0: | |
| base -= 3.0 # Pure application paper β strong penalty | |
| elif app_hits > edu_hits: | |
| base -= 1.5 # Mostly application β moderate penalty | |
| # edu >= app: no change | |
| base = max(4.0, min(10.0, base)) | |
| # ββ Wikipedia: boost for low-difficulty queries βββββββββββββββββββ | |
| # Wikipedia is authoritative for "what is X" queries. | |
| # Boost it when the source difficulty β€ 2 so it ranks above | |
| # arXiv papers that get difficulty-penalized. | |
| if platform == "wikipedia": | |
| src_diff = s.get("difficulty", 3) | |
| try: | |
| if int(src_diff) <= 2: | |
| base = min(10.0, base + 1.5) | |
| except (ValueError, TypeError): | |
| pass | |
| return base | |
| # ββ Completeness βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _completeness(self, s: Dict) -> float: | |
| score = 3.0 | |
| summary = s.get("summary") or "" | |
| if len(summary) > 200: | |
| score += 4.0 | |
| elif len(summary) > 50: | |
| score += 2.0 | |
| tags = s.get("tags") or [] | |
| if len(tags) >= 3: | |
| score += 2.0 | |
| elif len(tags) >= 1: | |
| score += 1.0 | |
| if s.get("authors"): | |
| score += 0.5 | |
| if s.get("publication_date"): | |
| score += 0.5 | |
| return min(10.0, score) | |
| # ββ Social Proof βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _social(self, s: Dict) -> float: | |
| stars = float(s.get("stars") or 0) | |
| citations = float(s.get("citation_count") or 0) | |
| views = float(s.get("views") or 0) | |
| downloads = float(s.get("downloads") or 0) | |
| likes = float(s.get("likes") or 0) | |
| score = 0.0 | |
| if stars > 0: score += math.log10(stars + 1) * 2.0 | |
| if citations > 0: score += math.log10(citations + 1) * 3.0 | |
| if views > 0: score += math.log10(views + 1) * 1.0 | |
| if downloads > 0: score += math.log10(downloads + 1) * 1.5 | |
| if likes > 0: score += math.log10(likes + 1) * 1.0 | |
| has_signal = any([stars, citations, views, downloads, likes]) | |
| score = max(4.0 if has_signal else 3.5, score) | |
| return min(10.0, score) | |
| # ββ Difficulty Alignment βββββββββββββββββββββββββββββββββββββββββββββ | |
| def _difficulty(self, s: Dict, req: Optional[Any]) -> float: | |
| """ | |
| Smooth penalty for difficulty mismatch. | |
| gap=0: 10.0 (perfect) | |
| gap=1: 8.5 (acceptable) | |
| gap=2: 6.0 (marginal) | |
| gap=3: 2.0 (poor β was 3.0, increased penalty) | |
| gap=4: 0.5 (very poor) | |
| The gap=3 increase from 3.0 to 2.0 is critical: | |
| It ensures a difficulty=4 arXiv paper scores BELOW Wikipedia (d=1) | |
| for a difficulty=1 query, because: | |
| arXiv: base_quality ~8.0 Γ penalty(gap=3, score=2.0) β ~5.6 | |
| Wikipedia: base_quality ~8.0 Γ penalty(gap=0, score=10.0) β ~8.0 | |
| """ | |
| if not req or not hasattr(req, "difficulty"): | |
| return 7.0 | |
| src_diff = s.get("difficulty") | |
| if src_diff is None: | |
| return 6.0 | |
| try: | |
| src_diff = int(src_diff) | |
| except (ValueError, TypeError): | |
| return 6.0 | |
| gap = abs(src_diff - req.difficulty) | |
| return { | |
| 0: 10.0, | |
| 1: 8.5, | |
| 2: 6.0, | |
| 3: 2.0, # Increased from 3.0 β penalizes d=4 papers for d=1 queries | |
| 4: 0.5, | |
| }.get(gap, 0.2) | |
| # ββ Accessibility ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _accessibility(self, s: Dict) -> float: | |
| score = 5.0 | |
| if s.get("open_access", True): | |
| score += 5.0 | |
| return min(10.0, score) |