""" PIOE Relevance Scorer Scores opportunities based on relevance to user interests. Uses sentence-transformers for semantic similarity. """ from sentence_transformers import SentenceTransformer import numpy as np from typing import Optional class RelevanceScorer: """ Scores opportunities for relevance using embeddings. Uses a lightweight sentence transformer model optimized for: - Fast inference - Low memory (works on HF Spaces 16GB) - Good semantic understanding """ # Using a smaller, efficient model that works well on limited resources MODEL_NAME = "all-MiniLM-L6-v2" # 80MB, fast, good quality # Keywords that indicate high-value opportunities HIGH_VALUE_KEYWORDS = [ "computer vision", "robotics", "ROS", "PyTorch", "TensorFlow", "machine learning", "deep learning", "neural network", "internship", "fellowship", "scholarship", "grant", "funding", "hackathon", "competition", "challenge", "bounty", "research assistant", "PhD", "postdoc", "hiring", "AI", "artificial intelligence", "data science", "NLP", "startup", "seed", "Series A", "early-stage" ] def __init__(self, custom_keywords: Optional[list[str]] = None): """Initialize the scorer with optional custom keywords.""" self._model = None # Lazy load to save memory self.keywords = custom_keywords or self.HIGH_VALUE_KEYWORDS @property def model(self): """Lazy load model only when needed.""" if self._model is None: print("Loading sentence transformer model...") self._model = SentenceTransformer(self.MODEL_NAME) print("Model loaded.") return self._model def score(self, text: str, title: str = "") -> dict: """ Score an opportunity for relevance. Returns dict with: - relevance_score: 0.0 to 1.0 - keyword_matches: list of matched keywords - method: scoring method used """ full_text = f"{title} {text}".lower() # Method 1: Keyword matching (fast, always works) keyword_score, matches = self._keyword_score(full_text) # If keyword score is high enough, use it (saves embedding computation) if keyword_score >= 0.5: return { "relevance_score": min(keyword_score, 1.0), "keyword_matches": matches, "method": "keywords" } # Method 2: For borderline cases, boost with semantic similarity try: semantic_score = self._semantic_score(full_text) combined_score = 0.6 * keyword_score + 0.4 * semantic_score return { "relevance_score": min(combined_score, 1.0), "keyword_matches": matches, "semantic_score": semantic_score, "method": "hybrid" } except Exception as e: # Fall back to keyword-only if embedding fails print(f"Semantic scoring failed: {e}") return { "relevance_score": keyword_score, "keyword_matches": matches, "method": "keywords_fallback" } def _keyword_score(self, text: str) -> tuple[float, list[str]]: """Score based on keyword matching.""" matches = [] for keyword in self.keywords: if keyword.lower() in text: matches.append(keyword) # More matches = higher score if not matches: return 0.1, [] # Diminishing returns for many matches score = min(0.3 + (len(matches) * 0.15), 1.0) return score, matches def _semantic_score(self, text: str) -> float: """Score based on semantic similarity to ideal opportunities.""" # Create an "ideal opportunity" embedding ideal_text = " ".join(self.keywords[:10]) # Use top keywords as reference # Get embeddings text_embedding = self.model.encode(text[:500]) # Limit text length ideal_embedding = self.model.encode(ideal_text) # Cosine similarity similarity = np.dot(text_embedding, ideal_embedding) / ( np.linalg.norm(text_embedding) * np.linalg.norm(ideal_embedding) ) # Normalize to 0-1 range (similarity is typically -1 to 1) return float((similarity + 1) / 2) def get_embedding(self, text: str) -> np.ndarray: """Get embedding for a text (used by novelty detector).""" return self.model.encode(text[:1000]) def batch_score(self, opportunities: list[dict]) -> list[dict]: """Score multiple opportunities efficiently.""" results = [] for opp in opportunities: score = self.score( opp.get("raw_text", ""), opp.get("title", "") ) results.append({ **opp, "relevance_score": score["relevance_score"], "keyword_matches": score.get("keyword_matches", []) }) return results