| """ |
| PIOE Relevance Scorer |
| |
| Scores opportunities based on relevance to user interests. |
| Uses sentence-transformers for semantic similarity. |
| """ |
| from sentence_transformers import SentenceTransformer |
| import numpy as np |
| from typing import Optional |
|
|
|
|
| class RelevanceScorer: |
| """ |
| Scores opportunities for relevance using embeddings. |
| |
| Uses a lightweight sentence transformer model optimized for: |
| - Fast inference |
| - Low memory (works on HF Spaces 16GB) |
| - Good semantic understanding |
| """ |
| |
| |
| MODEL_NAME = "all-MiniLM-L6-v2" |
| |
| |
| HIGH_VALUE_KEYWORDS = [ |
| "computer vision", "robotics", "ROS", "PyTorch", "TensorFlow", |
| "machine learning", "deep learning", "neural network", |
| "internship", "fellowship", "scholarship", "grant", "funding", |
| "hackathon", "competition", "challenge", "bounty", |
| "research assistant", "PhD", "postdoc", "hiring", |
| "AI", "artificial intelligence", "data science", "NLP", |
| "startup", "seed", "Series A", "early-stage" |
| ] |
| |
| def __init__(self, custom_keywords: Optional[list[str]] = None): |
| """Initialize the scorer with optional custom keywords.""" |
| self._model = None |
| self.keywords = custom_keywords or self.HIGH_VALUE_KEYWORDS |
| |
| @property |
| def model(self): |
| """Lazy load model only when needed.""" |
| if self._model is None: |
| print("Loading sentence transformer model...") |
| self._model = SentenceTransformer(self.MODEL_NAME) |
| print("Model loaded.") |
| return self._model |
| |
| def score(self, text: str, title: str = "") -> dict: |
| """ |
| Score an opportunity for relevance. |
| |
| Returns dict with: |
| - relevance_score: 0.0 to 1.0 |
| - keyword_matches: list of matched keywords |
| - method: scoring method used |
| """ |
| full_text = f"{title} {text}".lower() |
| |
| |
| keyword_score, matches = self._keyword_score(full_text) |
| |
| |
| if keyword_score >= 0.5: |
| return { |
| "relevance_score": min(keyword_score, 1.0), |
| "keyword_matches": matches, |
| "method": "keywords" |
| } |
| |
| |
| try: |
| semantic_score = self._semantic_score(full_text) |
| combined_score = 0.6 * keyword_score + 0.4 * semantic_score |
| |
| return { |
| "relevance_score": min(combined_score, 1.0), |
| "keyword_matches": matches, |
| "semantic_score": semantic_score, |
| "method": "hybrid" |
| } |
| except Exception as e: |
| |
| print(f"Semantic scoring failed: {e}") |
| return { |
| "relevance_score": keyword_score, |
| "keyword_matches": matches, |
| "method": "keywords_fallback" |
| } |
| |
| def _keyword_score(self, text: str) -> tuple[float, list[str]]: |
| """Score based on keyword matching.""" |
| matches = [] |
| |
| for keyword in self.keywords: |
| if keyword.lower() in text: |
| matches.append(keyword) |
| |
| |
| if not matches: |
| return 0.1, [] |
| |
| |
| score = min(0.3 + (len(matches) * 0.15), 1.0) |
| return score, matches |
| |
| def _semantic_score(self, text: str) -> float: |
| """Score based on semantic similarity to ideal opportunities.""" |
| |
| ideal_text = " ".join(self.keywords[:10]) |
| |
| |
| text_embedding = self.model.encode(text[:500]) |
| ideal_embedding = self.model.encode(ideal_text) |
| |
| |
| similarity = np.dot(text_embedding, ideal_embedding) / ( |
| np.linalg.norm(text_embedding) * np.linalg.norm(ideal_embedding) |
| ) |
| |
| |
| return float((similarity + 1) / 2) |
| |
| def get_embedding(self, text: str) -> np.ndarray: |
| """Get embedding for a text (used by novelty detector).""" |
| return self.model.encode(text[:1000]) |
| |
| def batch_score(self, opportunities: list[dict]) -> list[dict]: |
| """Score multiple opportunities efficiently.""" |
| results = [] |
| for opp in opportunities: |
| score = self.score( |
| opp.get("raw_text", ""), |
| opp.get("title", "") |
| ) |
| results.append({ |
| **opp, |
| "relevance_score": score["relevance_score"], |
| "keyword_matches": score.get("keyword_matches", []) |
| }) |
| return results |
|
|