PIOE / backend /intelligence /scorer.py
B1acB1rd
Add Hugging Face Spaces deployment support
24a8e5d
"""
PIOE Relevance Scorer
Scores opportunities based on relevance to user interests.
Uses sentence-transformers for semantic similarity.
"""
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import Optional
class RelevanceScorer:
"""
Scores opportunities for relevance using embeddings.
Uses a lightweight sentence transformer model optimized for:
- Fast inference
- Low memory (works on HF Spaces 16GB)
- Good semantic understanding
"""
# Using a smaller, efficient model that works well on limited resources
MODEL_NAME = "all-MiniLM-L6-v2" # 80MB, fast, good quality
# Keywords that indicate high-value opportunities
HIGH_VALUE_KEYWORDS = [
"computer vision", "robotics", "ROS", "PyTorch", "TensorFlow",
"machine learning", "deep learning", "neural network",
"internship", "fellowship", "scholarship", "grant", "funding",
"hackathon", "competition", "challenge", "bounty",
"research assistant", "PhD", "postdoc", "hiring",
"AI", "artificial intelligence", "data science", "NLP",
"startup", "seed", "Series A", "early-stage"
]
def __init__(self, custom_keywords: Optional[list[str]] = None):
"""Initialize the scorer with optional custom keywords."""
self._model = None # Lazy load to save memory
self.keywords = custom_keywords or self.HIGH_VALUE_KEYWORDS
@property
def model(self):
"""Lazy load model only when needed."""
if self._model is None:
print("Loading sentence transformer model...")
self._model = SentenceTransformer(self.MODEL_NAME)
print("Model loaded.")
return self._model
def score(self, text: str, title: str = "") -> dict:
"""
Score an opportunity for relevance.
Returns dict with:
- relevance_score: 0.0 to 1.0
- keyword_matches: list of matched keywords
- method: scoring method used
"""
full_text = f"{title} {text}".lower()
# Method 1: Keyword matching (fast, always works)
keyword_score, matches = self._keyword_score(full_text)
# If keyword score is high enough, use it (saves embedding computation)
if keyword_score >= 0.5:
return {
"relevance_score": min(keyword_score, 1.0),
"keyword_matches": matches,
"method": "keywords"
}
# Method 2: For borderline cases, boost with semantic similarity
try:
semantic_score = self._semantic_score(full_text)
combined_score = 0.6 * keyword_score + 0.4 * semantic_score
return {
"relevance_score": min(combined_score, 1.0),
"keyword_matches": matches,
"semantic_score": semantic_score,
"method": "hybrid"
}
except Exception as e:
# Fall back to keyword-only if embedding fails
print(f"Semantic scoring failed: {e}")
return {
"relevance_score": keyword_score,
"keyword_matches": matches,
"method": "keywords_fallback"
}
def _keyword_score(self, text: str) -> tuple[float, list[str]]:
"""Score based on keyword matching."""
matches = []
for keyword in self.keywords:
if keyword.lower() in text:
matches.append(keyword)
# More matches = higher score
if not matches:
return 0.1, []
# Diminishing returns for many matches
score = min(0.3 + (len(matches) * 0.15), 1.0)
return score, matches
def _semantic_score(self, text: str) -> float:
"""Score based on semantic similarity to ideal opportunities."""
# Create an "ideal opportunity" embedding
ideal_text = " ".join(self.keywords[:10]) # Use top keywords as reference
# Get embeddings
text_embedding = self.model.encode(text[:500]) # Limit text length
ideal_embedding = self.model.encode(ideal_text)
# Cosine similarity
similarity = np.dot(text_embedding, ideal_embedding) / (
np.linalg.norm(text_embedding) * np.linalg.norm(ideal_embedding)
)
# Normalize to 0-1 range (similarity is typically -1 to 1)
return float((similarity + 1) / 2)
def get_embedding(self, text: str) -> np.ndarray:
"""Get embedding for a text (used by novelty detector)."""
return self.model.encode(text[:1000])
def batch_score(self, opportunities: list[dict]) -> list[dict]:
"""Score multiple opportunities efficiently."""
results = []
for opp in opportunities:
score = self.score(
opp.get("raw_text", ""),
opp.get("title", "")
)
results.append({
**opp,
"relevance_score": score["relevance_score"],
"keyword_matches": score.get("keyword_matches", [])
})
return results