Spaces:

Tremick
/

PIOE

Runtime error

B1acB1rd

Add Hugging Face Spaces deployment support

24a8e5d 4 months ago

5.25 kB

	"""
	PIOE Relevance Scorer

	Scores opportunities based on relevance to user interests.
	Uses sentence-transformers for semantic similarity.
	"""
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from typing import Optional


	class RelevanceScorer:
	"""
	Scores opportunities for relevance using embeddings.

	Uses a lightweight sentence transformer model optimized for:
	- Fast inference
	- Low memory (works on HF Spaces 16GB)
	- Good semantic understanding
	"""

	# Using a smaller, efficient model that works well on limited resources
	MODEL_NAME = "all-MiniLM-L6-v2" # 80MB, fast, good quality

	# Keywords that indicate high-value opportunities
	HIGH_VALUE_KEYWORDS = [
	"computer vision", "robotics", "ROS", "PyTorch", "TensorFlow",
	"machine learning", "deep learning", "neural network",
	"internship", "fellowship", "scholarship", "grant", "funding",
	"hackathon", "competition", "challenge", "bounty",
	"research assistant", "PhD", "postdoc", "hiring",
	"AI", "artificial intelligence", "data science", "NLP",
	"startup", "seed", "Series A", "early-stage"
	]

	def __init__(self, custom_keywords: Optional[list[str]] = None):
	"""Initialize the scorer with optional custom keywords."""
	self._model = None # Lazy load to save memory
	self.keywords = custom_keywords or self.HIGH_VALUE_KEYWORDS

	@property
	def model(self):
	"""Lazy load model only when needed."""
	if self._model is None:
	print("Loading sentence transformer model...")
	self._model = SentenceTransformer(self.MODEL_NAME)
	print("Model loaded.")
	return self._model

	def score(self, text: str, title: str = "") -> dict:
	"""
	Score an opportunity for relevance.

	Returns dict with:
	- relevance_score: 0.0 to 1.0
	- keyword_matches: list of matched keywords
	- method: scoring method used
	"""
	full_text = f"{title} {text}".lower()

	# Method 1: Keyword matching (fast, always works)
	keyword_score, matches = self._keyword_score(full_text)

	# If keyword score is high enough, use it (saves embedding computation)
	if keyword_score >= 0.5:
	return {
	"relevance_score": min(keyword_score, 1.0),
	"keyword_matches": matches,
	"method": "keywords"
	}

	# Method 2: For borderline cases, boost with semantic similarity
	try:
	semantic_score = self._semantic_score(full_text)
	combined_score = 0.6 * keyword_score + 0.4 * semantic_score

	return {
	"relevance_score": min(combined_score, 1.0),
	"keyword_matches": matches,
	"semantic_score": semantic_score,
	"method": "hybrid"
	}
	except Exception as e:
	# Fall back to keyword-only if embedding fails
	print(f"Semantic scoring failed: {e}")
	return {
	"relevance_score": keyword_score,
	"keyword_matches": matches,
	"method": "keywords_fallback"
	}

	def _keyword_score(self, text: str) -> tuple[float, list[str]]:
	"""Score based on keyword matching."""
	matches = []

	for keyword in self.keywords:
	if keyword.lower() in text:
	matches.append(keyword)

	# More matches = higher score
	if not matches:
	return 0.1, []

	# Diminishing returns for many matches
	score = min(0.3 + (len(matches) * 0.15), 1.0)
	return score, matches

	def _semantic_score(self, text: str) -> float:
	"""Score based on semantic similarity to ideal opportunities."""
	# Create an "ideal opportunity" embedding
	ideal_text = " ".join(self.keywords[:10]) # Use top keywords as reference

	# Get embeddings
	text_embedding = self.model.encode(text[:500]) # Limit text length
	ideal_embedding = self.model.encode(ideal_text)

	# Cosine similarity
	similarity = np.dot(text_embedding, ideal_embedding) / (
	np.linalg.norm(text_embedding) * np.linalg.norm(ideal_embedding)
	)

	# Normalize to 0-1 range (similarity is typically -1 to 1)
	return float((similarity + 1) / 2)

	def get_embedding(self, text: str) -> np.ndarray:
	"""Get embedding for a text (used by novelty detector)."""
	return self.model.encode(text[:1000])

	def batch_score(self, opportunities: list[dict]) -> list[dict]:
	"""Score multiple opportunities efficiently."""
	results = []
	for opp in opportunities:
	score = self.score(
	opp.get("raw_text", ""),
	opp.get("title", "")
	)
	results.append({
	**opp,
	"relevance_score": score["relevance_score"],
	"keyword_matches": score.get("keyword_matches", [])
	})
	return results