"""
PIOE Relevance Scorer

Scores opportunities based on relevance to user interests.
Uses sentence-transformers for semantic similarity.
"""
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import Optional


class RelevanceScorer:
    """
    Scores opportunities for relevance using embeddings.
    
    Uses a lightweight sentence transformer model optimized for:
    - Fast inference
    - Low memory (works on HF Spaces 16GB)
    - Good semantic understanding
    """
    
    # Using a smaller, efficient model that works well on limited resources
    MODEL_NAME = "all-MiniLM-L6-v2"  # 80MB, fast, good quality
    
    # Keywords that indicate high-value opportunities
    HIGH_VALUE_KEYWORDS = [
        "computer vision", "robotics", "ROS", "PyTorch", "TensorFlow",
        "machine learning", "deep learning", "neural network",
        "internship", "fellowship", "scholarship", "grant", "funding",
        "hackathon", "competition", "challenge", "bounty",
        "research assistant", "PhD", "postdoc", "hiring",
        "AI", "artificial intelligence", "data science", "NLP",
        "startup", "seed", "Series A", "early-stage"
    ]
    
    def __init__(self, custom_keywords: Optional[list[str]] = None):
        """Initialize the scorer with optional custom keywords."""
        self._model = None  # Lazy load to save memory
        self.keywords = custom_keywords or self.HIGH_VALUE_KEYWORDS
    
    @property
    def model(self):
        """Lazy load model only when needed."""
        if self._model is None:
            print("Loading sentence transformer model...")
            self._model = SentenceTransformer(self.MODEL_NAME)
            print("Model loaded.")
        return self._model
    
    def score(self, text: str, title: str = "") -> dict:
        """
        Score an opportunity for relevance.
        
        Returns dict with:
        - relevance_score: 0.0 to 1.0
        - keyword_matches: list of matched keywords
        - method: scoring method used
        """
        full_text = f"{title} {text}".lower()
        
        # Method 1: Keyword matching (fast, always works)
        keyword_score, matches = self._keyword_score(full_text)
        
        # If keyword score is high enough, use it (saves embedding computation)
        if keyword_score >= 0.5:
            return {
                "relevance_score": min(keyword_score, 1.0),
                "keyword_matches": matches,
                "method": "keywords"
            }
        
        # Method 2: For borderline cases, boost with semantic similarity
        try:
            semantic_score = self._semantic_score(full_text)
            combined_score = 0.6 * keyword_score + 0.4 * semantic_score
            
            return {
                "relevance_score": min(combined_score, 1.0),
                "keyword_matches": matches,
                "semantic_score": semantic_score,
                "method": "hybrid"
            }
        except Exception as e:
            # Fall back to keyword-only if embedding fails
            print(f"Semantic scoring failed: {e}")
            return {
                "relevance_score": keyword_score,
                "keyword_matches": matches,
                "method": "keywords_fallback"
            }
    
    def _keyword_score(self, text: str) -> tuple[float, list[str]]:
        """Score based on keyword matching."""
        matches = []
        
        for keyword in self.keywords:
            if keyword.lower() in text:
                matches.append(keyword)
        
        # More matches = higher score
        if not matches:
            return 0.1, []
        
        # Diminishing returns for many matches
        score = min(0.3 + (len(matches) * 0.15), 1.0)
        return score, matches
    
    def _semantic_score(self, text: str) -> float:
        """Score based on semantic similarity to ideal opportunities."""
        # Create an "ideal opportunity" embedding
        ideal_text = " ".join(self.keywords[:10])  # Use top keywords as reference
        
        # Get embeddings
        text_embedding = self.model.encode(text[:500])  # Limit text length
        ideal_embedding = self.model.encode(ideal_text)
        
        # Cosine similarity
        similarity = np.dot(text_embedding, ideal_embedding) / (
            np.linalg.norm(text_embedding) * np.linalg.norm(ideal_embedding)
        )
        
        # Normalize to 0-1 range (similarity is typically -1 to 1)
        return float((similarity + 1) / 2)
    
    def get_embedding(self, text: str) -> np.ndarray:
        """Get embedding for a text (used by novelty detector)."""
        return self.model.encode(text[:1000])
    
    def batch_score(self, opportunities: list[dict]) -> list[dict]:
        """Score multiple opportunities efficiently."""
        results = []
        for opp in opportunities:
            score = self.score(
                opp.get("raw_text", ""),
                opp.get("title", "")
            )
            results.append({
                **opp,
                "relevance_score": score["relevance_score"],
                "keyword_matches": score.get("keyword_matches", [])
            })
        return results