Spaces:

Tremick
/

PIOE

Runtime error

App Files Files Community

B1acB1rd commited on Jan 20

Commit

24a8e5d

1 Parent(s): 8e9fb76

Add Hugging Face Spaces deployment support

Browse files

Files changed (3) hide show

Dockerfile +22 -15
backend/intelligence/scorer.py +120 -127
requirements.txt +4 -3

Dockerfile CHANGED Viewed

@@ -1,4 +1,6 @@
-# PIOE Docker Image
 FROM python:3.11-slim
 # Set working directory
@@ -7,25 +9,30 @@ WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     gcc \
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements first for caching
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy application code
-COPY . .
-# Create non-root user for security
-RUN useradd -m appuser && chown -R appuser:appuser /app
-USER appuser
-# Expose port
-EXPOSE 8000
 # Health check
-HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD curl -f http://localhost:8000/api/stats || exit 1
-# Run the application
-CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]

+# Hugging Face Spaces Dockerfile for PIOE
+# Uses Docker SDK for custom FastAPI deployment
 FROM python:3.11-slim
 # Set working directory
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     gcc \
+    curl \
     && rm -rf /var/lib/apt/lists/*
+# Create non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Copy requirements and install dependencies
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --user -r requirements.txt
+# Copy application code
+COPY --chown=user . .
+# Expose port 7860 (Hugging Face Spaces default)
+EXPOSE 7860
 # Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:7860/api/stats || exit 1
+# Run the application on port 7860
+CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]

backend/intelligence/scorer.py CHANGED Viewed

@@ -1,149 +1,142 @@
 """
-PIOE Relevance Scorer - Lightweight Version
-Uses keyword matching only (no ML models) to fit in Render's 512MB free tier.
 """
 from typing import Optional
-import re
-import hashlib
-from ..config import get_settings
 class RelevanceScorer:
     """
-    Scores opportunities based on relevance to user interests.
-    Lightweight version uses keyword matching only (no sentence-transformers).
-    This keeps memory usage under 100MB for Render free tier.
     """
-    def __init__(self):
-        self.settings = get_settings()
-        # Build keyword sets for efficient matching
-        self.keywords = set(kw.lower() for kw in self.settings.high_priority_keywords)
-        # Additional high-value keywords
-        self.bonus_keywords = {
-            # High-value opportunities
-            "fully funded", "paid", "stipend", "salary", "remote",
-            "deadline", "apply now", "applications open",
-            # Tech-specific
-            "python", "pytorch", "tensorflow", "opencv", "ros",
-            "transformer", "llm", "gpt", "neural",
-            # Opportunity types
-            "intern", "fellowship", "scholarship", "grant",
-            "hackathon", "competition", "bounty", "job",
-            "phd", "postdoc", "research assistant"
-        }
-    def get_embedding(self, text: str) -> list[float]:
-        """
-        Generate a simple hash-based 'embedding' for text.
-        Not a real embedding, but works for basic novelty detection.
-        """
-        if not text:
-            return [0.0] * 64
-        # Create a consistent hash-based vector
-        text_lower = text.lower()[:1000]
-        hash_bytes = hashlib.sha256(text_lower.encode()).digest()
-        # Convert to list of floats between 0 and 1
-        embedding = [b / 255.0 for b in hash_bytes]
-        # Pad to 64 dimensions
-        embedding = (embedding * 2)[:64]
-        return embedding
-    def score_keywords(self, text: str) -> float:
         """
-        Score based on keyword presence.
-        Returns 0.0 to 1.0
         """
-        if not text:
-            return 0.0
-        text_lower = text.lower()
-        # Count primary keyword matches
-        primary_matches = sum(
-            1 for keyword in self.keywords
-            if keyword in text_lower
-        )
-        # Count bonus keyword matches
-        bonus_matches = sum(
-            1 for keyword in self.bonus_keywords
-            if keyword in text_lower
         )
-        # Weighted score
-        primary_score = min(primary_matches / 5, 1.0)  # 5+ primary = full score
-        bonus_score = min(bonus_matches / 3, 0.5)      # Bonus adds up to 0.5
-        return min(primary_score + bonus_score * 0.3, 1.0)
-    def score_title_quality(self, title: str) -> float:
-        """
-        Score based on title indicators.
-        Titles with action words score higher.
-        """
-        if not title:
-            return 0.5
-        title_lower = title.lower()
-        # Positive indicators (action opportunities)
-        positive_patterns = [
-            r'\$\d+',        # Money amounts
-            r'\bhiring\b',
-            r'\bapply\b',
-            r'\bopening\b',
-            r'\bseeking\b',
-            r'\bfunded\b',
-            r'\bremote\b',
-        ]
-        # Negative indicators (discussions, not opportunities)
-        negative_patterns = [
-            r'^how (do|to|can)',
-            r'^why (do|is|are)',
-            r'^what (is|are)',
-            r'\bvs\b',
-            r'\bopinion\b',
-            r'\brant\b',
-        ]
-        score = 0.5  # Neutral baseline
-        for pattern in positive_patterns:
-            if re.search(pattern, title_lower):
-                score += 0.1
-        for pattern in negative_patterns:
-            if re.search(pattern, title_lower):
-                score -= 0.2
-        return max(0.0, min(score, 1.0))
-    def score(self, text: str, title: str = "") -> dict:
-        """
-        Calculate combined relevance score.
-        Returns dict with individual and combined scores.
-        """
-        full_text = f"{title} {text}".strip()
-        keyword_score = self.score_keywords(full_text)
-        title_score = self.score_title_quality(title)
-        # Combined: keywords 70%, title quality 30%
-        # (Semantic score removed for lightweight version)
-        combined = 0.7 * keyword_score + 0.3 * title_score
-        return {
-            "keyword_score": round(keyword_score, 3),
-            "semantic_score": round(title_score, 3),  # Renamed for backwards compat
-            "relevance_score": round(combined, 3)
-        }

 """
+PIOE Relevance Scorer
+Scores opportunities based on relevance to user interests.
+Uses sentence-transformers for semantic similarity.
 """
+from sentence_transformers import SentenceTransformer
+import numpy as np
 from typing import Optional
 class RelevanceScorer:
     """
+    Scores opportunities for relevance using embeddings.
+    Uses a lightweight sentence transformer model optimized for:
+    - Fast inference
+    - Low memory (works on HF Spaces 16GB)
+    - Good semantic understanding
     """
+    # Using a smaller, efficient model that works well on limited resources
+    MODEL_NAME = "all-MiniLM-L6-v2"  # 80MB, fast, good quality
+    # Keywords that indicate high-value opportunities
+    HIGH_VALUE_KEYWORDS = [
+        "computer vision", "robotics", "ROS", "PyTorch", "TensorFlow",
+        "machine learning", "deep learning", "neural network",
+        "internship", "fellowship", "scholarship", "grant", "funding",
+        "hackathon", "competition", "challenge", "bounty",
+        "research assistant", "PhD", "postdoc", "hiring",
+        "AI", "artificial intelligence", "data science", "NLP",
+        "startup", "seed", "Series A", "early-stage"
+    ]
+    def __init__(self, custom_keywords: Optional[list[str]] = None):
+        """Initialize the scorer with optional custom keywords."""
+        self._model = None  # Lazy load to save memory
+        self.keywords = custom_keywords or self.HIGH_VALUE_KEYWORDS
+    @property
+    def model(self):
+        """Lazy load model only when needed."""
+        if self._model is None:
+            print("Loading sentence transformer model...")
+            self._model = SentenceTransformer(self.MODEL_NAME)
+            print("Model loaded.")
+        return self._model
+    def score(self, text: str, title: str = "") -> dict:
         """
+        Score an opportunity for relevance.
+        Returns dict with:
+        - relevance_score: 0.0 to 1.0
+        - keyword_matches: list of matched keywords
+        - method: scoring method used
         """
+        full_text = f"{title} {text}".lower()
+        # Method 1: Keyword matching (fast, always works)
+        keyword_score, matches = self._keyword_score(full_text)
+        # If keyword score is high enough, use it (saves embedding computation)
+        if keyword_score >= 0.5:
+            return {
+                "relevance_score": min(keyword_score, 1.0),
+                "keyword_matches": matches,
+                "method": "keywords"
+            }
+        # Method 2: For borderline cases, boost with semantic similarity
+        try:
+            semantic_score = self._semantic_score(full_text)
+            combined_score = 0.6 * keyword_score + 0.4 * semantic_score
+            return {
+                "relevance_score": min(combined_score, 1.0),
+                "keyword_matches": matches,
+                "semantic_score": semantic_score,
+                "method": "hybrid"
+            }
+        except Exception as e:
+            # Fall back to keyword-only if embedding fails
+            print(f"Semantic scoring failed: {e}")
+            return {
+                "relevance_score": keyword_score,
+                "keyword_matches": matches,
+                "method": "keywords_fallback"
+            }
+    def _keyword_score(self, text: str) -> tuple[float, list[str]]:
+        """Score based on keyword matching."""
+        matches = []
+        for keyword in self.keywords:
+            if keyword.lower() in text:
+                matches.append(keyword)
+        # More matches = higher score
+        if not matches:
+            return 0.1, []
+        # Diminishing returns for many matches
+        score = min(0.3 + (len(matches) * 0.15), 1.0)
+        return score, matches
+    def _semantic_score(self, text: str) -> float:
+        """Score based on semantic similarity to ideal opportunities."""
+        # Create an "ideal opportunity" embedding
+        ideal_text = " ".join(self.keywords[:10])  # Use top keywords as reference
+        # Get embeddings
+        text_embedding = self.model.encode(text[:500])  # Limit text length
+        ideal_embedding = self.model.encode(ideal_text)
+        # Cosine similarity
+        similarity = np.dot(text_embedding, ideal_embedding) / (
+            np.linalg.norm(text_embedding) * np.linalg.norm(ideal_embedding)
         )
+        # Normalize to 0-1 range (similarity is typically -1 to 1)
+        return float((similarity + 1) / 2)
+    def get_embedding(self, text: str) -> np.ndarray:
+        """Get embedding for a text (used by novelty detector)."""
+        return self.model.encode(text[:1000])
+    def batch_score(self, opportunities: list[dict]) -> list[dict]:
+        """Score multiple opportunities efficiently."""
+        results = []
+        for opp in opportunities:
+            score = self.score(
+                opp.get("raw_text", ""),
+                opp.get("title", "")
+            )
+            results.append({
+                **opp,
+                "relevance_score": score["relevance_score"],
+                "keyword_matches": score.get("keyword_matches", [])
+            })
+        return results

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
 # PIOE 2.0 - Personal Intelligence & Opportunity Engine
-# Lightweight version for Render free tier (512MB limit)
 # Web Framework
 fastapi
 uvicorn[standard]
-# Database (SQLite local, PostgreSQL for production)
 sqlalchemy
 psycopg2-binary
@@ -19,7 +19,8 @@ aiofiles
 # Scheduling
 apscheduler
-# AI & ML (Lightweight - no sentence-transformers to save memory)
 google-generativeai
 numpy

 # PIOE 2.0 - Personal Intelligence & Opportunity Engine
+# Optimized for Hugging Face Spaces deployment
 # Web Framework
 fastapi
 uvicorn[standard]
+# Database
 sqlalchemy
 psycopg2-binary
 # Scheduling
 apscheduler
+# AI & ML (sentence-transformers works on HF Spaces - 16GB RAM)
+sentence-transformers
 google-generativeai
 numpy