B1acB1rd commited on
Commit
24a8e5d
·
1 Parent(s): 8e9fb76

Add Hugging Face Spaces deployment support

Browse files
Files changed (3) hide show
  1. Dockerfile +22 -15
  2. backend/intelligence/scorer.py +120 -127
  3. requirements.txt +4 -3
Dockerfile CHANGED
@@ -1,4 +1,6 @@
1
- # PIOE Docker Image
 
 
2
  FROM python:3.11-slim
3
 
4
  # Set working directory
@@ -7,25 +9,30 @@ WORKDIR /app
7
  # Install system dependencies
8
  RUN apt-get update && apt-get install -y \
9
  gcc \
 
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Copy requirements first for caching
13
- COPY requirements.txt .
14
- RUN pip install --no-cache-dir -r requirements.txt
 
 
15
 
16
- # Copy application code
17
- COPY . .
18
 
19
- # Create non-root user for security
20
- RUN useradd -m appuser && chown -R appuser:appuser /app
21
- USER appuser
 
 
 
22
 
23
- # Expose port
24
- EXPOSE 8000
25
 
26
  # Health check
27
- HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
28
- CMD curl -f http://localhost:8000/api/stats || exit 1
29
 
30
- # Run the application
31
- CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]
 
1
+ # Hugging Face Spaces Dockerfile for PIOE
2
+ # Uses Docker SDK for custom FastAPI deployment
3
+
4
  FROM python:3.11-slim
5
 
6
  # Set working directory
 
9
  # Install system dependencies
10
  RUN apt-get update && apt-get install -y \
11
  gcc \
12
+ curl \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
+ # Create non-root user (required by HF Spaces)
16
+ RUN useradd -m -u 1000 user
17
+ USER user
18
+ ENV HOME=/home/user \
19
+ PATH=/home/user/.local/bin:$PATH
20
 
21
+ WORKDIR $HOME/app
 
22
 
23
+ # Copy requirements and install dependencies
24
+ COPY --chown=user requirements.txt .
25
+ RUN pip install --no-cache-dir --user -r requirements.txt
26
+
27
+ # Copy application code
28
+ COPY --chown=user . .
29
 
30
+ # Expose port 7860 (Hugging Face Spaces default)
31
+ EXPOSE 7860
32
 
33
  # Health check
34
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
35
+ CMD curl -f http://localhost:7860/api/stats || exit 1
36
 
37
+ # Run the application on port 7860
38
+ CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]
backend/intelligence/scorer.py CHANGED
@@ -1,149 +1,142 @@
1
  """
2
- PIOE Relevance Scorer - Lightweight Version
3
 
4
- Uses keyword matching only (no ML models) to fit in Render's 512MB free tier.
 
5
  """
 
 
6
  from typing import Optional
7
- import re
8
- import hashlib
9
-
10
- from ..config import get_settings
11
 
12
 
13
  class RelevanceScorer:
14
  """
15
- Scores opportunities based on relevance to user interests.
16
 
17
- Lightweight version uses keyword matching only (no sentence-transformers).
18
- This keeps memory usage under 100MB for Render free tier.
 
 
19
  """
20
 
21
- def __init__(self):
22
- self.settings = get_settings()
23
-
24
- # Build keyword sets for efficient matching
25
- self.keywords = set(kw.lower() for kw in self.settings.high_priority_keywords)
26
-
27
- # Additional high-value keywords
28
- self.bonus_keywords = {
29
- # High-value opportunities
30
- "fully funded", "paid", "stipend", "salary", "remote",
31
- "deadline", "apply now", "applications open",
32
- # Tech-specific
33
- "python", "pytorch", "tensorflow", "opencv", "ros",
34
- "transformer", "llm", "gpt", "neural",
35
- # Opportunity types
36
- "intern", "fellowship", "scholarship", "grant",
37
- "hackathon", "competition", "bounty", "job",
38
- "phd", "postdoc", "research assistant"
39
- }
40
 
41
- def get_embedding(self, text: str) -> list[float]:
42
- """
43
- Generate a simple hash-based 'embedding' for text.
44
- Not a real embedding, but works for basic novelty detection.
45
- """
46
- if not text:
47
- return [0.0] * 64
48
-
49
- # Create a consistent hash-based vector
50
- text_lower = text.lower()[:1000]
51
- hash_bytes = hashlib.sha256(text_lower.encode()).digest()
52
-
53
- # Convert to list of floats between 0 and 1
54
- embedding = [b / 255.0 for b in hash_bytes]
55
- # Pad to 64 dimensions
56
- embedding = (embedding * 2)[:64]
57
-
58
- return embedding
59
 
60
- def score_keywords(self, text: str) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  """
62
- Score based on keyword presence.
63
- Returns 0.0 to 1.0
 
 
 
 
64
  """
65
- if not text:
66
- return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- text_lower = text.lower()
 
 
69
 
70
- # Count primary keyword matches
71
- primary_matches = sum(
72
- 1 for keyword in self.keywords
73
- if keyword in text_lower
74
- )
75
 
76
- # Count bonus keyword matches
77
- bonus_matches = sum(
78
- 1 for keyword in self.bonus_keywords
79
- if keyword in text_lower
 
 
 
 
 
 
 
 
 
 
 
 
80
  )
81
 
82
- # Weighted score
83
- primary_score = min(primary_matches / 5, 1.0) # 5+ primary = full score
84
- bonus_score = min(bonus_matches / 3, 0.5) # Bonus adds up to 0.5
85
-
86
- return min(primary_score + bonus_score * 0.3, 1.0)
87
 
88
- def score_title_quality(self, title: str) -> float:
89
- """
90
- Score based on title indicators.
91
- Titles with action words score higher.
92
- """
93
- if not title:
94
- return 0.5
95
-
96
- title_lower = title.lower()
97
-
98
- # Positive indicators (action opportunities)
99
- positive_patterns = [
100
- r'\$\d+', # Money amounts
101
- r'\bhiring\b',
102
- r'\bapply\b',
103
- r'\bopening\b',
104
- r'\bseeking\b',
105
- r'\bfunded\b',
106
- r'\bremote\b',
107
- ]
108
-
109
- # Negative indicators (discussions, not opportunities)
110
- negative_patterns = [
111
- r'^how (do|to|can)',
112
- r'^why (do|is|are)',
113
- r'^what (is|are)',
114
- r'\bvs\b',
115
- r'\bopinion\b',
116
- r'\brant\b',
117
- ]
118
-
119
- score = 0.5 # Neutral baseline
120
-
121
- for pattern in positive_patterns:
122
- if re.search(pattern, title_lower):
123
- score += 0.1
124
-
125
- for pattern in negative_patterns:
126
- if re.search(pattern, title_lower):
127
- score -= 0.2
128
-
129
- return max(0.0, min(score, 1.0))
130
 
131
- def score(self, text: str, title: str = "") -> dict:
132
- """
133
- Calculate combined relevance score.
134
- Returns dict with individual and combined scores.
135
- """
136
- full_text = f"{title} {text}".strip()
137
-
138
- keyword_score = self.score_keywords(full_text)
139
- title_score = self.score_title_quality(title)
140
-
141
- # Combined: keywords 70%, title quality 30%
142
- # (Semantic score removed for lightweight version)
143
- combined = 0.7 * keyword_score + 0.3 * title_score
144
-
145
- return {
146
- "keyword_score": round(keyword_score, 3),
147
- "semantic_score": round(title_score, 3), # Renamed for backwards compat
148
- "relevance_score": round(combined, 3)
149
- }
 
1
  """
2
+ PIOE Relevance Scorer
3
 
4
+ Scores opportunities based on relevance to user interests.
5
+ Uses sentence-transformers for semantic similarity.
6
  """
7
+ from sentence_transformers import SentenceTransformer
8
+ import numpy as np
9
  from typing import Optional
 
 
 
 
10
 
11
 
12
  class RelevanceScorer:
13
  """
14
+ Scores opportunities for relevance using embeddings.
15
 
16
+ Uses a lightweight sentence transformer model optimized for:
17
+ - Fast inference
18
+ - Low memory (works on HF Spaces 16GB)
19
+ - Good semantic understanding
20
  """
21
 
22
+ # Using a smaller, efficient model that works well on limited resources
23
+ MODEL_NAME = "all-MiniLM-L6-v2" # 80MB, fast, good quality
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Keywords that indicate high-value opportunities
26
+ HIGH_VALUE_KEYWORDS = [
27
+ "computer vision", "robotics", "ROS", "PyTorch", "TensorFlow",
28
+ "machine learning", "deep learning", "neural network",
29
+ "internship", "fellowship", "scholarship", "grant", "funding",
30
+ "hackathon", "competition", "challenge", "bounty",
31
+ "research assistant", "PhD", "postdoc", "hiring",
32
+ "AI", "artificial intelligence", "data science", "NLP",
33
+ "startup", "seed", "Series A", "early-stage"
34
+ ]
 
 
 
 
 
 
 
 
35
 
36
+ def __init__(self, custom_keywords: Optional[list[str]] = None):
37
+ """Initialize the scorer with optional custom keywords."""
38
+ self._model = None # Lazy load to save memory
39
+ self.keywords = custom_keywords or self.HIGH_VALUE_KEYWORDS
40
+
41
+ @property
42
+ def model(self):
43
+ """Lazy load model only when needed."""
44
+ if self._model is None:
45
+ print("Loading sentence transformer model...")
46
+ self._model = SentenceTransformer(self.MODEL_NAME)
47
+ print("Model loaded.")
48
+ return self._model
49
+
50
+ def score(self, text: str, title: str = "") -> dict:
51
  """
52
+ Score an opportunity for relevance.
53
+
54
+ Returns dict with:
55
+ - relevance_score: 0.0 to 1.0
56
+ - keyword_matches: list of matched keywords
57
+ - method: scoring method used
58
  """
59
+ full_text = f"{title} {text}".lower()
60
+
61
+ # Method 1: Keyword matching (fast, always works)
62
+ keyword_score, matches = self._keyword_score(full_text)
63
+
64
+ # If keyword score is high enough, use it (saves embedding computation)
65
+ if keyword_score >= 0.5:
66
+ return {
67
+ "relevance_score": min(keyword_score, 1.0),
68
+ "keyword_matches": matches,
69
+ "method": "keywords"
70
+ }
71
+
72
+ # Method 2: For borderline cases, boost with semantic similarity
73
+ try:
74
+ semantic_score = self._semantic_score(full_text)
75
+ combined_score = 0.6 * keyword_score + 0.4 * semantic_score
76
+
77
+ return {
78
+ "relevance_score": min(combined_score, 1.0),
79
+ "keyword_matches": matches,
80
+ "semantic_score": semantic_score,
81
+ "method": "hybrid"
82
+ }
83
+ except Exception as e:
84
+ # Fall back to keyword-only if embedding fails
85
+ print(f"Semantic scoring failed: {e}")
86
+ return {
87
+ "relevance_score": keyword_score,
88
+ "keyword_matches": matches,
89
+ "method": "keywords_fallback"
90
+ }
91
+
92
+ def _keyword_score(self, text: str) -> tuple[float, list[str]]:
93
+ """Score based on keyword matching."""
94
+ matches = []
95
 
96
+ for keyword in self.keywords:
97
+ if keyword.lower() in text:
98
+ matches.append(keyword)
99
 
100
+ # More matches = higher score
101
+ if not matches:
102
+ return 0.1, []
 
 
103
 
104
+ # Diminishing returns for many matches
105
+ score = min(0.3 + (len(matches) * 0.15), 1.0)
106
+ return score, matches
107
+
108
+ def _semantic_score(self, text: str) -> float:
109
+ """Score based on semantic similarity to ideal opportunities."""
110
+ # Create an "ideal opportunity" embedding
111
+ ideal_text = " ".join(self.keywords[:10]) # Use top keywords as reference
112
+
113
+ # Get embeddings
114
+ text_embedding = self.model.encode(text[:500]) # Limit text length
115
+ ideal_embedding = self.model.encode(ideal_text)
116
+
117
+ # Cosine similarity
118
+ similarity = np.dot(text_embedding, ideal_embedding) / (
119
+ np.linalg.norm(text_embedding) * np.linalg.norm(ideal_embedding)
120
  )
121
 
122
+ # Normalize to 0-1 range (similarity is typically -1 to 1)
123
+ return float((similarity + 1) / 2)
 
 
 
124
 
125
+ def get_embedding(self, text: str) -> np.ndarray:
126
+ """Get embedding for a text (used by novelty detector)."""
127
+ return self.model.encode(text[:1000])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ def batch_score(self, opportunities: list[dict]) -> list[dict]:
130
+ """Score multiple opportunities efficiently."""
131
+ results = []
132
+ for opp in opportunities:
133
+ score = self.score(
134
+ opp.get("raw_text", ""),
135
+ opp.get("title", "")
136
+ )
137
+ results.append({
138
+ **opp,
139
+ "relevance_score": score["relevance_score"],
140
+ "keyword_matches": score.get("keyword_matches", [])
141
+ })
142
+ return results
 
 
 
 
 
requirements.txt CHANGED
@@ -1,11 +1,11 @@
1
  # PIOE 2.0 - Personal Intelligence & Opportunity Engine
2
- # Lightweight version for Render free tier (512MB limit)
3
 
4
  # Web Framework
5
  fastapi
6
  uvicorn[standard]
7
 
8
- # Database (SQLite local, PostgreSQL for production)
9
  sqlalchemy
10
  psycopg2-binary
11
 
@@ -19,7 +19,8 @@ aiofiles
19
  # Scheduling
20
  apscheduler
21
 
22
- # AI & ML (Lightweight - no sentence-transformers to save memory)
 
23
  google-generativeai
24
  numpy
25
 
 
1
  # PIOE 2.0 - Personal Intelligence & Opportunity Engine
2
+ # Optimized for Hugging Face Spaces deployment
3
 
4
  # Web Framework
5
  fastapi
6
  uvicorn[standard]
7
 
8
+ # Database
9
  sqlalchemy
10
  psycopg2-binary
11
 
 
19
  # Scheduling
20
  apscheduler
21
 
22
+ # AI & ML (sentence-transformers works on HF Spaces - 16GB RAM)
23
+ sentence-transformers
24
  google-generativeai
25
  numpy
26