File size: 12,017 Bytes
4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 54b7ea7 4d92cd5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 | """
PIOE Opportunity Classifier
Classifies opportunities into categories using rules and LLM.
"""
from ..models import OpportunityCategory, Domain
class OpportunityClassifier:
"""
Classifies opportunities into categories and domains.
Uses rule-based classification first, LLM for ambiguous cases.
"""
# Source type to category mapping (high priority)
SOURCE_CATEGORY_MAP = {
"arxiv": OpportunityCategory.RESEARCH,
"github": OpportunityCategory.OPEN_SOURCE,
"superteam": OpportunityCategory.BOUNTY,
"grant_platform": OpportunityCategory.GRANT,
"gov_portal": OpportunityCategory.GRANT,
}
# Keyword patterns for each category (expanded for better matching)
CATEGORY_PATTERNS = {
OpportunityCategory.SCHOLARSHIP: [
"scholarship", "tuition", "financial aid", "merit award", "bursary",
"study abroad", "educational grant", "student funding", "tuition waiver",
"fully funded", "partial funding", "academic scholarship", "need-based",
"scholars4dev", "profellow", "scholars program", "student scholarship",
"undergraduate scholarship", "graduate scholarship", "phd funding",
"masters scholarship", "study opportunity", "education funding"
],
OpportunityCategory.FELLOWSHIP: [
"fellowship", "fellow program", "research fellow", "visiting fellow",
"postdoctoral fellowship", "predoctoral fellowship", "faculty fellowship",
"leadership fellowship", "professional fellowship", "policy fellowship",
"mandela rhodes", "chevening", "fulbright", "rhodes scholar", "gates cambridge"
],
OpportunityCategory.INTERNSHIP: [
"internship", "intern ", "intern,", "interns ", "summer program", "co-op",
"summer internship", "fall internship", "spring internship", "winter internship",
"student intern", "undergraduate intern", "graduate intern",
"internship program", "intern position", "paid internship", "remote internship",
"virtual internship", "intern opportunity", "entry level", "early career",
"new grad", "new graduate", "recent graduate", "campus hire", "university hire"
],
OpportunityCategory.JOB: [
"hiring", "job opening", "position available", "career opportunity",
"we're looking for", "full-time", "remote job", "we are hiring",
"join our team", "senior engineer", "staff engineer", "principal engineer",
"software developer", "data scientist", "ml engineer", "ai engineer",
"open position", "job posting", "employment", "role available"
],
OpportunityCategory.RESEARCH: [
"research assistant", "ra position", "research opportunity", "arxiv",
"abstract:", "we present", "we propose", "our method", "research paper",
"phd position", "postdoc position", "research position", "lab assistant",
"research internship", "research program"
],
OpportunityCategory.HACKATHON: [
"hackathon", "buildathon", "hackers wanted", "hack day", "hackerearth",
"devpost", "mlh ", "major league hacking", "eth global", "ethglobal",
"hackathon.io", "coding competition", "code sprint", "codeathon",
"24 hour", "48 hour", "weekend hack", "virtual hackathon",
"prize pool", "grand prize", "first prize", "finalist",
"submit your", "build something", "demo day", "pitch day"
],
OpportunityCategory.COMPETITION: [
"competition", "challenge", "contest", "kaggle", "data challenge",
"ai challenge", "ml competition", "coding contest",
"programming competition", "algorithm contest", "competitive programming",
"topcoder", "codeforces", "leetcode contest"
],
OpportunityCategory.GRANT: [
"grant program", "grant application", "grant funding", "grant deadline",
"grant opportunity", "project grant", "research grant", "innovation grant",
"startup grant", "seed grant", "small grant", "micro grant",
"grant call", "funding opportunity", "request for proposals", "rfp",
"government grant", "foundation grant", "apply for grant"
],
OpportunityCategory.ECOSYSTEM_GRANT: [
"ecosystem grant", "web3 grant", "blockchain grant", "crypto grant",
"solana grant", "ethereum grant", "polygon grant", "near grant",
"foundation grant", "protocol grant", "developer grant",
"builder grant", "ecosystem fund", "developer fund"
],
OpportunityCategory.CONFERENCE: [
"conference", "call for papers", "summit", "symposium", "workshop",
"speaker application", "paper submission", "abstract submission"
],
OpportunityCategory.OPEN_SOURCE: [
"open source", "gsoc", "google summer of code", "outreachy",
"contributor wanted", "hacktoberfest", "open source contribution",
"oss program", "open source internship"
],
OpportunityCategory.INVESTMENT: [
"funding round", "series a", "series b", "vc funding", "raised $",
"pre-seed", "seed round", "angel investment", "startup funding"
],
OpportunityCategory.BOUNTY: [
"bounty", "bug bounty", "earn reward", "usdc reward", "sol reward",
"crypto bounty", "superteam", "earn crypto", "bounty board"
],
}
# Domain patterns
DOMAIN_PATTERNS = {
Domain.COMPUTER_VISION: [
"computer vision", "image", "visual", "object detection", "segmentation", "opencv"
],
Domain.ROBOTICS: [
"robot", "ros", "autonomous", "manipulation", "navigation"
],
Domain.AI: [
"ai", "artificial intelligence", "machine learning", "deep learning",
"neural network", "llm", "transformer", "gpt"
],
Domain.FINANCE: [
"finance", "fintech", "trading", "investment", "stock", "quantitative"
],
Domain.CRYPTO: [
"crypto", "blockchain", "web3", "defi", "solana", "ethereum", "nft"
],
Domain.ACADEMIA: [
"research", "phd", "postdoc", "university", "academic", "professor"
],
}
def classify_by_source(self, source_type: str, source_name: str = "") -> OpportunityCategory | None:
"""
Classify primarily by source type.
Returns category or None if source doesn't determine category.
"""
source_lower = (source_type or "").lower()
source_name_lower = (source_name or "").lower()
# Check direct source mapping
if source_lower in self.SOURCE_CATEGORY_MAP:
return self.SOURCE_CATEGORY_MAP[source_lower]
# === Scholarship/Fellowship Sources ===
if any(x in source_name_lower for x in ["profellow", "scholars4dev", "opportunity desk"]):
# Check if it's specifically a fellowship or scholarship
if "fellowship" in source_name_lower:
return OpportunityCategory.FELLOWSHIP
return OpportunityCategory.SCHOLARSHIP
# === Internship Sources ===
if any(x in source_name_lower for x in ["intern", "entry level", "new grad"]):
return OpportunityCategory.INTERNSHIP
# === Hackathon Sources ===
if any(x in source_name_lower for x in ["devpost", "devfolio", "mlh", "hackathon", "ethglobal"]):
return OpportunityCategory.HACKATHON
# === Research Sources ===
if "arxiv" in source_name_lower:
return OpportunityCategory.RESEARCH
# === Open Source Sources ===
if "github" in source_name_lower:
return OpportunityCategory.OPEN_SOURCE
# === Job Sources ===
if any(x in source_name_lower for x in ["remotive", "arbeitnow", "themuse", "adzuna", "jooble", "linkedin"]):
# But if "intern" is in the title, it's an internship
return OpportunityCategory.JOB
if "hacker news" in source_name_lower and "jobs" in source_name_lower:
return OpportunityCategory.JOB
# === Bounty/Ecosystem Sources ===
if "superteam" in source_name_lower:
return OpportunityCategory.BOUNTY
return None
def classify_by_rules(self, text: str) -> tuple[OpportunityCategory, Domain, float]:
"""
Classify using keyword matching.
Returns (category, domain, confidence)
"""
if not text:
return OpportunityCategory.OTHER, Domain.MIXED, 0.0
text_lower = text.lower()
# Find matching category
category = OpportunityCategory.OTHER
cat_confidence = 0.0
for cat, patterns in self.CATEGORY_PATTERNS.items():
matches = sum(1 for p in patterns if p in text_lower)
if matches > cat_confidence:
category = cat
cat_confidence = min(matches * 0.3, 0.9)
# Find matching domain
domain = Domain.MIXED
domain_matches = 0
for dom, patterns in self.DOMAIN_PATTERNS.items():
matches = sum(1 for p in patterns if p in text_lower)
if matches > domain_matches:
domain = dom
domain_matches = matches
# If multiple domains match well, keep as mixed
domain_counts = {
dom: sum(1 for p in patterns if p in text_lower)
for dom, patterns in self.DOMAIN_PATTERNS.items()
}
high_matches = [d for d, c in domain_counts.items() if c >= domain_matches and c > 0]
if len(high_matches) > 1:
domain = Domain.MIXED
return category, domain, cat_confidence
def classify(
self,
text: str,
title: str = "",
source_type: str = "",
source_name: str = "",
use_llm: bool = False,
llm_client = None
) -> dict:
"""
Classify opportunity with optional LLM enhancement.
Returns dict with category, domain, confidence, method
"""
full_text = f"{title} {text}".strip()
# PRIORITY 1: Source-based classification (most reliable)
source_category = self.classify_by_source(source_type, source_name)
# PRIORITY 2: Rule-based keyword matching
rule_category, domain, confidence = self.classify_by_rules(full_text)
# Use source category if available (overrides keyword matching)
if source_category:
category = source_category
confidence = 0.85 # High confidence for source-based
method = "source"
else:
category = rule_category
method = "rules"
# Use LLM for low-confidence or ambiguous cases (only if no source match)
if use_llm and llm_client and confidence < 0.5 and not source_category:
try:
llm_result = llm_client.classify(full_text)
if llm_result.get("confidence", 0) > confidence:
return {
"category": llm_result.get("category", category.value),
"domain": llm_result.get("domain", domain.value),
"confidence": llm_result.get("confidence", confidence),
"method": "llm"
}
except Exception as e:
print(f"LLM classification failed: {e}")
return {
"category": category.value,
"domain": domain.value,
"confidence": confidence,
"method": method
}
|