PIOE / backend /intelligence /classifier.py
B1acB1rd
Enhanced classifier for better hackathon/internship/scholarship categorization
54b7ea7
"""
PIOE Opportunity Classifier
Classifies opportunities into categories using rules and LLM.
"""
from ..models import OpportunityCategory, Domain
class OpportunityClassifier:
"""
Classifies opportunities into categories and domains.
Uses rule-based classification first, LLM for ambiguous cases.
"""
# Source type to category mapping (high priority)
SOURCE_CATEGORY_MAP = {
"arxiv": OpportunityCategory.RESEARCH,
"github": OpportunityCategory.OPEN_SOURCE,
"superteam": OpportunityCategory.BOUNTY,
"grant_platform": OpportunityCategory.GRANT,
"gov_portal": OpportunityCategory.GRANT,
}
# Keyword patterns for each category (expanded for better matching)
CATEGORY_PATTERNS = {
OpportunityCategory.SCHOLARSHIP: [
"scholarship", "tuition", "financial aid", "merit award", "bursary",
"study abroad", "educational grant", "student funding", "tuition waiver",
"fully funded", "partial funding", "academic scholarship", "need-based",
"scholars4dev", "profellow", "scholars program", "student scholarship",
"undergraduate scholarship", "graduate scholarship", "phd funding",
"masters scholarship", "study opportunity", "education funding"
],
OpportunityCategory.FELLOWSHIP: [
"fellowship", "fellow program", "research fellow", "visiting fellow",
"postdoctoral fellowship", "predoctoral fellowship", "faculty fellowship",
"leadership fellowship", "professional fellowship", "policy fellowship",
"mandela rhodes", "chevening", "fulbright", "rhodes scholar", "gates cambridge"
],
OpportunityCategory.INTERNSHIP: [
"internship", "intern ", "intern,", "interns ", "summer program", "co-op",
"summer internship", "fall internship", "spring internship", "winter internship",
"student intern", "undergraduate intern", "graduate intern",
"internship program", "intern position", "paid internship", "remote internship",
"virtual internship", "intern opportunity", "entry level", "early career",
"new grad", "new graduate", "recent graduate", "campus hire", "university hire"
],
OpportunityCategory.JOB: [
"hiring", "job opening", "position available", "career opportunity",
"we're looking for", "full-time", "remote job", "we are hiring",
"join our team", "senior engineer", "staff engineer", "principal engineer",
"software developer", "data scientist", "ml engineer", "ai engineer",
"open position", "job posting", "employment", "role available"
],
OpportunityCategory.RESEARCH: [
"research assistant", "ra position", "research opportunity", "arxiv",
"abstract:", "we present", "we propose", "our method", "research paper",
"phd position", "postdoc position", "research position", "lab assistant",
"research internship", "research program"
],
OpportunityCategory.HACKATHON: [
"hackathon", "buildathon", "hackers wanted", "hack day", "hackerearth",
"devpost", "mlh ", "major league hacking", "eth global", "ethglobal",
"hackathon.io", "coding competition", "code sprint", "codeathon",
"24 hour", "48 hour", "weekend hack", "virtual hackathon",
"prize pool", "grand prize", "first prize", "finalist",
"submit your", "build something", "demo day", "pitch day"
],
OpportunityCategory.COMPETITION: [
"competition", "challenge", "contest", "kaggle", "data challenge",
"ai challenge", "ml competition", "coding contest",
"programming competition", "algorithm contest", "competitive programming",
"topcoder", "codeforces", "leetcode contest"
],
OpportunityCategory.GRANT: [
"grant program", "grant application", "grant funding", "grant deadline",
"grant opportunity", "project grant", "research grant", "innovation grant",
"startup grant", "seed grant", "small grant", "micro grant",
"grant call", "funding opportunity", "request for proposals", "rfp",
"government grant", "foundation grant", "apply for grant"
],
OpportunityCategory.ECOSYSTEM_GRANT: [
"ecosystem grant", "web3 grant", "blockchain grant", "crypto grant",
"solana grant", "ethereum grant", "polygon grant", "near grant",
"foundation grant", "protocol grant", "developer grant",
"builder grant", "ecosystem fund", "developer fund"
],
OpportunityCategory.CONFERENCE: [
"conference", "call for papers", "summit", "symposium", "workshop",
"speaker application", "paper submission", "abstract submission"
],
OpportunityCategory.OPEN_SOURCE: [
"open source", "gsoc", "google summer of code", "outreachy",
"contributor wanted", "hacktoberfest", "open source contribution",
"oss program", "open source internship"
],
OpportunityCategory.INVESTMENT: [
"funding round", "series a", "series b", "vc funding", "raised $",
"pre-seed", "seed round", "angel investment", "startup funding"
],
OpportunityCategory.BOUNTY: [
"bounty", "bug bounty", "earn reward", "usdc reward", "sol reward",
"crypto bounty", "superteam", "earn crypto", "bounty board"
],
}
# Domain patterns
DOMAIN_PATTERNS = {
Domain.COMPUTER_VISION: [
"computer vision", "image", "visual", "object detection", "segmentation", "opencv"
],
Domain.ROBOTICS: [
"robot", "ros", "autonomous", "manipulation", "navigation"
],
Domain.AI: [
"ai", "artificial intelligence", "machine learning", "deep learning",
"neural network", "llm", "transformer", "gpt"
],
Domain.FINANCE: [
"finance", "fintech", "trading", "investment", "stock", "quantitative"
],
Domain.CRYPTO: [
"crypto", "blockchain", "web3", "defi", "solana", "ethereum", "nft"
],
Domain.ACADEMIA: [
"research", "phd", "postdoc", "university", "academic", "professor"
],
}
def classify_by_source(self, source_type: str, source_name: str = "") -> OpportunityCategory | None:
"""
Classify primarily by source type.
Returns category or None if source doesn't determine category.
"""
source_lower = (source_type or "").lower()
source_name_lower = (source_name or "").lower()
# Check direct source mapping
if source_lower in self.SOURCE_CATEGORY_MAP:
return self.SOURCE_CATEGORY_MAP[source_lower]
# === Scholarship/Fellowship Sources ===
if any(x in source_name_lower for x in ["profellow", "scholars4dev", "opportunity desk"]):
# Check if it's specifically a fellowship or scholarship
if "fellowship" in source_name_lower:
return OpportunityCategory.FELLOWSHIP
return OpportunityCategory.SCHOLARSHIP
# === Internship Sources ===
if any(x in source_name_lower for x in ["intern", "entry level", "new grad"]):
return OpportunityCategory.INTERNSHIP
# === Hackathon Sources ===
if any(x in source_name_lower for x in ["devpost", "devfolio", "mlh", "hackathon", "ethglobal"]):
return OpportunityCategory.HACKATHON
# === Research Sources ===
if "arxiv" in source_name_lower:
return OpportunityCategory.RESEARCH
# === Open Source Sources ===
if "github" in source_name_lower:
return OpportunityCategory.OPEN_SOURCE
# === Job Sources ===
if any(x in source_name_lower for x in ["remotive", "arbeitnow", "themuse", "adzuna", "jooble", "linkedin"]):
# But if "intern" is in the title, it's an internship
return OpportunityCategory.JOB
if "hacker news" in source_name_lower and "jobs" in source_name_lower:
return OpportunityCategory.JOB
# === Bounty/Ecosystem Sources ===
if "superteam" in source_name_lower:
return OpportunityCategory.BOUNTY
return None
def classify_by_rules(self, text: str) -> tuple[OpportunityCategory, Domain, float]:
"""
Classify using keyword matching.
Returns (category, domain, confidence)
"""
if not text:
return OpportunityCategory.OTHER, Domain.MIXED, 0.0
text_lower = text.lower()
# Find matching category
category = OpportunityCategory.OTHER
cat_confidence = 0.0
for cat, patterns in self.CATEGORY_PATTERNS.items():
matches = sum(1 for p in patterns if p in text_lower)
if matches > cat_confidence:
category = cat
cat_confidence = min(matches * 0.3, 0.9)
# Find matching domain
domain = Domain.MIXED
domain_matches = 0
for dom, patterns in self.DOMAIN_PATTERNS.items():
matches = sum(1 for p in patterns if p in text_lower)
if matches > domain_matches:
domain = dom
domain_matches = matches
# If multiple domains match well, keep as mixed
domain_counts = {
dom: sum(1 for p in patterns if p in text_lower)
for dom, patterns in self.DOMAIN_PATTERNS.items()
}
high_matches = [d for d, c in domain_counts.items() if c >= domain_matches and c > 0]
if len(high_matches) > 1:
domain = Domain.MIXED
return category, domain, cat_confidence
def classify(
self,
text: str,
title: str = "",
source_type: str = "",
source_name: str = "",
use_llm: bool = False,
llm_client = None
) -> dict:
"""
Classify opportunity with optional LLM enhancement.
Returns dict with category, domain, confidence, method
"""
full_text = f"{title} {text}".strip()
# PRIORITY 1: Source-based classification (most reliable)
source_category = self.classify_by_source(source_type, source_name)
# PRIORITY 2: Rule-based keyword matching
rule_category, domain, confidence = self.classify_by_rules(full_text)
# Use source category if available (overrides keyword matching)
if source_category:
category = source_category
confidence = 0.85 # High confidence for source-based
method = "source"
else:
category = rule_category
method = "rules"
# Use LLM for low-confidence or ambiguous cases (only if no source match)
if use_llm and llm_client and confidence < 0.5 and not source_category:
try:
llm_result = llm_client.classify(full_text)
if llm_result.get("confidence", 0) > confidence:
return {
"category": llm_result.get("category", category.value),
"domain": llm_result.get("domain", domain.value),
"confidence": llm_result.get("confidence", confidence),
"method": "llm"
}
except Exception as e:
print(f"LLM classification failed: {e}")
return {
"category": category.value,
"domain": domain.value,
"confidence": confidence,
"method": method
}