Spaces:

Tremick
/

PIOE

Runtime error

PIOE / backend /intelligence /classifier.py

B1acB1rd

Enhanced classifier for better hackathon/internship/scholarship categorization

54b7ea7 4 months ago

12 kB

	"""
	PIOE Opportunity Classifier

	Classifies opportunities into categories using rules and LLM.
	"""
	from ..models import OpportunityCategory, Domain


	class OpportunityClassifier:
	"""
	Classifies opportunities into categories and domains.
	Uses rule-based classification first, LLM for ambiguous cases.
	"""

	# Source type to category mapping (high priority)
	SOURCE_CATEGORY_MAP = {
	"arxiv": OpportunityCategory.RESEARCH,
	"github": OpportunityCategory.OPEN_SOURCE,
	"superteam": OpportunityCategory.BOUNTY,
	"grant_platform": OpportunityCategory.GRANT,
	"gov_portal": OpportunityCategory.GRANT,
	}

	# Keyword patterns for each category (expanded for better matching)
	CATEGORY_PATTERNS = {
	OpportunityCategory.SCHOLARSHIP: [
	"scholarship", "tuition", "financial aid", "merit award", "bursary",
	"study abroad", "educational grant", "student funding", "tuition waiver",
	"fully funded", "partial funding", "academic scholarship", "need-based",
	"scholars4dev", "profellow", "scholars program", "student scholarship",
	"undergraduate scholarship", "graduate scholarship", "phd funding",
	"masters scholarship", "study opportunity", "education funding"
	],
	OpportunityCategory.FELLOWSHIP: [
	"fellowship", "fellow program", "research fellow", "visiting fellow",
	"postdoctoral fellowship", "predoctoral fellowship", "faculty fellowship",
	"leadership fellowship", "professional fellowship", "policy fellowship",
	"mandela rhodes", "chevening", "fulbright", "rhodes scholar", "gates cambridge"
	],
	OpportunityCategory.INTERNSHIP: [
	"internship", "intern ", "intern,", "interns ", "summer program", "co-op",
	"summer internship", "fall internship", "spring internship", "winter internship",
	"student intern", "undergraduate intern", "graduate intern",
	"internship program", "intern position", "paid internship", "remote internship",
	"virtual internship", "intern opportunity", "entry level", "early career",
	"new grad", "new graduate", "recent graduate", "campus hire", "university hire"
	],
	OpportunityCategory.JOB: [
	"hiring", "job opening", "position available", "career opportunity",
	"we're looking for", "full-time", "remote job", "we are hiring",
	"join our team", "senior engineer", "staff engineer", "principal engineer",
	"software developer", "data scientist", "ml engineer", "ai engineer",
	"open position", "job posting", "employment", "role available"
	],
	OpportunityCategory.RESEARCH: [
	"research assistant", "ra position", "research opportunity", "arxiv",
	"abstract:", "we present", "we propose", "our method", "research paper",
	"phd position", "postdoc position", "research position", "lab assistant",
	"research internship", "research program"
	],
	OpportunityCategory.HACKATHON: [
	"hackathon", "buildathon", "hackers wanted", "hack day", "hackerearth",
	"devpost", "mlh ", "major league hacking", "eth global", "ethglobal",
	"hackathon.io", "coding competition", "code sprint", "codeathon",
	"24 hour", "48 hour", "weekend hack", "virtual hackathon",
	"prize pool", "grand prize", "first prize", "finalist",
	"submit your", "build something", "demo day", "pitch day"
	],
	OpportunityCategory.COMPETITION: [
	"competition", "challenge", "contest", "kaggle", "data challenge",
	"ai challenge", "ml competition", "coding contest",
	"programming competition", "algorithm contest", "competitive programming",
	"topcoder", "codeforces", "leetcode contest"
	],
	OpportunityCategory.GRANT: [
	"grant program", "grant application", "grant funding", "grant deadline",
	"grant opportunity", "project grant", "research grant", "innovation grant",
	"startup grant", "seed grant", "small grant", "micro grant",
	"grant call", "funding opportunity", "request for proposals", "rfp",
	"government grant", "foundation grant", "apply for grant"
	],
	OpportunityCategory.ECOSYSTEM_GRANT: [
	"ecosystem grant", "web3 grant", "blockchain grant", "crypto grant",
	"solana grant", "ethereum grant", "polygon grant", "near grant",
	"foundation grant", "protocol grant", "developer grant",
	"builder grant", "ecosystem fund", "developer fund"
	],
	OpportunityCategory.CONFERENCE: [
	"conference", "call for papers", "summit", "symposium", "workshop",
	"speaker application", "paper submission", "abstract submission"
	],
	OpportunityCategory.OPEN_SOURCE: [
	"open source", "gsoc", "google summer of code", "outreachy",
	"contributor wanted", "hacktoberfest", "open source contribution",
	"oss program", "open source internship"
	],
	OpportunityCategory.INVESTMENT: [
	"funding round", "series a", "series b", "vc funding", "raised $",
	"pre-seed", "seed round", "angel investment", "startup funding"
	],
	OpportunityCategory.BOUNTY: [
	"bounty", "bug bounty", "earn reward", "usdc reward", "sol reward",
	"crypto bounty", "superteam", "earn crypto", "bounty board"
	],
	}

	# Domain patterns
	DOMAIN_PATTERNS = {
	Domain.COMPUTER_VISION: [
	"computer vision", "image", "visual", "object detection", "segmentation", "opencv"
	],
	Domain.ROBOTICS: [
	"robot", "ros", "autonomous", "manipulation", "navigation"
	],
	Domain.AI: [
	"ai", "artificial intelligence", "machine learning", "deep learning",
	"neural network", "llm", "transformer", "gpt"
	],
	Domain.FINANCE: [
	"finance", "fintech", "trading", "investment", "stock", "quantitative"
	],
	Domain.CRYPTO: [
	"crypto", "blockchain", "web3", "defi", "solana", "ethereum", "nft"
	],
	Domain.ACADEMIA: [
	"research", "phd", "postdoc", "university", "academic", "professor"
	],
	}

	def classify_by_source(self, source_type: str, source_name: str = "") -> OpportunityCategory \| None:
	"""
	Classify primarily by source type.
	Returns category or None if source doesn't determine category.
	"""
	source_lower = (source_type or "").lower()
	source_name_lower = (source_name or "").lower()

	# Check direct source mapping
	if source_lower in self.SOURCE_CATEGORY_MAP:
	return self.SOURCE_CATEGORY_MAP[source_lower]

	# === Scholarship/Fellowship Sources ===
	if any(x in source_name_lower for x in ["profellow", "scholars4dev", "opportunity desk"]):
	# Check if it's specifically a fellowship or scholarship
	if "fellowship" in source_name_lower:
	return OpportunityCategory.FELLOWSHIP
	return OpportunityCategory.SCHOLARSHIP

	# === Internship Sources ===
	if any(x in source_name_lower for x in ["intern", "entry level", "new grad"]):
	return OpportunityCategory.INTERNSHIP

	# === Hackathon Sources ===
	if any(x in source_name_lower for x in ["devpost", "devfolio", "mlh", "hackathon", "ethglobal"]):
	return OpportunityCategory.HACKATHON

	# === Research Sources ===
	if "arxiv" in source_name_lower:
	return OpportunityCategory.RESEARCH

	# === Open Source Sources ===
	if "github" in source_name_lower:
	return OpportunityCategory.OPEN_SOURCE

	# === Job Sources ===
	if any(x in source_name_lower for x in ["remotive", "arbeitnow", "themuse", "adzuna", "jooble", "linkedin"]):
	# But if "intern" is in the title, it's an internship
	return OpportunityCategory.JOB
	if "hacker news" in source_name_lower and "jobs" in source_name_lower:
	return OpportunityCategory.JOB

	# === Bounty/Ecosystem Sources ===
	if "superteam" in source_name_lower:
	return OpportunityCategory.BOUNTY

	return None

	def classify_by_rules(self, text: str) -> tuple[OpportunityCategory, Domain, float]:
	"""
	Classify using keyword matching.
	Returns (category, domain, confidence)
	"""
	if not text:
	return OpportunityCategory.OTHER, Domain.MIXED, 0.0

	text_lower = text.lower()

	# Find matching category
	category = OpportunityCategory.OTHER
	cat_confidence = 0.0

	for cat, patterns in self.CATEGORY_PATTERNS.items():
	matches = sum(1 for p in patterns if p in text_lower)
	if matches > cat_confidence:
	category = cat
	cat_confidence = min(matches * 0.3, 0.9)

	# Find matching domain
	domain = Domain.MIXED
	domain_matches = 0

	for dom, patterns in self.DOMAIN_PATTERNS.items():
	matches = sum(1 for p in patterns if p in text_lower)
	if matches > domain_matches:
	domain = dom
	domain_matches = matches

	# If multiple domains match well, keep as mixed
	domain_counts = {
	dom: sum(1 for p in patterns if p in text_lower)
	for dom, patterns in self.DOMAIN_PATTERNS.items()
	}
	high_matches = [d for d, c in domain_counts.items() if c >= domain_matches and c > 0]
	if len(high_matches) > 1:
	domain = Domain.MIXED

	return category, domain, cat_confidence

	def classify(
	self,
	text: str,
	title: str = "",
	source_type: str = "",
	source_name: str = "",
	use_llm: bool = False,
	llm_client = None
	) -> dict:
	"""
	Classify opportunity with optional LLM enhancement.

	Returns dict with category, domain, confidence, method
	"""
	full_text = f"{title} {text}".strip()

	# PRIORITY 1: Source-based classification (most reliable)
	source_category = self.classify_by_source(source_type, source_name)

	# PRIORITY 2: Rule-based keyword matching
	rule_category, domain, confidence = self.classify_by_rules(full_text)

	# Use source category if available (overrides keyword matching)
	if source_category:
	category = source_category
	confidence = 0.85 # High confidence for source-based
	method = "source"
	else:
	category = rule_category
	method = "rules"

	# Use LLM for low-confidence or ambiguous cases (only if no source match)
	if use_llm and llm_client and confidence < 0.5 and not source_category:
	try:
	llm_result = llm_client.classify(full_text)
	if llm_result.get("confidence", 0) > confidence:
	return {
	"category": llm_result.get("category", category.value),
	"domain": llm_result.get("domain", domain.value),
	"confidence": llm_result.get("confidence", confidence),
	"method": "llm"
	}
	except Exception as e:
	print(f"LLM classification failed: {e}")

	return {
	"category": category.value,
	"domain": domain.value,
	"confidence": confidence,
	"method": method
	}