| """ |
| PIOE Opportunity Classifier |
| |
| Classifies opportunities into categories using rules and LLM. |
| """ |
| from ..models import OpportunityCategory, Domain |
|
|
|
|
| class OpportunityClassifier: |
| """ |
| Classifies opportunities into categories and domains. |
| Uses rule-based classification first, LLM for ambiguous cases. |
| """ |
| |
| |
| SOURCE_CATEGORY_MAP = { |
| "arxiv": OpportunityCategory.RESEARCH, |
| "github": OpportunityCategory.OPEN_SOURCE, |
| "superteam": OpportunityCategory.BOUNTY, |
| "grant_platform": OpportunityCategory.GRANT, |
| "gov_portal": OpportunityCategory.GRANT, |
| } |
| |
| |
| CATEGORY_PATTERNS = { |
| OpportunityCategory.SCHOLARSHIP: [ |
| "scholarship", "tuition", "financial aid", "merit award", "bursary", |
| "study abroad", "educational grant", "student funding", "tuition waiver", |
| "fully funded", "partial funding", "academic scholarship", "need-based", |
| "scholars4dev", "profellow", "scholars program", "student scholarship", |
| "undergraduate scholarship", "graduate scholarship", "phd funding", |
| "masters scholarship", "study opportunity", "education funding" |
| ], |
| OpportunityCategory.FELLOWSHIP: [ |
| "fellowship", "fellow program", "research fellow", "visiting fellow", |
| "postdoctoral fellowship", "predoctoral fellowship", "faculty fellowship", |
| "leadership fellowship", "professional fellowship", "policy fellowship", |
| "mandela rhodes", "chevening", "fulbright", "rhodes scholar", "gates cambridge" |
| ], |
| OpportunityCategory.INTERNSHIP: [ |
| "internship", "intern ", "intern,", "interns ", "summer program", "co-op", |
| "summer internship", "fall internship", "spring internship", "winter internship", |
| "student intern", "undergraduate intern", "graduate intern", |
| "internship program", "intern position", "paid internship", "remote internship", |
| "virtual internship", "intern opportunity", "entry level", "early career", |
| "new grad", "new graduate", "recent graduate", "campus hire", "university hire" |
| ], |
| OpportunityCategory.JOB: [ |
| "hiring", "job opening", "position available", "career opportunity", |
| "we're looking for", "full-time", "remote job", "we are hiring", |
| "join our team", "senior engineer", "staff engineer", "principal engineer", |
| "software developer", "data scientist", "ml engineer", "ai engineer", |
| "open position", "job posting", "employment", "role available" |
| ], |
| OpportunityCategory.RESEARCH: [ |
| "research assistant", "ra position", "research opportunity", "arxiv", |
| "abstract:", "we present", "we propose", "our method", "research paper", |
| "phd position", "postdoc position", "research position", "lab assistant", |
| "research internship", "research program" |
| ], |
| OpportunityCategory.HACKATHON: [ |
| "hackathon", "buildathon", "hackers wanted", "hack day", "hackerearth", |
| "devpost", "mlh ", "major league hacking", "eth global", "ethglobal", |
| "hackathon.io", "coding competition", "code sprint", "codeathon", |
| "24 hour", "48 hour", "weekend hack", "virtual hackathon", |
| "prize pool", "grand prize", "first prize", "finalist", |
| "submit your", "build something", "demo day", "pitch day" |
| ], |
| OpportunityCategory.COMPETITION: [ |
| "competition", "challenge", "contest", "kaggle", "data challenge", |
| "ai challenge", "ml competition", "coding contest", |
| "programming competition", "algorithm contest", "competitive programming", |
| "topcoder", "codeforces", "leetcode contest" |
| ], |
| OpportunityCategory.GRANT: [ |
| "grant program", "grant application", "grant funding", "grant deadline", |
| "grant opportunity", "project grant", "research grant", "innovation grant", |
| "startup grant", "seed grant", "small grant", "micro grant", |
| "grant call", "funding opportunity", "request for proposals", "rfp", |
| "government grant", "foundation grant", "apply for grant" |
| ], |
| OpportunityCategory.ECOSYSTEM_GRANT: [ |
| "ecosystem grant", "web3 grant", "blockchain grant", "crypto grant", |
| "solana grant", "ethereum grant", "polygon grant", "near grant", |
| "foundation grant", "protocol grant", "developer grant", |
| "builder grant", "ecosystem fund", "developer fund" |
| ], |
| OpportunityCategory.CONFERENCE: [ |
| "conference", "call for papers", "summit", "symposium", "workshop", |
| "speaker application", "paper submission", "abstract submission" |
| ], |
| OpportunityCategory.OPEN_SOURCE: [ |
| "open source", "gsoc", "google summer of code", "outreachy", |
| "contributor wanted", "hacktoberfest", "open source contribution", |
| "oss program", "open source internship" |
| ], |
| OpportunityCategory.INVESTMENT: [ |
| "funding round", "series a", "series b", "vc funding", "raised $", |
| "pre-seed", "seed round", "angel investment", "startup funding" |
| ], |
| OpportunityCategory.BOUNTY: [ |
| "bounty", "bug bounty", "earn reward", "usdc reward", "sol reward", |
| "crypto bounty", "superteam", "earn crypto", "bounty board" |
| ], |
| } |
| |
| |
| DOMAIN_PATTERNS = { |
| Domain.COMPUTER_VISION: [ |
| "computer vision", "image", "visual", "object detection", "segmentation", "opencv" |
| ], |
| Domain.ROBOTICS: [ |
| "robot", "ros", "autonomous", "manipulation", "navigation" |
| ], |
| Domain.AI: [ |
| "ai", "artificial intelligence", "machine learning", "deep learning", |
| "neural network", "llm", "transformer", "gpt" |
| ], |
| Domain.FINANCE: [ |
| "finance", "fintech", "trading", "investment", "stock", "quantitative" |
| ], |
| Domain.CRYPTO: [ |
| "crypto", "blockchain", "web3", "defi", "solana", "ethereum", "nft" |
| ], |
| Domain.ACADEMIA: [ |
| "research", "phd", "postdoc", "university", "academic", "professor" |
| ], |
| } |
| |
| def classify_by_source(self, source_type: str, source_name: str = "") -> OpportunityCategory | None: |
| """ |
| Classify primarily by source type. |
| Returns category or None if source doesn't determine category. |
| """ |
| source_lower = (source_type or "").lower() |
| source_name_lower = (source_name or "").lower() |
| |
| |
| if source_lower in self.SOURCE_CATEGORY_MAP: |
| return self.SOURCE_CATEGORY_MAP[source_lower] |
| |
| |
| if any(x in source_name_lower for x in ["profellow", "scholars4dev", "opportunity desk"]): |
| |
| if "fellowship" in source_name_lower: |
| return OpportunityCategory.FELLOWSHIP |
| return OpportunityCategory.SCHOLARSHIP |
| |
| |
| if any(x in source_name_lower for x in ["intern", "entry level", "new grad"]): |
| return OpportunityCategory.INTERNSHIP |
| |
| |
| if any(x in source_name_lower for x in ["devpost", "devfolio", "mlh", "hackathon", "ethglobal"]): |
| return OpportunityCategory.HACKATHON |
| |
| |
| if "arxiv" in source_name_lower: |
| return OpportunityCategory.RESEARCH |
| |
| |
| if "github" in source_name_lower: |
| return OpportunityCategory.OPEN_SOURCE |
| |
| |
| if any(x in source_name_lower for x in ["remotive", "arbeitnow", "themuse", "adzuna", "jooble", "linkedin"]): |
| |
| return OpportunityCategory.JOB |
| if "hacker news" in source_name_lower and "jobs" in source_name_lower: |
| return OpportunityCategory.JOB |
| |
| |
| if "superteam" in source_name_lower: |
| return OpportunityCategory.BOUNTY |
| |
| return None |
| |
| def classify_by_rules(self, text: str) -> tuple[OpportunityCategory, Domain, float]: |
| """ |
| Classify using keyword matching. |
| Returns (category, domain, confidence) |
| """ |
| if not text: |
| return OpportunityCategory.OTHER, Domain.MIXED, 0.0 |
| |
| text_lower = text.lower() |
| |
| |
| category = OpportunityCategory.OTHER |
| cat_confidence = 0.0 |
| |
| for cat, patterns in self.CATEGORY_PATTERNS.items(): |
| matches = sum(1 for p in patterns if p in text_lower) |
| if matches > cat_confidence: |
| category = cat |
| cat_confidence = min(matches * 0.3, 0.9) |
| |
| |
| domain = Domain.MIXED |
| domain_matches = 0 |
| |
| for dom, patterns in self.DOMAIN_PATTERNS.items(): |
| matches = sum(1 for p in patterns if p in text_lower) |
| if matches > domain_matches: |
| domain = dom |
| domain_matches = matches |
| |
| |
| domain_counts = { |
| dom: sum(1 for p in patterns if p in text_lower) |
| for dom, patterns in self.DOMAIN_PATTERNS.items() |
| } |
| high_matches = [d for d, c in domain_counts.items() if c >= domain_matches and c > 0] |
| if len(high_matches) > 1: |
| domain = Domain.MIXED |
| |
| return category, domain, cat_confidence |
| |
| def classify( |
| self, |
| text: str, |
| title: str = "", |
| source_type: str = "", |
| source_name: str = "", |
| use_llm: bool = False, |
| llm_client = None |
| ) -> dict: |
| """ |
| Classify opportunity with optional LLM enhancement. |
| |
| Returns dict with category, domain, confidence, method |
| """ |
| full_text = f"{title} {text}".strip() |
| |
| |
| source_category = self.classify_by_source(source_type, source_name) |
| |
| |
| rule_category, domain, confidence = self.classify_by_rules(full_text) |
| |
| |
| if source_category: |
| category = source_category |
| confidence = 0.85 |
| method = "source" |
| else: |
| category = rule_category |
| method = "rules" |
| |
| |
| if use_llm and llm_client and confidence < 0.5 and not source_category: |
| try: |
| llm_result = llm_client.classify(full_text) |
| if llm_result.get("confidence", 0) > confidence: |
| return { |
| "category": llm_result.get("category", category.value), |
| "domain": llm_result.get("domain", domain.value), |
| "confidence": llm_result.get("confidence", confidence), |
| "method": "llm" |
| } |
| except Exception as e: |
| print(f"LLM classification failed: {e}") |
| |
| return { |
| "category": category.value, |
| "domain": domain.value, |
| "confidence": confidence, |
| "method": method |
| } |
|
|
|
|