File size: 12,017 Bytes
4d92cd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54b7ea7
4d92cd5
 
54b7ea7
 
 
 
 
 
4d92cd5
 
54b7ea7
 
 
 
4d92cd5
 
54b7ea7
 
 
 
 
 
4d92cd5
 
 
54b7ea7
 
 
 
4d92cd5
 
 
54b7ea7
 
 
4d92cd5
 
54b7ea7
 
 
 
 
 
4d92cd5
 
54b7ea7
 
 
 
4d92cd5
 
54b7ea7
 
 
 
 
 
 
 
 
 
 
4d92cd5
 
54b7ea7
 
4d92cd5
 
54b7ea7
 
 
4d92cd5
 
54b7ea7
 
4d92cd5
 
54b7ea7
 
4d92cd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54b7ea7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d92cd5
 
54b7ea7
 
4d92cd5
 
54b7ea7
 
 
 
4d92cd5
54b7ea7
 
 
 
 
 
4d92cd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""
PIOE Opportunity Classifier

Classifies opportunities into categories using rules and LLM.
"""
from ..models import OpportunityCategory, Domain


class OpportunityClassifier:
    """
    Classifies opportunities into categories and domains.
    Uses rule-based classification first, LLM for ambiguous cases.
    """
    
    # Source type to category mapping (high priority)
    SOURCE_CATEGORY_MAP = {
        "arxiv": OpportunityCategory.RESEARCH,
        "github": OpportunityCategory.OPEN_SOURCE,
        "superteam": OpportunityCategory.BOUNTY,
        "grant_platform": OpportunityCategory.GRANT,
        "gov_portal": OpportunityCategory.GRANT,
    }
    
    # Keyword patterns for each category (expanded for better matching)
    CATEGORY_PATTERNS = {
        OpportunityCategory.SCHOLARSHIP: [
            "scholarship", "tuition", "financial aid", "merit award", "bursary",
            "study abroad", "educational grant", "student funding", "tuition waiver",
            "fully funded", "partial funding", "academic scholarship", "need-based",
            "scholars4dev", "profellow", "scholars program", "student scholarship",
            "undergraduate scholarship", "graduate scholarship", "phd funding",
            "masters scholarship", "study opportunity", "education funding"
        ],
        OpportunityCategory.FELLOWSHIP: [
            "fellowship", "fellow program", "research fellow", "visiting fellow",
            "postdoctoral fellowship", "predoctoral fellowship", "faculty fellowship",
            "leadership fellowship", "professional fellowship", "policy fellowship",
            "mandela rhodes", "chevening", "fulbright", "rhodes scholar", "gates cambridge"
        ],
        OpportunityCategory.INTERNSHIP: [
            "internship", "intern ", "intern,", "interns ", "summer program", "co-op",
            "summer internship", "fall internship", "spring internship", "winter internship",
            "student intern", "undergraduate intern", "graduate intern",
            "internship program", "intern position", "paid internship", "remote internship",
            "virtual internship", "intern opportunity", "entry level", "early career",
            "new grad", "new graduate", "recent graduate", "campus hire", "university hire"
        ],
        OpportunityCategory.JOB: [
            "hiring", "job opening", "position available", "career opportunity", 
            "we're looking for", "full-time", "remote job", "we are hiring",
            "join our team", "senior engineer", "staff engineer", "principal engineer",
            "software developer", "data scientist", "ml engineer", "ai engineer",
            "open position", "job posting", "employment", "role available"
        ],
        OpportunityCategory.RESEARCH: [
            "research assistant", "ra position", "research opportunity", "arxiv",
            "abstract:", "we present", "we propose", "our method", "research paper",
            "phd position", "postdoc position", "research position", "lab assistant",
            "research internship", "research program"
        ],
        OpportunityCategory.HACKATHON: [
            "hackathon", "buildathon", "hackers wanted", "hack day", "hackerearth",
            "devpost", "mlh ", "major league hacking", "eth global", "ethglobal",
            "hackathon.io", "coding competition", "code sprint", "codeathon",
            "24 hour", "48 hour", "weekend hack", "virtual hackathon",
            "prize pool", "grand prize", "first prize", "finalist",
            "submit your", "build something", "demo day", "pitch day"
        ],
        OpportunityCategory.COMPETITION: [
            "competition", "challenge", "contest", "kaggle", "data challenge",
            "ai challenge", "ml competition", "coding contest",
            "programming competition", "algorithm contest", "competitive programming",
            "topcoder", "codeforces", "leetcode contest"
        ],
        OpportunityCategory.GRANT: [
            "grant program", "grant application", "grant funding", "grant deadline",
            "grant opportunity", "project grant", "research grant", "innovation grant",
            "startup grant", "seed grant", "small grant", "micro grant",
            "grant call", "funding opportunity", "request for proposals", "rfp",
            "government grant", "foundation grant", "apply for grant"
        ],
        OpportunityCategory.ECOSYSTEM_GRANT: [
            "ecosystem grant", "web3 grant", "blockchain grant", "crypto grant",
            "solana grant", "ethereum grant", "polygon grant", "near grant",
            "foundation grant", "protocol grant", "developer grant",
            "builder grant", "ecosystem fund", "developer fund"
        ],
        OpportunityCategory.CONFERENCE: [
            "conference", "call for papers", "summit", "symposium", "workshop",
            "speaker application", "paper submission", "abstract submission"
        ],
        OpportunityCategory.OPEN_SOURCE: [
            "open source", "gsoc", "google summer of code", "outreachy", 
            "contributor wanted", "hacktoberfest", "open source contribution",
            "oss program", "open source internship"
        ],
        OpportunityCategory.INVESTMENT: [
            "funding round", "series a", "series b", "vc funding", "raised $",
            "pre-seed", "seed round", "angel investment", "startup funding"
        ],
        OpportunityCategory.BOUNTY: [
            "bounty", "bug bounty", "earn reward", "usdc reward", "sol reward",
            "crypto bounty", "superteam", "earn crypto", "bounty board"
        ],
    }
    
    # Domain patterns
    DOMAIN_PATTERNS = {
        Domain.COMPUTER_VISION: [
            "computer vision", "image", "visual", "object detection", "segmentation", "opencv"
        ],
        Domain.ROBOTICS: [
            "robot", "ros", "autonomous", "manipulation", "navigation"
        ],
        Domain.AI: [
            "ai", "artificial intelligence", "machine learning", "deep learning", 
            "neural network", "llm", "transformer", "gpt"
        ],
        Domain.FINANCE: [
            "finance", "fintech", "trading", "investment", "stock", "quantitative"
        ],
        Domain.CRYPTO: [
            "crypto", "blockchain", "web3", "defi", "solana", "ethereum", "nft"
        ],
        Domain.ACADEMIA: [
            "research", "phd", "postdoc", "university", "academic", "professor"
        ],
    }
    
    def classify_by_source(self, source_type: str, source_name: str = "") -> OpportunityCategory | None:
        """
        Classify primarily by source type.
        Returns category or None if source doesn't determine category.
        """
        source_lower = (source_type or "").lower()
        source_name_lower = (source_name or "").lower()
        
        # Check direct source mapping
        if source_lower in self.SOURCE_CATEGORY_MAP:
            return self.SOURCE_CATEGORY_MAP[source_lower]
        
        # === Scholarship/Fellowship Sources ===
        if any(x in source_name_lower for x in ["profellow", "scholars4dev", "opportunity desk"]):
            # Check if it's specifically a fellowship or scholarship
            if "fellowship" in source_name_lower:
                return OpportunityCategory.FELLOWSHIP
            return OpportunityCategory.SCHOLARSHIP
        
        # === Internship Sources ===
        if any(x in source_name_lower for x in ["intern", "entry level", "new grad"]):
            return OpportunityCategory.INTERNSHIP
        
        # === Hackathon Sources ===
        if any(x in source_name_lower for x in ["devpost", "devfolio", "mlh", "hackathon", "ethglobal"]):
            return OpportunityCategory.HACKATHON
        
        # === Research Sources ===
        if "arxiv" in source_name_lower:
            return OpportunityCategory.RESEARCH
            
        # === Open Source Sources ===
        if "github" in source_name_lower:
            return OpportunityCategory.OPEN_SOURCE
            
        # === Job Sources ===
        if any(x in source_name_lower for x in ["remotive", "arbeitnow", "themuse", "adzuna", "jooble", "linkedin"]):
            # But if "intern" is in the title, it's an internship
            return OpportunityCategory.JOB
        if "hacker news" in source_name_lower and "jobs" in source_name_lower:
            return OpportunityCategory.JOB
            
        # === Bounty/Ecosystem Sources ===
        if "superteam" in source_name_lower:
            return OpportunityCategory.BOUNTY
        
        return None
    
    def classify_by_rules(self, text: str) -> tuple[OpportunityCategory, Domain, float]:
        """
        Classify using keyword matching.
        Returns (category, domain, confidence)
        """
        if not text:
            return OpportunityCategory.OTHER, Domain.MIXED, 0.0
        
        text_lower = text.lower()
        
        # Find matching category
        category = OpportunityCategory.OTHER
        cat_confidence = 0.0
        
        for cat, patterns in self.CATEGORY_PATTERNS.items():
            matches = sum(1 for p in patterns if p in text_lower)
            if matches > cat_confidence:
                category = cat
                cat_confidence = min(matches * 0.3, 0.9)
        
        # Find matching domain
        domain = Domain.MIXED
        domain_matches = 0
        
        for dom, patterns in self.DOMAIN_PATTERNS.items():
            matches = sum(1 for p in patterns if p in text_lower)
            if matches > domain_matches:
                domain = dom
                domain_matches = matches
        
        # If multiple domains match well, keep as mixed
        domain_counts = {
            dom: sum(1 for p in patterns if p in text_lower)
            for dom, patterns in self.DOMAIN_PATTERNS.items()
        }
        high_matches = [d for d, c in domain_counts.items() if c >= domain_matches and c > 0]
        if len(high_matches) > 1:
            domain = Domain.MIXED
        
        return category, domain, cat_confidence
    
    def classify(
        self, 
        text: str, 
        title: str = "",
        source_type: str = "",
        source_name: str = "",
        use_llm: bool = False,
        llm_client = None
    ) -> dict:
        """
        Classify opportunity with optional LLM enhancement.
        
        Returns dict with category, domain, confidence, method
        """
        full_text = f"{title} {text}".strip()
        
        # PRIORITY 1: Source-based classification (most reliable)
        source_category = self.classify_by_source(source_type, source_name)
        
        # PRIORITY 2: Rule-based keyword matching
        rule_category, domain, confidence = self.classify_by_rules(full_text)
        
        # Use source category if available (overrides keyword matching)
        if source_category:
            category = source_category
            confidence = 0.85  # High confidence for source-based
            method = "source"
        else:
            category = rule_category
            method = "rules"
        
        # Use LLM for low-confidence or ambiguous cases (only if no source match)
        if use_llm and llm_client and confidence < 0.5 and not source_category:
            try:
                llm_result = llm_client.classify(full_text)
                if llm_result.get("confidence", 0) > confidence:
                    return {
                        "category": llm_result.get("category", category.value),
                        "domain": llm_result.get("domain", domain.value),
                        "confidence": llm_result.get("confidence", confidence),
                        "method": "llm"
                    }
            except Exception as e:
                print(f"LLM classification failed: {e}")
        
        return {
            "category": category.value,
            "domain": domain.value,
            "confidence": confidence,
            "method": method
        }