Spaces:

Tremick
/

PIOE

Runtime error

File size: 12,017 Bytes

"""
PIOE Opportunity Classifier

Classifies opportunities into categories using rules and LLM.
"""
from ..models import OpportunityCategory, Domain


class OpportunityClassifier:
    """
    Classifies opportunities into categories and domains.
    Uses rule-based classification first, LLM for ambiguous cases.
    """
    
    # Source type to category mapping (high priority)
    SOURCE_CATEGORY_MAP = {
        "arxiv": OpportunityCategory.RESEARCH,
        "github": OpportunityCategory.OPEN_SOURCE,
        "superteam": OpportunityCategory.BOUNTY,
        "grant_platform": OpportunityCategory.GRANT,
        "gov_portal": OpportunityCategory.GRANT,
    }
    
    # Keyword patterns for each category (expanded for better matching)
    CATEGORY_PATTERNS = {
        OpportunityCategory.SCHOLARSHIP: [
            "scholarship", "tuition", "financial aid", "merit award", "bursary",
            "study abroad", "educational grant", "student funding", "tuition waiver",
            "fully funded", "partial funding", "academic scholarship", "need-based",
            "scholars4dev", "profellow", "scholars program", "student scholarship",
            "undergraduate scholarship", "graduate scholarship", "phd funding",
            "masters scholarship", "study opportunity", "education funding"
        ],
        OpportunityCategory.FELLOWSHIP: [
            "fellowship", "fellow program", "research fellow", "visiting fellow",
            "postdoctoral fellowship", "predoctoral fellowship", "faculty fellowship",
            "leadership fellowship", "professional fellowship", "policy fellowship",
            "mandela rhodes", "chevening", "fulbright", "rhodes scholar", "gates cambridge"
        ],
        OpportunityCategory.INTERNSHIP: [
            "internship", "intern ", "intern,", "interns ", "summer program", "co-op",
            "summer internship", "fall internship", "spring internship", "winter internship",
            "student intern", "undergraduate intern", "graduate intern",
            "internship program", "intern position", "paid internship", "remote internship",
            "virtual internship", "intern opportunity", "entry level", "early career",
            "new grad", "new graduate", "recent graduate", "campus hire", "university hire"
        ],
        OpportunityCategory.JOB: [
            "hiring", "job opening", "position available", "career opportunity", 
            "we're looking for", "full-time", "remote job", "we are hiring",
            "join our team", "senior engineer", "staff engineer", "principal engineer",
            "software developer", "data scientist", "ml engineer", "ai engineer",
            "open position", "job posting", "employment", "role available"
        ],
        OpportunityCategory.RESEARCH: [
            "research assistant", "ra position", "research opportunity", "arxiv",
            "abstract:", "we present", "we propose", "our method", "research paper",
            "phd position", "postdoc position", "research position", "lab assistant",
            "research internship", "research program"
        ],
        OpportunityCategory.HACKATHON: [
            "hackathon", "buildathon", "hackers wanted", "hack day", "hackerearth",
            "devpost", "mlh ", "major league hacking", "eth global", "ethglobal",
            "hackathon.io", "coding competition", "code sprint", "codeathon",
            "24 hour", "48 hour", "weekend hack", "virtual hackathon",
            "prize pool", "grand prize", "first prize", "finalist",
            "submit your", "build something", "demo day", "pitch day"
        ],
        OpportunityCategory.COMPETITION: [
            "competition", "challenge", "contest", "kaggle", "data challenge",
            "ai challenge", "ml competition", "coding contest",
            "programming competition", "algorithm contest", "competitive programming",
            "topcoder", "codeforces", "leetcode contest"
        ],
        OpportunityCategory.GRANT: [
            "grant program", "grant application", "grant funding", "grant deadline",
            "grant opportunity", "project grant", "research grant", "innovation grant",
            "startup grant", "seed grant", "small grant", "micro grant",
            "grant call", "funding opportunity", "request for proposals", "rfp",
            "government grant", "foundation grant", "apply for grant"
        ],
        OpportunityCategory.ECOSYSTEM_GRANT: [
            "ecosystem grant", "web3 grant", "blockchain grant", "crypto grant",
            "solana grant", "ethereum grant", "polygon grant", "near grant",
            "foundation grant", "protocol grant", "developer grant",
            "builder grant", "ecosystem fund", "developer fund"
        ],
        OpportunityCategory.CONFERENCE: [
            "conference", "call for papers", "summit", "symposium", "workshop",
            "speaker application", "paper submission", "abstract submission"
        ],
        OpportunityCategory.OPEN_SOURCE: [
            "open source", "gsoc", "google summer of code", "outreachy", 
            "contributor wanted", "hacktoberfest", "open source contribution",
            "oss program", "open source internship"
        ],
        OpportunityCategory.INVESTMENT: [
            "funding round", "series a", "series b", "vc funding", "raised $",
            "pre-seed", "seed round", "angel investment", "startup funding"
        ],
        OpportunityCategory.BOUNTY: [
            "bounty", "bug bounty", "earn reward", "usdc reward", "sol reward",
            "crypto bounty", "superteam", "earn crypto", "bounty board"
        ],
    }
    
    # Domain patterns
    DOMAIN_PATTERNS = {
        Domain.COMPUTER_VISION: [
            "computer vision", "image", "visual", "object detection", "segmentation", "opencv"
        ],
        Domain.ROBOTICS: [
            "robot", "ros", "autonomous", "manipulation", "navigation"
        ],
        Domain.AI: [
            "ai", "artificial intelligence", "machine learning", "deep learning", 
            "neural network", "llm", "transformer", "gpt"
        ],
        Domain.FINANCE: [
            "finance", "fintech", "trading", "investment", "stock", "quantitative"
        ],
        Domain.CRYPTO: [
            "crypto", "blockchain", "web3", "defi", "solana", "ethereum", "nft"
        ],
        Domain.ACADEMIA: [
            "research", "phd", "postdoc", "university", "academic", "professor"
        ],
    }
    
    def classify_by_source(self, source_type: str, source_name: str = "") -> OpportunityCategory | None:
        """
        Classify primarily by source type.
        Returns category or None if source doesn't determine category.
        """
        source_lower = (source_type or "").lower()
        source_name_lower = (source_name or "").lower()
        
        # Check direct source mapping
        if source_lower in self.SOURCE_CATEGORY_MAP:
            return self.SOURCE_CATEGORY_MAP[source_lower]
        
        # === Scholarship/Fellowship Sources ===
        if any(x in source_name_lower for x in ["profellow", "scholars4dev", "opportunity desk"]):
            # Check if it's specifically a fellowship or scholarship
            if "fellowship" in source_name_lower:
                return OpportunityCategory.FELLOWSHIP
            return OpportunityCategory.SCHOLARSHIP
        
        # === Internship Sources ===
        if any(x in source_name_lower for x in ["intern", "entry level", "new grad"]):
            return OpportunityCategory.INTERNSHIP
        
        # === Hackathon Sources ===
        if any(x in source_name_lower for x in ["devpost", "devfolio", "mlh", "hackathon", "ethglobal"]):
            return OpportunityCategory.HACKATHON
        
        # === Research Sources ===
        if "arxiv" in source_name_lower:
            return OpportunityCategory.RESEARCH
            
        # === Open Source Sources ===
        if "github" in source_name_lower:
            return OpportunityCategory.OPEN_SOURCE
            
        # === Job Sources ===
        if any(x in source_name_lower for x in ["remotive", "arbeitnow", "themuse", "adzuna", "jooble", "linkedin"]):
            # But if "intern" is in the title, it's an internship
            return OpportunityCategory.JOB
        if "hacker news" in source_name_lower and "jobs" in source_name_lower:
            return OpportunityCategory.JOB
            
        # === Bounty/Ecosystem Sources ===
        if "superteam" in source_name_lower:
            return OpportunityCategory.BOUNTY
        
        return None
    
    def classify_by_rules(self, text: str) -> tuple[OpportunityCategory, Domain, float]:
        """
        Classify using keyword matching.
        Returns (category, domain, confidence)
        """
        if not text:
            return OpportunityCategory.OTHER, Domain.MIXED, 0.0
        
        text_lower = text.lower()
        
        # Find matching category
        category = OpportunityCategory.OTHER
        cat_confidence = 0.0
        
        for cat, patterns in self.CATEGORY_PATTERNS.items():
            matches = sum(1 for p in patterns if p in text_lower)
            if matches > cat_confidence:
                category = cat
                cat_confidence = min(matches * 0.3, 0.9)
        
        # Find matching domain
        domain = Domain.MIXED
        domain_matches = 0
        
        for dom, patterns in self.DOMAIN_PATTERNS.items():
            matches = sum(1 for p in patterns if p in text_lower)
            if matches > domain_matches:
                domain = dom
                domain_matches = matches
        
        # If multiple domains match well, keep as mixed
        domain_counts = {
            dom: sum(1 for p in patterns if p in text_lower)
            for dom, patterns in self.DOMAIN_PATTERNS.items()
        }
        high_matches = [d for d, c in domain_counts.items() if c >= domain_matches and c > 0]
        if len(high_matches) > 1:
            domain = Domain.MIXED
        
        return category, domain, cat_confidence
    
    def classify(
        self, 
        text: str, 
        title: str = "",
        source_type: str = "",
        source_name: str = "",
        use_llm: bool = False,
        llm_client = None
    ) -> dict:
        """
        Classify opportunity with optional LLM enhancement.
        
        Returns dict with category, domain, confidence, method
        """
        full_text = f"{title} {text}".strip()
        
        # PRIORITY 1: Source-based classification (most reliable)
        source_category = self.classify_by_source(source_type, source_name)
        
        # PRIORITY 2: Rule-based keyword matching
        rule_category, domain, confidence = self.classify_by_rules(full_text)
        
        # Use source category if available (overrides keyword matching)
        if source_category:
            category = source_category
            confidence = 0.85  # High confidence for source-based
            method = "source"
        else:
            category = rule_category
            method = "rules"
        
        # Use LLM for low-confidence or ambiguous cases (only if no source match)
        if use_llm and llm_client and confidence < 0.5 and not source_category:
            try:
                llm_result = llm_client.classify(full_text)
                if llm_result.get("confidence", 0) > confidence:
                    return {
                        "category": llm_result.get("category", category.value),
                        "domain": llm_result.get("domain", domain.value),
                        "confidence": llm_result.get("confidence", confidence),
                        "method": "llm"
                    }
            except Exception as e:
                print(f"LLM classification failed: {e}")
        
        return {
            "category": category.value,
            "domain": domain.value,
            "confidence": confidence,
            "method": method
        }