Spaces:
Running
Running
| """ | |
| Topic Classifier โ maps dynamic LLM-extracted topics to predefined UI categories. | |
| Usage: | |
| from src.summarization.topic_classifier import classify_topic, get_primary_category | |
| topics = ["Python", "Machine Learning", "Neural Networks"] | |
| result = classify_topic(topics) | |
| # => "Technology & AI" | |
| Categories: | |
| Technology & AI | Business & Finance | Education & Science | |
| Productivity & Self-Growth | News & Politics | |
| Entertainment & Lifestyle | Health & Sports | |
| """ | |
| from typing import List, Set | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # PREDEFINED CATEGORIES | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| CATEGORIES = [ | |
| "Technology & AI", | |
| "Business & Finance", | |
| "Education & Science", | |
| "Productivity & Self-Growth", | |
| "News & Politics", | |
| "Entertainment & Lifestyle", | |
| "Health & Sports", | |
| ] | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # KEYWORD โ CATEGORY MAPPING (English + Arabic) | |
| # All keywords are stored lowercase for case-insensitive matching. | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _KEYWORD_MAP: dict[str, str] = {} | |
| def _register(category: str, keywords: list[str]): | |
| """Register a list of keywords for a category (lowercase).""" | |
| for kw in keywords: | |
| _KEYWORD_MAP[kw.lower()] = category | |
| # โโ Technology & AI โโ | |
| _register("Technology & AI", [ | |
| # English | |
| "ai", "artificial intelligence", "machine learning", "deep learning", | |
| "neural network", "neural networks", "nlp", "natural language processing", | |
| "computer vision", "robotics", "automation", "algorithm", "algorithms", | |
| "python", "javascript", "typescript", "java", "c++", "rust", "golang", "go", | |
| "programming", "coding", "software", "software engineering", "web development", | |
| "frontend", "backend", "full stack", "fullstack", "devops", "cloud", | |
| "cloud computing", "aws", "azure", "gcp", "docker", "kubernetes", | |
| "database", "sql", "nosql", "mongodb", "api", "rest api", "graphql", | |
| "cybersecurity", "security", "hacking", "encryption", "blockchain", | |
| "cryptocurrency", "bitcoin", "ethereum", "web3", "metaverse", | |
| "data science", "data analysis", "data engineering", "big data", | |
| "iot", "internet of things", "5g", "hardware", "semiconductor", | |
| "gpu", "chip", "processor", "tech", "technology", "computing", | |
| "linux", "git", "github", "open source", "framework", "react", | |
| "vue", "angular", "node", "nodejs", "django", "flask", "fastapi", | |
| "tensorflow", "pytorch", "llm", "large language model", "chatgpt", | |
| "gpt", "gemini", "copilot", "transformer", "diffusion model", | |
| "generative ai", "prompt engineering", "fine tuning", "rag", | |
| "mobile development", "android", "ios", "swift", "kotlin", "flutter", "dart", | |
| # Arabic | |
| "ุฐูุงุก ุงุตุทูุงุนู", "ุชุนูู ุขูู", "ุชุนูู ุนู ูู", "ุจุฑู ุฌุฉ", "ุชูููุฉ", "ุชูููููุฌูุง", | |
| "ุฎูุงุฑุฒู ูุฉ", "ุญุงุณูุจ", "ุดุจูุงุช ุนุตุจูุฉ", "ุจูุงูุงุช", "ุฃู ู ุณูุจุฑุงูู", | |
| "ุญูุณุจุฉ ุณุญุงุจูุฉ", "ุชุทููุฑ ุจุฑู ุฌูุงุช", "ุชุทููุฑ ููุจ", "ููุงุนุฏ ุจูุงูุงุช", | |
| ]) | |
| # โโ Business & Finance โโ | |
| _register("Business & Finance", [ | |
| # English | |
| "business", "finance", "economics", "economy", "stock", "stocks", | |
| "stock market", "trading", "investing", "investment", "real estate", | |
| "entrepreneurship", "startup", "startups", "marketing", "digital marketing", | |
| "seo", "branding", "sales", "revenue", "profit", "accounting", | |
| "budgeting", "money", "wealth", "financial", "banking", "bank", | |
| "fintech", "venture capital", "vc", "ipo", "merger", "acquisition", | |
| "management", "leadership", "strategy", "e-commerce", "ecommerce", | |
| "supply chain", "logistics", "consulting", "mba", "corporate", | |
| "tax", "taxes", "inflation", "gdp", "interest rate", "forex", | |
| "commodity", "commodities", "portfolio", "dividend", "bond", "bonds", | |
| "freelancing", "freelance", "passive income", "side hustle", | |
| # Arabic | |
| "ุฃุนู ุงู", "ุชุฌุงุฑุฉ", "ุงูุชุตุงุฏ", "ู ุงููุฉ", "ุงุณุชุซู ุงุฑ", "ุฃุณูู ", "ุจูุฑุตุฉ", | |
| "ุชุณููู", "ุฑูุงุฏุฉ ุฃุนู ุงู", "ู ุดุฑูุน", "ุชู ููู", "ู ุญุงุณุจุฉ", "ุจูู", "ุนูุงุฑุงุช", | |
| "ุฑุจุญ", "ุฏุฎู", "ู ูุฒุงููุฉ", | |
| ]) | |
| # โโ Education & Science โโ | |
| _register("Education & Science", [ | |
| # English | |
| "education", "learning", "teaching", "school", "university", "college", | |
| "academic", "research", "study", "studying", "exam", "exams", "course", | |
| "tutorial", "lecture", "scholarship", "degree", "phd", "thesis", | |
| "science", "physics", "chemistry", "biology", "math", "mathematics", | |
| "statistics", "calculus", "algebra", "geometry", "astronomy", "space", | |
| "nasa", "quantum", "quantum physics", "quantum computing", | |
| "neuroscience", "genetics", "evolution", "ecology", "geology", | |
| "climate", "climate change", "environment", "engineering", | |
| "mechanical engineering", "electrical engineering", "civil engineering", | |
| "experiment", "laboratory", "lab", "hypothesis", "theory", | |
| "history", "philosophy", "psychology", "sociology", "linguistics", | |
| "anthropology", "archaeology", "literature", "language", "grammar", | |
| # Arabic | |
| "ุชุนููู ", "ุชุนูู ", "ู ุฏุฑุณุฉ", "ุฌุงู ุนุฉ", "ุนููู ", "ููุฒูุงุก", "ููู ูุงุก", | |
| "ุฃุญูุงุก", "ุฑูุงุถูุงุช", "ุจุญุซ", "ุฏุฑุงุณุฉ", "ุงู ุชุญุงู", "ู ููุฌ", "ู ุญุงุถุฑุฉ", | |
| "ููุฏุณุฉ", "ุชุงุฑูุฎ", "ููุณูุฉ", "ุนูู ููุณ", "ููู", "ุจูุฆุฉ", | |
| ]) | |
| # โโ Productivity & Self-Growth โโ | |
| _register("Productivity & Self-Growth", [ | |
| # English | |
| "productivity", "self improvement", "self-improvement", "self growth", | |
| "self-growth", "personal development", "motivation", "discipline", | |
| "habits", "habit", "time management", "goal setting", "goals", | |
| "mindset", "focus", "concentration", "efficiency", "organization", | |
| "planning", "journaling", "morning routine", "routine", "success", | |
| "self help", "self-help", "life coaching", "coaching", "mentoring", | |
| "mentor", "stoicism", "minimalism", "mindfulness", "meditation", | |
| "emotional intelligence", "communication skills", "public speaking", | |
| "negotiation", "critical thinking", "problem solving", "creativity", | |
| "decision making", "confidence", "resilience", "work-life balance", | |
| "burnout", "career", "career development", "skill building", | |
| # Arabic | |
| "ุฅูุชุงุฌูุฉ", "ุชุทููุฑ ุฐุงุช", "ุชุญููุฒ", "ุนุงุฏุงุช", "ุฅุฏุงุฑุฉ ุงูููุช", | |
| "ุฃูุฏุงู", "ุชุฑููุฒ", "ูุฌุงุญ", "ุชุฎุทูุท", "ุชุฃู ู", "ุซูุฉ ุจุงูููุณ", | |
| "ู ูุงุฑุงุช", "ุชูููุฑ", "ุฅุจุฏุงุน", | |
| ]) | |
| # โโ News & Politics โโ | |
| _register("News & Politics", [ | |
| # English | |
| "news", "politics", "political", "government", "policy", "election", | |
| "elections", "democracy", "geopolitics", "diplomacy", "war", "conflict", | |
| "military", "defense", "law", "legal", "legislation", "regulation", | |
| "human rights", "immigration", "refugee", "sanctions", "united nations", | |
| "nato", "eu", "european union", "congress", "parliament", "senate", | |
| "president", "prime minister", "foreign policy", "domestic policy", | |
| "protest", "activism", "corruption", "media", "journalism", | |
| "press", "freedom of speech", "censorship", "propaganda", | |
| "international relations", "treaty", "nuclear", | |
| # Arabic | |
| "ุฃุฎุจุงุฑ", "ุณูุงุณุฉ", "ุญููู ุฉ", "ุงูุชุฎุงุจุงุช", "ุฏูู ูุฑุงุทูุฉ", "ุญุฑุจ", | |
| "ูุงููู", "ุญููู ุฅูุณุงู", "ุฏุจููู ุงุณูุฉ", "ุจุฑูู ุงู", "ุฑุฆูุณ", | |
| "ุฅุนูุงู ", "ุตุญุงูุฉ", | |
| ]) | |
| # โโ Entertainment & Lifestyle โโ | |
| _register("Entertainment & Lifestyle", [ | |
| # English | |
| "entertainment", "movie", "movies", "film", "films", "cinema", | |
| "tv", "television", "series", "netflix", "streaming", "anime", | |
| "manga", "gaming", "video games", "esports", "twitch", "youtube", | |
| "podcast", "music", "song", "album", "concert", "artist", | |
| "celebrity", "fashion", "style", "beauty", "makeup", "skincare", | |
| "travel", "tourism", "food", "cooking", "recipe", "restaurant", | |
| "cuisine", "vlog", "vlogging", "photography", "art", "design", | |
| "graphic design", "illustration", "architecture", "interior design", | |
| "diy", "crafts", "comedy", "humor", "drama", "reality tv", | |
| "social media", "tiktok", "instagram", "influencer", "content creator", | |
| "lifestyle", "luxury", "culture", "pop culture", | |
| # Arabic | |
| "ุชุฑููู", "ุฃููุงู ", "ุณููู ุง", "ู ุณูุณูุงุช", "ุฃูุนุงุจ", "ู ูุณููู", | |
| "ุณูุฑ", "ุทุจุฎ", "ุฃุฒูุงุก", "ุฌู ุงู", "ุชุตููุฑ", "ูู", "ุชุตู ูู ", | |
| "ุซูุงูุฉ", "ููู ูุฏูุง", "ููุชููุจ", | |
| ]) | |
| # โโ Health & Sports โโ | |
| _register("Health & Sports", [ | |
| # English | |
| "health", "fitness", "exercise", "workout", "gym", "bodybuilding", | |
| "weight loss", "diet", "nutrition", "calories", "protein", "vitamins", | |
| "supplements", "wellness", "mental health", "therapy", "depression", | |
| "anxiety", "stress", "sleep", "yoga", "pilates", "crossfit", | |
| "running", "marathon", "swimming", "cycling", "hiking", | |
| "sports", "football", "soccer", "basketball", "tennis", "baseball", | |
| "cricket", "rugby", "boxing", "mma", "ufc", "wrestling", | |
| "olympics", "world cup", "premier league", "nba", "nfl", | |
| "medicine", "medical", "doctor", "hospital", "surgery", "disease", | |
| "virus", "vaccine", "pandemic", "covid", "cancer", "diabetes", | |
| "heart", "cardio", "physical therapy", "rehabilitation", | |
| "first aid", "pharmacy", "drug", "prescription", | |
| # Arabic | |
| "ุตุญุฉ", "ุฑูุงุถุฉ", "ุชู ุงุฑูู", "ููุงูุฉ", "ุชุบุฐูุฉ", "ุญู ูุฉ", | |
| "ุตุญุฉ ููุณูุฉ", "ุนูุงุฌ", "ุทุจ", "ู ุณุชุดูู", "ูุฑุฉ ูุฏู ", "ุณุจุงุญุฉ", | |
| "ููุบุง", "ููู ", "ููุชุงู ููุงุช", | |
| ]) | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # PUBLIC API | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def classify_topic(topics: List[str]) -> str: | |
| """ | |
| Map a list of dynamically extracted topics to a SINGLE predefined UI category. | |
| Returns the single best-matching category (the first match in CATEGORIES order). | |
| Falls back to "Education & Science" if no match is found. | |
| Args: | |
| topics: List of topic strings from the LLM (e.g. ["Python", "Deep Learning"]). | |
| Returns: | |
| A single category string. | |
| Example: | |
| >>> classify_topic(["Python", "Machine Learning", "Neural Networks"]) | |
| "Technology & AI" | |
| >>> classify_topic(["Investing", "AI Stocks"]) | |
| "Business & Finance" | |
| """ | |
| matched: Set[str] = set() | |
| for topic in topics: | |
| topic_lower = topic.lower().strip() | |
| # 1. Exact match | |
| if topic_lower in _KEYWORD_MAP: | |
| matched.add(_KEYWORD_MAP[topic_lower]) | |
| continue | |
| # 2. Substring match โ check if any keyword appears inside the topic | |
| for keyword, category in _KEYWORD_MAP.items(): | |
| if keyword in topic_lower or topic_lower in keyword: | |
| matched.add(category) | |
| break | |
| if not matched: | |
| return "Education & Science" | |
| # Return the first match in CATEGORIES order for consistency | |
| for cat in CATEGORIES: | |
| if cat in matched: | |
| return cat | |
| return "Education & Science" | |
| def classify_topics(topics: List[str]) -> List[str]: | |
| """Backward-compatible wrapper โ returns a single-element list.""" | |
| return [classify_topic(topics)] | |
| def get_primary_category(topics: List[str]) -> str: | |
| """ | |
| Return the single best-matching category for the given topics. | |
| Alias for classify_topic(). | |
| """ | |
| return classify_topic(topics) | |