""" Topic Classifier — maps dynamic LLM-extracted topics to predefined UI categories. Usage: from src.summarization.topic_classifier import classify_topic, get_primary_category topics = ["Python", "Machine Learning", "Neural Networks"] result = classify_topic(topics) # => "Technology & AI" Categories: Technology & AI | Business & Finance | Education & Science Productivity & Self-Growth | News & Politics Entertainment & Lifestyle | Health & Sports """ from typing import List, Set # ───────────────────────────────────────────────────────────────────────────── # PREDEFINED CATEGORIES # ───────────────────────────────────────────────────────────────────────────── CATEGORIES = [ "Technology & AI", "Business & Finance", "Education & Science", "Productivity & Self-Growth", "News & Politics", "Entertainment & Lifestyle", "Health & Sports", ] # ───────────────────────────────────────────────────────────────────────────── # KEYWORD → CATEGORY MAPPING (English + Arabic) # All keywords are stored lowercase for case-insensitive matching. # ───────────────────────────────────────────────────────────────────────────── _KEYWORD_MAP: dict[str, str] = {} def _register(category: str, keywords: list[str]): """Register a list of keywords for a category (lowercase).""" for kw in keywords: _KEYWORD_MAP[kw.lower()] = category # ── Technology & AI ── _register("Technology & AI", [ # English "ai", "artificial intelligence", "machine learning", "deep learning", "neural network", "neural networks", "nlp", "natural language processing", "computer vision", "robotics", "automation", "algorithm", "algorithms", "python", "javascript", "typescript", "java", "c++", "rust", "golang", "go", "programming", "coding", "software", "software engineering", "web development", "frontend", "backend", "full stack", "fullstack", "devops", "cloud", "cloud computing", "aws", "azure", "gcp", "docker", "kubernetes", "database", "sql", "nosql", "mongodb", "api", "rest api", "graphql", "cybersecurity", "security", "hacking", "encryption", "blockchain", "cryptocurrency", "bitcoin", "ethereum", "web3", "metaverse", "data science", "data analysis", "data engineering", "big data", "iot", "internet of things", "5g", "hardware", "semiconductor", "gpu", "chip", "processor", "tech", "technology", "computing", "linux", "git", "github", "open source", "framework", "react", "vue", "angular", "node", "nodejs", "django", "flask", "fastapi", "tensorflow", "pytorch", "llm", "large language model", "chatgpt", "gpt", "gemini", "copilot", "transformer", "diffusion model", "generative ai", "prompt engineering", "fine tuning", "rag", "mobile development", "android", "ios", "swift", "kotlin", "flutter", "dart", # Arabic "ذكاء اصطناعي", "تعلم آلي", "تعلم عميق", "برمجة", "تقنية", "تكنولوجيا", "خوارزمية", "حاسوب", "شبكات عصبية", "بيانات", "أمن سيبراني", "حوسبة سحابية", "تطوير برمجيات", "تطوير ويب", "قواعد بيانات", ]) # ── Business & Finance ── _register("Business & Finance", [ # English "business", "finance", "economics", "economy", "stock", "stocks", "stock market", "trading", "investing", "investment", "real estate", "entrepreneurship", "startup", "startups", "marketing", "digital marketing", "seo", "branding", "sales", "revenue", "profit", "accounting", "budgeting", "money", "wealth", "financial", "banking", "bank", "fintech", "venture capital", "vc", "ipo", "merger", "acquisition", "management", "leadership", "strategy", "e-commerce", "ecommerce", "supply chain", "logistics", "consulting", "mba", "corporate", "tax", "taxes", "inflation", "gdp", "interest rate", "forex", "commodity", "commodities", "portfolio", "dividend", "bond", "bonds", "freelancing", "freelance", "passive income", "side hustle", # Arabic "أعمال", "تجارة", "اقتصاد", "مالية", "استثمار", "أسهم", "بورصة", "تسويق", "ريادة أعمال", "مشروع", "تمويل", "محاسبة", "بنك", "عقارات", "ربح", "دخل", "ميزانية", ]) # ── Education & Science ── _register("Education & Science", [ # English "education", "learning", "teaching", "school", "university", "college", "academic", "research", "study", "studying", "exam", "exams", "course", "tutorial", "lecture", "scholarship", "degree", "phd", "thesis", "science", "physics", "chemistry", "biology", "math", "mathematics", "statistics", "calculus", "algebra", "geometry", "astronomy", "space", "nasa", "quantum", "quantum physics", "quantum computing", "neuroscience", "genetics", "evolution", "ecology", "geology", "climate", "climate change", "environment", "engineering", "mechanical engineering", "electrical engineering", "civil engineering", "experiment", "laboratory", "lab", "hypothesis", "theory", "history", "philosophy", "psychology", "sociology", "linguistics", "anthropology", "archaeology", "literature", "language", "grammar", # Arabic "تعليم", "تعلم", "مدرسة", "جامعة", "علوم", "فيزياء", "كيمياء", "أحياء", "رياضيات", "بحث", "دراسة", "امتحان", "منهج", "محاضرة", "هندسة", "تاريخ", "فلسفة", "علم نفس", "فلك", "بيئة", ]) # ── Productivity & Self-Growth ── _register("Productivity & Self-Growth", [ # English "productivity", "self improvement", "self-improvement", "self growth", "self-growth", "personal development", "motivation", "discipline", "habits", "habit", "time management", "goal setting", "goals", "mindset", "focus", "concentration", "efficiency", "organization", "planning", "journaling", "morning routine", "routine", "success", "self help", "self-help", "life coaching", "coaching", "mentoring", "mentor", "stoicism", "minimalism", "mindfulness", "meditation", "emotional intelligence", "communication skills", "public speaking", "negotiation", "critical thinking", "problem solving", "creativity", "decision making", "confidence", "resilience", "work-life balance", "burnout", "career", "career development", "skill building", # Arabic "إنتاجية", "تطوير ذات", "تحفيز", "عادات", "إدارة الوقت", "أهداف", "تركيز", "نجاح", "تخطيط", "تأمل", "ثقة بالنفس", "مهارات", "تفكير", "إبداع", ]) # ── News & Politics ── _register("News & Politics", [ # English "news", "politics", "political", "government", "policy", "election", "elections", "democracy", "geopolitics", "diplomacy", "war", "conflict", "military", "defense", "law", "legal", "legislation", "regulation", "human rights", "immigration", "refugee", "sanctions", "united nations", "nato", "eu", "european union", "congress", "parliament", "senate", "president", "prime minister", "foreign policy", "domestic policy", "protest", "activism", "corruption", "media", "journalism", "press", "freedom of speech", "censorship", "propaganda", "international relations", "treaty", "nuclear", # Arabic "أخبار", "سياسة", "حكومة", "انتخابات", "ديمقراطية", "حرب", "قانون", "حقوق إنسان", "دبلوماسية", "برلمان", "رئيس", "إعلام", "صحافة", ]) # ── Entertainment & Lifestyle ── _register("Entertainment & Lifestyle", [ # English "entertainment", "movie", "movies", "film", "films", "cinema", "tv", "television", "series", "netflix", "streaming", "anime", "manga", "gaming", "video games", "esports", "twitch", "youtube", "podcast", "music", "song", "album", "concert", "artist", "celebrity", "fashion", "style", "beauty", "makeup", "skincare", "travel", "tourism", "food", "cooking", "recipe", "restaurant", "cuisine", "vlog", "vlogging", "photography", "art", "design", "graphic design", "illustration", "architecture", "interior design", "diy", "crafts", "comedy", "humor", "drama", "reality tv", "social media", "tiktok", "instagram", "influencer", "content creator", "lifestyle", "luxury", "culture", "pop culture", # Arabic "ترفيه", "أفلام", "سينما", "مسلسلات", "ألعاب", "موسيقى", "سفر", "طبخ", "أزياء", "جمال", "تصوير", "فن", "تصميم", "ثقافة", "كوميديا", "يوتيوب", ]) # ── Health & Sports ── _register("Health & Sports", [ # English "health", "fitness", "exercise", "workout", "gym", "bodybuilding", "weight loss", "diet", "nutrition", "calories", "protein", "vitamins", "supplements", "wellness", "mental health", "therapy", "depression", "anxiety", "stress", "sleep", "yoga", "pilates", "crossfit", "running", "marathon", "swimming", "cycling", "hiking", "sports", "football", "soccer", "basketball", "tennis", "baseball", "cricket", "rugby", "boxing", "mma", "ufc", "wrestling", "olympics", "world cup", "premier league", "nba", "nfl", "medicine", "medical", "doctor", "hospital", "surgery", "disease", "virus", "vaccine", "pandemic", "covid", "cancer", "diabetes", "heart", "cardio", "physical therapy", "rehabilitation", "first aid", "pharmacy", "drug", "prescription", # Arabic "صحة", "رياضة", "تمارين", "لياقة", "تغذية", "حمية", "صحة نفسية", "علاج", "طب", "مستشفى", "كرة قدم", "سباحة", "يوغا", "نوم", "فيتامينات", ]) # ───────────────────────────────────────────────────────────────────────────── # PUBLIC API # ───────────────────────────────────────────────────────────────────────────── def classify_topic(topics: List[str]) -> str: """ Map a list of dynamically extracted topics to a SINGLE predefined UI category. Returns the single best-matching category (the first match in CATEGORIES order). Falls back to "Education & Science" if no match is found. Args: topics: List of topic strings from the LLM (e.g. ["Python", "Deep Learning"]). Returns: A single category string. Example: >>> classify_topic(["Python", "Machine Learning", "Neural Networks"]) "Technology & AI" >>> classify_topic(["Investing", "AI Stocks"]) "Business & Finance" """ matched: Set[str] = set() for topic in topics: topic_lower = topic.lower().strip() # 1. Exact match if topic_lower in _KEYWORD_MAP: matched.add(_KEYWORD_MAP[topic_lower]) continue # 2. Substring match — check if any keyword appears inside the topic for keyword, category in _KEYWORD_MAP.items(): if keyword in topic_lower or topic_lower in keyword: matched.add(category) break if not matched: return "Education & Science" # Return the first match in CATEGORIES order for consistency for cat in CATEGORIES: if cat in matched: return cat return "Education & Science" def classify_topics(topics: List[str]) -> List[str]: """Backward-compatible wrapper — returns a single-element list.""" return [classify_topic(topics)] def get_primary_category(topics: List[str]) -> str: """ Return the single best-matching category for the given topics. Alias for classify_topic(). """ return classify_topic(topics)