AIdea-Server / src /summarization /topic_classifier.py
Ali Hashhash
1.1
06803fa
"""
Topic Classifier โ€” maps dynamic LLM-extracted topics to predefined UI categories.
Usage:
from src.summarization.topic_classifier import classify_topic, get_primary_category
topics = ["Python", "Machine Learning", "Neural Networks"]
result = classify_topic(topics)
# => "Technology & AI"
Categories:
Technology & AI | Business & Finance | Education & Science
Productivity & Self-Growth | News & Politics
Entertainment & Lifestyle | Health & Sports
"""
from typing import List, Set
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# PREDEFINED CATEGORIES
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
CATEGORIES = [
"Technology & AI",
"Business & Finance",
"Education & Science",
"Productivity & Self-Growth",
"News & Politics",
"Entertainment & Lifestyle",
"Health & Sports",
]
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# KEYWORD โ†’ CATEGORY MAPPING (English + Arabic)
# All keywords are stored lowercase for case-insensitive matching.
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
_KEYWORD_MAP: dict[str, str] = {}
def _register(category: str, keywords: list[str]):
"""Register a list of keywords for a category (lowercase)."""
for kw in keywords:
_KEYWORD_MAP[kw.lower()] = category
# โ”€โ”€ Technology & AI โ”€โ”€
_register("Technology & AI", [
# English
"ai", "artificial intelligence", "machine learning", "deep learning",
"neural network", "neural networks", "nlp", "natural language processing",
"computer vision", "robotics", "automation", "algorithm", "algorithms",
"python", "javascript", "typescript", "java", "c++", "rust", "golang", "go",
"programming", "coding", "software", "software engineering", "web development",
"frontend", "backend", "full stack", "fullstack", "devops", "cloud",
"cloud computing", "aws", "azure", "gcp", "docker", "kubernetes",
"database", "sql", "nosql", "mongodb", "api", "rest api", "graphql",
"cybersecurity", "security", "hacking", "encryption", "blockchain",
"cryptocurrency", "bitcoin", "ethereum", "web3", "metaverse",
"data science", "data analysis", "data engineering", "big data",
"iot", "internet of things", "5g", "hardware", "semiconductor",
"gpu", "chip", "processor", "tech", "technology", "computing",
"linux", "git", "github", "open source", "framework", "react",
"vue", "angular", "node", "nodejs", "django", "flask", "fastapi",
"tensorflow", "pytorch", "llm", "large language model", "chatgpt",
"gpt", "gemini", "copilot", "transformer", "diffusion model",
"generative ai", "prompt engineering", "fine tuning", "rag",
"mobile development", "android", "ios", "swift", "kotlin", "flutter", "dart",
# Arabic
"ุฐูƒุงุก ุงุตุทู†ุงุนูŠ", "ุชุนู„ู… ุขู„ูŠ", "ุชุนู„ู… ุนู…ูŠู‚", "ุจุฑู…ุฌุฉ", "ุชู‚ู†ูŠุฉ", "ุชูƒู†ูˆู„ูˆุฌูŠุง",
"ุฎูˆุงุฑุฒู…ูŠุฉ", "ุญุงุณูˆุจ", "ุดุจูƒุงุช ุนุตุจูŠุฉ", "ุจูŠุงู†ุงุช", "ุฃู…ู† ุณูŠุจุฑุงู†ูŠ",
"ุญูˆุณุจุฉ ุณุญุงุจูŠุฉ", "ุชุทูˆูŠุฑ ุจุฑู…ุฌูŠุงุช", "ุชุทูˆูŠุฑ ูˆูŠุจ", "ู‚ูˆุงุนุฏ ุจูŠุงู†ุงุช",
])
# โ”€โ”€ Business & Finance โ”€โ”€
_register("Business & Finance", [
# English
"business", "finance", "economics", "economy", "stock", "stocks",
"stock market", "trading", "investing", "investment", "real estate",
"entrepreneurship", "startup", "startups", "marketing", "digital marketing",
"seo", "branding", "sales", "revenue", "profit", "accounting",
"budgeting", "money", "wealth", "financial", "banking", "bank",
"fintech", "venture capital", "vc", "ipo", "merger", "acquisition",
"management", "leadership", "strategy", "e-commerce", "ecommerce",
"supply chain", "logistics", "consulting", "mba", "corporate",
"tax", "taxes", "inflation", "gdp", "interest rate", "forex",
"commodity", "commodities", "portfolio", "dividend", "bond", "bonds",
"freelancing", "freelance", "passive income", "side hustle",
# Arabic
"ุฃุนู…ุงู„", "ุชุฌุงุฑุฉ", "ุงู‚ุชุตุงุฏ", "ู…ุงู„ูŠุฉ", "ุงุณุชุซู…ุงุฑ", "ุฃุณู‡ู…", "ุจูˆุฑุตุฉ",
"ุชุณูˆูŠู‚", "ุฑูŠุงุฏุฉ ุฃุนู…ุงู„", "ู…ุดุฑูˆุน", "ุชู…ูˆูŠู„", "ู…ุญุงุณุจุฉ", "ุจู†ูƒ", "ุนู‚ุงุฑุงุช",
"ุฑุจุญ", "ุฏุฎู„", "ู…ูŠุฒุงู†ูŠุฉ",
])
# โ”€โ”€ Education & Science โ”€โ”€
_register("Education & Science", [
# English
"education", "learning", "teaching", "school", "university", "college",
"academic", "research", "study", "studying", "exam", "exams", "course",
"tutorial", "lecture", "scholarship", "degree", "phd", "thesis",
"science", "physics", "chemistry", "biology", "math", "mathematics",
"statistics", "calculus", "algebra", "geometry", "astronomy", "space",
"nasa", "quantum", "quantum physics", "quantum computing",
"neuroscience", "genetics", "evolution", "ecology", "geology",
"climate", "climate change", "environment", "engineering",
"mechanical engineering", "electrical engineering", "civil engineering",
"experiment", "laboratory", "lab", "hypothesis", "theory",
"history", "philosophy", "psychology", "sociology", "linguistics",
"anthropology", "archaeology", "literature", "language", "grammar",
# Arabic
"ุชุนู„ูŠู…", "ุชุนู„ู…", "ู…ุฏุฑุณุฉ", "ุฌุงู…ุนุฉ", "ุนู„ูˆู…", "ููŠุฒูŠุงุก", "ูƒูŠู…ูŠุงุก",
"ุฃุญูŠุงุก", "ุฑูŠุงุถูŠุงุช", "ุจุญุซ", "ุฏุฑุงุณุฉ", "ุงู…ุชุญุงู†", "ู…ู†ู‡ุฌ", "ู…ุญุงุถุฑุฉ",
"ู‡ู†ุฏุณุฉ", "ุชุงุฑูŠุฎ", "ูู„ุณูุฉ", "ุนู„ู… ู†ูุณ", "ูู„ูƒ", "ุจูŠุฆุฉ",
])
# โ”€โ”€ Productivity & Self-Growth โ”€โ”€
_register("Productivity & Self-Growth", [
# English
"productivity", "self improvement", "self-improvement", "self growth",
"self-growth", "personal development", "motivation", "discipline",
"habits", "habit", "time management", "goal setting", "goals",
"mindset", "focus", "concentration", "efficiency", "organization",
"planning", "journaling", "morning routine", "routine", "success",
"self help", "self-help", "life coaching", "coaching", "mentoring",
"mentor", "stoicism", "minimalism", "mindfulness", "meditation",
"emotional intelligence", "communication skills", "public speaking",
"negotiation", "critical thinking", "problem solving", "creativity",
"decision making", "confidence", "resilience", "work-life balance",
"burnout", "career", "career development", "skill building",
# Arabic
"ุฅู†ุชุงุฌูŠุฉ", "ุชุทูˆูŠุฑ ุฐุงุช", "ุชุญููŠุฒ", "ุนุงุฏุงุช", "ุฅุฏุงุฑุฉ ุงู„ูˆู‚ุช",
"ุฃู‡ุฏุงู", "ุชุฑูƒูŠุฒ", "ู†ุฌุงุญ", "ุชุฎุทูŠุท", "ุชุฃู…ู„", "ุซู‚ุฉ ุจุงู„ู†ูุณ",
"ู…ู‡ุงุฑุงุช", "ุชููƒูŠุฑ", "ุฅุจุฏุงุน",
])
# โ”€โ”€ News & Politics โ”€โ”€
_register("News & Politics", [
# English
"news", "politics", "political", "government", "policy", "election",
"elections", "democracy", "geopolitics", "diplomacy", "war", "conflict",
"military", "defense", "law", "legal", "legislation", "regulation",
"human rights", "immigration", "refugee", "sanctions", "united nations",
"nato", "eu", "european union", "congress", "parliament", "senate",
"president", "prime minister", "foreign policy", "domestic policy",
"protest", "activism", "corruption", "media", "journalism",
"press", "freedom of speech", "censorship", "propaganda",
"international relations", "treaty", "nuclear",
# Arabic
"ุฃุฎุจุงุฑ", "ุณูŠุงุณุฉ", "ุญูƒูˆู…ุฉ", "ุงู†ุชุฎุงุจุงุช", "ุฏูŠู…ู‚ุฑุงุทูŠุฉ", "ุญุฑุจ",
"ู‚ุงู†ูˆู†", "ุญู‚ูˆู‚ ุฅู†ุณุงู†", "ุฏุจู„ูˆู…ุงุณูŠุฉ", "ุจุฑู„ู…ุงู†", "ุฑุฆูŠุณ",
"ุฅุนู„ุงู…", "ุตุญุงูุฉ",
])
# โ”€โ”€ Entertainment & Lifestyle โ”€โ”€
_register("Entertainment & Lifestyle", [
# English
"entertainment", "movie", "movies", "film", "films", "cinema",
"tv", "television", "series", "netflix", "streaming", "anime",
"manga", "gaming", "video games", "esports", "twitch", "youtube",
"podcast", "music", "song", "album", "concert", "artist",
"celebrity", "fashion", "style", "beauty", "makeup", "skincare",
"travel", "tourism", "food", "cooking", "recipe", "restaurant",
"cuisine", "vlog", "vlogging", "photography", "art", "design",
"graphic design", "illustration", "architecture", "interior design",
"diy", "crafts", "comedy", "humor", "drama", "reality tv",
"social media", "tiktok", "instagram", "influencer", "content creator",
"lifestyle", "luxury", "culture", "pop culture",
# Arabic
"ุชุฑููŠู‡", "ุฃูู„ุงู…", "ุณูŠู†ู…ุง", "ู…ุณู„ุณู„ุงุช", "ุฃู„ุนุงุจ", "ู…ูˆุณูŠู‚ู‰",
"ุณูุฑ", "ุทุจุฎ", "ุฃุฒูŠุงุก", "ุฌู…ุงู„", "ุชุตูˆูŠุฑ", "ูู†", "ุชุตู…ูŠู…",
"ุซู‚ุงูุฉ", "ูƒูˆู…ูŠุฏูŠุง", "ูŠูˆุชูŠูˆุจ",
])
# โ”€โ”€ Health & Sports โ”€โ”€
_register("Health & Sports", [
# English
"health", "fitness", "exercise", "workout", "gym", "bodybuilding",
"weight loss", "diet", "nutrition", "calories", "protein", "vitamins",
"supplements", "wellness", "mental health", "therapy", "depression",
"anxiety", "stress", "sleep", "yoga", "pilates", "crossfit",
"running", "marathon", "swimming", "cycling", "hiking",
"sports", "football", "soccer", "basketball", "tennis", "baseball",
"cricket", "rugby", "boxing", "mma", "ufc", "wrestling",
"olympics", "world cup", "premier league", "nba", "nfl",
"medicine", "medical", "doctor", "hospital", "surgery", "disease",
"virus", "vaccine", "pandemic", "covid", "cancer", "diabetes",
"heart", "cardio", "physical therapy", "rehabilitation",
"first aid", "pharmacy", "drug", "prescription",
# Arabic
"ุตุญุฉ", "ุฑูŠุงุถุฉ", "ุชู…ุงุฑูŠู†", "ู„ูŠุงู‚ุฉ", "ุชุบุฐูŠุฉ", "ุญู…ูŠุฉ",
"ุตุญุฉ ู†ูุณูŠุฉ", "ุนู„ุงุฌ", "ุทุจ", "ู…ุณุชุดูู‰", "ูƒุฑุฉ ู‚ุฏู…", "ุณุจุงุญุฉ",
"ูŠูˆุบุง", "ู†ูˆู…", "ููŠุชุงู…ูŠู†ุงุช",
])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# PUBLIC API
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def classify_topic(topics: List[str]) -> str:
"""
Map a list of dynamically extracted topics to a SINGLE predefined UI category.
Returns the single best-matching category (the first match in CATEGORIES order).
Falls back to "Education & Science" if no match is found.
Args:
topics: List of topic strings from the LLM (e.g. ["Python", "Deep Learning"]).
Returns:
A single category string.
Example:
>>> classify_topic(["Python", "Machine Learning", "Neural Networks"])
"Technology & AI"
>>> classify_topic(["Investing", "AI Stocks"])
"Business & Finance"
"""
matched: Set[str] = set()
for topic in topics:
topic_lower = topic.lower().strip()
# 1. Exact match
if topic_lower in _KEYWORD_MAP:
matched.add(_KEYWORD_MAP[topic_lower])
continue
# 2. Substring match โ€” check if any keyword appears inside the topic
for keyword, category in _KEYWORD_MAP.items():
if keyword in topic_lower or topic_lower in keyword:
matched.add(category)
break
if not matched:
return "Education & Science"
# Return the first match in CATEGORIES order for consistency
for cat in CATEGORIES:
if cat in matched:
return cat
return "Education & Science"
def classify_topics(topics: List[str]) -> List[str]:
"""Backward-compatible wrapper โ€” returns a single-element list."""
return [classify_topic(topics)]
def get_primary_category(topics: List[str]) -> str:
"""
Return the single best-matching category for the given topics.
Alias for classify_topic().
"""
return classify_topic(topics)