Spaces:
Sleeping
Sleeping
Commit ยท
65b04d6
1
Parent(s): c9146f3
topic classifier
Browse files
src/summarization/topic_classifier.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Topic Classifier โ maps dynamic LLM-extracted topics to predefined UI categories.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
from src.summarization.topic_classifier import classify_topics
|
| 6 |
+
|
| 7 |
+
topics = ["Python", "Machine Learning", "Neural Networks"]
|
| 8 |
+
result = classify_topics(topics)
|
| 9 |
+
# => ["Technology & AI"]
|
| 10 |
+
|
| 11 |
+
Categories:
|
| 12 |
+
Technology & AI | Business & Finance | Education & Science
|
| 13 |
+
Productivity & Self-Growth | News & Politics
|
| 14 |
+
Entertainment & Lifestyle | Health & Sports
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import List, Set
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 21 |
+
# PREDEFINED CATEGORIES
|
| 22 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 23 |
+
|
| 24 |
+
CATEGORIES = [
|
| 25 |
+
"Technology & AI",
|
| 26 |
+
"Business & Finance",
|
| 27 |
+
"Education & Science",
|
| 28 |
+
"Productivity & Self-Growth",
|
| 29 |
+
"News & Politics",
|
| 30 |
+
"Entertainment & Lifestyle",
|
| 31 |
+
"Health & Sports",
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 36 |
+
# KEYWORD โ CATEGORY MAPPING (English + Arabic)
|
| 37 |
+
# All keywords are stored lowercase for case-insensitive matching.
|
| 38 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 39 |
+
|
| 40 |
+
_KEYWORD_MAP: dict[str, str] = {}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _register(category: str, keywords: list[str]):
|
| 44 |
+
"""Register a list of keywords for a category (lowercase)."""
|
| 45 |
+
for kw in keywords:
|
| 46 |
+
_KEYWORD_MAP[kw.lower()] = category
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# โโ Technology & AI โโ
|
| 50 |
+
_register("Technology & AI", [
|
| 51 |
+
# English
|
| 52 |
+
"ai", "artificial intelligence", "machine learning", "deep learning",
|
| 53 |
+
"neural network", "neural networks", "nlp", "natural language processing",
|
| 54 |
+
"computer vision", "robotics", "automation", "algorithm", "algorithms",
|
| 55 |
+
"python", "javascript", "typescript", "java", "c++", "rust", "golang", "go",
|
| 56 |
+
"programming", "coding", "software", "software engineering", "web development",
|
| 57 |
+
"frontend", "backend", "full stack", "fullstack", "devops", "cloud",
|
| 58 |
+
"cloud computing", "aws", "azure", "gcp", "docker", "kubernetes",
|
| 59 |
+
"database", "sql", "nosql", "mongodb", "api", "rest api", "graphql",
|
| 60 |
+
"cybersecurity", "security", "hacking", "encryption", "blockchain",
|
| 61 |
+
"cryptocurrency", "bitcoin", "ethereum", "web3", "metaverse",
|
| 62 |
+
"data science", "data analysis", "data engineering", "big data",
|
| 63 |
+
"iot", "internet of things", "5g", "hardware", "semiconductor",
|
| 64 |
+
"gpu", "chip", "processor", "tech", "technology", "computing",
|
| 65 |
+
"linux", "git", "github", "open source", "framework", "react",
|
| 66 |
+
"vue", "angular", "node", "nodejs", "django", "flask", "fastapi",
|
| 67 |
+
"tensorflow", "pytorch", "llm", "large language model", "chatgpt",
|
| 68 |
+
"gpt", "gemini", "copilot", "transformer", "diffusion model",
|
| 69 |
+
"generative ai", "prompt engineering", "fine tuning", "rag",
|
| 70 |
+
"mobile development", "android", "ios", "swift", "kotlin", "flutter", "dart",
|
| 71 |
+
# Arabic
|
| 72 |
+
"ุฐูุงุก ุงุตุทูุงุนู", "ุชุนูู
ุขูู", "ุชุนูู
ุนู
ูู", "ุจุฑู
ุฌุฉ", "ุชูููุฉ", "ุชูููููุฌูุง",
|
| 73 |
+
"ุฎูุงุฑุฒู
ูุฉ", "ุญุงุณูุจ", "ุดุจูุงุช ุนุตุจูุฉ", "ุจูุงูุงุช", "ุฃู
ู ุณูุจุฑุงูู",
|
| 74 |
+
"ุญูุณุจุฉ ุณุญุงุจูุฉ", "ุชุทููุฑ ุจุฑู
ุฌูุงุช", "ุชุทููุฑ ููุจ", "ููุงุนุฏ ุจูุงูุงุช",
|
| 75 |
+
])
|
| 76 |
+
|
| 77 |
+
# โโ Business & Finance โโ
|
| 78 |
+
_register("Business & Finance", [
|
| 79 |
+
# English
|
| 80 |
+
"business", "finance", "economics", "economy", "stock", "stocks",
|
| 81 |
+
"stock market", "trading", "investing", "investment", "real estate",
|
| 82 |
+
"entrepreneurship", "startup", "startups", "marketing", "digital marketing",
|
| 83 |
+
"seo", "branding", "sales", "revenue", "profit", "accounting",
|
| 84 |
+
"budgeting", "money", "wealth", "financial", "banking", "bank",
|
| 85 |
+
"fintech", "venture capital", "vc", "ipo", "merger", "acquisition",
|
| 86 |
+
"management", "leadership", "strategy", "e-commerce", "ecommerce",
|
| 87 |
+
"supply chain", "logistics", "consulting", "mba", "corporate",
|
| 88 |
+
"tax", "taxes", "inflation", "gdp", "interest rate", "forex",
|
| 89 |
+
"commodity", "commodities", "portfolio", "dividend", "bond", "bonds",
|
| 90 |
+
"freelancing", "freelance", "passive income", "side hustle",
|
| 91 |
+
# Arabic
|
| 92 |
+
"ุฃุนู
ุงู", "ุชุฌุงุฑุฉ", "ุงูุชุตุงุฏ", "ู
ุงููุฉ", "ุงุณุชุซู
ุงุฑ", "ุฃุณูู
", "ุจูุฑุตุฉ",
|
| 93 |
+
"ุชุณููู", "ุฑูุงุฏุฉ ุฃุนู
ุงู", "ู
ุดุฑูุน", "ุชู
ููู", "ู
ุญุงุณุจุฉ", "ุจูู", "ุนูุงุฑุงุช",
|
| 94 |
+
"ุฑุจุญ", "ุฏุฎู", "ู
ูุฒุงููุฉ",
|
| 95 |
+
])
|
| 96 |
+
|
| 97 |
+
# โโ Education & Science โโ
|
| 98 |
+
_register("Education & Science", [
|
| 99 |
+
# English
|
| 100 |
+
"education", "learning", "teaching", "school", "university", "college",
|
| 101 |
+
"academic", "research", "study", "studying", "exam", "exams", "course",
|
| 102 |
+
"tutorial", "lecture", "scholarship", "degree", "phd", "thesis",
|
| 103 |
+
"science", "physics", "chemistry", "biology", "math", "mathematics",
|
| 104 |
+
"statistics", "calculus", "algebra", "geometry", "astronomy", "space",
|
| 105 |
+
"nasa", "quantum", "quantum physics", "quantum computing",
|
| 106 |
+
"neuroscience", "genetics", "evolution", "ecology", "geology",
|
| 107 |
+
"climate", "climate change", "environment", "engineering",
|
| 108 |
+
"mechanical engineering", "electrical engineering", "civil engineering",
|
| 109 |
+
"experiment", "laboratory", "lab", "hypothesis", "theory",
|
| 110 |
+
"history", "philosophy", "psychology", "sociology", "linguistics",
|
| 111 |
+
"anthropology", "archaeology", "literature", "language", "grammar",
|
| 112 |
+
# Arabic
|
| 113 |
+
"ุชุนููู
", "ุชุนูู
", "ู
ุฏุฑุณุฉ", "ุฌุงู
ุนุฉ", "ุนููู
", "ููุฒูุงุก", "ููู
ูุงุก",
|
| 114 |
+
"ุฃุญูุงุก", "ุฑูุงุถูุงุช", "ุจุญุซ", "ุฏุฑุงุณุฉ", "ุงู
ุชุญุงู", "ู
ููุฌ", "ู
ุญุงุถุฑุฉ",
|
| 115 |
+
"ููุฏุณุฉ", "ุชุงุฑูุฎ", "ููุณูุฉ", "ุนูู
ููุณ", "ููู", "ุจูุฆุฉ",
|
| 116 |
+
])
|
| 117 |
+
|
| 118 |
+
# โโ Productivity & Self-Growth โโ
|
| 119 |
+
_register("Productivity & Self-Growth", [
|
| 120 |
+
# English
|
| 121 |
+
"productivity", "self improvement", "self-improvement", "self growth",
|
| 122 |
+
"self-growth", "personal development", "motivation", "discipline",
|
| 123 |
+
"habits", "habit", "time management", "goal setting", "goals",
|
| 124 |
+
"mindset", "focus", "concentration", "efficiency", "organization",
|
| 125 |
+
"planning", "journaling", "morning routine", "routine", "success",
|
| 126 |
+
"self help", "self-help", "life coaching", "coaching", "mentoring",
|
| 127 |
+
"mentor", "stoicism", "minimalism", "mindfulness", "meditation",
|
| 128 |
+
"emotional intelligence", "communication skills", "public speaking",
|
| 129 |
+
"negotiation", "critical thinking", "problem solving", "creativity",
|
| 130 |
+
"decision making", "confidence", "resilience", "work-life balance",
|
| 131 |
+
"burnout", "career", "career development", "skill building",
|
| 132 |
+
# Arabic
|
| 133 |
+
"ุฅูุชุงุฌูุฉ", "ุชุทููุฑ ุฐุงุช", "ุชุญููุฒ", "ุนุงุฏุงุช", "ุฅุฏุงุฑุฉ ุงูููุช",
|
| 134 |
+
"ุฃูุฏุงู", "ุชุฑููุฒ", "ูุฌุงุญ", "ุชุฎุทูุท", "ุชุฃู
ู", "ุซูุฉ ุจุงูููุณ",
|
| 135 |
+
"ู
ูุงุฑุงุช", "ุชูููุฑ", "ุฅุจุฏุงุน",
|
| 136 |
+
])
|
| 137 |
+
|
| 138 |
+
# โโ News & Politics โโ
|
| 139 |
+
_register("News & Politics", [
|
| 140 |
+
# English
|
| 141 |
+
"news", "politics", "political", "government", "policy", "election",
|
| 142 |
+
"elections", "democracy", "geopolitics", "diplomacy", "war", "conflict",
|
| 143 |
+
"military", "defense", "law", "legal", "legislation", "regulation",
|
| 144 |
+
"human rights", "immigration", "refugee", "sanctions", "united nations",
|
| 145 |
+
"nato", "eu", "european union", "congress", "parliament", "senate",
|
| 146 |
+
"president", "prime minister", "foreign policy", "domestic policy",
|
| 147 |
+
"protest", "activism", "corruption", "media", "journalism",
|
| 148 |
+
"press", "freedom of speech", "censorship", "propaganda",
|
| 149 |
+
"international relations", "treaty", "nuclear",
|
| 150 |
+
# Arabic
|
| 151 |
+
"ุฃุฎุจุงุฑ", "ุณูุงุณุฉ", "ุญููู
ุฉ", "ุงูุชุฎุงุจุงุช", "ุฏูู
ูุฑุงุทูุฉ", "ุญุฑุจ",
|
| 152 |
+
"ูุงููู", "ุญููู ุฅูุณุงู", "ุฏุจููู
ุงุณูุฉ", "ุจุฑูู
ุงู", "ุฑุฆูุณ",
|
| 153 |
+
"ุฅุนูุงู
", "ุตุญุงูุฉ",
|
| 154 |
+
])
|
| 155 |
+
|
| 156 |
+
# โโ Entertainment & Lifestyle โโ
|
| 157 |
+
_register("Entertainment & Lifestyle", [
|
| 158 |
+
# English
|
| 159 |
+
"entertainment", "movie", "movies", "film", "films", "cinema",
|
| 160 |
+
"tv", "television", "series", "netflix", "streaming", "anime",
|
| 161 |
+
"manga", "gaming", "video games", "esports", "twitch", "youtube",
|
| 162 |
+
"podcast", "music", "song", "album", "concert", "artist",
|
| 163 |
+
"celebrity", "fashion", "style", "beauty", "makeup", "skincare",
|
| 164 |
+
"travel", "tourism", "food", "cooking", "recipe", "restaurant",
|
| 165 |
+
"cuisine", "vlog", "vlogging", "photography", "art", "design",
|
| 166 |
+
"graphic design", "illustration", "architecture", "interior design",
|
| 167 |
+
"diy", "crafts", "comedy", "humor", "drama", "reality tv",
|
| 168 |
+
"social media", "tiktok", "instagram", "influencer", "content creator",
|
| 169 |
+
"lifestyle", "luxury", "culture", "pop culture",
|
| 170 |
+
# Arabic
|
| 171 |
+
"ุชุฑููู", "ุฃููุงู
", "ุณููู
ุง", "ู
ุณูุณูุงุช", "ุฃูุนุงุจ", "ู
ูุณููู",
|
| 172 |
+
"ุณูุฑ", "ุทุจุฎ", "ุฃุฒูุงุก", "ุฌู
ุงู", "ุชุตููุฑ", "ูู", "ุชุตู
ูู
",
|
| 173 |
+
"ุซูุงูุฉ", "ููู
ูุฏูุง", "ููุชููุจ",
|
| 174 |
+
])
|
| 175 |
+
|
| 176 |
+
# โโ Health & Sports โโ
|
| 177 |
+
_register("Health & Sports", [
|
| 178 |
+
# English
|
| 179 |
+
"health", "fitness", "exercise", "workout", "gym", "bodybuilding",
|
| 180 |
+
"weight loss", "diet", "nutrition", "calories", "protein", "vitamins",
|
| 181 |
+
"supplements", "wellness", "mental health", "therapy", "depression",
|
| 182 |
+
"anxiety", "stress", "sleep", "yoga", "pilates", "crossfit",
|
| 183 |
+
"running", "marathon", "swimming", "cycling", "hiking",
|
| 184 |
+
"sports", "football", "soccer", "basketball", "tennis", "baseball",
|
| 185 |
+
"cricket", "rugby", "boxing", "mma", "ufc", "wrestling",
|
| 186 |
+
"olympics", "world cup", "premier league", "nba", "nfl",
|
| 187 |
+
"medicine", "medical", "doctor", "hospital", "surgery", "disease",
|
| 188 |
+
"virus", "vaccine", "pandemic", "covid", "cancer", "diabetes",
|
| 189 |
+
"heart", "cardio", "physical therapy", "rehabilitation",
|
| 190 |
+
"first aid", "pharmacy", "drug", "prescription",
|
| 191 |
+
# Arabic
|
| 192 |
+
"ุตุญุฉ", "ุฑูุงุถุฉ", "ุชู
ุงุฑูู", "ููุงูุฉ", "ุชุบุฐูุฉ", "ุญู
ูุฉ",
|
| 193 |
+
"ุตุญุฉ ููุณูุฉ", "ุนูุงุฌ", "ุทุจ", "ู
ุณุชุดูู", "ูุฑุฉ ูุฏู
", "ุณุจุงุญุฉ",
|
| 194 |
+
"ููุบุง", "ููู
", "ููุชุงู
ููุงุช",
|
| 195 |
+
])
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 199 |
+
# PUBLIC API
|
| 200 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 201 |
+
|
| 202 |
+
def classify_topics(topics: List[str]) -> List[str]:
|
| 203 |
+
"""
|
| 204 |
+
Map a list of dynamically extracted topics to predefined UI categories.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
topics: List of topic strings from the LLM (e.g. ["Python", "Deep Learning"]).
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
A deduplicated, ordered list of matching category names.
|
| 211 |
+
Falls back to ["Education & Science"] if no match is found.
|
| 212 |
+
|
| 213 |
+
Example:
|
| 214 |
+
>>> classify_topics(["Python", "Machine Learning", "Neural Networks"])
|
| 215 |
+
["Technology & AI"]
|
| 216 |
+
>>> classify_topics(["Investing", "AI Stocks"])
|
| 217 |
+
["Business & Finance", "Technology & AI"]
|
| 218 |
+
"""
|
| 219 |
+
matched: Set[str] = set()
|
| 220 |
+
|
| 221 |
+
for topic in topics:
|
| 222 |
+
topic_lower = topic.lower().strip()
|
| 223 |
+
|
| 224 |
+
# 1. Exact match
|
| 225 |
+
if topic_lower in _KEYWORD_MAP:
|
| 226 |
+
matched.add(_KEYWORD_MAP[topic_lower])
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
# 2. Substring match โ check if any keyword appears inside the topic
|
| 230 |
+
for keyword, category in _KEYWORD_MAP.items():
|
| 231 |
+
if keyword in topic_lower or topic_lower in keyword:
|
| 232 |
+
matched.add(category)
|
| 233 |
+
break
|
| 234 |
+
|
| 235 |
+
if not matched:
|
| 236 |
+
matched.add("Education & Science")
|
| 237 |
+
|
| 238 |
+
# Return in the same order as CATEGORIES for consistency
|
| 239 |
+
return [cat for cat in CATEGORIES if cat in matched]
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def get_primary_category(topics: List[str]) -> str:
|
| 243 |
+
"""
|
| 244 |
+
Return the single best-matching category for the given topics.
|
| 245 |
+
|
| 246 |
+
Useful when only one category tag is needed (e.g. a badge in the UI).
|
| 247 |
+
"""
|
| 248 |
+
categories = classify_topics(topics)
|
| 249 |
+
return categories[0] if categories else "Education & Science"
|