amanyelfiky commited on
Commit
65b04d6
ยท
1 Parent(s): c9146f3

topic classifier

Browse files
Files changed (1) hide show
  1. src/summarization/topic_classifier.py +249 -0
src/summarization/topic_classifier.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Topic Classifier โ€” maps dynamic LLM-extracted topics to predefined UI categories.
3
+
4
+ Usage:
5
+ from src.summarization.topic_classifier import classify_topics
6
+
7
+ topics = ["Python", "Machine Learning", "Neural Networks"]
8
+ result = classify_topics(topics)
9
+ # => ["Technology & AI"]
10
+
11
+ Categories:
12
+ Technology & AI | Business & Finance | Education & Science
13
+ Productivity & Self-Growth | News & Politics
14
+ Entertainment & Lifestyle | Health & Sports
15
+ """
16
+
17
+ from typing import List, Set
18
+
19
+
20
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
21
+ # PREDEFINED CATEGORIES
22
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
23
+
24
+ CATEGORIES = [
25
+ "Technology & AI",
26
+ "Business & Finance",
27
+ "Education & Science",
28
+ "Productivity & Self-Growth",
29
+ "News & Politics",
30
+ "Entertainment & Lifestyle",
31
+ "Health & Sports",
32
+ ]
33
+
34
+
35
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
36
+ # KEYWORD โ†’ CATEGORY MAPPING (English + Arabic)
37
+ # All keywords are stored lowercase for case-insensitive matching.
38
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
39
+
40
+ _KEYWORD_MAP: dict[str, str] = {}
41
+
42
+
43
+ def _register(category: str, keywords: list[str]):
44
+ """Register a list of keywords for a category (lowercase)."""
45
+ for kw in keywords:
46
+ _KEYWORD_MAP[kw.lower()] = category
47
+
48
+
49
+ # โ”€โ”€ Technology & AI โ”€โ”€
50
+ _register("Technology & AI", [
51
+ # English
52
+ "ai", "artificial intelligence", "machine learning", "deep learning",
53
+ "neural network", "neural networks", "nlp", "natural language processing",
54
+ "computer vision", "robotics", "automation", "algorithm", "algorithms",
55
+ "python", "javascript", "typescript", "java", "c++", "rust", "golang", "go",
56
+ "programming", "coding", "software", "software engineering", "web development",
57
+ "frontend", "backend", "full stack", "fullstack", "devops", "cloud",
58
+ "cloud computing", "aws", "azure", "gcp", "docker", "kubernetes",
59
+ "database", "sql", "nosql", "mongodb", "api", "rest api", "graphql",
60
+ "cybersecurity", "security", "hacking", "encryption", "blockchain",
61
+ "cryptocurrency", "bitcoin", "ethereum", "web3", "metaverse",
62
+ "data science", "data analysis", "data engineering", "big data",
63
+ "iot", "internet of things", "5g", "hardware", "semiconductor",
64
+ "gpu", "chip", "processor", "tech", "technology", "computing",
65
+ "linux", "git", "github", "open source", "framework", "react",
66
+ "vue", "angular", "node", "nodejs", "django", "flask", "fastapi",
67
+ "tensorflow", "pytorch", "llm", "large language model", "chatgpt",
68
+ "gpt", "gemini", "copilot", "transformer", "diffusion model",
69
+ "generative ai", "prompt engineering", "fine tuning", "rag",
70
+ "mobile development", "android", "ios", "swift", "kotlin", "flutter", "dart",
71
+ # Arabic
72
+ "ุฐูƒุงุก ุงุตุทู†ุงุนูŠ", "ุชุนู„ู… ุขู„ูŠ", "ุชุนู„ู… ุนู…ูŠู‚", "ุจุฑู…ุฌุฉ", "ุชู‚ู†ูŠุฉ", "ุชูƒู†ูˆู„ูˆุฌูŠุง",
73
+ "ุฎูˆุงุฑุฒู…ูŠุฉ", "ุญุงุณูˆุจ", "ุดุจูƒุงุช ุนุตุจูŠุฉ", "ุจูŠุงู†ุงุช", "ุฃู…ู† ุณูŠุจุฑุงู†ูŠ",
74
+ "ุญูˆุณุจุฉ ุณุญุงุจูŠุฉ", "ุชุทูˆูŠุฑ ุจุฑู…ุฌูŠุงุช", "ุชุทูˆูŠุฑ ูˆูŠุจ", "ู‚ูˆุงุนุฏ ุจูŠุงู†ุงุช",
75
+ ])
76
+
77
+ # โ”€โ”€ Business & Finance โ”€โ”€
78
+ _register("Business & Finance", [
79
+ # English
80
+ "business", "finance", "economics", "economy", "stock", "stocks",
81
+ "stock market", "trading", "investing", "investment", "real estate",
82
+ "entrepreneurship", "startup", "startups", "marketing", "digital marketing",
83
+ "seo", "branding", "sales", "revenue", "profit", "accounting",
84
+ "budgeting", "money", "wealth", "financial", "banking", "bank",
85
+ "fintech", "venture capital", "vc", "ipo", "merger", "acquisition",
86
+ "management", "leadership", "strategy", "e-commerce", "ecommerce",
87
+ "supply chain", "logistics", "consulting", "mba", "corporate",
88
+ "tax", "taxes", "inflation", "gdp", "interest rate", "forex",
89
+ "commodity", "commodities", "portfolio", "dividend", "bond", "bonds",
90
+ "freelancing", "freelance", "passive income", "side hustle",
91
+ # Arabic
92
+ "ุฃุนู…ุงู„", "ุชุฌุงุฑุฉ", "ุงู‚ุชุตุงุฏ", "ู…ุงู„ูŠุฉ", "ุงุณุชุซู…ุงุฑ", "ุฃุณู‡ู…", "ุจูˆุฑุตุฉ",
93
+ "ุชุณูˆูŠู‚", "ุฑูŠุงุฏุฉ ุฃุนู…ุงู„", "ู…ุดุฑูˆุน", "ุชู…ูˆูŠู„", "ู…ุญุงุณุจุฉ", "ุจู†ูƒ", "ุนู‚ุงุฑุงุช",
94
+ "ุฑุจุญ", "ุฏุฎู„", "ู…ูŠุฒุงู†ูŠุฉ",
95
+ ])
96
+
97
+ # โ”€โ”€ Education & Science โ”€โ”€
98
+ _register("Education & Science", [
99
+ # English
100
+ "education", "learning", "teaching", "school", "university", "college",
101
+ "academic", "research", "study", "studying", "exam", "exams", "course",
102
+ "tutorial", "lecture", "scholarship", "degree", "phd", "thesis",
103
+ "science", "physics", "chemistry", "biology", "math", "mathematics",
104
+ "statistics", "calculus", "algebra", "geometry", "astronomy", "space",
105
+ "nasa", "quantum", "quantum physics", "quantum computing",
106
+ "neuroscience", "genetics", "evolution", "ecology", "geology",
107
+ "climate", "climate change", "environment", "engineering",
108
+ "mechanical engineering", "electrical engineering", "civil engineering",
109
+ "experiment", "laboratory", "lab", "hypothesis", "theory",
110
+ "history", "philosophy", "psychology", "sociology", "linguistics",
111
+ "anthropology", "archaeology", "literature", "language", "grammar",
112
+ # Arabic
113
+ "ุชุนู„ูŠู…", "ุชุนู„ู…", "ู…ุฏุฑุณุฉ", "ุฌุงู…ุนุฉ", "ุนู„ูˆู…", "ููŠุฒูŠุงุก", "ูƒูŠู…ูŠุงุก",
114
+ "ุฃุญูŠุงุก", "ุฑูŠุงุถูŠุงุช", "ุจุญุซ", "ุฏุฑุงุณุฉ", "ุงู…ุชุญุงู†", "ู…ู†ู‡ุฌ", "ู…ุญุงุถุฑุฉ",
115
+ "ู‡ู†ุฏุณุฉ", "ุชุงุฑูŠุฎ", "ูู„ุณูุฉ", "ุนู„ู… ู†ูุณ", "ูู„ูƒ", "ุจูŠุฆุฉ",
116
+ ])
117
+
118
+ # โ”€โ”€ Productivity & Self-Growth โ”€โ”€
119
+ _register("Productivity & Self-Growth", [
120
+ # English
121
+ "productivity", "self improvement", "self-improvement", "self growth",
122
+ "self-growth", "personal development", "motivation", "discipline",
123
+ "habits", "habit", "time management", "goal setting", "goals",
124
+ "mindset", "focus", "concentration", "efficiency", "organization",
125
+ "planning", "journaling", "morning routine", "routine", "success",
126
+ "self help", "self-help", "life coaching", "coaching", "mentoring",
127
+ "mentor", "stoicism", "minimalism", "mindfulness", "meditation",
128
+ "emotional intelligence", "communication skills", "public speaking",
129
+ "negotiation", "critical thinking", "problem solving", "creativity",
130
+ "decision making", "confidence", "resilience", "work-life balance",
131
+ "burnout", "career", "career development", "skill building",
132
+ # Arabic
133
+ "ุฅู†ุชุงุฌูŠุฉ", "ุชุทูˆูŠุฑ ุฐุงุช", "ุชุญููŠุฒ", "ุนุงุฏุงุช", "ุฅุฏุงุฑุฉ ุงู„ูˆู‚ุช",
134
+ "ุฃู‡ุฏุงู", "ุชุฑูƒูŠุฒ", "ู†ุฌุงุญ", "ุชุฎุทูŠุท", "ุชุฃู…ู„", "ุซู‚ุฉ ุจุงู„ู†ูุณ",
135
+ "ู…ู‡ุงุฑุงุช", "ุชููƒูŠุฑ", "ุฅุจุฏุงุน",
136
+ ])
137
+
138
+ # โ”€โ”€ News & Politics โ”€โ”€
139
+ _register("News & Politics", [
140
+ # English
141
+ "news", "politics", "political", "government", "policy", "election",
142
+ "elections", "democracy", "geopolitics", "diplomacy", "war", "conflict",
143
+ "military", "defense", "law", "legal", "legislation", "regulation",
144
+ "human rights", "immigration", "refugee", "sanctions", "united nations",
145
+ "nato", "eu", "european union", "congress", "parliament", "senate",
146
+ "president", "prime minister", "foreign policy", "domestic policy",
147
+ "protest", "activism", "corruption", "media", "journalism",
148
+ "press", "freedom of speech", "censorship", "propaganda",
149
+ "international relations", "treaty", "nuclear",
150
+ # Arabic
151
+ "ุฃุฎุจุงุฑ", "ุณูŠุงุณุฉ", "ุญูƒูˆู…ุฉ", "ุงู†ุชุฎุงุจุงุช", "ุฏูŠู…ู‚ุฑุงุทูŠุฉ", "ุญุฑุจ",
152
+ "ู‚ุงู†ูˆู†", "ุญู‚ูˆู‚ ุฅู†ุณุงู†", "ุฏุจู„ูˆู…ุงุณูŠุฉ", "ุจุฑู„ู…ุงู†", "ุฑุฆูŠุณ",
153
+ "ุฅุนู„ุงู…", "ุตุญุงูุฉ",
154
+ ])
155
+
156
+ # โ”€โ”€ Entertainment & Lifestyle โ”€โ”€
157
+ _register("Entertainment & Lifestyle", [
158
+ # English
159
+ "entertainment", "movie", "movies", "film", "films", "cinema",
160
+ "tv", "television", "series", "netflix", "streaming", "anime",
161
+ "manga", "gaming", "video games", "esports", "twitch", "youtube",
162
+ "podcast", "music", "song", "album", "concert", "artist",
163
+ "celebrity", "fashion", "style", "beauty", "makeup", "skincare",
164
+ "travel", "tourism", "food", "cooking", "recipe", "restaurant",
165
+ "cuisine", "vlog", "vlogging", "photography", "art", "design",
166
+ "graphic design", "illustration", "architecture", "interior design",
167
+ "diy", "crafts", "comedy", "humor", "drama", "reality tv",
168
+ "social media", "tiktok", "instagram", "influencer", "content creator",
169
+ "lifestyle", "luxury", "culture", "pop culture",
170
+ # Arabic
171
+ "ุชุฑููŠู‡", "ุฃูู„ุงู…", "ุณูŠู†ู…ุง", "ู…ุณู„ุณู„ุงุช", "ุฃู„ุนุงุจ", "ู…ูˆุณูŠู‚ู‰",
172
+ "ุณูุฑ", "ุทุจุฎ", "ุฃุฒูŠุงุก", "ุฌู…ุงู„", "ุชุตูˆูŠุฑ", "ูู†", "ุชุตู…ูŠู…",
173
+ "ุซู‚ุงูุฉ", "ูƒูˆู…ูŠุฏูŠุง", "ูŠูˆุชูŠูˆุจ",
174
+ ])
175
+
176
+ # โ”€โ”€ Health & Sports โ”€โ”€
177
+ _register("Health & Sports", [
178
+ # English
179
+ "health", "fitness", "exercise", "workout", "gym", "bodybuilding",
180
+ "weight loss", "diet", "nutrition", "calories", "protein", "vitamins",
181
+ "supplements", "wellness", "mental health", "therapy", "depression",
182
+ "anxiety", "stress", "sleep", "yoga", "pilates", "crossfit",
183
+ "running", "marathon", "swimming", "cycling", "hiking",
184
+ "sports", "football", "soccer", "basketball", "tennis", "baseball",
185
+ "cricket", "rugby", "boxing", "mma", "ufc", "wrestling",
186
+ "olympics", "world cup", "premier league", "nba", "nfl",
187
+ "medicine", "medical", "doctor", "hospital", "surgery", "disease",
188
+ "virus", "vaccine", "pandemic", "covid", "cancer", "diabetes",
189
+ "heart", "cardio", "physical therapy", "rehabilitation",
190
+ "first aid", "pharmacy", "drug", "prescription",
191
+ # Arabic
192
+ "ุตุญุฉ", "ุฑูŠุงุถุฉ", "ุชู…ุงุฑูŠู†", "ู„ูŠุงู‚ุฉ", "ุชุบุฐูŠุฉ", "ุญู…ูŠุฉ",
193
+ "ุตุญุฉ ู†ูุณูŠุฉ", "ุนู„ุงุฌ", "ุทุจ", "ู…ุณุชุดูู‰", "ูƒุฑุฉ ู‚ุฏู…", "ุณุจุงุญุฉ",
194
+ "ูŠูˆุบุง", "ู†ูˆู…", "ููŠุชุงู…ูŠู†ุงุช",
195
+ ])
196
+
197
+
198
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
199
+ # PUBLIC API
200
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
201
+
202
+ def classify_topics(topics: List[str]) -> List[str]:
203
+ """
204
+ Map a list of dynamically extracted topics to predefined UI categories.
205
+
206
+ Args:
207
+ topics: List of topic strings from the LLM (e.g. ["Python", "Deep Learning"]).
208
+
209
+ Returns:
210
+ A deduplicated, ordered list of matching category names.
211
+ Falls back to ["Education & Science"] if no match is found.
212
+
213
+ Example:
214
+ >>> classify_topics(["Python", "Machine Learning", "Neural Networks"])
215
+ ["Technology & AI"]
216
+ >>> classify_topics(["Investing", "AI Stocks"])
217
+ ["Business & Finance", "Technology & AI"]
218
+ """
219
+ matched: Set[str] = set()
220
+
221
+ for topic in topics:
222
+ topic_lower = topic.lower().strip()
223
+
224
+ # 1. Exact match
225
+ if topic_lower in _KEYWORD_MAP:
226
+ matched.add(_KEYWORD_MAP[topic_lower])
227
+ continue
228
+
229
+ # 2. Substring match โ€” check if any keyword appears inside the topic
230
+ for keyword, category in _KEYWORD_MAP.items():
231
+ if keyword in topic_lower or topic_lower in keyword:
232
+ matched.add(category)
233
+ break
234
+
235
+ if not matched:
236
+ matched.add("Education & Science")
237
+
238
+ # Return in the same order as CATEGORIES for consistency
239
+ return [cat for cat in CATEGORIES if cat in matched]
240
+
241
+
242
+ def get_primary_category(topics: List[str]) -> str:
243
+ """
244
+ Return the single best-matching category for the given topics.
245
+
246
+ Useful when only one category tag is needed (e.g. a badge in the UI).
247
+ """
248
+ categories = classify_topics(topics)
249
+ return categories[0] if categories else "Education & Science"