""" Topic Classifier — maps dynamic LLM-extracted topics to predefined UI categories. Usage: from src.categorization.topic_classifier import classify_topic, get_primary_category topics = ["Python", "Machine Learning", "Neural Networks"] result = classify_topic(topics) # => "Technology & AI" Categories: Technology & AI | Business & Finance | Education | Science Productivity & Self-Growth | Health & Wellness | Sports & Fitness Entertainment | History | Philosophy | Arts & Culture """ from typing import List, Set from src.utils.logger import setup_logger logger = setup_logger(__name__) # ───────────────────────────────────────────────────────────────────────────── # PREDEFINED CATEGORIES # ───────────────────────────────────────────────────────────────────────────── CATEGORIES = [ "Technology & AI", "Business & Finance", "Education", "Science", "Productivity & Self-Growth", "Health & Wellness", "Sports & Fitness", "Entertainment", "History", "Philosophy", "Arts & Culture", ] # ───────────────────────────────────────────────────────────────────────────── # KEYWORD → CATEGORY MAPPING (English + Arabic) # All keywords are stored lowercase for case-insensitive matching. # ───────────────────────────────────────────────────────────────────────────── _KEYWORD_MAP: dict[str, str] = {} def _register(category: str, keywords: list[str]): """Register a list of keywords for a category (lowercase).""" for kw in keywords: _KEYWORD_MAP[kw.lower()] = category # ── Technology & AI ── _register("Technology & AI", [ # English "ai", "artificial intelligence", "machine learning", "deep learning", "neural network", "neural networks", "nlp", "natural language processing", "computer vision", "robotics", "automation", "algorithm", "algorithms", "python", "javascript", "typescript", "java", "c++", "rust", "golang", "go", "programming", "coding", "software", "software engineering", "web development", "frontend", "backend", "full stack", "fullstack", "devops", "cloud", "cloud computing", "aws", "azure", "gcp", "docker", "kubernetes", "database", "sql", "nosql", "mongodb", "api", "rest api", "graphql", "cybersecurity", "security", "hacking", "encryption", "blockchain", "cryptocurrency", "bitcoin", "ethereum", "web3", "metaverse", "data science", "data analysis", "data engineering", "big data", "iot", "internet of things", "5g", "hardware", "semiconductor", "gpu", "chip", "processor", "tech", "technology", "computing", "linux", "git", "github", "open source", "framework", "react", "vue", "angular", "node", "nodejs", "django", "flask", "fastapi", "tensorflow", "pytorch", "llm", "large language model", "chatgpt", "gpt", "copilot", "transformer", "diffusion model", "generative ai", "prompt engineering", "fine tuning", "rag", "mobile development", "android", "ios", "swift", "kotlin", "flutter", "dart", # Arabic "ذكاء اصطناعي", "تعلم آلي", "تعلم عميق", "برمجة", "تقنية", "تكنولوجيا", "خوارزمية", "حاسوب", "شبكات عصبية", "بيانات", "أمن سيبراني", "حوسبة سحابية", "تطوير برمجيات", "تطوير ويب", "قواعد بيانات", ]) # ── Business & Finance ── _register("Business & Finance", [ # English "business", "finance", "economics", "economy", "stock", "stocks", "stock market", "trading", "investing", "investment", "real estate", "entrepreneurship", "startup", "startups", "marketing", "digital marketing", "seo", "branding", "sales", "revenue", "profit", "accounting", "budgeting", "money", "wealth", "financial", "banking", "bank", "fintech", "venture capital", "vc", "ipo", "merger", "acquisition", "management", "leadership", "strategy", "e-commerce", "ecommerce", "supply chain", "logistics", "consulting", "mba", "corporate", "tax", "taxes", "inflation", "gdp", "interest rate", "forex", "commodity", "commodities", "portfolio", "dividend", "bond", "bonds", "freelancing", "freelance", "passive income", "side hustle", # Arabic "أعمال", "تجارة", "اقتصاد", "مالية", "استثمار", "أسهم", "بورصة", "تسويق", "ريادة أعمال", "مشروع", "تمويل", "محاسبة", "بنك", "عقارات", "ربح", "دخل", "ميزانية", ]) # ── Education ── _register("Education", [ # English "education", "learning", "teaching", "school", "university", "college", "academic", "study", "studying", "exam", "exams", "course", "tutorial", "lecture", "scholarship", "degree", "phd", "thesis", "curriculum", "pedagogy", "classroom", "student", "teacher", "grammar", "language", "linguistics", # Arabic "تعليم", "تعلم", "مدرسة", "جامعة", "دراسة", "امتحان", "منهج", "محاضرة", "طالب", "معلم", ]) # ── Science ── _register("Science", [ # English "science", "physics", "chemistry", "biology", "math", "mathematics", "statistics", "calculus", "algebra", "geometry", "astronomy", "space", "nasa", "quantum", "quantum physics", "quantum computing", "neuroscience", "genetics", "evolution", "ecology", "geology", "climate", "climate change", "environment", "engineering", "mechanical engineering", "electrical engineering", "civil engineering", "experiment", "laboratory", "lab", "hypothesis", "theory", "research", "psychology", "sociology", "anthropology", # Arabic "علوم", "فيزياء", "كيمياء", "أحياء", "رياضيات", "بحث", "هندسة", "فلك", "بيئة", "علم نفس", ]) # ── Productivity & Self-Growth ── _register("Productivity & Self-Growth", [ # English "productivity", "self improvement", "self-improvement", "self growth", "self-growth", "personal development", "motivation", "discipline", "habits", "habit", "time management", "goal setting", "goals", "mindset", "focus", "concentration", "efficiency", "organization", "planning", "journaling", "morning routine", "routine", "success", "self help", "self-help", "life coaching", "coaching", "mentoring", "mentor", "minimalism", "mindfulness", "emotional intelligence", "communication skills", "public speaking", "negotiation", "critical thinking", "problem solving", "creativity", "decision making", "confidence", "resilience", "work-life balance", "burnout", "career", "career development", "skill building", # Arabic "إنتاجية", "تطوير ذات", "تحفيز", "عادات", "إدارة الوقت", "أهداف", "تركيز", "نجاح", "تخطيط", "ثقة بالنفس", "مهارات", "تفكير", "إبداع", ]) # ── Health & Wellness ── _register("Health & Wellness", [ # English "health", "wellness", "mental health", "therapy", "depression", "anxiety", "stress", "sleep", "yoga", "pilates", "meditation", "diet", "nutrition", "calories", "protein", "vitamins", "supplements", "weight loss", "fitness", "medicine", "medical", "doctor", "hospital", "surgery", "disease", "virus", "vaccine", "pandemic", "covid", "cancer", "diabetes", "heart", "cardio", "physical therapy", "rehabilitation", "first aid", "pharmacy", "drug", "prescription", # Arabic "صحة", "تغذية", "حمية", "صحة نفسية", "علاج", "طب", "مستشفى", "نوم", "فيتامينات", "يوغا", ]) # ── Sports & Fitness ── _register("Sports & Fitness", [ # English "sports", "football", "soccer", "basketball", "tennis", "baseball", "cricket", "rugby", "boxing", "mma", "ufc", "wrestling", "olympics", "world cup", "premier league", "nba", "nfl", "exercise", "workout", "gym", "bodybuilding", "crossfit", "running", "marathon", "swimming", "cycling", "hiking", # Arabic "رياضة", "تمارين", "لياقة", "كرة قدم", "سباحة", ]) # ── Entertainment ── _register("Entertainment", [ # English "entertainment", "movie", "movies", "film", "films", "cinema", "tv", "television", "series", "netflix", "streaming", "anime", "manga", "gaming", "video games", "esports", "twitch", "youtube", "podcast", "music", "song", "album", "concert", "celebrity", "comedy", "humor", "drama", "reality tv", "social media", "tiktok", "instagram", "influencer", "content creator", "vlog", "vlogging", "pop culture", # Arabic "ترفيه", "أفلام", "سينما", "مسلسلات", "ألعاب", "موسيقى", "كوميديا", "يوتيوب", ]) # ── History ── _register("History", [ # English "history", "ancient", "medieval", "civilization", "empire", "world war", "revolution", "archaeology", "historical", "dynasty", "colonialism", "independence", "heritage", # Arabic "تاريخ", "حضارة", "إمبراطورية", "ثورة", "آثار", "تراث", ]) # ── Philosophy ── _register("Philosophy", [ # English "philosophy", "ethics", "morality", "existentialism", "stoicism", "metaphysics", "epistemology", "logic", "consciousness", "free will", "determinism", "nihilism", "virtue", # Arabic "فلسفة", "أخلاق", "وجودية", "منطق", "وعي", ]) # ── Arts & Culture ── _register("Arts & Culture", [ # English "art", "artist", "painting", "sculpture", "gallery", "museum", "photography", "design", "graphic design", "illustration", "architecture", "interior design", "fashion", "style", "beauty", "makeup", "skincare", "travel", "tourism", "food", "cooking", "recipe", "restaurant", "cuisine", "diy", "crafts", "culture", "literature", "lifestyle", "luxury", # Arabic "فن", "تصميم", "تصوير", "سفر", "طبخ", "أزياء", "جمال", "ثقافة", ]) # ───────────────────────────────────────────────────────────────────────────── # PUBLIC API # ───────────────────────────────────────────────────────────────────────────── def classify_topic(topics: List[str]) -> str: """ Map a list of dynamically extracted topics to a SINGLE predefined UI category. Returns the single best-matching category (the first match in CATEGORIES order). Falls back to "Education & Science" if no match is found. Args: topics: List of topic strings from the LLM (e.g. ["Python", "Deep Learning"]). Returns: A single category string. Example: >>> classify_topic(["Python", "Machine Learning", "Neural Networks"]) "Technology & AI" >>> classify_topic(["Investing", "AI Stocks"]) "Business & Finance" """ matched: Set[str] = set() for topic in topics: topic_lower = topic.lower().strip() # 1. Exact match if topic_lower in _KEYWORD_MAP: matched.add(_KEYWORD_MAP[topic_lower]) continue # 2. Substring match — check if any keyword appears inside the topic for keyword, category in _KEYWORD_MAP.items(): if keyword in topic_lower or topic_lower in keyword: matched.add(category) break if not matched: return "Education" # Return the first match in CATEGORIES order for consistency for cat in CATEGORIES: if cat in matched: return cat return "Education" def classify_topics(topics: List[str]) -> List[str]: """Backward-compatible wrapper — returns a single-element list.""" return [classify_topic(topics)] def get_primary_category(topics: List[str]) -> str: """ Return the single best-matching category for the given topics. Alias for classify_topic(). """ return classify_topic(topics) def classify_topic_groq(title: str, summary: str) -> str: """Classify video into one of the predefined categories using the Groq API. Bypasses the local Zero-Shot classification model entirely. """ if not title and not summary: return "Education" try: from src.utils.model_loader import get_groq_client client = get_groq_client() # Build prompt categories_str = "\n".join(f"- {cat}" for cat in CATEGORIES) prompt = ( "You are an expert content categorization AI.\n" "Your task is to classify a video into exactly ONE of the following categories:\n" f"{categories_str}\n\n" f"Video Title: {title}\n" f"Video Summary:\n{summary}\n\n" "Instructions:\n" "1. Reply with ONLY the exact name of the category from the list above.\n" "2. Do not write any introduction, explanation, quote marks, punctuation, or extra text.\n" "3. The output must be exactly one of the listed categories." ) messages = [ {"role": "user", "content": prompt} ] logger.info("🟢 Requesting category classification from Groq API...") chat_completion = client.chat.completions.create( model="llama-3.3-70b-versatile", messages=messages, max_tokens=30, temperature=0.0, ) reply = (chat_completion.choices[0].message.content or "").strip() # Clean quotes if any reply = reply.strip("'\"") # Validate that the reply is in the CATEGORIES list for cat in CATEGORIES: if reply.lower() == cat.lower(): logger.info("🏷️ Groq classification: %s", cat) return cat # If not exact match, try substring matching for cat in CATEGORIES: if cat.lower() in reply.lower(): logger.info("🏷️ Groq classification (substring match): %s", cat) return cat logger.warning("⚠️ Groq returned invalid category: %s — falling back", reply) return "Education" except Exception as e: logger.error("❌ Groq category classification failed: %s", e, exc_info=True) return "Education" # ───────────────────────────────────────────────────────────────────────────── # ZERO-SHOT CLASSIFICATION (mDeBERTa fallback) # ───────────────────────────────────────────────────────────────────────────── def classify_topic_zeroshot(text: str) -> str: """Classify free-form text into one of the predefined UI categories using the mDeBERTa zero-shot classification pipeline. Args: text: Free-form text (transcript excerpt, note body, etc.) Returns: The best-matching category string from CATEGORIES. """ if not text or not text.strip(): return "Education" try: from src.utils.model_loader import get_classifier_pipeline classifier = get_classifier_pipeline() # Truncate to ~500 chars for speed on CPU result = classifier( text[:500], candidate_labels=CATEGORIES, multi_label=False, ) best_label = result["labels"][0] best_score = result["scores"][0] logger.info( "🏷️ Zero-shot classification: %s (score=%.3f)", best_label, best_score ) return best_label except Exception as e: logger.warning("⚠️ Zero-shot classification failed: %s — falling back", e) return "Education" def classify_topic_hybrid(topics: List[str], text: str = "") -> str: """Best-of-both-worlds classifier. 1. First tries fast keyword matching via ``classify_topic(topics)``. 2. If the result is the generic fallback ("Education") AND ``text`` is provided, runs the mDeBERTa zero-shot classifier on the text for a more nuanced result. Args: topics: List of topic strings (from the summarization pipeline). text: Optional free-form text for zero-shot fallback. Returns: A single category string from CATEGORIES. """ keyword_result = classify_topic(topics) # If keyword matching gave a confident answer, use it if keyword_result != "Education": return keyword_result # If we have text, try zero-shot as a fallback if text and text.strip(): return classify_topic_zeroshot(text) return keyword_result