Spaces:
Running
Running
Ali Hashhash
feat: implement summarization engine with Pydantic schemas and map-reduce processing pipeline
4db2bb6 | """ | |
| Topic Classifier — maps dynamic LLM-extracted topics to predefined UI categories. | |
| Usage: | |
| from src.categorization.topic_classifier import classify_topic, get_primary_category | |
| topics = ["Python", "Machine Learning", "Neural Networks"] | |
| result = classify_topic(topics) | |
| # => "Technology & AI" | |
| Categories: | |
| Technology & AI | Business & Finance | Education | Science | |
| Productivity & Self-Growth | Health & Wellness | Sports & Fitness | |
| Entertainment | History | Philosophy | Arts & Culture | |
| """ | |
| from typing import List, Set | |
| from src.utils.logger import setup_logger | |
| logger = setup_logger(__name__) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # PREDEFINED CATEGORIES | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| CATEGORIES = [ | |
| "Technology & AI", | |
| "Business & Finance", | |
| "Education", | |
| "Science", | |
| "Productivity & Self-Growth", | |
| "Health & Wellness", | |
| "Sports & Fitness", | |
| "Entertainment", | |
| "History", | |
| "Philosophy", | |
| "Arts & Culture", | |
| ] | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # KEYWORD → CATEGORY MAPPING (English + Arabic) | |
| # All keywords are stored lowercase for case-insensitive matching. | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| _KEYWORD_MAP: dict[str, str] = {} | |
| def _register(category: str, keywords: list[str]): | |
| """Register a list of keywords for a category (lowercase).""" | |
| for kw in keywords: | |
| _KEYWORD_MAP[kw.lower()] = category | |
| # ── Technology & AI ── | |
| _register("Technology & AI", [ | |
| # English | |
| "ai", "artificial intelligence", "machine learning", "deep learning", | |
| "neural network", "neural networks", "nlp", "natural language processing", | |
| "computer vision", "robotics", "automation", "algorithm", "algorithms", | |
| "python", "javascript", "typescript", "java", "c++", "rust", "golang", "go", | |
| "programming", "coding", "software", "software engineering", "web development", | |
| "frontend", "backend", "full stack", "fullstack", "devops", "cloud", | |
| "cloud computing", "aws", "azure", "gcp", "docker", "kubernetes", | |
| "database", "sql", "nosql", "mongodb", "api", "rest api", "graphql", | |
| "cybersecurity", "security", "hacking", "encryption", "blockchain", | |
| "cryptocurrency", "bitcoin", "ethereum", "web3", "metaverse", | |
| "data science", "data analysis", "data engineering", "big data", | |
| "iot", "internet of things", "5g", "hardware", "semiconductor", | |
| "gpu", "chip", "processor", "tech", "technology", "computing", | |
| "linux", "git", "github", "open source", "framework", "react", | |
| "vue", "angular", "node", "nodejs", "django", "flask", "fastapi", | |
| "tensorflow", "pytorch", "llm", "large language model", "chatgpt", | |
| "gpt", "copilot", "transformer", "diffusion model", | |
| "generative ai", "prompt engineering", "fine tuning", "rag", | |
| "mobile development", "android", "ios", "swift", "kotlin", "flutter", "dart", | |
| # Arabic | |
| "ذكاء اصطناعي", "تعلم آلي", "تعلم عميق", "برمجة", "تقنية", "تكنولوجيا", | |
| "خوارزمية", "حاسوب", "شبكات عصبية", "بيانات", "أمن سيبراني", | |
| "حوسبة سحابية", "تطوير برمجيات", "تطوير ويب", "قواعد بيانات", | |
| ]) | |
| # ── Business & Finance ── | |
| _register("Business & Finance", [ | |
| # English | |
| "business", "finance", "economics", "economy", "stock", "stocks", | |
| "stock market", "trading", "investing", "investment", "real estate", | |
| "entrepreneurship", "startup", "startups", "marketing", "digital marketing", | |
| "seo", "branding", "sales", "revenue", "profit", "accounting", | |
| "budgeting", "money", "wealth", "financial", "banking", "bank", | |
| "fintech", "venture capital", "vc", "ipo", "merger", "acquisition", | |
| "management", "leadership", "strategy", "e-commerce", "ecommerce", | |
| "supply chain", "logistics", "consulting", "mba", "corporate", | |
| "tax", "taxes", "inflation", "gdp", "interest rate", "forex", | |
| "commodity", "commodities", "portfolio", "dividend", "bond", "bonds", | |
| "freelancing", "freelance", "passive income", "side hustle", | |
| # Arabic | |
| "أعمال", "تجارة", "اقتصاد", "مالية", "استثمار", "أسهم", "بورصة", | |
| "تسويق", "ريادة أعمال", "مشروع", "تمويل", "محاسبة", "بنك", "عقارات", | |
| "ربح", "دخل", "ميزانية", | |
| ]) | |
| # ── Education ── | |
| _register("Education", [ | |
| # English | |
| "education", "learning", "teaching", "school", "university", "college", | |
| "academic", "study", "studying", "exam", "exams", "course", | |
| "tutorial", "lecture", "scholarship", "degree", "phd", "thesis", | |
| "curriculum", "pedagogy", "classroom", "student", "teacher", | |
| "grammar", "language", "linguistics", | |
| # Arabic | |
| "تعليم", "تعلم", "مدرسة", "جامعة", "دراسة", "امتحان", "منهج", | |
| "محاضرة", "طالب", "معلم", | |
| ]) | |
| # ── Science ── | |
| _register("Science", [ | |
| # English | |
| "science", "physics", "chemistry", "biology", "math", "mathematics", | |
| "statistics", "calculus", "algebra", "geometry", "astronomy", "space", | |
| "nasa", "quantum", "quantum physics", "quantum computing", | |
| "neuroscience", "genetics", "evolution", "ecology", "geology", | |
| "climate", "climate change", "environment", "engineering", | |
| "mechanical engineering", "electrical engineering", "civil engineering", | |
| "experiment", "laboratory", "lab", "hypothesis", "theory", | |
| "research", "psychology", "sociology", "anthropology", | |
| # Arabic | |
| "علوم", "فيزياء", "كيمياء", "أحياء", "رياضيات", "بحث", | |
| "هندسة", "فلك", "بيئة", "علم نفس", | |
| ]) | |
| # ── Productivity & Self-Growth ── | |
| _register("Productivity & Self-Growth", [ | |
| # English | |
| "productivity", "self improvement", "self-improvement", "self growth", | |
| "self-growth", "personal development", "motivation", "discipline", | |
| "habits", "habit", "time management", "goal setting", "goals", | |
| "mindset", "focus", "concentration", "efficiency", "organization", | |
| "planning", "journaling", "morning routine", "routine", "success", | |
| "self help", "self-help", "life coaching", "coaching", "mentoring", | |
| "mentor", "minimalism", "mindfulness", | |
| "emotional intelligence", "communication skills", "public speaking", | |
| "negotiation", "critical thinking", "problem solving", "creativity", | |
| "decision making", "confidence", "resilience", "work-life balance", | |
| "burnout", "career", "career development", "skill building", | |
| # Arabic | |
| "إنتاجية", "تطوير ذات", "تحفيز", "عادات", "إدارة الوقت", | |
| "أهداف", "تركيز", "نجاح", "تخطيط", "ثقة بالنفس", | |
| "مهارات", "تفكير", "إبداع", | |
| ]) | |
| # ── Health & Wellness ── | |
| _register("Health & Wellness", [ | |
| # English | |
| "health", "wellness", "mental health", "therapy", "depression", | |
| "anxiety", "stress", "sleep", "yoga", "pilates", "meditation", | |
| "diet", "nutrition", "calories", "protein", "vitamins", | |
| "supplements", "weight loss", "fitness", | |
| "medicine", "medical", "doctor", "hospital", "surgery", "disease", | |
| "virus", "vaccine", "pandemic", "covid", "cancer", "diabetes", | |
| "heart", "cardio", "physical therapy", "rehabilitation", | |
| "first aid", "pharmacy", "drug", "prescription", | |
| # Arabic | |
| "صحة", "تغذية", "حمية", "صحة نفسية", "علاج", "طب", | |
| "مستشفى", "نوم", "فيتامينات", "يوغا", | |
| ]) | |
| # ── Sports & Fitness ── | |
| _register("Sports & Fitness", [ | |
| # English | |
| "sports", "football", "soccer", "basketball", "tennis", "baseball", | |
| "cricket", "rugby", "boxing", "mma", "ufc", "wrestling", | |
| "olympics", "world cup", "premier league", "nba", "nfl", | |
| "exercise", "workout", "gym", "bodybuilding", "crossfit", | |
| "running", "marathon", "swimming", "cycling", "hiking", | |
| # Arabic | |
| "رياضة", "تمارين", "لياقة", "كرة قدم", "سباحة", | |
| ]) | |
| # ── Entertainment ── | |
| _register("Entertainment", [ | |
| # English | |
| "entertainment", "movie", "movies", "film", "films", "cinema", | |
| "tv", "television", "series", "netflix", "streaming", "anime", | |
| "manga", "gaming", "video games", "esports", "twitch", "youtube", | |
| "podcast", "music", "song", "album", "concert", | |
| "celebrity", "comedy", "humor", "drama", "reality tv", | |
| "social media", "tiktok", "instagram", "influencer", "content creator", | |
| "vlog", "vlogging", "pop culture", | |
| # Arabic | |
| "ترفيه", "أفلام", "سينما", "مسلسلات", "ألعاب", "موسيقى", | |
| "كوميديا", "يوتيوب", | |
| ]) | |
| # ── History ── | |
| _register("History", [ | |
| # English | |
| "history", "ancient", "medieval", "civilization", "empire", | |
| "world war", "revolution", "archaeology", "historical", | |
| "dynasty", "colonialism", "independence", "heritage", | |
| # Arabic | |
| "تاريخ", "حضارة", "إمبراطورية", "ثورة", "آثار", "تراث", | |
| ]) | |
| # ── Philosophy ── | |
| _register("Philosophy", [ | |
| # English | |
| "philosophy", "ethics", "morality", "existentialism", "stoicism", | |
| "metaphysics", "epistemology", "logic", "consciousness", | |
| "free will", "determinism", "nihilism", "virtue", | |
| # Arabic | |
| "فلسفة", "أخلاق", "وجودية", "منطق", "وعي", | |
| ]) | |
| # ── Arts & Culture ── | |
| _register("Arts & Culture", [ | |
| # English | |
| "art", "artist", "painting", "sculpture", "gallery", "museum", | |
| "photography", "design", "graphic design", "illustration", | |
| "architecture", "interior design", "fashion", "style", "beauty", | |
| "makeup", "skincare", "travel", "tourism", "food", "cooking", | |
| "recipe", "restaurant", "cuisine", "diy", "crafts", | |
| "culture", "literature", "lifestyle", "luxury", | |
| # Arabic | |
| "فن", "تصميم", "تصوير", "سفر", "طبخ", "أزياء", | |
| "جمال", "ثقافة", | |
| ]) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # PUBLIC API | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def classify_topic(topics: List[str]) -> str: | |
| """ | |
| Map a list of dynamically extracted topics to a SINGLE predefined UI category. | |
| Returns the single best-matching category (the first match in CATEGORIES order). | |
| Falls back to "Education & Science" if no match is found. | |
| Args: | |
| topics: List of topic strings from the LLM (e.g. ["Python", "Deep Learning"]). | |
| Returns: | |
| A single category string. | |
| Example: | |
| >>> classify_topic(["Python", "Machine Learning", "Neural Networks"]) | |
| "Technology & AI" | |
| >>> classify_topic(["Investing", "AI Stocks"]) | |
| "Business & Finance" | |
| """ | |
| matched: Set[str] = set() | |
| for topic in topics: | |
| topic_lower = topic.lower().strip() | |
| # 1. Exact match | |
| if topic_lower in _KEYWORD_MAP: | |
| matched.add(_KEYWORD_MAP[topic_lower]) | |
| continue | |
| # 2. Substring match — check if any keyword appears inside the topic | |
| for keyword, category in _KEYWORD_MAP.items(): | |
| if keyword in topic_lower or topic_lower in keyword: | |
| matched.add(category) | |
| break | |
| if not matched: | |
| return "Education" | |
| # Return the first match in CATEGORIES order for consistency | |
| for cat in CATEGORIES: | |
| if cat in matched: | |
| return cat | |
| return "Education" | |
| def classify_topics(topics: List[str]) -> List[str]: | |
| """Backward-compatible wrapper — returns a single-element list.""" | |
| return [classify_topic(topics)] | |
| def get_primary_category(topics: List[str]) -> str: | |
| """ | |
| Return the single best-matching category for the given topics. | |
| Alias for classify_topic(). | |
| """ | |
| return classify_topic(topics) | |
| def classify_topic_groq(title: str, summary: str) -> str: | |
| """Classify video into one of the predefined categories using the Groq API. | |
| Bypasses the local Zero-Shot classification model entirely. | |
| """ | |
| if not title and not summary: | |
| return "Education" | |
| try: | |
| from src.utils.model_loader import get_groq_client | |
| client = get_groq_client() | |
| # Build prompt | |
| categories_str = "\n".join(f"- {cat}" for cat in CATEGORIES) | |
| prompt = ( | |
| "You are an expert content categorization AI.\n" | |
| "Your task is to classify a video into exactly ONE of the following categories:\n" | |
| f"{categories_str}\n\n" | |
| f"Video Title: {title}\n" | |
| f"Video Summary:\n{summary}\n\n" | |
| "Instructions:\n" | |
| "1. Reply with ONLY the exact name of the category from the list above.\n" | |
| "2. Do not write any introduction, explanation, quote marks, punctuation, or extra text.\n" | |
| "3. The output must be exactly one of the listed categories." | |
| ) | |
| messages = [ | |
| {"role": "user", "content": prompt} | |
| ] | |
| logger.info("🟢 Requesting category classification from Groq API...") | |
| chat_completion = client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=messages, | |
| max_tokens=30, | |
| temperature=0.0, | |
| ) | |
| reply = (chat_completion.choices[0].message.content or "").strip() | |
| # Clean quotes if any | |
| reply = reply.strip("'\"") | |
| # Validate that the reply is in the CATEGORIES list | |
| for cat in CATEGORIES: | |
| if reply.lower() == cat.lower(): | |
| logger.info("🏷️ Groq classification: %s", cat) | |
| return cat | |
| # If not exact match, try substring matching | |
| for cat in CATEGORIES: | |
| if cat.lower() in reply.lower(): | |
| logger.info("🏷️ Groq classification (substring match): %s", cat) | |
| return cat | |
| logger.warning("⚠️ Groq returned invalid category: %s — falling back", reply) | |
| return "Education" | |
| except Exception as e: | |
| logger.error("❌ Groq category classification failed: %s", e, exc_info=True) | |
| return "Education" | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # ZERO-SHOT CLASSIFICATION (mDeBERTa fallback) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def classify_topic_zeroshot(text: str) -> str: | |
| """Classify free-form text into one of the predefined UI categories | |
| using the mDeBERTa zero-shot classification pipeline. | |
| Args: | |
| text: Free-form text (transcript excerpt, note body, etc.) | |
| Returns: | |
| The best-matching category string from CATEGORIES. | |
| """ | |
| if not text or not text.strip(): | |
| return "Education" | |
| try: | |
| from src.utils.model_loader import get_classifier_pipeline | |
| classifier = get_classifier_pipeline() | |
| # Truncate to ~500 chars for speed on CPU | |
| result = classifier( | |
| text[:500], | |
| candidate_labels=CATEGORIES, | |
| multi_label=False, | |
| ) | |
| best_label = result["labels"][0] | |
| best_score = result["scores"][0] | |
| logger.info( | |
| "🏷️ Zero-shot classification: %s (score=%.3f)", best_label, best_score | |
| ) | |
| return best_label | |
| except Exception as e: | |
| logger.warning("⚠️ Zero-shot classification failed: %s — falling back", e) | |
| return "Education" | |
| def classify_topic_hybrid(topics: List[str], text: str = "") -> str: | |
| """Best-of-both-worlds classifier. | |
| 1. First tries fast keyword matching via ``classify_topic(topics)``. | |
| 2. If the result is the generic fallback ("Education") AND | |
| ``text`` is provided, runs the mDeBERTa zero-shot classifier on | |
| the text for a more nuanced result. | |
| Args: | |
| topics: List of topic strings (from the summarization pipeline). | |
| text: Optional free-form text for zero-shot fallback. | |
| Returns: | |
| A single category string from CATEGORIES. | |
| """ | |
| keyword_result = classify_topic(topics) | |
| # If keyword matching gave a confident answer, use it | |
| if keyword_result != "Education": | |
| return keyword_result | |
| # If we have text, try zero-shot as a fallback | |
| if text and text.strip(): | |
| return classify_topic_zeroshot(text) | |
| return keyword_result | |