from typing import List, Dict, Any from smolagents import CodeAgent, InferenceClientModel, tool import os import json from pathlib import Path from datetime import datetime AI_CATEGORIES = { "research_breakthroughs": { "name": "Research & Breakthroughs", "description": "Novel papers, theoretical advances, new architectures, state-of-the-art results.", "keywords": [ "paper", "arxiv", "research", "breakthrough", "novel", "theory", "architecture", "state-of-the-art", "sota", "academic", "study", "findings", "discovery", ], }, "model_releases": { "name": "Model Releases & Updates", "description": "Launches of new large-language or vision models, version upgrades, open-source checkpoints.", "keywords": [ "model", "release", "launch", "gpt", "llm", "vision", "checkpoint", "open-source", "version", "update", "huggingface", "anthropic", "openai", "google", "meta", ], }, "tools_frameworks": { "name": "Tools, Frameworks & Platforms", "description": "SDKs, libraries, cloud services, developer toolkits, hosting/serving solutions.", "keywords": [ "sdk", "library", "framework", "platform", "toolkit", "api", "cloud", "hosting", "serving", "deployment", "infrastructure", "docker", "kubernetes", "aws", "azure", "gcp", ], }, "applications_industry": { "name": "Applications & Industry Use Cases", "description": "AI in healthcare, finance, manufacturing, marketing, robotics—real-world deployments.", "keywords": [ "healthcare", "finance", "manufacturing", "marketing", "robotics", "deployment", "use-case", "industry", "application", "real-world", "production", "enterprise", "business", ], }, "regulation_ethics": { "name": "Regulation, Ethics & Policy", "description": "Government guidelines, ethical debates, bias/fairness studies, compliance news.", "keywords": [ "regulation", "ethics", "policy", "government", "guidelines", "bias", "fairness", "compliance", "law", "legal", "governance", "responsible", "ai-safety", "alignment", ], }, "investment_funding": { "name": "Investment, Funding & M&A", "description": "Venture rounds, strategic investments, acquisitions, startup valuations.", "keywords": [ "investment", "funding", "venture", "acquisition", "m&a", "startup", "valuation", "series", "round", "investor", "vc", "private-equity", "ipo", "financing", ], }, "benchmarks_leaderboards": { "name": "Benchmarks & Leaderboards", "description": "Performance comparisons, academic/industry challenges, leaderboard standings.", "keywords": [ "benchmark", "leaderboard", "performance", "comparison", "evaluation", "metric", "score", "ranking", "competition", "challenge", "test", "dataset", ], }, "community_events": { "name": "Community, Events & Education", "description": "Conferences, workshops, hackathons, courses, tutorials, webinars.", "keywords": [ "conference", "workshop", "hackathon", "course", "tutorial", "webinar", "education", "community", "event", "meetup", "training", "learning", "certification", ], }, "security_privacy": { "name": "Security, Privacy & Safety", "description": "Adversarial attacks, defensive techniques, data-privacy breakthroughs, AI safety research.", "keywords": [ "security", "privacy", "safety", "adversarial", "attack", "defense", "vulnerability", "protection", "encryption", "data-privacy", "gdpr", "cybersecurity", ], }, "market_trends": { "name": "Market Trends & Analysis", "description": "Adoption rates, market forecasts, analyst reports, surveys on AI usage.", "keywords": [ "market", "trends", "analysis", "forecast", "survey", "adoption", "report", "analyst", "growth", "statistics", "usage", "metrics", "insights", ], }, } def get_cache_file_path(): """Returns the path for the bookmark cache file.""" data_dir = Path("data") data_dir.mkdir(exist_ok=True) return str(data_dir / "ai_bookmarks_cache.json") def load_cache(): """Loads the bookmark cache from JSON file.""" cache_file = get_cache_file_path() if os.path.exists(cache_file): try: with open(cache_file, "r", encoding="utf-8") as f: return json.load(f) except Exception as e: print(f"Error loading cache: {e}") return {"bookmarks": [], "last_updated": None} def save_cache(cache_data): """Saves the bookmark cache to JSON file.""" cache_file = get_cache_file_path() try: with open(cache_file, "w", encoding="utf-8") as f: json.dump(cache_data, f, indent=2, ensure_ascii=False) return True except Exception as e: print(f"Error saving cache: {e}") return False def categorize_bookmark(bookmark: Dict[str, Any]) -> str: """ Categorizes a single bookmark based on title and URL using keyword matching. Args: bookmark: Dictionary containing bookmark data with title and url fields. Returns: String key of the most likely category, or 'uncategorized' if no match found. """ title = bookmark.get("title", "").lower() url = bookmark.get("url", "").lower() text_to_analyze = f"{title} {url}" category_scores = {} # Score each category based on keyword matches for category_key, category_data in AI_CATEGORIES.items(): score = 0 keywords = category_data["keywords"] for keyword in keywords: # Count occurrences of each keyword keyword_count = text_to_analyze.count(keyword.lower()) score += keyword_count # Bonus for exact matches in title if keyword.lower() in title: score += 2 category_scores[category_key] = score # Find the category with the highest score if max(category_scores.values()) > 0: return max(category_scores, key=category_scores.get) else: return "uncategorized" @tool def categorize_all_bookmarks() -> Dict[str, Any]: """ Categorizes all bookmarks in the cache and adds category information to each bookmark. Updates the cache file with categorized bookmarks. Returns: Dictionary with categorization results and statistics. """ try: cache = load_cache() bookmarks = cache.get("bookmarks", []) if not bookmarks: return {"status": "error", "message": "No bookmarks found in cache"} categorized_count = 0 category_stats = {} # Initialize category stats for category_key in AI_CATEGORIES.keys(): category_stats[category_key] = 0 category_stats["uncategorized"] = 0 # Categorize each bookmark for bookmark in bookmarks: category = categorize_bookmark(bookmark) bookmark["category"] = category bookmark["category_name"] = AI_CATEGORIES.get(category, {}).get("name", "Uncategorized") category_stats[category] += 1 if category != "uncategorized": categorized_count += 1 # Update cache with categorized bookmarks cache["bookmarks"] = bookmarks cache["last_categorized"] = datetime.now().isoformat() cache["categorization_stats"] = category_stats if save_cache(cache): return { "status": "success", "message": f"Successfully categorized {categorized_count} out of {len(bookmarks)} bookmarks", "total_bookmarks": len(bookmarks), "categorized_bookmarks": categorized_count, "uncategorized_bookmarks": category_stats["uncategorized"], "category_breakdown": category_stats, } else: return {"status": "error", "message": "Failed to save categorized bookmarks to cache"} except Exception as e: return {"status": "error", "message": f"Error categorizing bookmarks: {str(e)}"} @tool def get_bookmarks_by_category(category: str) -> List[Dict[str, Any]]: """ Gets all bookmarks belonging to a specific category. Args: category: Category key (e.g., 'research_breakthroughs') or category name (e.g., 'Research & Breakthroughs') Returns: List of bookmarks in the specified category. """ cache = load_cache() bookmarks = cache.get("bookmarks", []) if not bookmarks: return [] # Check if category is a key or name category_key = None if category in AI_CATEGORIES: category_key = category else: # Search by category name for key, data in AI_CATEGORIES.items(): if data["name"].lower() == category.lower(): category_key = key break if not category_key and category.lower() != "uncategorized": return [] # Filter bookmarks by category filtered_bookmarks = [] for bookmark in bookmarks: bookmark_category = bookmark.get("category", "uncategorized") if (category_key and bookmark_category == category_key) or ( category.lower() == "uncategorized" and bookmark_category == "uncategorized" ): filtered_bookmarks.append(bookmark) return filtered_bookmarks @tool def get_category_statistics() -> Dict[str, Any]: """ Gets statistics about bookmark categorization. Returns: Dictionary with categorization statistics and category information. """ cache = load_cache() bookmarks = cache.get("bookmarks", []) if not bookmarks: return {"error": "No bookmarks found in cache"} # Calculate current category distribution category_counts = {} for category_key in AI_CATEGORIES.keys(): category_counts[category_key] = 0 category_counts["uncategorized"] = 0 categorized_bookmarks = 0 for bookmark in bookmarks: category = bookmark.get("category", "uncategorized") category_counts[category] += 1 if category != "uncategorized": categorized_bookmarks += 1 # Prepare detailed category info category_details = {} for key, data in AI_CATEGORIES.items(): category_details[key] = { "name": data["name"], "description": data["description"], "count": category_counts[key], "percentage": round((category_counts[key] / len(bookmarks)) * 100, 2) if bookmarks else 0, } return { "total_bookmarks": len(bookmarks), "categorized_bookmarks": categorized_bookmarks, "uncategorized_bookmarks": category_counts["uncategorized"], "categorization_rate": round((categorized_bookmarks / len(bookmarks)) * 100, 2) if bookmarks else 0, "last_categorized": cache.get("last_categorized"), "category_details": category_details, "available_categories": list(AI_CATEGORIES.keys()), } @tool def recategorize_bookmark(bookmark_id: str, new_category: str) -> Dict[str, Any]: """ Manually recategorizes a specific bookmark. Args: bookmark_id: ID of the bookmark to recategorize new_category: New category key (e.g., 'research_breakthroughs') or 'uncategorized' Returns: Dictionary with recategorization result. """ try: cache = load_cache() bookmarks = cache.get("bookmarks", []) # Find the bookmark bookmark_found = False for bookmark in bookmarks: if bookmark.get("id") == bookmark_id: # Validate new category if new_category == "uncategorized" or new_category in AI_CATEGORIES: old_category = bookmark.get("category", "uncategorized") bookmark["category"] = new_category bookmark["category_name"] = AI_CATEGORIES.get(new_category, {}).get("name", "Uncategorized") bookmark["manually_categorized"] = True bookmark["recategorized_at"] = datetime.now().isoformat() bookmark_found = True # Save updated cache if save_cache(cache): return { "status": "success", "message": f"Bookmark '{bookmark.get('title', 'Unknown')}' recategorized from '{old_category}' to '{new_category}'", "bookmark_title": bookmark.get("title"), "old_category": old_category, "new_category": new_category, } else: return {"status": "error", "message": "Failed to save recategorized bookmark"} else: return {"status": "error", "message": f"Invalid category: {new_category}"} if not bookmark_found: return {"status": "error", "message": f"Bookmark with ID '{bookmark_id}' not found"} except Exception as e: return {"status": "error", "message": f"Error recategorizing bookmark: {str(e)}"} @tool def get_uncategorized_bookmarks() -> List[Dict[str, Any]]: """ Gets all bookmarks that are currently uncategorized. Returns: List of uncategorized bookmarks. """ cache = load_cache() bookmarks = cache.get("bookmarks", []) uncategorized = [] for bookmark in bookmarks: if bookmark.get("category", "uncategorized") == "uncategorized": uncategorized.append(bookmark) return uncategorized @tool def search_bookmarks_by_category_and_query(category: str, query: str) -> List[Dict[str, Any]]: """ Search bookmarks within a specific category using a query. Args: category: Category key or name to search within query: Search term to find in bookmark titles or URLs Returns: List of matching bookmarks within the specified category. """ # First get bookmarks by category category_bookmarks = get_bookmarks_by_category(category) if not category_bookmarks: return [] # Then search within those bookmarks query_lower = query.lower() matching_bookmarks = [] for bookmark in category_bookmarks: title = bookmark.get("title", "").lower() url = bookmark.get("url", "").lower() if query_lower in title or query_lower in url: matching_bookmarks.append(bookmark) return matching_bookmarks # Instantiate the Categoriser CodeAgent categoriser_agent = CodeAgent( model=InferenceClientModel( provider="nebius", token=os.environ["HF_TOKEN"], ), tools=[ categorize_all_bookmarks, get_bookmarks_by_category, get_category_statistics, recategorize_bookmark, get_uncategorized_bookmarks, search_bookmarks_by_category_and_query, ], name="categoriser_agent", description="Specializes in categorizing AI news and bookmarks into 10 predefined categories: Research & Breakthroughs, Model Releases & Updates, Tools/Frameworks/Platforms, Applications & Industry Use Cases, Regulation/Ethics/Policy, Investment/Funding/M&A, Benchmarks & Leaderboards, Community/Events/Education, Security/Privacy/Safety, and Market Trends & Analysis. Uses keyword-based categorization and provides tools for managing and searching categorized content.", max_steps=10, additional_authorized_imports=["json", "datetime", "re", "pathlib"], # Reduce verbosity stream_outputs=False, max_print_outputs_length=300, )