Spaces:

Agents-MCP-Hackathon
/

ReMind

Sleeping

File size: 17,360 Bytes

from typing import List, Dict, Any
from smolagents import CodeAgent, InferenceClientModel, tool
import os
import json
from pathlib import Path
from datetime import datetime


AI_CATEGORIES = {
    "research_breakthroughs": {
        "name": "Research & Breakthroughs",
        "description": "Novel papers, theoretical advances, new architectures, state-of-the-art results.",
        "keywords": [
            "paper",
            "arxiv",
            "research",
            "breakthrough",
            "novel",
            "theory",
            "architecture",
            "state-of-the-art",
            "sota",
            "academic",
            "study",
            "findings",
            "discovery",
        ],
    },
    "model_releases": {
        "name": "Model Releases & Updates",
        "description": "Launches of new large-language or vision models, version upgrades, open-source checkpoints.",
        "keywords": [
            "model",
            "release",
            "launch",
            "gpt",
            "llm",
            "vision",
            "checkpoint",
            "open-source",
            "version",
            "update",
            "huggingface",
            "anthropic",
            "openai",
            "google",
            "meta",
        ],
    },
    "tools_frameworks": {
        "name": "Tools, Frameworks & Platforms",
        "description": "SDKs, libraries, cloud services, developer toolkits, hosting/serving solutions.",
        "keywords": [
            "sdk",
            "library",
            "framework",
            "platform",
            "toolkit",
            "api",
            "cloud",
            "hosting",
            "serving",
            "deployment",
            "infrastructure",
            "docker",
            "kubernetes",
            "aws",
            "azure",
            "gcp",
        ],
    },
    "applications_industry": {
        "name": "Applications & Industry Use Cases",
        "description": "AI in healthcare, finance, manufacturing, marketing, robotics—real-world deployments.",
        "keywords": [
            "healthcare",
            "finance",
            "manufacturing",
            "marketing",
            "robotics",
            "deployment",
            "use-case",
            "industry",
            "application",
            "real-world",
            "production",
            "enterprise",
            "business",
        ],
    },
    "regulation_ethics": {
        "name": "Regulation, Ethics & Policy",
        "description": "Government guidelines, ethical debates, bias/fairness studies, compliance news.",
        "keywords": [
            "regulation",
            "ethics",
            "policy",
            "government",
            "guidelines",
            "bias",
            "fairness",
            "compliance",
            "law",
            "legal",
            "governance",
            "responsible",
            "ai-safety",
            "alignment",
        ],
    },
    "investment_funding": {
        "name": "Investment, Funding & M&A",
        "description": "Venture rounds, strategic investments, acquisitions, startup valuations.",
        "keywords": [
            "investment",
            "funding",
            "venture",
            "acquisition",
            "m&a",
            "startup",
            "valuation",
            "series",
            "round",
            "investor",
            "vc",
            "private-equity",
            "ipo",
            "financing",
        ],
    },
    "benchmarks_leaderboards": {
        "name": "Benchmarks & Leaderboards",
        "description": "Performance comparisons, academic/industry challenges, leaderboard standings.",
        "keywords": [
            "benchmark",
            "leaderboard",
            "performance",
            "comparison",
            "evaluation",
            "metric",
            "score",
            "ranking",
            "competition",
            "challenge",
            "test",
            "dataset",
        ],
    },
    "community_events": {
        "name": "Community, Events & Education",
        "description": "Conferences, workshops, hackathons, courses, tutorials, webinars.",
        "keywords": [
            "conference",
            "workshop",
            "hackathon",
            "course",
            "tutorial",
            "webinar",
            "education",
            "community",
            "event",
            "meetup",
            "training",
            "learning",
            "certification",
        ],
    },
    "security_privacy": {
        "name": "Security, Privacy & Safety",
        "description": "Adversarial attacks, defensive techniques, data-privacy breakthroughs, AI safety research.",
        "keywords": [
            "security",
            "privacy",
            "safety",
            "adversarial",
            "attack",
            "defense",
            "vulnerability",
            "protection",
            "encryption",
            "data-privacy",
            "gdpr",
            "cybersecurity",
        ],
    },
    "market_trends": {
        "name": "Market Trends & Analysis",
        "description": "Adoption rates, market forecasts, analyst reports, surveys on AI usage.",
        "keywords": [
            "market",
            "trends",
            "analysis",
            "forecast",
            "survey",
            "adoption",
            "report",
            "analyst",
            "growth",
            "statistics",
            "usage",
            "metrics",
            "insights",
        ],
    },
}


def get_cache_file_path():
    """Returns the path for the bookmark cache file."""
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    return str(data_dir / "ai_bookmarks_cache.json")


def load_cache():
    """Loads the bookmark cache from JSON file."""
    cache_file = get_cache_file_path()
    if os.path.exists(cache_file):
        try:
            with open(cache_file, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading cache: {e}")
    return {"bookmarks": [], "last_updated": None}


def save_cache(cache_data):
    """Saves the bookmark cache to JSON file."""
    cache_file = get_cache_file_path()
    try:
        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(cache_data, f, indent=2, ensure_ascii=False)
        return True
    except Exception as e:
        print(f"Error saving cache: {e}")
        return False


def categorize_bookmark(bookmark: Dict[str, Any]) -> str:
    """
    Categorizes a single bookmark based on title and URL using keyword matching.

    Args:
        bookmark: Dictionary containing bookmark data with title and url fields.

    Returns:
        String key of the most likely category, or 'uncategorized' if no match found.
    """
    title = bookmark.get("title", "").lower()
    url = bookmark.get("url", "").lower()
    text_to_analyze = f"{title} {url}"

    category_scores = {}

    # Score each category based on keyword matches
    for category_key, category_data in AI_CATEGORIES.items():
        score = 0
        keywords = category_data["keywords"]

        for keyword in keywords:
            # Count occurrences of each keyword
            keyword_count = text_to_analyze.count(keyword.lower())
            score += keyword_count

            # Bonus for exact matches in title
            if keyword.lower() in title:
                score += 2

        category_scores[category_key] = score

    # Find the category with the highest score
    if max(category_scores.values()) > 0:
        return max(category_scores, key=category_scores.get)
    else:
        return "uncategorized"


@tool
def categorize_all_bookmarks() -> Dict[str, Any]:
    """
    Categorizes all bookmarks in the cache and adds category information to each bookmark.
    Updates the cache file with categorized bookmarks.

    Returns:
        Dictionary with categorization results and statistics.
    """
    try:
        cache = load_cache()
        bookmarks = cache.get("bookmarks", [])

        if not bookmarks:
            return {"status": "error", "message": "No bookmarks found in cache"}

        categorized_count = 0
        category_stats = {}

        # Initialize category stats
        for category_key in AI_CATEGORIES.keys():
            category_stats[category_key] = 0
        category_stats["uncategorized"] = 0

        # Categorize each bookmark
        for bookmark in bookmarks:
            category = categorize_bookmark(bookmark)
            bookmark["category"] = category
            bookmark["category_name"] = AI_CATEGORIES.get(category, {}).get("name", "Uncategorized")

            category_stats[category] += 1
            if category != "uncategorized":
                categorized_count += 1

        # Update cache with categorized bookmarks
        cache["bookmarks"] = bookmarks
        cache["last_categorized"] = datetime.now().isoformat()
        cache["categorization_stats"] = category_stats

        if save_cache(cache):
            return {
                "status": "success",
                "message": f"Successfully categorized {categorized_count} out of {len(bookmarks)} bookmarks",
                "total_bookmarks": len(bookmarks),
                "categorized_bookmarks": categorized_count,
                "uncategorized_bookmarks": category_stats["uncategorized"],
                "category_breakdown": category_stats,
            }
        else:
            return {"status": "error", "message": "Failed to save categorized bookmarks to cache"}

    except Exception as e:
        return {"status": "error", "message": f"Error categorizing bookmarks: {str(e)}"}


@tool
def get_bookmarks_by_category(category: str) -> List[Dict[str, Any]]:
    """
    Gets all bookmarks belonging to a specific category.

    Args:
        category: Category key (e.g., 'research_breakthroughs') or category name (e.g., 'Research & Breakthroughs')

    Returns:
        List of bookmarks in the specified category.
    """
    cache = load_cache()
    bookmarks = cache.get("bookmarks", [])

    if not bookmarks:
        return []

    # Check if category is a key or name
    category_key = None
    if category in AI_CATEGORIES:
        category_key = category
    else:
        # Search by category name
        for key, data in AI_CATEGORIES.items():
            if data["name"].lower() == category.lower():
                category_key = key
                break

    if not category_key and category.lower() != "uncategorized":
        return []

    # Filter bookmarks by category
    filtered_bookmarks = []
    for bookmark in bookmarks:
        bookmark_category = bookmark.get("category", "uncategorized")
        if (category_key and bookmark_category == category_key) or (
            category.lower() == "uncategorized" and bookmark_category == "uncategorized"
        ):
            filtered_bookmarks.append(bookmark)

    return filtered_bookmarks


@tool
def get_category_statistics() -> Dict[str, Any]:
    """
    Gets statistics about bookmark categorization.

    Returns:
        Dictionary with categorization statistics and category information.
    """
    cache = load_cache()
    bookmarks = cache.get("bookmarks", [])

    if not bookmarks:
        return {"error": "No bookmarks found in cache"}

    # Calculate current category distribution
    category_counts = {}
    for category_key in AI_CATEGORIES.keys():
        category_counts[category_key] = 0
    category_counts["uncategorized"] = 0

    categorized_bookmarks = 0
    for bookmark in bookmarks:
        category = bookmark.get("category", "uncategorized")
        category_counts[category] += 1
        if category != "uncategorized":
            categorized_bookmarks += 1

    # Prepare detailed category info
    category_details = {}
    for key, data in AI_CATEGORIES.items():
        category_details[key] = {
            "name": data["name"],
            "description": data["description"],
            "count": category_counts[key],
            "percentage": round((category_counts[key] / len(bookmarks)) * 100, 2) if bookmarks else 0,
        }

    return {
        "total_bookmarks": len(bookmarks),
        "categorized_bookmarks": categorized_bookmarks,
        "uncategorized_bookmarks": category_counts["uncategorized"],
        "categorization_rate": round((categorized_bookmarks / len(bookmarks)) * 100, 2) if bookmarks else 0,
        "last_categorized": cache.get("last_categorized"),
        "category_details": category_details,
        "available_categories": list(AI_CATEGORIES.keys()),
    }


@tool
def recategorize_bookmark(bookmark_id: str, new_category: str) -> Dict[str, Any]:
    """
    Manually recategorizes a specific bookmark.

    Args:
        bookmark_id: ID of the bookmark to recategorize
        new_category: New category key (e.g., 'research_breakthroughs') or 'uncategorized'

    Returns:
        Dictionary with recategorization result.
    """
    try:
        cache = load_cache()
        bookmarks = cache.get("bookmarks", [])

        # Find the bookmark
        bookmark_found = False
        for bookmark in bookmarks:
            if bookmark.get("id") == bookmark_id:
                # Validate new category
                if new_category == "uncategorized" or new_category in AI_CATEGORIES:
                    old_category = bookmark.get("category", "uncategorized")
                    bookmark["category"] = new_category
                    bookmark["category_name"] = AI_CATEGORIES.get(new_category, {}).get("name", "Uncategorized")
                    bookmark["manually_categorized"] = True
                    bookmark["recategorized_at"] = datetime.now().isoformat()
                    bookmark_found = True

                    # Save updated cache
                    if save_cache(cache):
                        return {
                            "status": "success",
                            "message": f"Bookmark '{bookmark.get('title', 'Unknown')}' recategorized from '{old_category}' to '{new_category}'",
                            "bookmark_title": bookmark.get("title"),
                            "old_category": old_category,
                            "new_category": new_category,
                        }
                    else:
                        return {"status": "error", "message": "Failed to save recategorized bookmark"}
                else:
                    return {"status": "error", "message": f"Invalid category: {new_category}"}

        if not bookmark_found:
            return {"status": "error", "message": f"Bookmark with ID '{bookmark_id}' not found"}

    except Exception as e:
        return {"status": "error", "message": f"Error recategorizing bookmark: {str(e)}"}


@tool
def get_uncategorized_bookmarks() -> List[Dict[str, Any]]:
    """
    Gets all bookmarks that are currently uncategorized.

    Returns:
        List of uncategorized bookmarks.
    """
    cache = load_cache()
    bookmarks = cache.get("bookmarks", [])

    uncategorized = []
    for bookmark in bookmarks:
        if bookmark.get("category", "uncategorized") == "uncategorized":
            uncategorized.append(bookmark)

    return uncategorized


@tool
def search_bookmarks_by_category_and_query(category: str, query: str) -> List[Dict[str, Any]]:
    """
    Search bookmarks within a specific category using a query.

    Args:
        category: Category key or name to search within
        query: Search term to find in bookmark titles or URLs

    Returns:
        List of matching bookmarks within the specified category.
    """
    # First get bookmarks by category
    category_bookmarks = get_bookmarks_by_category(category)

    if not category_bookmarks:
        return []

    # Then search within those bookmarks
    query_lower = query.lower()
    matching_bookmarks = []

    for bookmark in category_bookmarks:
        title = bookmark.get("title", "").lower()
        url = bookmark.get("url", "").lower()

        if query_lower in title or query_lower in url:
            matching_bookmarks.append(bookmark)

    return matching_bookmarks


# Instantiate the Categoriser CodeAgent
categoriser_agent = CodeAgent(
    model=InferenceClientModel(
        provider="nebius",
        token=os.environ["HF_TOKEN"],
    ),
    tools=[
        categorize_all_bookmarks,
        get_bookmarks_by_category,
        get_category_statistics,
        recategorize_bookmark,
        get_uncategorized_bookmarks,
        search_bookmarks_by_category_and_query,
    ],
    name="categoriser_agent",
    description="Specializes in categorizing AI news and bookmarks into 10 predefined categories: Research & Breakthroughs, Model Releases & Updates, Tools/Frameworks/Platforms, Applications & Industry Use Cases, Regulation/Ethics/Policy, Investment/Funding/M&A, Benchmarks & Leaderboards, Community/Events/Education, Security/Privacy/Safety, and Market Trends & Analysis. Uses keyword-based categorization and provides tools for managing and searching categorized content.",
    max_steps=10,
    additional_authorized_imports=["json", "datetime", "re", "pathlib"],
    # Reduce verbosity
    stream_outputs=False,
    max_print_outputs_length=300,
)