Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

File size: 17,683 Bytes

from __future__ import annotations

from collections import Counter, defaultdict
from collections.abc import Mapping, Sequence
import math
from typing import Any

from hackathon_advisor.data import (
    Project,
    ProjectIndex,
    normalize_project_tags,
    public_project_summary,
    public_project_title,
    tokenize,
)
from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
from hackathon_advisor._text import utc_now


DASHBOARD_SCHEMA_VERSION = 1
TSNE_RANDOM_STATE = 42
TSNE_MIN_PROJECTS = 3
LINKS_PER_PROJECT = 2
CLUSTER_LABEL_ALGORITHM = "distinctive-keywords-v1"

STOPWORDS = {
    "about",
    "agent",
    "app",
    "apps",
    "ai",
    "all",
    "an",
    "and",
    "are",
    "as",
    "at",
    "before",
    "assistant",
    "be",
    "been",
    "being",
    "build",
    "build-small",
    "build-small-hackathon",
    "built",
    "by",
    "demo",
    "face",
    "for",
    "from",
    "gradio",
    "hackathon",
    "hugging",
    "huggingface",
    "in",
    "is",
    "it",
    "its",
    "first",
    "local",
    "make",
    "makes",
    "made",
    "me",
    "model",
    "models",
    "my",
    "of",
    "on",
    "or",
    "our",
    "one",
    "project",
    "projects",
    "pro",
    "region",
    "run",
    "runs",
    "small",
    "space",
    "spaces",
    "submission",
    "the",
    "their",
    "them",
    "these",
    "they",
    "this",
    "those",
    "to",
    "tool",
    "tools",
    "try",
    "us",
    "use",
    "used",
    "uses",
    "using",
    "we",
    "with",
    "you",
    "your",
}


class DashboardError(ValueError):
    pass


def build_dashboard_payload(
    index: ProjectIndex,
    *,
    quest_matches: Mapping[str, Sequence[Mapping[str, Any]]] | None = None,
    quest_source: str = "",
    generated_at: str | None = None,
) -> dict[str, Any]:
    projects = list(index.projects)
    if len(projects) < TSNE_MIN_PROJECTS:
        raise DashboardError(f"dashboard atlas requires at least {TSNE_MIN_PROJECTS} projects")

    matrix = _embedding_matrix(index)
    coordinates = _tsne_coordinates(matrix)
    raw_cluster_labels = _cluster_labels(matrix)
    cluster_id_by_raw, clusters = _cluster_payloads(projects, coordinates, raw_cluster_labels)
    normalized_quest_matches = _normalize_quest_matches(projects, quest_matches)
    points = _point_payloads(projects, coordinates, raw_cluster_labels, cluster_id_by_raw, normalized_quest_matches)
    links = _nearest_links(projects, matrix)
    quest_report = _quest_report(points, normalized_quest_matches, quest_source)
    payload = {
        "schema_version": DASHBOARD_SCHEMA_VERSION,
        "generated_at": generated_at or utc_now(),
        "project_count": len(projects),
        "provenance": {
            "snapshot_generated_at": index.generated_at,
            "snapshot_source": index.source,
            "index_generated_at": index.index_generated_at,
            "index_algorithm": index.index_algorithm,
            "snapshot_digest": index.snapshot_digest,
            "embedding": index.embedding_metadata,
        },
        "layout": {
            "algorithm": "tsne",
            "metric": "cosine",
            "init": "pca",
            "random_state": TSNE_RANDOM_STATE,
            "perplexity": _tsne_perplexity(len(projects)),
        },
        "cluster_label_algorithm": CLUSTER_LABEL_ALGORITHM,
        "points": points,
        "links": links,
        "clusters": clusters,
        "quest_report": quest_report,
    }
    validate_dashboard_payload(payload)
    return payload


def validate_dashboard_payload(payload: Mapping[str, Any]) -> None:
    if payload.get("schema_version") != DASHBOARD_SCHEMA_VERSION:
        raise DashboardError("unsupported dashboard schema version")
    project_count = int(payload.get("project_count") or 0)
    if project_count < TSNE_MIN_PROJECTS:
        raise DashboardError("dashboard project count is too small")
    points = payload.get("points")
    if not isinstance(points, list) or len(points) != project_count:
        raise DashboardError("dashboard point count does not match project count")
    ids: set[str] = set()
    cluster_ids: set[str] = set()
    for point in points:
        if not isinstance(point, dict):
            raise DashboardError("dashboard points must be objects")
        project_id = str(point.get("id") or "")
        if not project_id or project_id in ids:
            raise DashboardError("dashboard points must have unique project ids")
        ids.add(project_id)
        x = float(point.get("x"))
        y = float(point.get("y"))
        if not 0.0 <= x <= 100.0 or not 0.0 <= y <= 100.0:
            raise DashboardError("dashboard point coordinates must be percentages")
        cluster_id = str(point.get("cluster_id") or "")
        if not cluster_id:
            raise DashboardError("dashboard point cluster id is missing")
        cluster_ids.add(cluster_id)

    clusters = payload.get("clusters")
    if not isinstance(clusters, list) or not clusters:
        raise DashboardError("dashboard clusters are missing")
    declared_cluster_ids = {str(cluster.get("id") or "") for cluster in clusters if isinstance(cluster, dict)}
    if cluster_ids - declared_cluster_ids:
        raise DashboardError("dashboard points reference missing clusters")

    links = payload.get("links")
    if not isinstance(links, list):
        raise DashboardError("dashboard links must be a list")
    for link in links:
        if str(link.get("source") or "") not in ids or str(link.get("target") or "") not in ids:
            raise DashboardError("dashboard link references an unknown project")

    quest_report = payload.get("quest_report")
    if not isinstance(quest_report, dict):
        raise DashboardError("dashboard quest report is missing")
    if quest_report.get("status") not in {"analyzed", "not_analyzed"}:
        raise DashboardError("dashboard quest report status is invalid")


def _embedding_matrix(index: ProjectIndex) -> Any:
    import numpy as np

    return np.asarray(index.project_vectors(), dtype=np.float32)


def _tsne_coordinates(matrix: Any) -> list[tuple[float, float]]:
    from sklearn.manifold import TSNE

    coords = TSNE(
        n_components=2,
        perplexity=_tsne_perplexity(int(matrix.shape[0])),
        init="pca",
        learning_rate="auto",
        max_iter=1000,
        metric="cosine",
        random_state=TSNE_RANDOM_STATE,
    ).fit_transform(matrix)
    return _scale_points(coords)


def _tsne_perplexity(count: int) -> int:
    return max(2, min(30, count // 4))


def _cluster_labels(matrix: Any) -> list[int]:
    from sklearn.cluster import KMeans

    count = int(matrix.shape[0])
    cluster_count = min(10, max(min(6, count), round(math.sqrt(count))))
    labels = KMeans(
        n_clusters=cluster_count,
        random_state=TSNE_RANDOM_STATE,
        n_init=20,
    ).fit_predict(matrix)
    return [int(label) for label in labels]


def _scale_points(points: Any, low: float = 3.0, high: float = 97.0) -> list[tuple[float, float]]:
    import numpy as np

    scaled = np.empty_like(points, dtype=np.float64)
    for axis in range(points.shape[1]):
        column = points[:, axis]
        minimum = float(column.min())
        maximum = float(column.max())
        span = maximum - minimum
        if span <= 1e-9:
            scaled[:, axis] = (low + high) / 2.0
        else:
            scaled[:, axis] = low + (column - minimum) / span * (high - low)
    return [(round(float(x), 4), round(float(y), 4)) for x, y in scaled]


def _cluster_payloads(
    projects: Sequence[Project],
    coordinates: Sequence[tuple[float, float]],
    raw_labels: Sequence[int],
) -> tuple[dict[int, str], list[dict[str, Any]]]:
    grouped: dict[int, list[int]] = defaultdict(list)
    for index, label in enumerate(raw_labels):
        grouped[int(label)].append(index)

    ordered_raw_labels = sorted(
        grouped,
        key=lambda label: (-len(grouped[label]), _cluster_center(coordinates, grouped[label])),
    )
    cluster_id_by_raw = {label: f"cluster-{position + 1}" for position, label in enumerate(ordered_raw_labels)}
    clusters: list[dict[str, Any]] = []
    corpus_document_frequency = _corpus_document_frequency(projects)
    for raw_label in ordered_raw_labels:
        indexes = grouped[raw_label]
        cluster_projects = [projects[index] for index in indexes]
        representatives = sorted(
            cluster_projects,
            key=lambda project: (project.likes, project.last_modified, project.title.lower()),
            reverse=True,
        )[:4]
        keywords = _cluster_keywords(
            cluster_projects,
            corpus_document_frequency=corpus_document_frequency,
            corpus_project_count=len(projects),
        )
        label = (
            " / ".join(word.title() for word in keywords[:2])
            if keywords
            else _representative_cluster_label(representatives)
        )
        clusters.append(
            {
                "id": cluster_id_by_raw[raw_label],
                "label": label,
                "keywords": keywords,
                "project_count": len(indexes),
                "center": {
                    "x": round(sum(coordinates[index][0] for index in indexes) / len(indexes), 4),
                    "y": round(sum(coordinates[index][1] for index in indexes) / len(indexes), 4),
                },
                "representative_projects": [project.to_public_dict() for project in representatives],
            }
        )
    return cluster_id_by_raw, clusters


def _cluster_center(coordinates: Sequence[tuple[float, float]], indexes: Sequence[int]) -> tuple[float, float]:
    return (
        sum(coordinates[index][0] for index in indexes) / len(indexes),
        sum(coordinates[index][1] for index in indexes) / len(indexes),
    )


def _corpus_document_frequency(projects: Sequence[Project]) -> Counter[str]:
    document_frequency: Counter[str] = Counter()
    for project in projects:
        document_frequency.update(set(_project_keyword_tokens(project)))
    return document_frequency


def _cluster_keywords(
    projects: Sequence[Project],
    *,
    corpus_document_frequency: Mapping[str, int],
    corpus_project_count: int,
) -> list[str]:
    counts: Counter[str] = Counter()
    document_frequency: Counter[str] = Counter()
    project_list = list(projects)
    for project in project_list:
        tokens = _project_keyword_tokens(project)
        counts.update(tokens)
        document_frequency.update(set(tokens))

    if not project_list:
        return []

    min_cluster_documents = 1 if len(project_list) <= 3 else 2
    scored: list[tuple[float, int, int, str]] = []
    for token, count in counts.items():
        cluster_documents = document_frequency[token]
        if cluster_documents < min_cluster_documents:
            continue
        corpus_documents = int(corpus_document_frequency.get(token) or 0)
        if corpus_documents <= 0:
            continue
        inverse_document_frequency = math.log((1 + corpus_project_count) / (1 + corpus_documents))
        if inverse_document_frequency <= 0.0:
            continue
        exclusivity = cluster_documents / corpus_documents
        coverage = cluster_documents / len(project_list)
        score = (
            (1.0 + math.log(count))
            * inverse_document_frequency
            * (0.35 + 0.65 * exclusivity)
            * (0.35 + 0.65 * coverage)
        )
        scored.append((score, cluster_documents, count, token))

    scored.sort(key=lambda item: (-item[0], -item[1], -item[2], item[3]))
    return [token for _score, _cluster_documents, _count, token in scored[:5]]


def _project_keyword_tokens(project: Project) -> list[str]:
    text = " ".join(
        [
            project.title,
            project.slug.replace("-", " ").replace("_", " "),
            project.summary,
            " ".join(normalize_project_tags(project.tags)),
            " ".join(project.models),
        ]
    )
    return [token for token in tokenize(text) if _is_cluster_keyword(token)]


def _is_cluster_keyword(token: str) -> bool:
    if token in STOPWORDS:
        return False
    if token.startswith("region"):
        return False
    if token.isdigit():
        return False
    return True


def _representative_cluster_label(projects: Sequence[Project]) -> str:
    labels: list[str] = []
    for project in projects:
        title = public_project_title(project.title)
        if title == "Untitled project":
            continue
        labels.append(title)
        if len(labels) == 2:
            break
    return " / ".join(labels) if labels else "Mixed projects"


def _normalize_quest_matches(
    projects: Sequence[Project],
    quest_matches: Mapping[str, Sequence[Mapping[str, Any]]] | None,
) -> dict[str, list[dict[str, Any]]]:
    project_ids = {project.id for project in projects}
    normalized = {project.id: [] for project in projects}
    if quest_matches is None:
        return normalized
    if set(quest_matches) != project_ids:
        missing = sorted(project_ids - set(quest_matches))
        extra = sorted(set(quest_matches) - project_ids)
        detail = []
        if missing:
            detail.append(f"missing {len(missing)} projects")
        if extra:
            detail.append(f"unknown {len(extra)} projects")
        raise DashboardError("quest analysis project coverage is invalid: " + ", ".join(detail))
    for project_id, matches in quest_matches.items():
        normalized[project_id] = [_normalize_quest_match(match) for match in matches]
    return normalized


def _normalize_quest_match(match: Mapping[str, Any]) -> dict[str, Any]:
    try:
        return normalize_match(match)
    except ValueError as error:
        raise DashboardError(f"invalid quest match: {error}") from error


def _point_payloads(
    projects: Sequence[Project],
    coordinates: Sequence[tuple[float, float]],
    raw_labels: Sequence[int],
    cluster_id_by_raw: Mapping[int, str],
    quest_matches: Mapping[str, Sequence[Mapping[str, Any]]],
) -> list[dict[str, Any]]:
    points: list[dict[str, Any]] = []
    for project, (x, y), raw_label in zip(projects, coordinates, raw_labels, strict=True):
        matches = list(quest_matches.get(project.id) or [])
        points.append(
            {
                "id": project.id,
                "title": public_project_title(project.title),
                "summary": public_project_summary(project.summary),
                "url": project.url,
                "host": project.host,
                "likes": project.likes,
                "sdk": project.sdk,
                "models": list(project.models),
                "tags": list(normalize_project_tags(project.tags)),
                "last_modified": project.last_modified,
                "x": x,
                "y": y,
                "cluster_id": cluster_id_by_raw[int(raw_label)],
                "quest_matches": matches,
                "quest_ids": [str(match["quest"]) for match in matches],
            }
        )
    return points


def _nearest_links(projects: Sequence[Project], matrix: Any) -> list[dict[str, Any]]:
    import numpy as np

    similarity = matrix @ matrix.T
    pairs: dict[tuple[int, int], float] = {}
    for index in range(len(projects)):
        order = np.argsort(similarity[index])[::-1]
        neighbors = [int(candidate) for candidate in order if int(candidate) != index][:LINKS_PER_PROJECT]
        for neighbor in neighbors:
            left, right = sorted((index, neighbor))
            pairs[(left, right)] = max(float(similarity[left, right]), pairs.get((left, right), -1.0))
    return [
        {
            "source": projects[left].id,
            "target": projects[right].id,
            "score": round(max(0.0, min(1.0, score)), 4),
        }
        for (left, right), score in sorted(pairs.items(), key=lambda item: (-item[1], item[0]))
    ]


def _quest_report(
    points: Sequence[Mapping[str, Any]],
    quest_matches: Mapping[str, Sequence[Mapping[str, Any]]],
    quest_source: str,
) -> dict[str, Any]:
    profiles = {profile["id"]: profile for profile in quest_profiles()}
    status = "analyzed" if quest_source else "not_analyzed"
    quests = []
    for quest in QUESTS:
        matched_points = [
            point
            for point in points
            if any(match["quest"] == quest for match in quest_matches.get(str(point["id"]), []))
        ]
        examples = sorted(
            matched_points,
            key=lambda point: (
                max(
                    (
                        float(match["confidence"])
                        for match in quest_matches.get(str(point["id"]), [])
                        if match["quest"] == quest
                    ),
                    default=0.0,
                ),
                int(point.get("likes") or 0),
            ),
            reverse=True,
        )[:4]
        profile = profiles.get(quest, {"label": quest, "description": ""})
        quests.append(
            {
                "id": quest,
                "label": profile["label"],
                "description": profile["description"],
                "project_count": len(matched_points),
                "examples": [
                    {
                        "id": point["id"],
                        "title": point["title"],
                        "url": point["url"],
                    }
                    for point in examples
                ],
            }
        )
    return {
        "status": status,
        "source": quest_source,
        "quests": quests,
    }