Spaces:

RadicalNotionAI
/

modeldna

Running

File size: 23,634 Bytes

#!/usr/bin/env python3
"""
modeldna Stage 1 HF Scanner — core logic.
Given a HuggingFace model_id, validates architectural claims against the
ModelAtlas reference database. No weight download needed — uses config.json only.

This is the heart of the modeldna 'test before you download' feature.
"""
from __future__ import annotations
import json, hashlib, os, re, time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import requests

HF_API = "https://huggingface.co"
HF_DATASET = "RadicalNotionAI/modelatlas-reference"
DB = "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"

# In-process cache — loaded once per worker, refreshes when the file changes
_REF_DF = None
_REF_LOADED_AT: float = 0.0
_REF_TTL = 3600  # reload at most once per hour


def _load_reference_df():
    """Load ModelAtlas reference parquet. Tries local snapshot first, then HF dataset."""
    global _REF_DF, _REF_LOADED_AT
    now = time.time()
    if _REF_DF is not None and (now - _REF_LOADED_AT) < _REF_TTL:
        return _REF_DF

    import pandas as pd

    # 1. Local snapshot (fast, used in dev / on local server)
    local_path = Path(__file__).parent.parent / "snapshots" / "modeldna_reference.parquet"
    if local_path.exists():
        try:
            _REF_DF = pd.read_parquet(local_path)
            _REF_LOADED_AT = now
            return _REF_DF
        except Exception:
            pass

    # 2. HF dataset (used on HF Space — downloaded and cached by huggingface_hub)
    try:
        from huggingface_hub import hf_hub_download
        path = hf_hub_download(
            repo_id=HF_DATASET,
            filename="modeldna_reference.parquet",
            repo_type="dataset",
        )
        _REF_DF = pd.read_parquet(path)
        _REF_LOADED_AT = now
        return _REF_DF
    except Exception:
        pass

    return None

# Known base model reference configs (canonical identifiers)
KNOWN_BASES = {
    "qwen3_5_text": {
        "name": "Qwen3.5 (dense)",
        "vocab_size": 248320,
        "model_type_patterns": ["qwen3_5_text", "qwen3_5"],
    },
    "qwen3_5_moe_text": {
        "name": "Qwen3.5 MoE",
        "vocab_size": 248320,
        "model_type_patterns": ["qwen3_5_moe_text", "qwen3_5_moe"],
    },
    "qwen3": {
        "name": "Qwen3",
        "vocab_size": [151936, 152064, 151851, 151670],
        "model_type_patterns": ["qwen3"],
        # 151936/152064 = standard Qwen3; 151851 = BAAI OpenSeek (domain token swap);
        # 151670 = OpenBMB SciCore-Mol (chemistry tokenizer variant)
    },
    "qwen2": {
        "name": "Qwen2.5 (incl. VL)",
        "vocab_size": [151936, 152064, 151680],
        "model_type_patterns": ["qwen2"],
        # 151680 = MiMo-Embodied-7B uses Qwen2.5-VL backbone with this vocab
    },
    "llama3": {
        "name": "Llama 3.x",
        "vocab_size": 128256,
        "model_type_patterns": ["llama"],
        "num_key_value_heads_hint": [8, 32],
    },
    "llama2": {
        "name": "Llama 2",
        "vocab_size": 32000,
        "model_type_patterns": ["llama"],
    },
    "mistral": {
        "name": "Mistral 7B family",
        "vocab_size": 32000,
        "model_type_patterns": ["mistral", "mixtral"],
    },
    "deepseek_v3": {
        "name": "DeepSeek V3/R1",
        "vocab_size": 129280,
        "model_type_patterns": ["deepseek_v3", "deepseek_v2"],
        "kv_lora_rank": 512,
    },
    "gemma": {
        "name": "Gemma family",
        "vocab_size": [256000, 262144],
        "model_type_patterns": ["gemma"],
    },
    "nemotron_h": {
        "name": "NemotronH (NVIDIA Mamba+MoE hybrid)",
        "vocab_size": 131072,
        "model_type_patterns": ["nemotron_h", "nemotronh"],
    },
    "ministral3": {
        "name": "Mistral 3.x (medium/large dense)",
        "vocab_size": 131072,
        "model_type_patterns": ["ministral3", "mistral3"],
        # Mistral Medium 3.5, hidden=12288, 88 layers — dense ~128B
        # Multimodal wrapper uses model_type=mistral3; LLM backbone is ministral3
        # vocab 131072 overlaps with NemotronH — exact model_type match scores higher
    },
    "glm4": {
        "name": "ZhipuAI GLM-4.x (4.5 / 4.6 / 4.7 / 4.6V text backbone)",
        "vocab_size": [151552, 151936, 154880],
        "model_type_patterns": ["glm4v_moe_text", "glm4v_moe", "glm4_moe_lite", "glm4_moe", "glm4", "chatglm"],
        # 151552 = GLM-4.5/4.6 dense+MoE and 4.6V multimodal text backbone
        # 154880 = GLM-4.7 series (including 4.7-Flash, glm4_moe_lite)
    },
    "seed_oss": {
        "name": "ByteDance Seed-OSS (dense)",
        "vocab_size": 155136,
        "model_type_patterns": ["seed_oss"],
        # Dense GQA, RoPE θ=1e7, 512K context, 80→8 KV heads
    },
    "bailing_v2": {
        "name": "AntGroup Bailing-V2 / V2.5 (inclusionAI Ling)",
        "vocab_size": 157184,
        "model_type_patterns": ["bailing_hybrid", "bailing_moe", "bailingmm_moe_v2_lite"],
        # V2 = bailing_moe; V2.5 = bailing_hybrid (MLA + linear-attn + MTP)
        # bailingmm_moe_v2_lite = Ming-flash-omni multimodal lite variant
    },
    "llada2": {
        "name": "inclusionAI LLaDA2 (discrete-diffusion MoE)",
        "vocab_size": [157184, 173568],
        "model_type_patterns": ["llada2_moe", "llada2"],
        # 157184 = text-only discrete diffusion (flash, base)
        # 173568 = Uni any-to-any variant — adds ~16K image codebook tokens to vocab
        # Non-autoregressive masked LM; separate family from Bailing-V2 by training paradigm
    },
    "kimi": {
        "name": "Moonshot Kimi (K2, Kimi-Linear)",
        "vocab_size": 163840,
        "model_type_patterns": ["kimi_linear", "kimi"],
        # Kimi-Linear adds linear_attn_config + MLA + MTP on Kimi MoE backbone
    },
    "ernie4_5_vl": {
        "name": "Baidu ERNIE 4.5 VL (MoE multimodal)",
        "vocab_size": 103424,
        "model_type_patterns": ["ernie4_5_moe_vl", "ernie4_5_vl"],
    },
    "qianfan_vl": {
        "name": "Baidu Qianfan-VL (dense multimodal)",
        "vocab_size": 182025,
        "model_type_patterns": ["qianfanvl_chat", "qianfan"],
        # Distinct Baidu tokenizer from ERNIE — two separate VLM lineages
        # model_type is qianfanvl_chat; qianfan prefix catches future variants
    },
    "interns1": {
        "name": "InternLM S1 (dense, long-chain reasoning)",
        "vocab_size": 153216,
        "model_type_patterns": ["interns1"],
    },
    "pangu_pro_moe": {
        "name": "FreedomIntelligence Pangu-R (Huawei Pangu-Pro-MoE)",
        "vocab_size": 153600,
        "model_type_patterns": ["pangupromoe"],
        # model_type in config is "PanguProMoE" — lowercased to pangupromoe for matching
        # MoE 80/8, first_k_dense_replace=4, hidden=4608, layers=50
    },
    "iquest_coder": {
        "name": "IQuest-Coder",
        "vocab_size": 76800,
        "model_type_patterns": ["iquestcoder"],
        # Code-specialized tokenizer (76800 = code-token-dense). Dense GQA 32→2.
        # Same family across 7B (14 layers) and 40B (80 layers).
    },
    "minicpm": {
        "name": "OpenBMB MiniCPM",
        "vocab_size": 73448,
        "model_type_patterns": ["minicpm"],
        # MiniCPM family (AgentCPM-Report etc.). Heavy GQA 32→2.
    },
    "step3_5": {
        "name": "StepFun Step-3.5 Flash",
        "vocab_size": [128815, 128896],
        "model_type_patterns": ["step3p5"],
        # Per-layer RoPE schedule: every 4th layer gets long-context theta (1e6/5e6),
        # others get 1e4. Sliding-window=512. First StepFun entry with multi-freq RoPE.
    },
    "mimo_v2": {
        "name": "Xiaomi MiMo V2.x",
        "vocab_size": 152576,
        "model_type_patterns": ["mimo_v2"],
        # V2.5: hidden=4096, 48 layers; V2.5-Pro: hidden=6144, 70 layers
    },
    "emu3": {
        "name": "BAAI Emu3 family (unified vision+text)",
        "vocab_size": [184622, 282926],
        "model_type_patterns": ["emu3"],
        # Emu3-Stage1 vocab=184622; Emu3.5 vocab=282926 (expanded vision codebook)
        # Emu3.5 also adds hidden 4096→5120, layers 32→64, sliding_window=4096
    },
    "hunyuan_v1": {
        "name": "Tencent Hunyuan V1 (dense + MoT multimodal)",
        "vocab_size": 120818,
        "model_type_patterns": ["hunyuan_v1_dense", "hunyuan_vl_mot", "hunyuan"],
        # Catches HY-Embodied-0.5 and HY-1.8B variants; MoT = Mixture of Tokens
    },
    "gpt_oss": {
        "name": "OpenAI gpt-oss (via InternVL3.5 wrapper)",
        "vocab_size": 200028,
        "model_type_patterns": ["gpt_oss"],
        # Caught via lifted text_config; InternVL3.5-GPT-OSS-20B uses this backbone
    },
    "valley": {
        "name": "ByteDance Valley (video-language)",
        "vocab_size": [151675, 151679],
        "model_type_patterns": ["valley"],
        # Valley-Eagle-7B (151675) and Valley2.5 (151679) — close but distinct vocabs
    },
    "starcoder2": {
        "name": "BigCode StarCoder2",
        "vocab_size": 49152,
        "model_type_patterns": ["starcoder2", "gpt_bigcode"],
        # 3B: hidden=3072/30L (97K dl), 7B: hidden=4608/32L, 15B: hidden=6144/40L
        # gpt_bigcode = tiny_starcoder_py and early StarCoder variants (234K dl)
        # Code-specialized tokenizer (49152 tokens)
    },
    "zaya": {
        "name": "Zyphra ZAYA1 (deep hybrid SSM+Attention MoE)",
        "vocab_size": 262272,
        "model_type_patterns": ["zaya"],
        # Zyphra's hybrid SSM+attention MoE lineage (follows Zamba2 design philosophy)
        # 8B:  hidden=2048, 80 layers,  16 experts, no explicit KV heads → SSM-dominant
        # 74B: hidden=4096, 120 layers, 24 experts, GQA 16→2 (extreme compression)
        # vocab ≈ Gemma tokenizer + 128 extra tokens (262144→262272)
        # Very deep-and-narrow: 120 layers at 74B vs Llama-3-70B's 80 layers
    },
}


def fetch_config(model_id: str) -> Optional[dict]:
    """Fetch config.json from HuggingFace. Returns None on failure."""
    url = f"{HF_API}/{model_id}/resolve/main/config.json"
    try:
        headers = {}
        token = os.environ.get("HF_TOKEN", "")
        if token:
            headers["Authorization"] = f"Bearer {token}"
        r = requests.get(url, headers=headers, timeout=20)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        return None


def fetch_model_metadata(model_id: str) -> dict:
    """Fetch HF model metadata (downloads, likes, author, tags)."""
    try:
        r = requests.get(f"{HF_API}/api/models/{model_id}", timeout=10)
        r.raise_for_status()
        d = r.json()
        return {
            "downloads": d.get("downloads", 0),
            "likes": d.get("likes", 0),
            "author": d.get("author", ""),
            "tags": d.get("tags", []),
            "pipeline_tag": d.get("pipeline_tag", ""),
            "base_model": d.get("cardData", {}).get("base_model", ""),
            "license": d.get("cardData", {}).get("license", ""),
            "created_at": d.get("createdAt", ""),
            "last_modified": d.get("lastModified", ""),
        }
    except Exception:
        return {}


def detect_claimed_base(model_id: str, config: dict, metadata: dict) -> dict:
    """Detect what base model a model claims to be derived from."""
    claims = {}
    name = model_id.split("/")[-1].lower()
    # Explicit base_model field
    if metadata.get("base_model"):
        claims["explicit_base"] = metadata["base_model"]
    # Name-based detection
    name_signals = []
    for term, base_key in [
        ("qwen3.5", "qwen3_5"), ("qwen3-5", "qwen3_5"), ("qwen35", "qwen3_5"),
        ("qwen3", "qwen3"), ("qwen2.5", "qwen2"), ("qwen2", "qwen2"),
        ("llama-3", "llama3"), ("llama3", "llama3"), ("llama-2", "llama2"),
        ("mistral", "mistral"), ("mixtral", "mistral"),
        ("deepseek", "deepseek_v3"), ("gemma", "gemma"),
    ]:
        if term in name:
            name_signals.append(base_key)
    if name_signals:
        claims["name_implies"] = name_signals
    # Suspicious claims in name
    suspicious = []
    for term in ["claude", "gpt", "chatgpt", "openai", "gemini", "anthropic"]:
        if term in name:
            suspicious.append(term)
    if suspicious:
        claims["suspicious_name_terms"] = suspicious
    return claims


def stage1_screen(model_id: str, config: dict) -> dict:
    """
    Stage 1: Architecture screening against ModelAtlas reference.
    Returns a structured verdict without downloading any weights.
    Handles nested text_config (Qwen3.5/3.6, Mistral3, MiMo-V2.5 pattern).
    """
    # Lift nested LLM config into top-level when top-level vocab/hidden is absent.
    # Recurse up to 2 levels deep to handle models like Logics-MLLM where LLM backbone
    # is at thinker_config.text_config (two levels: thinker_config → text_config).
    _NESTED_KEYS = ("text_config", "llm_config", "thinker_config", "language_model")
    _SKIP_KEYS = ("text_config", "llm_config", "thinker_config", "language_model",
                  "vision_config", "audio_config", "sound_config")
    if not config.get("vocab_size"):
        for nested_key in _NESTED_KEYS:
            candidate = config.get(nested_key, {})
            if candidate:
                # One level deep
                if candidate.get("vocab_size"):
                    # Let nested model_type win over top-level wrapper type
                    outer = {k: v for k, v in config.items()
                             if k not in _SKIP_KEYS and k != "model_type"}
                    config = {**outer, **candidate}
                    break
                # Two levels deep (e.g. thinker_config.text_config)
                for inner_key in _NESTED_KEYS:
                    inner = candidate.get(inner_key, {})
                    if inner and inner.get("vocab_size"):
                        outer = {k: v for k, v in config.items()
                                 if k not in _SKIP_KEYS and k != "model_type"}
                        config = {**outer, **inner}
                        break
                else:
                    continue
                break

    vocab = config.get("vocab_size")
    model_type = (config.get("model_type") or "").lower()
    hidden = config.get("hidden_size")
    layers = config.get("num_hidden_layers")
    kv_lora = config.get("kv_lora_rank")  # MLA signal
    base_model_field = config.get("base_model") or config.get("_name_or_path", "")

    # Compute architecture signature
    key_fields = sorted([
        f"vocab={vocab}", f"type={model_type}", f"hidden={hidden}",
        f"layers={layers}", f"kv_lora={kv_lora}",
    ])
    arch_sig = hashlib.md5("|".join(str(f) for f in key_fields).encode()).hexdigest()[:12]

    # Match against known bases
    base_matches = []
    for base_key, base_info in KNOWN_BASES.items():
        score = 0
        reasons = []
        # Vocab match
        expected_vocab = base_info.get("vocab_size")
        if isinstance(expected_vocab, list):
            if vocab in expected_vocab: score += 3; reasons.append(f"vocab matches ({vocab})")
        elif vocab == expected_vocab:
            score += 3; reasons.append(f"vocab matches ({vocab})")
        # Model type match
        for pat in base_info.get("model_type_patterns", []):
            if model_type == pat:
                score += 3; reasons.append(f"model_type '{model_type}' exact"); break
            elif model_type.startswith(pat):
                score += 2; reasons.append(f"model_type '{model_type}' matches {pat}"); break
        # MLA signal
        if base_key == "deepseek_v3" and kv_lora and kv_lora > 0:
            score += 2; reasons.append(f"MLA kv_lora_rank={kv_lora}")
        if score >= 3:
            base_matches.append({
                "base": base_key,
                "name": base_info["name"],
                "confidence": "HIGH" if score >= 5 else "MODERATE",
                "score": score,
                "evidence": reasons,
            })

    # Query ModelAtlas reference parquet for architecturally similar models
    db_matches = []
    try:
        ref = _load_reference_df()
        if ref is not None and vocab and hidden:
            hit = ref[
                (ref["vocab_size"] == vocab) &
                (ref["hidden_size"] == hidden) &
                (~ref["model_id"].str.contains("tiny|/", case=False, na=False))
            ].sort_values("hf_downloads", ascending=False).head(5)
            db_matches = hit[
                ["model_id", "org_display", "hf_downloads", "total_params",
                 "technique_signature", "num_layers", "hidden_size", "vocab_size"]
            ].rename(columns={"org_display": "lab"}).to_dict("records")
    except Exception:
        pass

    # Also try local DB if available (dev / local server)
    if not db_matches:
        try:
            import psycopg2, psycopg2.extras
            conn = psycopg2.connect(DB)
            cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
            cur.execute("""
                SELECT m.model_id, o.name AS lab, m.hf_downloads, m.release_date,
                       a.technique_signature, a.total_params, a.num_layers, a.hidden_size, a.vocab_size
                FROM analyses a JOIN models m ON m.id=a.model_id
                JOIN organizations o ON m.org_id=o.id
                WHERE a.is_current=true AND a.vocab_size=%s AND a.hidden_size=%s
                  AND m.model_id NOT ILIKE '%%tiny%%' AND m.model_id NOT ILIKE '/%%'
                ORDER BY m.hf_downloads DESC NULLS LAST
                LIMIT 5
            """, (vocab, hidden))
            db_matches = [dict(r) for r in cur.fetchall()]
            cur.close(); conn.close()
        except Exception:
            pass

    return {
        "arch_signature": arch_sig,
        "config_signals": {
            "model_type": model_type,
            "vocab_size": vocab,
            "hidden_size": hidden,
            "num_layers": layers,
            "has_mla": bool(kv_lora and kv_lora > 0),
            "kv_lora_rank": kv_lora,
        },
        "base_matches": sorted(base_matches, key=lambda x: -x["score"]),
        "modelatlas_similar": db_matches,
    }


def generate_verdict(
    model_id: str,
    config: dict,
    metadata: dict,
    claims: dict,
    stage1: dict,
) -> dict:
    """Synthesize all signals into a human-readable verdict."""
    now = datetime.now(timezone.utc).isoformat()
    base_matches = stage1["base_matches"]
    suspicious = claims.get("suspicious_name_terms", [])

    # Headline verdict
    if base_matches:
        top = base_matches[0]
        if top["confidence"] == "HIGH":
            architecture_verdict = f"CONFIRMED — architecture matches {top['name']}"
        else:
            architecture_verdict = f"LIKELY — architecture consistent with {top['name']}"
    else:
        architecture_verdict = "UNRECOGNIZED — architecture does not match any known base model"

    # Claim accuracy flags
    flags = []
    if "claude" in suspicious or "anthropic" in suspicious:
        flags.append({
            "type": "UNVERIFIABLE_CLAIM",
            "term": "claude/anthropic",
            "explanation": (
                "Claude weights are not publicly available — no weight transfer from Claude "
                "is possible. If this model used Claude-generated reasoning traces as training "
                "data (distillation), that is a post-training technique that leaves no "
                "architectural trace and cannot be verified from weights alone. "
                "The base architecture claim can be checked; the Claude claim cannot."
            ),
        })
    if "gpt" in suspicious or "openai" in suspicious or "chatgpt" in suspicious:
        flags.append({
            "type": "UNVERIFIABLE_CLAIM",
            "term": "gpt/openai",
            "explanation": "GPT-4/OpenAI weights are closed. Any weight transfer claim is false. Distillation via outputs is possible but unverifiable from architecture.",
        })
    if "gemini" in suspicious:
        flags.append({
            "type": "UNVERIFIABLE_CLAIM",
            "term": "gemini",
            "explanation": "Gemini weights are closed. Architecture shows no Gemini structure.",
        })

    # Name vs architecture consistency
    name_implied = claims.get("name_implies", [])
    if name_implied and base_matches:
        top_base = base_matches[0]["base"]
        if not any(n in top_base or top_base in n for n in name_implied):
            flags.append({
                "type": "NAME_MISMATCH",
                "explanation": f"Model name implies {name_implied} but architecture suggests {top_base}. Possible mislabeling.",
            })

    return {
        "model_id": model_id,
        "scanned_at": now,
        "verdict": {
            "architecture": architecture_verdict,
            "base_model_confirmed": base_matches[0]["name"] if base_matches else "Unknown",
            "confidence": base_matches[0]["confidence"] if base_matches else "NONE",
            "flags": flags,
            "flag_count": len(flags),
            "stage": "Stage 1 (config-only — no weight download)",
        },
        "evidence": {
            "config_signals": stage1["config_signals"],
            "base_matches": stage1["base_matches"][:3],
            "modelatlas_similar": stage1["modelatlas_similar"][:3],
            "claimed_base": claims.get("explicit_base"),
            "name_implies": name_implied,
        },
        "metadata": {
            "downloads": metadata.get("downloads", 0),
            "likes": metadata.get("likes", 0),
            "license": metadata.get("license", ""),
            "created_at": metadata.get("created_at", ""),
        },
        "note": (
            "Stage 1 validates architecture from config.json only (~2KB). "
            "Stage 2 weight analysis (requires model download) provides stronger confirmation. "
            "Powered by ModelAtlas — modeldna.ai · a RadicalNotion product."
        ),
    }


def scan(model_id: str) -> dict:
    """Full Stage 1 scan. Entry point."""
    t0 = time.time()

    # Detect unsupported formats before attempting config fetch
    name_lower = model_id.lower()
    if "gguf" in name_lower:
        return {
            "model_id": model_id,
            "error": (
                "GGUF models pack weights into a single file and don't have a standard config.json. "
                "Stage 1 scanning works with standard HuggingFace checkpoints (safetensors/PyTorch). "
                "Try the original (non-quantized) model instead — e.g. the unsloth/Qwen3.6-35B-A3B "
                "base would be Qwen/Qwen2.5-... or the upstream source. "
                "GGUF support is on the roadmap."
            ),
            "scanned_at": datetime.now(timezone.utc).isoformat(),
        }

    config = fetch_config(model_id)
    if not config:
        return {
            "model_id": model_id,
            "error": "Could not fetch config.json — model may be private, gated, or not exist on HuggingFace.",
            "scanned_at": datetime.now(timezone.utc).isoformat(),
        }
    metadata = fetch_model_metadata(model_id)
    claims = detect_claimed_base(model_id, config, metadata)
    stage1 = stage1_screen(model_id, config)
    verdict = generate_verdict(model_id, config, metadata, claims, stage1)
    verdict["elapsed_s"] = round(time.time() - t0, 2)
    return verdict


if __name__ == "__main__":
    import sys
    model_id = sys.argv[1] if len(sys.argv) > 1 else "Qwen/Qwen3.5-27B"
    result = scan(model_id)
    print(json.dumps(result, indent=2, default=str))