miosipov commited on Nov 16, 2025

Commit

cf5d751

verified ·

1 Parent(s): d4219ed

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

README.md +41 -0
config.json +2026 -0
core/.ipynb_checkpoints/distill-checkpoint.py +184 -0
core/.ipynb_checkpoints/finetune-checkpoint.py +267 -0
core/.ipynb_checkpoints/profiler-checkpoint.py +236 -0
core/.ipynb_checkpoints/proxy_cost-checkpoint.py +771 -0
core/.ipynb_checkpoints/train-checkpoint.py +327 -0
core/.ipynb_checkpoints/utils-checkpoint.py +190 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-310.pyc +0 -0
core/__pycache__/distill.cpython-310.pyc +0 -0
core/__pycache__/export.cpython-310.pyc +0 -0
core/__pycache__/finetune.cpython-310.pyc +0 -0
core/__pycache__/gates.cpython-310.pyc +0 -0
core/__pycache__/profiler.cpython-310.pyc +0 -0
core/__pycache__/proxy_cost.cpython-310.pyc +0 -0
core/__pycache__/search_export.cpython-310.pyc +0 -0
core/__pycache__/train.cpython-310.pyc +0 -0
core/__pycache__/utils.cpython-310.pyc +0 -0
core/distill.py +183 -0
core/export.py +220 -0
core/finetune.py +267 -0
core/gates.py +389 -0
core/profiler.py +236 -0
core/proxy_cost.py +771 -0
core/search_export.py +76 -0
core/train.py +327 -0
core/utils.py +190 -0
custom_code.py +1 -0
huggingface/.ipynb_checkpoints/llama-checkpoint.py +607 -0
huggingface/.ipynb_checkpoints/vit-checkpoint.py +383 -0
huggingface/__init__.py +0 -0
huggingface/__pycache__/__init__.cpython-310.pyc +0 -0
huggingface/__pycache__/vit.cpython-310.pyc +0 -0
huggingface/llama.py +607 -0
huggingface/registry.py +0 -0
huggingface/vit.py +383 -0
model_index.json +5 -0
pytorch_model.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+```yaml
+---
+library_name: pytorch
+tags:
+  - resnet
+  - pruning
+  - knowledge-distillation
+  - speedup
+license: apache-2.0
+dataset: imagenet-1k
+pipeline_tag: image-classification
+---
+```
+# hawada/vit-base-patch16-224-rtx4090-gated
+This repository contains two variants:
+- **Gated student** (with learned pruning gates) – requires custom code.
+- **Slim student** (post-prune/export) – loads with standard code (LLM) or bundled code (ResNet).
+## Inference (LLM, slim)
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+tok = AutoTokenizer.from_pretrained('hawada/vit-base-patch16-224-rtx4090-slim')
+mdl = AutoModelForCausalLM.from_pretrained('hawada/vit-base-patch16-224-rtx4090-slim', torch_dtype='auto').eval()
+x = tok('Hello', return_tensors='pt')
+print(tok.decode(mdl.generate(**x, max_new_tokens=16)[0]))
+```
+## Notes
+- The **gated** repo includes lightweight custom code (adapters/…, core/…) needed to attach/load gates.
+- The **slim** LLM is exported to standard HF architecture for out-of-the-box loading.
+- For ResNet, both repos include minimal custom code to define the module.
+## Training metadata
+```json
+{
+  "base_id": "google/vit-base-patch16-224",
+  "variant": "gated-student",
+  "repo_slim": "hawada/vit-base-patch16-224-rtx4090-slim"
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,2026 @@

+{
+  "architectures": [
+    "ViTForImageClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "encoder_stride": 16,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "tench, Tinca tinca",
+    "1": "goldfish, Carassius auratus",
+    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    "3": "tiger shark, Galeocerdo cuvieri",
+    "4": "hammerhead, hammerhead shark",
+    "5": "electric ray, crampfish, numbfish, torpedo",
+    "6": "stingray",
+    "7": "cock",
+    "8": "hen",
+    "9": "ostrich, Struthio camelus",
+    "10": "brambling, Fringilla montifringilla",
+    "11": "goldfinch, Carduelis carduelis",
+    "12": "house finch, linnet, Carpodacus mexicanus",
+    "13": "junco, snowbird",
+    "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    "15": "robin, American robin, Turdus migratorius",
+    "16": "bulbul",
+    "17": "jay",
+    "18": "magpie",
+    "19": "chickadee",
+    "20": "water ouzel, dipper",
+    "21": "kite",
+    "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
+    "23": "vulture",
+    "24": "great grey owl, great gray owl, Strix nebulosa",
+    "25": "European fire salamander, Salamandra salamandra",
+    "26": "common newt, Triturus vulgaris",
+    "27": "eft",
+    "28": "spotted salamander, Ambystoma maculatum",
+    "29": "axolotl, mud puppy, Ambystoma mexicanum",
+    "30": "bullfrog, Rana catesbeiana",
+    "31": "tree frog, tree-frog",
+    "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    "33": "loggerhead, loggerhead turtle, Caretta caretta",
+    "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    "35": "mud turtle",
+    "36": "terrapin",
+    "37": "box turtle, box tortoise",
+    "38": "banded gecko",
+    "39": "common iguana, iguana, Iguana iguana",
+    "40": "American chameleon, anole, Anolis carolinensis",
+    "41": "whiptail, whiptail lizard",
+    "42": "agama",
+    "43": "frilled lizard, Chlamydosaurus kingi",
+    "44": "alligator lizard",
+    "45": "Gila monster, Heloderma suspectum",
+    "46": "green lizard, Lacerta viridis",
+    "47": "African chameleon, Chamaeleo chamaeleon",
+    "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
+    "50": "American alligator, Alligator mississipiensis",
+    "51": "triceratops",
+    "52": "thunder snake, worm snake, Carphophis amoenus",
+    "53": "ringneck snake, ring-necked snake, ring snake",
+    "54": "hognose snake, puff adder, sand viper",
+    "55": "green snake, grass snake",
+    "56": "king snake, kingsnake",
+    "57": "garter snake, grass snake",
+    "58": "water snake",
+    "59": "vine snake",
+    "60": "night snake, Hypsiglena torquata",
+    "61": "boa constrictor, Constrictor constrictor",
+    "62": "rock python, rock snake, Python sebae",
+    "63": "Indian cobra, Naja naja",
+    "64": "green mamba",
+    "65": "sea snake",
+    "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
+    "69": "trilobite",
+    "70": "harvestman, daddy longlegs, Phalangium opilio",
+    "71": "scorpion",
+    "72": "black and gold garden spider, Argiope aurantia",
+    "73": "barn spider, Araneus cavaticus",
+    "74": "garden spider, Aranea diademata",
+    "75": "black widow, Latrodectus mactans",
+    "76": "tarantula",
+    "77": "wolf spider, hunting spider",
+    "78": "tick",
+    "79": "centipede",
+    "80": "black grouse",
+    "81": "ptarmigan",
+    "82": "ruffed grouse, partridge, Bonasa umbellus",
+    "83": "prairie chicken, prairie grouse, prairie fowl",
+    "84": "peacock",
+    "85": "quail",
+    "86": "partridge",
+    "87": "African grey, African gray, Psittacus erithacus",
+    "88": "macaw",
+    "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    "90": "lorikeet",
+    "91": "coucal",
+    "92": "bee eater",
+    "93": "hornbill",
+    "94": "hummingbird",
+    "95": "jacamar",
+    "96": "toucan",
+    "97": "drake",
+    "98": "red-breasted merganser, Mergus serrator",
+    "99": "goose",
+    "100": "black swan, Cygnus atratus",
+    "101": "tusker",
+    "102": "echidna, spiny anteater, anteater",
+    "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    "104": "wallaby, brush kangaroo",
+    "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    "106": "wombat",
+    "107": "jellyfish",
+    "108": "sea anemone, anemone",
+    "109": "brain coral",
+    "110": "flatworm, platyhelminth",
+    "111": "nematode, nematode worm, roundworm",
+    "112": "conch",
+    "113": "snail",
+    "114": "slug",
+    "115": "sea slug, nudibranch",
+    "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    "117": "chambered nautilus, pearly nautilus, nautilus",
+    "118": "Dungeness crab, Cancer magister",
+    "119": "rock crab, Cancer irroratus",
+    "120": "fiddler crab",
+    "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    "124": "crayfish, crawfish, crawdad, crawdaddy",
+    "125": "hermit crab",
+    "126": "isopod",
+    "127": "white stork, Ciconia ciconia",
+    "128": "black stork, Ciconia nigra",
+    "129": "spoonbill",
+    "130": "flamingo",
+    "131": "little blue heron, Egretta caerulea",
+    "132": "American egret, great white heron, Egretta albus",
+    "133": "bittern",
+    "134": "crane",
+    "135": "limpkin, Aramus pictus",
+    "136": "European gallinule, Porphyrio porphyrio",
+    "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    "138": "bustard",
+    "139": "ruddy turnstone, Arenaria interpres",
+    "140": "red-backed sandpiper, dunlin, Erolia alpina",
+    "141": "redshank, Tringa totanus",
+    "142": "dowitcher",
+    "143": "oystercatcher, oyster catcher",
+    "144": "pelican",
+    "145": "king penguin, Aptenodytes patagonica",
+    "146": "albatross, mollymawk",
+    "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    "149": "dugong, Dugong dugon",
+    "150": "sea lion",
+    "151": "Chihuahua",
+    "152": "Japanese spaniel",
+    "153": "Maltese dog, Maltese terrier, Maltese",
+    "154": "Pekinese, Pekingese, Peke",
+    "155": "Shih-Tzu",
+    "156": "Blenheim spaniel",
+    "157": "papillon",
+    "158": "toy terrier",
+    "159": "Rhodesian ridgeback",
+    "160": "Afghan hound, Afghan",
+    "161": "basset, basset hound",
+    "162": "beagle",
+    "163": "bloodhound, sleuthhound",
+    "164": "bluetick",
+    "165": "black-and-tan coonhound",
+    "166": "Walker hound, Walker foxhound",
+    "167": "English foxhound",
+    "168": "redbone",
+    "169": "borzoi, Russian wolfhound",
+    "170": "Irish wolfhound",
+    "171": "Italian greyhound",
+    "172": "whippet",
+    "173": "Ibizan hound, Ibizan Podenco",
+    "174": "Norwegian elkhound, elkhound",
+    "175": "otterhound, otter hound",
+    "176": "Saluki, gazelle hound",
+    "177": "Scottish deerhound, deerhound",
+    "178": "Weimaraner",
+    "179": "Staffordshire bullterrier, Staffordshire bull terrier",
+    "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    "181": "Bedlington terrier",
+    "182": "Border terrier",
+    "183": "Kerry blue terrier",
+    "184": "Irish terrier",
+    "185": "Norfolk terrier",
+    "186": "Norwich terrier",
+    "187": "Yorkshire terrier",
+    "188": "wire-haired fox terrier",
+    "189": "Lakeland terrier",
+    "190": "Sealyham terrier, Sealyham",
+    "191": "Airedale, Airedale terrier",
+    "192": "cairn, cairn terrier",
+    "193": "Australian terrier",
+    "194": "Dandie Dinmont, Dandie Dinmont terrier",
+    "195": "Boston bull, Boston terrier",
+    "196": "miniature schnauzer",
+    "197": "giant schnauzer",
+    "198": "standard schnauzer",
+    "199": "Scotch terrier, Scottish terrier, Scottie",
+    "200": "Tibetan terrier, chrysanthemum dog",
+    "201": "silky terrier, Sydney silky",
+    "202": "soft-coated wheaten terrier",
+    "203": "West Highland white terrier",
+    "204": "Lhasa, Lhasa apso",
+    "205": "flat-coated retriever",
+    "206": "curly-coated retriever",
+    "207": "golden retriever",
+    "208": "Labrador retriever",
+    "209": "Chesapeake Bay retriever",
+    "210": "German short-haired pointer",
+    "211": "vizsla, Hungarian pointer",
+    "212": "English setter",
+    "213": "Irish setter, red setter",
+    "214": "Gordon setter",
+    "215": "Brittany spaniel",
+    "216": "clumber, clumber spaniel",
+    "217": "English springer, English springer spaniel",
+    "218": "Welsh springer spaniel",
+    "219": "cocker spaniel, English cocker spaniel, cocker",
+    "220": "Sussex spaniel",
+    "221": "Irish water spaniel",
+    "222": "kuvasz",
+    "223": "schipperke",
+    "224": "groenendael",
+    "225": "malinois",
+    "226": "briard",
+    "227": "kelpie",
+    "228": "komondor",
+    "229": "Old English sheepdog, bobtail",
+    "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
+    "231": "collie",
+    "232": "Border collie",
+    "233": "Bouvier des Flandres, Bouviers des Flandres",
+    "234": "Rottweiler",
+    "235": "German shepherd, German shepherd dog, German police dog, alsatian",
+    "236": "Doberman, Doberman pinscher",
+    "237": "miniature pinscher",
+    "238": "Greater Swiss Mountain dog",
+    "239": "Bernese mountain dog",
+    "240": "Appenzeller",
+    "241": "EntleBucher",
+    "242": "boxer",
+    "243": "bull mastiff",
+    "244": "Tibetan mastiff",
+    "245": "French bulldog",
+    "246": "Great Dane",
+    "247": "Saint Bernard, St Bernard",
+    "248": "Eskimo dog, husky",
+    "249": "malamute, malemute, Alaskan malamute",
+    "250": "Siberian husky",
+    "251": "dalmatian, coach dog, carriage dog",
+    "252": "affenpinscher, monkey pinscher, monkey dog",
+    "253": "basenji",
+    "254": "pug, pug-dog",
+    "255": "Leonberg",
+    "256": "Newfoundland, Newfoundland dog",
+    "257": "Great Pyrenees",
+    "258": "Samoyed, Samoyede",
+    "259": "Pomeranian",
+    "260": "chow, chow chow",
+    "261": "keeshond",
+    "262": "Brabancon griffon",
+    "263": "Pembroke, Pembroke Welsh corgi",
+    "264": "Cardigan, Cardigan Welsh corgi",
+    "265": "toy poodle",
+    "266": "miniature poodle",
+    "267": "standard poodle",
+    "268": "Mexican hairless",
+    "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
+    "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
+    "271": "red wolf, maned wolf, Canis rufus, Canis niger",
+    "272": "coyote, prairie wolf, brush wolf, Canis latrans",
+    "273": "dingo, warrigal, warragal, Canis dingo",
+    "274": "dhole, Cuon alpinus",
+    "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    "276": "hyena, hyaena",
+    "277": "red fox, Vulpes vulpes",
+    "278": "kit fox, Vulpes macrotis",
+    "279": "Arctic fox, white fox, Alopex lagopus",
+    "280": "grey fox, gray fox, Urocyon cinereoargenteus",
+    "281": "tabby, tabby cat",
+    "282": "tiger cat",
+    "283": "Persian cat",
+    "284": "Siamese cat, Siamese",
+    "285": "Egyptian cat",
+    "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    "287": "lynx, catamount",
+    "288": "leopard, Panthera pardus",
+    "289": "snow leopard, ounce, Panthera uncia",
+    "290": "jaguar, panther, Panthera onca, Felis onca",
+    "291": "lion, king of beasts, Panthera leo",
+    "292": "tiger, Panthera tigris",
+    "293": "cheetah, chetah, Acinonyx jubatus",
+    "294": "brown bear, bruin, Ursus arctos",
+    "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    "297": "sloth bear, Melursus ursinus, Ursus ursinus",
+    "298": "mongoose",
+    "299": "meerkat, mierkat",
+    "300": "tiger beetle",
+    "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    "302": "ground beetle, carabid beetle",
+    "303": "long-horned beetle, longicorn, longicorn beetle",
+    "304": "leaf beetle, chrysomelid",
+    "305": "dung beetle",
+    "306": "rhinoceros beetle",
+    "307": "weevil",
+    "308": "fly",
+    "309": "bee",
+    "310": "ant, emmet, pismire",
+    "311": "grasshopper, hopper",
+    "312": "cricket",
+    "313": "walking stick, walkingstick, stick insect",
+    "314": "cockroach, roach",
+    "315": "mantis, mantid",
+    "316": "cicada, cicala",
+    "317": "leafhopper",
+    "318": "lacewing, lacewing fly",
+    "319": "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    "320": "damselfly",
+    "321": "admiral",
+    "322": "ringlet, ringlet butterfly",
+    "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    "324": "cabbage butterfly",
+    "325": "sulphur butterfly, sulfur butterfly",
+    "326": "lycaenid, lycaenid butterfly",
+    "327": "starfish, sea star",
+    "328": "sea urchin",
+    "329": "sea cucumber, holothurian",
+    "330": "wood rabbit, cottontail, cottontail rabbit",
+    "331": "hare",
+    "332": "Angora, Angora rabbit",
+    "333": "hamster",
+    "334": "porcupine, hedgehog",
+    "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
+    "336": "marmot",
+    "337": "beaver",
+    "338": "guinea pig, Cavia cobaya",
+    "339": "sorrel",
+    "340": "zebra",
+    "341": "hog, pig, grunter, squealer, Sus scrofa",
+    "342": "wild boar, boar, Sus scrofa",
+    "343": "warthog",
+    "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    "345": "ox",
+    "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    "347": "bison",
+    "348": "ram, tup",
+    "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    "350": "ibex, Capra ibex",
+    "351": "hartebeest",
+    "352": "impala, Aepyceros melampus",
+    "353": "gazelle",
+    "354": "Arabian camel, dromedary, Camelus dromedarius",
+    "355": "llama",
+    "356": "weasel",
+    "357": "mink",
+    "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
+    "359": "black-footed ferret, ferret, Mustela nigripes",
+    "360": "otter",
+    "361": "skunk, polecat, wood pussy",
+    "362": "badger",
+    "363": "armadillo",
+    "364": "three-toed sloth, ai, Bradypus tridactylus",
+    "365": "orangutan, orang, orangutang, Pongo pygmaeus",
+    "366": "gorilla, Gorilla gorilla",
+    "367": "chimpanzee, chimp, Pan troglodytes",
+    "368": "gibbon, Hylobates lar",
+    "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    "370": "guenon, guenon monkey",
+    "371": "patas, hussar monkey, Erythrocebus patas",
+    "372": "baboon",
+    "373": "macaque",
+    "374": "langur",
+    "375": "colobus, colobus monkey",
+    "376": "proboscis monkey, Nasalis larvatus",
+    "377": "marmoset",
+    "378": "capuchin, ringtail, Cebus capucinus",
+    "379": "howler monkey, howler",
+    "380": "titi, titi monkey",
+    "381": "spider monkey, Ateles geoffroyi",
+    "382": "squirrel monkey, Saimiri sciureus",
+    "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
+    "384": "indri, indris, Indri indri, Indri brevicaudatus",
+    "385": "Indian elephant, Elephas maximus",
+    "386": "African elephant, Loxodonta africana",
+    "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    "389": "barracouta, snoek",
+    "390": "eel",
+    "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    "392": "rock beauty, Holocanthus tricolor",
+    "393": "anemone fish",
+    "394": "sturgeon",
+    "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    "396": "lionfish",
+    "397": "puffer, pufferfish, blowfish, globefish",
+    "398": "abacus",
+    "399": "abaya",
+    "400": "academic gown, academic robe, judge's robe",
+    "401": "accordion, piano accordion, squeeze box",
+    "402": "acoustic guitar",
+    "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    "404": "airliner",
+    "405": "airship, dirigible",
+    "406": "altar",
+    "407": "ambulance",
+    "408": "amphibian, amphibious vehicle",
+    "409": "analog clock",
+    "410": "apiary, bee house",
+    "411": "apron",
+    "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    "413": "assault rifle, assault gun",
+    "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    "415": "bakery, bakeshop, bakehouse",
+    "416": "balance beam, beam",
+    "417": "balloon",
+    "418": "ballpoint, ballpoint pen, ballpen, Biro",
+    "419": "Band Aid",
+    "420": "banjo",
+    "421": "bannister, banister, balustrade, balusters, handrail",
+    "422": "barbell",
+    "423": "barber chair",
+    "424": "barbershop",
+    "425": "barn",
+    "426": "barometer",
+    "427": "barrel, cask",
+    "428": "barrow, garden cart, lawn cart, wheelbarrow",
+    "429": "baseball",
+    "430": "basketball",
+    "431": "bassinet",
+    "432": "bassoon",
+    "433": "bathing cap, swimming cap",
+    "434": "bath towel",
+    "435": "bathtub, bathing tub, bath, tub",
+    "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    "437": "beacon, lighthouse, beacon light, pharos",
+    "438": "beaker",
+    "439": "bearskin, busby, shako",
+    "440": "beer bottle",
+    "441": "beer glass",
+    "442": "bell cote, bell cot",
+    "443": "bib",
+    "444": "bicycle-built-for-two, tandem bicycle, tandem",
+    "445": "bikini, two-piece",
+    "446": "binder, ring-binder",
+    "447": "binoculars, field glasses, opera glasses",
+    "448": "birdhouse",
+    "449": "boathouse",
+    "450": "bobsled, bobsleigh, bob",
+    "451": "bolo tie, bolo, bola tie, bola",
+    "452": "bonnet, poke bonnet",
+    "453": "bookcase",
+    "454": "bookshop, bookstore, bookstall",
+    "455": "bottlecap",
+    "456": "bow",
+    "457": "bow tie, bow-tie, bowtie",
+    "458": "brass, memorial tablet, plaque",
+    "459": "brassiere, bra, bandeau",
+    "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    "461": "breastplate, aegis, egis",
+    "462": "broom",
+    "463": "bucket, pail",
+    "464": "buckle",
+    "465": "bulletproof vest",
+    "466": "bullet train, bullet",
+    "467": "butcher shop, meat market",
+    "468": "cab, hack, taxi, taxicab",
+    "469": "caldron, cauldron",
+    "470": "candle, taper, wax light",
+    "471": "cannon",
+    "472": "canoe",
+    "473": "can opener, tin opener",
+    "474": "cardigan",
+    "475": "car mirror",
+    "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    "477": "carpenter's kit, tool kit",
+    "478": "carton",
+    "479": "car wheel",
+    "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    "481": "cassette",
+    "482": "cassette player",
+    "483": "castle",
+    "484": "catamaran",
+    "485": "CD player",
+    "486": "cello, violoncello",
+    "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    "488": "chain",
+    "489": "chainlink fence",
+    "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    "491": "chain saw, chainsaw",
+    "492": "chest",
+    "493": "chiffonier, commode",
+    "494": "chime, bell, gong",
+    "495": "china cabinet, china closet",
+    "496": "Christmas stocking",
+    "497": "church, church building",
+    "498": "cinema, movie theater, movie theatre, movie house, picture palace",
+    "499": "cleaver, meat cleaver, chopper",
+    "500": "cliff dwelling",
+    "501": "cloak",
+    "502": "clog, geta, patten, sabot",
+    "503": "cocktail shaker",
+    "504": "coffee mug",
+    "505": "coffeepot",
+    "506": "coil, spiral, volute, whorl, helix",
+    "507": "combination lock",
+    "508": "computer keyboard, keypad",
+    "509": "confectionery, confectionary, candy store",
+    "510": "container ship, containership, container vessel",
+    "511": "convertible",
+    "512": "corkscrew, bottle screw",
+    "513": "cornet, horn, trumpet, trump",
+    "514": "cowboy boot",
+    "515": "cowboy hat, ten-gallon hat",
+    "516": "cradle",
+    "517": "crane",
+    "518": "crash helmet",
+    "519": "crate",
+    "520": "crib, cot",
+    "521": "Crock Pot",
+    "522": "croquet ball",
+    "523": "crutch",
+    "524": "cuirass",
+    "525": "dam, dike, dyke",
+    "526": "desk",
+    "527": "desktop computer",
+    "528": "dial telephone, dial phone",
+    "529": "diaper, nappy, napkin",
+    "530": "digital clock",
+    "531": "digital watch",
+    "532": "dining table, board",
+    "533": "dishrag, dishcloth",
+    "534": "dishwasher, dish washer, dishwashing machine",
+    "535": "disk brake, disc brake",
+    "536": "dock, dockage, docking facility",
+    "537": "dogsled, dog sled, dog sleigh",
+    "538": "dome",
+    "539": "doormat, welcome mat",
+    "540": "drilling platform, offshore rig",
+    "541": "drum, membranophone, tympan",
+    "542": "drumstick",
+    "543": "dumbbell",
+    "544": "Dutch oven",
+    "545": "electric fan, blower",
+    "546": "electric guitar",
+    "547": "electric locomotive",
+    "548": "entertainment center",
+    "549": "envelope",
+    "550": "espresso maker",
+    "551": "face powder",
+    "552": "feather boa, boa",
+    "553": "file, file cabinet, filing cabinet",
+    "554": "fireboat",
+    "555": "fire engine, fire truck",
+    "556": "fire screen, fireguard",
+    "557": "flagpole, flagstaff",
+    "558": "flute, transverse flute",
+    "559": "folding chair",
+    "560": "football helmet",
+    "561": "forklift",
+    "562": "fountain",
+    "563": "fountain pen",
+    "564": "four-poster",
+    "565": "freight car",
+    "566": "French horn, horn",
+    "567": "frying pan, frypan, skillet",
+    "568": "fur coat",
+    "569": "garbage truck, dustcart",
+    "570": "gasmask, respirator, gas helmet",
+    "571": "gas pump, gasoline pump, petrol pump, island dispenser",
+    "572": "goblet",
+    "573": "go-kart",
+    "574": "golf ball",
+    "575": "golfcart, golf cart",
+    "576": "gondola",
+    "577": "gong, tam-tam",
+    "578": "gown",
+    "579": "grand piano, grand",
+    "580": "greenhouse, nursery, glasshouse",
+    "581": "grille, radiator grille",
+    "582": "grocery store, grocery, food market, market",
+    "583": "guillotine",
+    "584": "hair slide",
+    "585": "hair spray",
+    "586": "half track",
+    "587": "hammer",
+    "588": "hamper",
+    "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    "590": "hand-held computer, hand-held microcomputer",
+    "591": "handkerchief, hankie, hanky, hankey",
+    "592": "hard disc, hard disk, fixed disk",
+    "593": "harmonica, mouth organ, harp, mouth harp",
+    "594": "harp",
+    "595": "harvester, reaper",
+    "596": "hatchet",
+    "597": "holster",
+    "598": "home theater, home theatre",
+    "599": "honeycomb",
+    "600": "hook, claw",
+    "601": "hoopskirt, crinoline",
+    "602": "horizontal bar, high bar",
+    "603": "horse cart, horse-cart",
+    "604": "hourglass",
+    "605": "iPod",
+    "606": "iron, smoothing iron",
+    "607": "jack-o'-lantern",
+    "608": "jean, blue jean, denim",
+    "609": "jeep, landrover",
+    "610": "jersey, T-shirt, tee shirt",
+    "611": "jigsaw puzzle",
+    "612": "jinrikisha, ricksha, rickshaw",
+    "613": "joystick",
+    "614": "kimono",
+    "615": "knee pad",
+    "616": "knot",
+    "617": "lab coat, laboratory coat",
+    "618": "ladle",
+    "619": "lampshade, lamp shade",
+    "620": "laptop, laptop computer",
+    "621": "lawn mower, mower",
+    "622": "lens cap, lens cover",
+    "623": "letter opener, paper knife, paperknife",
+    "624": "library",
+    "625": "lifeboat",
+    "626": "lighter, light, igniter, ignitor",
+    "627": "limousine, limo",
+    "628": "liner, ocean liner",
+    "629": "lipstick, lip rouge",
+    "630": "Loafer",
+    "631": "lotion",
+    "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    "633": "loupe, jeweler's loupe",
+    "634": "lumbermill, sawmill",
+    "635": "magnetic compass",
+    "636": "mailbag, postbag",
+    "637": "mailbox, letter box",
+    "638": "maillot",
+    "639": "maillot, tank suit",
+    "640": "manhole cover",
+    "641": "maraca",
+    "642": "marimba, xylophone",
+    "643": "mask",
+    "644": "matchstick",
+    "645": "maypole",
+    "646": "maze, labyrinth",
+    "647": "measuring cup",
+    "648": "medicine chest, medicine cabinet",
+    "649": "megalith, megalithic structure",
+    "650": "microphone, mike",
+    "651": "microwave, microwave oven",
+    "652": "military uniform",
+    "653": "milk can",
+    "654": "minibus",
+    "655": "miniskirt, mini",
+    "656": "minivan",
+    "657": "missile",
+    "658": "mitten",
+    "659": "mixing bowl",
+    "660": "mobile home, manufactured home",
+    "661": "Model T",
+    "662": "modem",
+    "663": "monastery",
+    "664": "monitor",
+    "665": "moped",
+    "666": "mortar",
+    "667": "mortarboard",
+    "668": "mosque",
+    "669": "mosquito net",
+    "670": "motor scooter, scooter",
+    "671": "mountain bike, all-terrain bike, off-roader",
+    "672": "mountain tent",
+    "673": "mouse, computer mouse",
+    "674": "mousetrap",
+    "675": "moving van",
+    "676": "muzzle",
+    "677": "nail",
+    "678": "neck brace",
+    "679": "necklace",
+    "680": "nipple",
+    "681": "notebook, notebook computer",
+    "682": "obelisk",
+    "683": "oboe, hautboy, hautbois",
+    "684": "ocarina, sweet potato",
+    "685": "odometer, hodometer, mileometer, milometer",
+    "686": "oil filter",
+    "687": "organ, pipe organ",
+    "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    "689": "overskirt",
+    "690": "oxcart",
+    "691": "oxygen mask",
+    "692": "packet",
+    "693": "paddle, boat paddle",
+    "694": "paddlewheel, paddle wheel",
+    "695": "padlock",
+    "696": "paintbrush",
+    "697": "pajama, pyjama, pj's, jammies",
+    "698": "palace",
+    "699": "panpipe, pandean pipe, syrinx",
+    "700": "paper towel",
+    "701": "parachute, chute",
+    "702": "parallel bars, bars",
+    "703": "park bench",
+    "704": "parking meter",
+    "705": "passenger car, coach, carriage",
+    "706": "patio, terrace",
+    "707": "pay-phone, pay-station",
+    "708": "pedestal, plinth, footstall",
+    "709": "pencil box, pencil case",
+    "710": "pencil sharpener",
+    "711": "perfume, essence",
+    "712": "Petri dish",
+    "713": "photocopier",
+    "714": "pick, plectrum, plectron",
+    "715": "pickelhaube",
+    "716": "picket fence, paling",
+    "717": "pickup, pickup truck",
+    "718": "pier",
+    "719": "piggy bank, penny bank",
+    "720": "pill bottle",
+    "721": "pillow",
+    "722": "ping-pong ball",
+    "723": "pinwheel",
+    "724": "pirate, pirate ship",
+    "725": "pitcher, ewer",
+    "726": "plane, carpenter's plane, woodworking plane",
+    "727": "planetarium",
+    "728": "plastic bag",
+    "729": "plate rack",
+    "730": "plow, plough",
+    "731": "plunger, plumber's helper",
+    "732": "Polaroid camera, Polaroid Land camera",
+    "733": "pole",
+    "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    "735": "poncho",
+    "736": "pool table, billiard table, snooker table",
+    "737": "pop bottle, soda bottle",
+    "738": "pot, flowerpot",
+    "739": "potter's wheel",
+    "740": "power drill",
+    "741": "prayer rug, prayer mat",
+    "742": "printer",
+    "743": "prison, prison house",
+    "744": "projectile, missile",
+    "745": "projector",
+    "746": "puck, hockey puck",
+    "747": "punching bag, punch bag, punching ball, punchball",
+    "748": "purse",
+    "749": "quill, quill pen",
+    "750": "quilt, comforter, comfort, puff",
+    "751": "racer, race car, racing car",
+    "752": "racket, racquet",
+    "753": "radiator",
+    "754": "radio, wireless",
+    "755": "radio telescope, radio reflector",
+    "756": "rain barrel",
+    "757": "recreational vehicle, RV, R.V.",
+    "758": "reel",
+    "759": "reflex camera",
+    "760": "refrigerator, icebox",
+    "761": "remote control, remote",
+    "762": "restaurant, eating house, eating place, eatery",
+    "763": "revolver, six-gun, six-shooter",
+    "764": "rifle",
+    "765": "rocking chair, rocker",
+    "766": "rotisserie",
+    "767": "rubber eraser, rubber, pencil eraser",
+    "768": "rugby ball",
+    "769": "rule, ruler",
+    "770": "running shoe",
+    "771": "safe",
+    "772": "safety pin",
+    "773": "saltshaker, salt shaker",
+    "774": "sandal",
+    "775": "sarong",
+    "776": "sax, saxophone",
+    "777": "scabbard",
+    "778": "scale, weighing machine",
+    "779": "school bus",
+    "780": "schooner",
+    "781": "scoreboard",
+    "782": "screen, CRT screen",
+    "783": "screw",
+    "784": "screwdriver",
+    "785": "seat belt, seatbelt",
+    "786": "sewing machine",
+    "787": "shield, buckler",
+    "788": "shoe shop, shoe-shop, shoe store",
+    "789": "shoji",
+    "790": "shopping basket",
+    "791": "shopping cart",
+    "792": "shovel",
+    "793": "shower cap",
+    "794": "shower curtain",
+    "795": "ski",
+    "796": "ski mask",
+    "797": "sleeping bag",
+    "798": "slide rule, slipstick",
+    "799": "sliding door",
+    "800": "slot, one-armed bandit",
+    "801": "snorkel",
+    "802": "snowmobile",
+    "803": "snowplow, snowplough",
+    "804": "soap dispenser",
+    "805": "soccer ball",
+    "806": "sock",
+    "807": "solar dish, solar collector, solar furnace",
+    "808": "sombrero",
+    "809": "soup bowl",
+    "810": "space bar",
+    "811": "space heater",
+    "812": "space shuttle",
+    "813": "spatula",
+    "814": "speedboat",
+    "815": "spider web, spider's web",
+    "816": "spindle",
+    "817": "sports car, sport car",
+    "818": "spotlight, spot",
+    "819": "stage",
+    "820": "steam locomotive",
+    "821": "steel arch bridge",
+    "822": "steel drum",
+    "823": "stethoscope",
+    "824": "stole",
+    "825": "stone wall",
+    "826": "stopwatch, stop watch",
+    "827": "stove",
+    "828": "strainer",
+    "829": "streetcar, tram, tramcar, trolley, trolley car",
+    "830": "stretcher",
+    "831": "studio couch, day bed",
+    "832": "stupa, tope",
+    "833": "submarine, pigboat, sub, U-boat",
+    "834": "suit, suit of clothes",
+    "835": "sundial",
+    "836": "sunglass",
+    "837": "sunglasses, dark glasses, shades",
+    "838": "sunscreen, sunblock, sun blocker",
+    "839": "suspension bridge",
+    "840": "swab, swob, mop",
+    "841": "sweatshirt",
+    "842": "swimming trunks, bathing trunks",
+    "843": "swing",
+    "844": "switch, electric switch, electrical switch",
+    "845": "syringe",
+    "846": "table lamp",
+    "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    "848": "tape player",
+    "849": "teapot",
+    "850": "teddy, teddy bear",
+    "851": "television, television system",
+    "852": "tennis ball",
+    "853": "thatch, thatched roof",
+    "854": "theater curtain, theatre curtain",
+    "855": "thimble",
+    "856": "thresher, thrasher, threshing machine",
+    "857": "throne",
+    "858": "tile roof",
+    "859": "toaster",
+    "860": "tobacco shop, tobacconist shop, tobacconist",
+    "861": "toilet seat",
+    "862": "torch",
+    "863": "totem pole",
+    "864": "tow truck, tow car, wrecker",
+    "865": "toyshop",
+    "866": "tractor",
+    "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    "868": "tray",
+    "869": "trench coat",
+    "870": "tricycle, trike, velocipede",
+    "871": "trimaran",
+    "872": "tripod",
+    "873": "triumphal arch",
+    "874": "trolleybus, trolley coach, trackless trolley",
+    "875": "trombone",
+    "876": "tub, vat",
+    "877": "turnstile",
+    "878": "typewriter keyboard",
+    "879": "umbrella",
+    "880": "unicycle, monocycle",
+    "881": "upright, upright piano",
+    "882": "vacuum, vacuum cleaner",
+    "883": "vase",
+    "884": "vault",
+    "885": "velvet",
+    "886": "vending machine",
+    "887": "vestment",
+    "888": "viaduct",
+    "889": "violin, fiddle",
+    "890": "volleyball",
+    "891": "waffle iron",
+    "892": "wall clock",
+    "893": "wallet, billfold, notecase, pocketbook",
+    "894": "wardrobe, closet, press",
+    "895": "warplane, military plane",
+    "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    "897": "washer, automatic washer, washing machine",
+    "898": "water bottle",
+    "899": "water jug",
+    "900": "water tower",
+    "901": "whiskey jug",
+    "902": "whistle",
+    "903": "wig",
+    "904": "window screen",
+    "905": "window shade",
+    "906": "Windsor tie",
+    "907": "wine bottle",
+    "908": "wing",
+    "909": "wok",
+    "910": "wooden spoon",
+    "911": "wool, woolen, woollen",
+    "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
+    "913": "wreck",
+    "914": "yawl",
+    "915": "yurt",
+    "916": "web site, website, internet site, site",
+    "917": "comic book",
+    "918": "crossword puzzle, crossword",
+    "919": "street sign",
+    "920": "traffic light, traffic signal, stoplight",
+    "921": "book jacket, dust cover, dust jacket, dust wrapper",
+    "922": "menu",
+    "923": "plate",
+    "924": "guacamole",
+    "925": "consomme",
+    "926": "hot pot, hotpot",
+    "927": "trifle",
+    "928": "ice cream, icecream",
+    "929": "ice lolly, lolly, lollipop, popsicle",
+    "930": "French loaf",
+    "931": "bagel, beigel",
+    "932": "pretzel",
+    "933": "cheeseburger",
+    "934": "hotdog, hot dog, red hot",
+    "935": "mashed potato",
+    "936": "head cabbage",
+    "937": "broccoli",
+    "938": "cauliflower",
+    "939": "zucchini, courgette",
+    "940": "spaghetti squash",
+    "941": "acorn squash",
+    "942": "butternut squash",
+    "943": "cucumber, cuke",
+    "944": "artichoke, globe artichoke",
+    "945": "bell pepper",
+    "946": "cardoon",
+    "947": "mushroom",
+    "948": "Granny Smith",
+    "949": "strawberry",
+    "950": "orange",
+    "951": "lemon",
+    "952": "fig",
+    "953": "pineapple, ananas",
+    "954": "banana",
+    "955": "jackfruit, jak, jack",
+    "956": "custard apple",
+    "957": "pomegranate",
+    "958": "hay",
+    "959": "carbonara",
+    "960": "chocolate sauce, chocolate syrup",
+    "961": "dough",
+    "962": "meat loaf, meatloaf",
+    "963": "pizza, pizza pie",
+    "964": "potpie",
+    "965": "burrito",
+    "966": "red wine",
+    "967": "espresso",
+    "968": "cup",
+    "969": "eggnog",
+    "970": "alp",
+    "971": "bubble",
+    "972": "cliff, drop, drop-off",
+    "973": "coral reef",
+    "974": "geyser",
+    "975": "lakeside, lakeshore",
+    "976": "promontory, headland, head, foreland",
+    "977": "sandbar, sand bar",
+    "978": "seashore, coast, seacoast, sea-coast",
+    "979": "valley, vale",
+    "980": "volcano",
+    "981": "ballplayer, baseball player",
+    "982": "groom, bridegroom",
+    "983": "scuba diver",
+    "984": "rapeseed",
+    "985": "daisy",
+    "986": "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    "987": "corn",
+    "988": "acorn",
+    "989": "hip, rose hip, rosehip",
+    "990": "buckeye, horse chestnut, conker",
+    "991": "coral fungus",
+    "992": "agaric",
+    "993": "gyromitra",
+    "994": "stinkhorn, carrion fungus",
+    "995": "earthstar",
+    "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    "997": "bolete",
+    "998": "ear, spike, capitulum",
+    "999": "toilet tissue, toilet paper, bathroom tissue"
+  },
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "Afghan hound, Afghan": 160,
+    "African chameleon, Chamaeleo chamaeleon": 47,
+    "African crocodile, Nile crocodile, Crocodylus niloticus": 49,
+    "African elephant, Loxodonta africana": 386,
+    "African grey, African gray, Psittacus erithacus": 87,
+    "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus": 275,
+    "Airedale, Airedale terrier": 191,
+    "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier": 180,
+    "American alligator, Alligator mississipiensis": 50,
+    "American black bear, black bear, Ursus americanus, Euarctos americanus": 295,
+    "American chameleon, anole, Anolis carolinensis": 40,
+    "American coot, marsh hen, mud hen, water hen, Fulica americana": 137,
+    "American egret, great white heron, Egretta albus": 132,
+    "American lobster, Northern lobster, Maine lobster, Homarus americanus": 122,
+    "Angora, Angora rabbit": 332,
+    "Appenzeller": 240,
+    "Arabian camel, dromedary, Camelus dromedarius": 354,
+    "Arctic fox, white fox, Alopex lagopus": 279,
+    "Australian terrier": 193,
+    "Band Aid": 419,
+    "Bedlington terrier": 181,
+    "Bernese mountain dog": 239,
+    "Blenheim spaniel": 156,
+    "Border collie": 232,
+    "Border terrier": 182,
+    "Boston bull, Boston terrier": 195,
+    "Bouvier des Flandres, Bouviers des Flandres": 233,
+    "Brabancon griffon": 262,
+    "Brittany spaniel": 215,
+    "CD player": 485,
+    "Cardigan, Cardigan Welsh corgi": 264,
+    "Chesapeake Bay retriever": 209,
+    "Chihuahua": 151,
+    "Christmas stocking": 496,
+    "Crock Pot": 521,
+    "Dandie Dinmont, Dandie Dinmont terrier": 194,
+    "Doberman, Doberman pinscher": 236,
+    "Dungeness crab, Cancer magister": 118,
+    "Dutch oven": 544,
+    "Egyptian cat": 285,
+    "English foxhound": 167,
+    "English setter": 212,
+    "English springer, English springer spaniel": 217,
+    "EntleBucher": 241,
+    "Eskimo dog, husky": 248,
+    "European fire salamander, Salamandra salamandra": 25,
+    "European gallinule, Porphyrio porphyrio": 136,
+    "French bulldog": 245,
+    "French horn, horn": 566,
+    "French loaf": 930,
+    "German shepherd, German shepherd dog, German police dog, alsatian": 235,
+    "German short-haired pointer": 210,
+    "Gila monster, Heloderma suspectum": 45,
+    "Gordon setter": 214,
+    "Granny Smith": 948,
+    "Great Dane": 246,
+    "Great Pyrenees": 257,
+    "Greater Swiss Mountain dog": 238,
+    "Ibizan hound, Ibizan Podenco": 173,
+    "Indian cobra, Naja naja": 63,
+    "Indian elephant, Elephas maximus": 385,
+    "Irish setter, red setter": 213,
+    "Irish terrier": 184,
+    "Irish water spaniel": 221,
+    "Irish wolfhound": 170,
+    "Italian greyhound": 171,
+    "Japanese spaniel": 152,
+    "Kerry blue terrier": 183,
+    "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis": 48,
+    "Labrador retriever": 208,
+    "Lakeland terrier": 189,
+    "Leonberg": 255,
+    "Lhasa, Lhasa apso": 204,
+    "Loafer": 630,
+    "Madagascar cat, ring-tailed lemur, Lemur catta": 383,
+    "Maltese dog, Maltese terrier, Maltese": 153,
+    "Mexican hairless": 268,
+    "Model T": 661,
+    "Newfoundland, Newfoundland dog": 256,
+    "Norfolk terrier": 185,
+    "Norwegian elkhound, elkhound": 174,
+    "Norwich terrier": 186,
+    "Old English sheepdog, bobtail": 229,
+    "Pekinese, Pekingese, Peke": 154,
+    "Pembroke, Pembroke Welsh corgi": 263,
+    "Persian cat": 283,
+    "Petri dish": 712,
+    "Polaroid camera, Polaroid Land camera": 732,
+    "Pomeranian": 259,
+    "Rhodesian ridgeback": 159,
+    "Rottweiler": 234,
+    "Saint Bernard, St Bernard": 247,
+    "Saluki, gazelle hound": 176,
+    "Samoyed, Samoyede": 258,
+    "Scotch terrier, Scottish terrier, Scottie": 199,
+    "Scottish deerhound, deerhound": 177,
+    "Sealyham terrier, Sealyham": 190,
+    "Shetland sheepdog, Shetland sheep dog, Shetland": 230,
+    "Shih-Tzu": 155,
+    "Siamese cat, Siamese": 284,
+    "Siberian husky": 250,
+    "Staffordshire bullterrier, Staffordshire bull terrier": 179,
+    "Sussex spaniel": 220,
+    "Tibetan mastiff": 244,
+    "Tibetan terrier, chrysanthemum dog": 200,
+    "Walker hound, Walker foxhound": 166,
+    "Weimaraner": 178,
+    "Welsh springer spaniel": 218,
+    "West Highland white terrier": 203,
+    "Windsor tie": 906,
+    "Yorkshire terrier": 187,
+    "abacus": 398,
+    "abaya": 399,
+    "academic gown, academic robe, judge's robe": 400,
+    "accordion, piano accordion, squeeze box": 401,
+    "acorn": 988,
+    "acorn squash": 941,
+    "acoustic guitar": 402,
+    "admiral": 321,
+    "affenpinscher, monkey pinscher, monkey dog": 252,
+    "agama": 42,
+    "agaric": 992,
+    "aircraft carrier, carrier, flattop, attack aircraft carrier": 403,
+    "airliner": 404,
+    "airship, dirigible": 405,
+    "albatross, mollymawk": 146,
+    "alligator lizard": 44,
+    "alp": 970,
+    "altar": 406,
+    "ambulance": 407,
+    "amphibian, amphibious vehicle": 408,
+    "analog clock": 409,
+    "anemone fish": 393,
+    "ant, emmet, pismire": 310,
+    "apiary, bee house": 410,
+    "apron": 411,
+    "armadillo": 363,
+    "artichoke, globe artichoke": 944,
+    "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin": 412,
+    "assault rifle, assault gun": 413,
+    "axolotl, mud puppy, Ambystoma mexicanum": 29,
+    "baboon": 372,
+    "backpack, back pack, knapsack, packsack, rucksack, haversack": 414,
+    "badger": 362,
+    "bagel, beigel": 931,
+    "bakery, bakeshop, bakehouse": 415,
+    "balance beam, beam": 416,
+    "bald eagle, American eagle, Haliaeetus leucocephalus": 22,
+    "balloon": 417,
+    "ballplayer, baseball player": 981,
+    "ballpoint, ballpoint pen, ballpen, Biro": 418,
+    "banana": 954,
+    "banded gecko": 38,
+    "banjo": 420,
+    "bannister, banister, balustrade, balusters, handrail": 421,
+    "barbell": 422,
+    "barber chair": 423,
+    "barbershop": 424,
+    "barn": 425,
+    "barn spider, Araneus cavaticus": 73,
+    "barometer": 426,
+    "barracouta, snoek": 389,
+    "barrel, cask": 427,
+    "barrow, garden cart, lawn cart, wheelbarrow": 428,
+    "baseball": 429,
+    "basenji": 253,
+    "basketball": 430,
+    "basset, basset hound": 161,
+    "bassinet": 431,
+    "bassoon": 432,
+    "bath towel": 434,
+    "bathing cap, swimming cap": 433,
+    "bathtub, bathing tub, bath, tub": 435,
+    "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon": 436,
+    "beacon, lighthouse, beacon light, pharos": 437,
+    "beagle": 162,
+    "beaker": 438,
+    "bearskin, busby, shako": 439,
+    "beaver": 337,
+    "bee": 309,
+    "bee eater": 92,
+    "beer bottle": 440,
+    "beer glass": 441,
+    "bell cote, bell cot": 442,
+    "bell pepper": 945,
+    "bib": 443,
+    "bicycle-built-for-two, tandem bicycle, tandem": 444,
+    "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis": 349,
+    "bikini, two-piece": 445,
+    "binder, ring-binder": 446,
+    "binoculars, field glasses, opera glasses": 447,
+    "birdhouse": 448,
+    "bison": 347,
+    "bittern": 133,
+    "black and gold garden spider, Argiope aurantia": 72,
+    "black grouse": 80,
+    "black stork, Ciconia nigra": 128,
+    "black swan, Cygnus atratus": 100,
+    "black widow, Latrodectus mactans": 75,
+    "black-and-tan coonhound": 165,
+    "black-footed ferret, ferret, Mustela nigripes": 359,
+    "bloodhound, sleuthhound": 163,
+    "bluetick": 164,
+    "boa constrictor, Constrictor constrictor": 61,
+    "boathouse": 449,
+    "bobsled, bobsleigh, bob": 450,
+    "bolete": 997,
+    "bolo tie, bolo, bola tie, bola": 451,
+    "bonnet, poke bonnet": 452,
+    "book jacket, dust cover, dust jacket, dust wrapper": 921,
+    "bookcase": 453,
+    "bookshop, bookstore, bookstall": 454,
+    "borzoi, Russian wolfhound": 169,
+    "bottlecap": 455,
+    "bow": 456,
+    "bow tie, bow-tie, bowtie": 457,
+    "box turtle, box tortoise": 37,
+    "boxer": 242,
+    "brain coral": 109,
+    "brambling, Fringilla montifringilla": 10,
+    "brass, memorial tablet, plaque": 458,
+    "brassiere, bra, bandeau": 459,
+    "breakwater, groin, groyne, mole, bulwark, seawall, jetty": 460,
+    "breastplate, aegis, egis": 461,
+    "briard": 226,
+    "broccoli": 937,
+    "broom": 462,
+    "brown bear, bruin, Ursus arctos": 294,
+    "bubble": 971,
+    "bucket, pail": 463,
+    "buckeye, horse chestnut, conker": 990,
+    "buckle": 464,
+    "bulbul": 16,
+    "bull mastiff": 243,
+    "bullet train, bullet": 466,
+    "bulletproof vest": 465,
+    "bullfrog, Rana catesbeiana": 30,
+    "burrito": 965,
+    "bustard": 138,
+    "butcher shop, meat market": 467,
+    "butternut squash": 942,
+    "cab, hack, taxi, taxicab": 468,
+    "cabbage butterfly": 324,
+    "cairn, cairn terrier": 192,
+    "caldron, cauldron": 469,
+    "can opener, tin opener": 473,
+    "candle, taper, wax light": 470,
+    "cannon": 471,
+    "canoe": 472,
+    "capuchin, ringtail, Cebus capucinus": 378,
+    "car mirror": 475,
+    "car wheel": 479,
+    "carbonara": 959,
+    "cardigan": 474,
+    "cardoon": 946,
+    "carousel, carrousel, merry-go-round, roundabout, whirligig": 476,
+    "carpenter's kit, tool kit": 477,
+    "carton": 478,
+    "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM": 480,
+    "cassette": 481,
+    "cassette player": 482,
+    "castle": 483,
+    "catamaran": 484,
+    "cauliflower": 938,
+    "cello, violoncello": 486,
+    "cellular telephone, cellular phone, cellphone, cell, mobile phone": 487,
+    "centipede": 79,
+    "chain": 488,
+    "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour": 490,
+    "chain saw, chainsaw": 491,
+    "chainlink fence": 489,
+    "chambered nautilus, pearly nautilus, nautilus": 117,
+    "cheeseburger": 933,
+    "cheetah, chetah, Acinonyx jubatus": 293,
+    "chest": 492,
+    "chickadee": 19,
+    "chiffonier, commode": 493,
+    "chime, bell, gong": 494,
+    "chimpanzee, chimp, Pan troglodytes": 367,
+    "china cabinet, china closet": 495,
+    "chiton, coat-of-mail shell, sea cradle, polyplacophore": 116,
+    "chocolate sauce, chocolate syrup": 960,
+    "chow, chow chow": 260,
+    "church, church building": 497,
+    "cicada, cicala": 316,
+    "cinema, movie theater, movie theatre, movie house, picture palace": 498,
+    "cleaver, meat cleaver, chopper": 499,
+    "cliff dwelling": 500,
+    "cliff, drop, drop-off": 972,
+    "cloak": 501,
+    "clog, geta, patten, sabot": 502,
+    "clumber, clumber spaniel": 216,
+    "cock": 7,
+    "cocker spaniel, English cocker spaniel, cocker": 219,
+    "cockroach, roach": 314,
+    "cocktail shaker": 503,
+    "coffee mug": 504,
+    "coffeepot": 505,
+    "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch": 391,
+    "coil, spiral, volute, whorl, helix": 506,
+    "collie": 231,
+    "colobus, colobus monkey": 375,
+    "combination lock": 507,
+    "comic book": 917,
+    "common iguana, iguana, Iguana iguana": 39,
+    "common newt, Triturus vulgaris": 26,
+    "computer keyboard, keypad": 508,
+    "conch": 112,
+    "confectionery, confectionary, candy store": 509,
+    "consomme": 925,
+    "container ship, containership, container vessel": 510,
+    "convertible": 511,
+    "coral fungus": 991,
+    "coral reef": 973,
+    "corkscrew, bottle screw": 512,
+    "corn": 987,
+    "cornet, horn, trumpet, trump": 513,
+    "coucal": 91,
+    "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor": 286,
+    "cowboy boot": 514,
+    "cowboy hat, ten-gallon hat": 515,
+    "coyote, prairie wolf, brush wolf, Canis latrans": 272,
+    "cradle": 516,
+    "crane": 517,
+    "crash helmet": 518,
+    "crate": 519,
+    "crayfish, crawfish, crawdad, crawdaddy": 124,
+    "crib, cot": 520,
+    "cricket": 312,
+    "croquet ball": 522,
+    "crossword puzzle, crossword": 918,
+    "crutch": 523,
+    "cucumber, cuke": 943,
+    "cuirass": 524,
+    "cup": 968,
+    "curly-coated retriever": 206,
+    "custard apple": 956,
+    "daisy": 985,
+    "dalmatian, coach dog, carriage dog": 251,
+    "dam, dike, dyke": 525,
+    "damselfly": 320,
+    "desk": 526,
+    "desktop computer": 527,
+    "dhole, Cuon alpinus": 274,
+    "dial telephone, dial phone": 528,
+    "diamondback, diamondback rattlesnake, Crotalus adamanteus": 67,
+    "diaper, nappy, napkin": 529,
+    "digital clock": 530,
+    "digital watch": 531,
+    "dingo, warrigal, warragal, Canis dingo": 273,
+    "dining table, board": 532,
+    "dishrag, dishcloth": 533,
+    "dishwasher, dish washer, dishwashing machine": 534,
+    "disk brake, disc brake": 535,
+    "dock, dockage, docking facility": 536,
+    "dogsled, dog sled, dog sleigh": 537,
+    "dome": 538,
+    "doormat, welcome mat": 539,
+    "dough": 961,
+    "dowitcher": 142,
+    "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk": 319,
+    "drake": 97,
+    "drilling platform, offshore rig": 540,
+    "drum, membranophone, tympan": 541,
+    "drumstick": 542,
+    "dugong, Dugong dugon": 149,
+    "dumbbell": 543,
+    "dung beetle": 305,
+    "ear, spike, capitulum": 998,
+    "earthstar": 995,
+    "echidna, spiny anteater, anteater": 102,
+    "eel": 390,
+    "eft": 27,
+    "eggnog": 969,
+    "electric fan, blower": 545,
+    "electric guitar": 546,
+    "electric locomotive": 547,
+    "electric ray, crampfish, numbfish, torpedo": 5,
+    "entertainment center": 548,
+    "envelope": 549,
+    "espresso": 967,
+    "espresso maker": 550,
+    "face powder": 551,
+    "feather boa, boa": 552,
+    "fiddler crab": 120,
+    "fig": 952,
+    "file, file cabinet, filing cabinet": 553,
+    "fire engine, fire truck": 555,
+    "fire screen, fireguard": 556,
+    "fireboat": 554,
+    "flagpole, flagstaff": 557,
+    "flamingo": 130,
+    "flat-coated retriever": 205,
+    "flatworm, platyhelminth": 110,
+    "flute, transverse flute": 558,
+    "fly": 308,
+    "folding chair": 559,
+    "football helmet": 560,
+    "forklift": 561,
+    "fountain": 562,
+    "fountain pen": 563,
+    "four-poster": 564,
+    "fox squirrel, eastern fox squirrel, Sciurus niger": 335,
+    "freight car": 565,
+    "frilled lizard, Chlamydosaurus kingi": 43,
+    "frying pan, frypan, skillet": 567,
+    "fur coat": 568,
+    "gar, garfish, garpike, billfish, Lepisosteus osseus": 395,
+    "garbage truck, dustcart": 569,
+    "garden spider, Aranea diademata": 74,
+    "garter snake, grass snake": 57,
+    "gas pump, gasoline pump, petrol pump, island dispenser": 571,
+    "gasmask, respirator, gas helmet": 570,
+    "gazelle": 353,
+    "geyser": 974,
+    "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca": 388,
+    "giant schnauzer": 197,
+    "gibbon, Hylobates lar": 368,
+    "go-kart": 573,
+    "goblet": 572,
+    "golden retriever": 207,
+    "goldfinch, Carduelis carduelis": 11,
+    "goldfish, Carassius auratus": 1,
+    "golf ball": 574,
+    "golfcart, golf cart": 575,
+    "gondola": 576,
+    "gong, tam-tam": 577,
+    "goose": 99,
+    "gorilla, Gorilla gorilla": 366,
+    "gown": 578,
+    "grand piano, grand": 579,
+    "grasshopper, hopper": 311,
+    "great grey owl, great gray owl, Strix nebulosa": 24,
+    "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias": 2,
+    "green lizard, Lacerta viridis": 46,
+    "green mamba": 64,
+    "green snake, grass snake": 55,
+    "greenhouse, nursery, glasshouse": 580,
+    "grey fox, gray fox, Urocyon cinereoargenteus": 280,
+    "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus": 147,
+    "grille, radiator grille": 581,
+    "grocery store, grocery, food market, market": 582,
+    "groenendael": 224,
+    "groom, bridegroom": 982,
+    "ground beetle, carabid beetle": 302,
+    "guacamole": 924,
+    "guenon, guenon monkey": 370,
+    "guillotine": 583,
+    "guinea pig, Cavia cobaya": 338,
+    "gyromitra": 993,
+    "hair slide": 584,
+    "hair spray": 585,
+    "half track": 586,
+    "hammer": 587,
+    "hammerhead, hammerhead shark": 4,
+    "hamper": 588,
+    "hamster": 333,
+    "hand blower, blow dryer, blow drier, hair dryer, hair drier": 589,
+    "hand-held computer, hand-held microcomputer": 590,
+    "handkerchief, hankie, hanky, hankey": 591,
+    "hard disc, hard disk, fixed disk": 592,
+    "hare": 331,
+    "harmonica, mouth organ, harp, mouth harp": 593,
+    "harp": 594,
+    "hartebeest": 351,
+    "harvester, reaper": 595,
+    "harvestman, daddy longlegs, Phalangium opilio": 70,
+    "hatchet": 596,
+    "hay": 958,
+    "head cabbage": 936,
+    "hen": 8,
+    "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa": 996,
+    "hermit crab": 125,
+    "hip, rose hip, rosehip": 989,
+    "hippopotamus, hippo, river horse, Hippopotamus amphibius": 344,
+    "hog, pig, grunter, squealer, Sus scrofa": 341,
+    "hognose snake, puff adder, sand viper": 54,
+    "holster": 597,
+    "home theater, home theatre": 598,
+    "honeycomb": 599,
+    "hook, claw": 600,
+    "hoopskirt, crinoline": 601,
+    "horizontal bar, high bar": 602,
+    "hornbill": 93,
+    "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus": 66,
+    "horse cart, horse-cart": 603,
+    "hot pot, hotpot": 926,
+    "hotdog, hot dog, red hot": 934,
+    "hourglass": 604,
+    "house finch, linnet, Carpodacus mexicanus": 12,
+    "howler monkey, howler": 379,
+    "hummingbird": 94,
+    "hyena, hyaena": 276,
+    "iPod": 605,
+    "ibex, Capra ibex": 350,
+    "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus": 296,
+    "ice cream, icecream": 928,
+    "ice lolly, lolly, lollipop, popsicle": 929,
+    "impala, Aepyceros melampus": 352,
+    "indigo bunting, indigo finch, indigo bird, Passerina cyanea": 14,
+    "indri, indris, Indri indri, Indri brevicaudatus": 384,
+    "iron, smoothing iron": 606,
+    "isopod": 126,
+    "jacamar": 95,
+    "jack-o'-lantern": 607,
+    "jackfruit, jak, jack": 955,
+    "jaguar, panther, Panthera onca, Felis onca": 290,
+    "jay": 17,
+    "jean, blue jean, denim": 608,
+    "jeep, landrover": 609,
+    "jellyfish": 107,
+    "jersey, T-shirt, tee shirt": 610,
+    "jigsaw puzzle": 611,
+    "jinrikisha, ricksha, rickshaw": 612,
+    "joystick": 613,
+    "junco, snowbird": 13,
+    "keeshond": 261,
+    "kelpie": 227,
+    "killer whale, killer, orca, grampus, sea wolf, Orcinus orca": 148,
+    "kimono": 614,
+    "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica": 121,
+    "king penguin, Aptenodytes patagonica": 145,
+    "king snake, kingsnake": 56,
+    "kit fox, Vulpes macrotis": 278,
+    "kite": 21,
+    "knee pad": 615,
+    "knot": 616,
+    "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus": 105,
+    "komondor": 228,
+    "kuvasz": 222,
+    "lab coat, laboratory coat": 617,
+    "lacewing, lacewing fly": 318,
+    "ladle": 618,
+    "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle": 301,
+    "lakeside, lakeshore": 975,
+    "lampshade, lamp shade": 619,
+    "langur": 374,
+    "laptop, laptop computer": 620,
+    "lawn mower, mower": 621,
+    "leaf beetle, chrysomelid": 304,
+    "leafhopper": 317,
+    "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea": 34,
+    "lemon": 951,
+    "lens cap, lens cover": 622,
+    "leopard, Panthera pardus": 288,
+    "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens": 387,
+    "letter opener, paper knife, paperknife": 623,
+    "library": 624,
+    "lifeboat": 625,
+    "lighter, light, igniter, ignitor": 626,
+    "limousine, limo": 627,
+    "limpkin, Aramus pictus": 135,
+    "liner, ocean liner": 628,
+    "lion, king of beasts, Panthera leo": 291,
+    "lionfish": 396,
+    "lipstick, lip rouge": 629,
+    "little blue heron, Egretta caerulea": 131,
+    "llama": 355,
+    "loggerhead, loggerhead turtle, Caretta caretta": 33,
+    "long-horned beetle, longicorn, longicorn beetle": 303,
+    "lorikeet": 90,
+    "lotion": 631,
+    "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system": 632,
+    "loupe, jeweler's loupe": 633,
+    "lumbermill, sawmill": 634,
+    "lycaenid, lycaenid butterfly": 326,
+    "lynx, catamount": 287,
+    "macaque": 373,
+    "macaw": 88,
+    "magnetic compass": 635,
+    "magpie": 18,
+    "mailbag, postbag": 636,
+    "mailbox, letter box": 637,
+    "maillot": 638,
+    "maillot, tank suit": 639,
+    "malamute, malemute, Alaskan malamute": 249,
+    "malinois": 225,
+    "manhole cover": 640,
+    "mantis, mantid": 315,
+    "maraca": 641,
+    "marimba, xylophone": 642,
+    "marmoset": 377,
+    "marmot": 336,
+    "mashed potato": 935,
+    "mask": 643,
+    "matchstick": 644,
+    "maypole": 645,
+    "maze, labyrinth": 646,
+    "measuring cup": 647,
+    "meat loaf, meatloaf": 962,
+    "medicine chest, medicine cabinet": 648,
+    "meerkat, mierkat": 299,
+    "megalith, megalithic structure": 649,
+    "menu": 922,
+    "microphone, mike": 650,
+    "microwave, microwave oven": 651,
+    "military uniform": 652,
+    "milk can": 653,
+    "miniature pinscher": 237,
+    "miniature poodle": 266,
+    "miniature schnauzer": 196,
+    "minibus": 654,
+    "miniskirt, mini": 655,
+    "minivan": 656,
+    "mink": 357,
+    "missile": 657,
+    "mitten": 658,
+    "mixing bowl": 659,
+    "mobile home, manufactured home": 660,
+    "modem": 662,
+    "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus": 323,
+    "monastery": 663,
+    "mongoose": 298,
+    "monitor": 664,
+    "moped": 665,
+    "mortar": 666,
+    "mortarboard": 667,
+    "mosque": 668,
+    "mosquito net": 669,
+    "motor scooter, scooter": 670,
+    "mountain bike, all-terrain bike, off-roader": 671,
+    "mountain tent": 672,
+    "mouse, computer mouse": 673,
+    "mousetrap": 674,
+    "moving van": 675,
+    "mud turtle": 35,
+    "mushroom": 947,
+    "muzzle": 676,
+    "nail": 677,
+    "neck brace": 678,
+    "necklace": 679,
+    "nematode, nematode worm, roundworm": 111,
+    "night snake, Hypsiglena torquata": 60,
+    "nipple": 680,
+    "notebook, notebook computer": 681,
+    "obelisk": 682,
+    "oboe, hautboy, hautbois": 683,
+    "ocarina, sweet potato": 684,
+    "odometer, hodometer, mileometer, milometer": 685,
+    "oil filter": 686,
+    "orange": 950,
+    "orangutan, orang, orangutang, Pongo pygmaeus": 365,
+    "organ, pipe organ": 687,
+    "oscilloscope, scope, cathode-ray oscilloscope, CRO": 688,
+    "ostrich, Struthio camelus": 9,
+    "otter": 360,
+    "otterhound, otter hound": 175,
+    "overskirt": 689,
+    "ox": 345,
+    "oxcart": 690,
+    "oxygen mask": 691,
+    "oystercatcher, oyster catcher": 143,
+    "packet": 692,
+    "paddle, boat paddle": 693,
+    "paddlewheel, paddle wheel": 694,
+    "padlock": 695,
+    "paintbrush": 696,
+    "pajama, pyjama, pj's, jammies": 697,
+    "palace": 698,
+    "panpipe, pandean pipe, syrinx": 699,
+    "paper towel": 700,
+    "papillon": 157,
+    "parachute, chute": 701,
+    "parallel bars, bars": 702,
+    "park bench": 703,
+    "parking meter": 704,
+    "partridge": 86,
+    "passenger car, coach, carriage": 705,
+    "patas, hussar monkey, Erythrocebus patas": 371,
+    "patio, terrace": 706,
+    "pay-phone, pay-station": 707,
+    "peacock": 84,
+    "pedestal, plinth, footstall": 708,
+    "pelican": 144,
+    "pencil box, pencil case": 709,
+    "pencil sharpener": 710,
+    "perfume, essence": 711,
+    "photocopier": 713,
+    "pick, plectrum, plectron": 714,
+    "pickelhaube": 715,
+    "picket fence, paling": 716,
+    "pickup, pickup truck": 717,
+    "pier": 718,
+    "piggy bank, penny bank": 719,
+    "pill bottle": 720,
+    "pillow": 721,
+    "pineapple, ananas": 953,
+    "ping-pong ball": 722,
+    "pinwheel": 723,
+    "pirate, pirate ship": 724,
+    "pitcher, ewer": 725,
+    "pizza, pizza pie": 963,
+    "plane, carpenter's plane, woodworking plane": 726,
+    "planetarium": 727,
+    "plastic bag": 728,
+    "plate": 923,
+    "plate rack": 729,
+    "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus": 103,
+    "plow, plough": 730,
+    "plunger, plumber's helper": 731,
+    "pole": 733,
+    "polecat, fitch, foulmart, foumart, Mustela putorius": 358,
+    "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria": 734,
+    "pomegranate": 957,
+    "poncho": 735,
+    "pool table, billiard table, snooker table": 736,
+    "pop bottle, soda bottle": 737,
+    "porcupine, hedgehog": 334,
+    "pot, flowerpot": 738,
+    "potpie": 964,
+    "potter's wheel": 739,
+    "power drill": 740,
+    "prairie chicken, prairie grouse, prairie fowl": 83,
+    "prayer rug, prayer mat": 741,
+    "pretzel": 932,
+    "printer": 742,
+    "prison, prison house": 743,
+    "proboscis monkey, Nasalis larvatus": 376,
+    "projectile, missile": 744,
+    "projector": 745,
+    "promontory, headland, head, foreland": 976,
+    "ptarmigan": 81,
+    "puck, hockey puck": 746,
+    "puffer, pufferfish, blowfish, globefish": 397,
+    "pug, pug-dog": 254,
+    "punching bag, punch bag, punching ball, punchball": 747,
+    "purse": 748,
+    "quail": 85,
+    "quill, quill pen": 749,
+    "quilt, comforter, comfort, puff": 750,
+    "racer, race car, racing car": 751,
+    "racket, racquet": 752,
+    "radiator": 753,
+    "radio telescope, radio reflector": 755,
+    "radio, wireless": 754,
+    "rain barrel": 756,
+    "ram, tup": 348,
+    "rapeseed": 984,
+    "recreational vehicle, RV, R.V.": 757,
+    "red fox, Vulpes vulpes": 277,
+    "red wine": 966,
+    "red wolf, maned wolf, Canis rufus, Canis niger": 271,
+    "red-backed sandpiper, dunlin, Erolia alpina": 140,
+    "red-breasted merganser, Mergus serrator": 98,
+    "redbone": 168,
+    "redshank, Tringa totanus": 141,
+    "reel": 758,
+    "reflex camera": 759,
+    "refrigerator, icebox": 760,
+    "remote control, remote": 761,
+    "restaurant, eating house, eating place, eatery": 762,
+    "revolver, six-gun, six-shooter": 763,
+    "rhinoceros beetle": 306,
+    "rifle": 764,
+    "ringlet, ringlet butterfly": 322,
+    "ringneck snake, ring-necked snake, ring snake": 53,
+    "robin, American robin, Turdus migratorius": 15,
+    "rock beauty, Holocanthus tricolor": 392,
+    "rock crab, Cancer irroratus": 119,
+    "rock python, rock snake, Python sebae": 62,
+    "rocking chair, rocker": 765,
+    "rotisserie": 766,
+    "rubber eraser, rubber, pencil eraser": 767,
+    "ruddy turnstone, Arenaria interpres": 139,
+    "ruffed grouse, partridge, Bonasa umbellus": 82,
+    "rugby ball": 768,
+    "rule, ruler": 769,
+    "running shoe": 770,
+    "safe": 771,
+    "safety pin": 772,
+    "saltshaker, salt shaker": 773,
+    "sandal": 774,
+    "sandbar, sand bar": 977,
+    "sarong": 775,
+    "sax, saxophone": 776,
+    "scabbard": 777,
+    "scale, weighing machine": 778,
+    "schipperke": 223,
+    "school bus": 779,
+    "schooner": 780,
+    "scoreboard": 781,
+    "scorpion": 71,
+    "screen, CRT screen": 782,
+    "screw": 783,
+    "screwdriver": 784,
+    "scuba diver": 983,
+    "sea anemone, anemone": 108,
+    "sea cucumber, holothurian": 329,
+    "sea lion": 150,
+    "sea slug, nudibranch": 115,
+    "sea snake": 65,
+    "sea urchin": 328,
+    "seashore, coast, seacoast, sea-coast": 978,
+    "seat belt, seatbelt": 785,
+    "sewing machine": 786,
+    "shield, buckler": 787,
+    "shoe shop, shoe-shop, shoe store": 788,
+    "shoji": 789,
+    "shopping basket": 790,
+    "shopping cart": 791,
+    "shovel": 792,
+    "shower cap": 793,
+    "shower curtain": 794,
+    "siamang, Hylobates syndactylus, Symphalangus syndactylus": 369,
+    "sidewinder, horned rattlesnake, Crotalus cerastes": 68,
+    "silky terrier, Sydney silky": 201,
+    "ski": 795,
+    "ski mask": 796,
+    "skunk, polecat, wood pussy": 361,
+    "sleeping bag": 797,
+    "slide rule, slipstick": 798,
+    "sliding door": 799,
+    "slot, one-armed bandit": 800,
+    "sloth bear, Melursus ursinus, Ursus ursinus": 297,
+    "slug": 114,
+    "snail": 113,
+    "snorkel": 801,
+    "snow leopard, ounce, Panthera uncia": 289,
+    "snowmobile": 802,
+    "snowplow, snowplough": 803,
+    "soap dispenser": 804,
+    "soccer ball": 805,
+    "sock": 806,
+    "soft-coated wheaten terrier": 202,
+    "solar dish, solar collector, solar furnace": 807,
+    "sombrero": 808,
+    "sorrel": 339,
+    "soup bowl": 809,
+    "space bar": 810,
+    "space heater": 811,
+    "space shuttle": 812,
+    "spaghetti squash": 940,
+    "spatula": 813,
+    "speedboat": 814,
+    "spider monkey, Ateles geoffroyi": 381,
+    "spider web, spider's web": 815,
+    "spindle": 816,
+    "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish": 123,
+    "spoonbill": 129,
+    "sports car, sport car": 817,
+    "spotlight, spot": 818,
+    "spotted salamander, Ambystoma maculatum": 28,
+    "squirrel monkey, Saimiri sciureus": 382,
+    "stage": 819,
+    "standard poodle": 267,
+    "standard schnauzer": 198,
+    "starfish, sea star": 327,
+    "steam locomotive": 820,
+    "steel arch bridge": 821,
+    "steel drum": 822,
+    "stethoscope": 823,
+    "stingray": 6,
+    "stinkhorn, carrion fungus": 994,
+    "stole": 824,
+    "stone wall": 825,
+    "stopwatch, stop watch": 826,
+    "stove": 827,
+    "strainer": 828,
+    "strawberry": 949,
+    "street sign": 919,
+    "streetcar, tram, tramcar, trolley, trolley car": 829,
+    "stretcher": 830,
+    "studio couch, day bed": 831,
+    "stupa, tope": 832,
+    "sturgeon": 394,
+    "submarine, pigboat, sub, U-boat": 833,
+    "suit, suit of clothes": 834,
+    "sulphur butterfly, sulfur butterfly": 325,
+    "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita": 89,
+    "sundial": 835,
+    "sunglass": 836,
+    "sunglasses, dark glasses, shades": 837,
+    "sunscreen, sunblock, sun blocker": 838,
+    "suspension bridge": 839,
+    "swab, swob, mop": 840,
+    "sweatshirt": 841,
+    "swimming trunks, bathing trunks": 842,
+    "swing": 843,
+    "switch, electric switch, electrical switch": 844,
+    "syringe": 845,
+    "tabby, tabby cat": 281,
+    "table lamp": 846,
+    "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui": 32,
+    "tank, army tank, armored combat vehicle, armoured combat vehicle": 847,
+    "tape player": 848,
+    "tarantula": 76,
+    "teapot": 849,
+    "teddy, teddy bear": 850,
+    "television, television system": 851,
+    "tench, Tinca tinca": 0,
+    "tennis ball": 852,
+    "terrapin": 36,
+    "thatch, thatched roof": 853,
+    "theater curtain, theatre curtain": 854,
+    "thimble": 855,
+    "three-toed sloth, ai, Bradypus tridactylus": 364,
+    "thresher, thrasher, threshing machine": 856,
+    "throne": 857,
+    "thunder snake, worm snake, Carphophis amoenus": 52,
+    "tick": 78,
+    "tiger beetle": 300,
+    "tiger cat": 282,
+    "tiger shark, Galeocerdo cuvieri": 3,
+    "tiger, Panthera tigris": 292,
+    "tile roof": 858,
+    "timber wolf, grey wolf, gray wolf, Canis lupus": 269,
+    "titi, titi monkey": 380,
+    "toaster": 859,
+    "tobacco shop, tobacconist shop, tobacconist": 860,
+    "toilet seat": 861,
+    "toilet tissue, toilet paper, bathroom tissue": 999,
+    "torch": 862,
+    "totem pole": 863,
+    "toucan": 96,
+    "tow truck, tow car, wrecker": 864,
+    "toy poodle": 265,
+    "toy terrier": 158,
+    "toyshop": 865,
+    "tractor": 866,
+    "traffic light, traffic signal, stoplight": 920,
+    "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi": 867,
+    "tray": 868,
+    "tree frog, tree-frog": 31,
+    "trench coat": 869,
+    "triceratops": 51,
+    "tricycle, trike, velocipede": 870,
+    "trifle": 927,
+    "trilobite": 69,
+    "trimaran": 871,
+    "tripod": 872,
+    "triumphal arch": 873,
+    "trolleybus, trolley coach, trackless trolley": 874,
+    "trombone": 875,
+    "tub, vat": 876,
+    "turnstile": 877,
+    "tusker": 101,
+    "typewriter keyboard": 878,
+    "umbrella": 879,
+    "unicycle, monocycle": 880,
+    "upright, upright piano": 881,
+    "vacuum, vacuum cleaner": 882,
+    "valley, vale": 979,
+    "vase": 883,
+    "vault": 884,
+    "velvet": 885,
+    "vending machine": 886,
+    "vestment": 887,
+    "viaduct": 888,
+    "vine snake": 59,
+    "violin, fiddle": 889,
+    "vizsla, Hungarian pointer": 211,
+    "volcano": 980,
+    "volleyball": 890,
+    "vulture": 23,
+    "waffle iron": 891,
+    "walking stick, walkingstick, stick insect": 313,
+    "wall clock": 892,
+    "wallaby, brush kangaroo": 104,
+    "wallet, billfold, notecase, pocketbook": 893,
+    "wardrobe, closet, press": 894,
+    "warplane, military plane": 895,
+    "warthog": 343,
+    "washbasin, handbasin, washbowl, lavabo, wash-hand basin": 896,
+    "washer, automatic washer, washing machine": 897,
+    "water bottle": 898,
+    "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis": 346,
+    "water jug": 899,
+    "water ouzel, dipper": 20,
+    "water snake": 58,
+    "water tower": 900,
+    "weasel": 356,
+    "web site, website, internet site, site": 916,
+    "weevil": 307,
+    "whippet": 172,
+    "whiptail, whiptail lizard": 41,
+    "whiskey jug": 901,
+    "whistle": 902,
+    "white stork, Ciconia ciconia": 127,
+    "white wolf, Arctic wolf, Canis lupus tundrarum": 270,
+    "wig": 903,
+    "wild boar, boar, Sus scrofa": 342,
+    "window screen": 904,
+    "window shade": 905,
+    "wine bottle": 907,
+    "wing": 908,
+    "wire-haired fox terrier": 188,
+    "wok": 909,
+    "wolf spider, hunting spider": 77,
+    "wombat": 106,
+    "wood rabbit, cottontail, cottontail rabbit": 330,
+    "wooden spoon": 910,
+    "wool, woolen, woollen": 911,
+    "worm fence, snake fence, snake-rail fence, Virginia fence": 912,
+    "wreck": 913,
+    "yawl": 914,
+    "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum": 986,
+    "yurt": 915,
+    "zebra": 340,
+    "zucchini, courgette": 939
+  },
+  "layer_norm_eps": 1e-12,
+  "model_type": "vit",
+  "num_attention_heads": 12,
+  "num_channels": 3,
+  "num_hidden_layers": 12,
+  "patch_size": 16,
+  "pooler_act": "tanh",
+  "pooler_output_size": 768,
+  "qkv_bias": true,
+  "transformers_version": "4.57.1"
+}

core/.ipynb_checkpoints/distill-checkpoint.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""Knowledge-distillation utilities (model-family agnostic).
+This module provides:
+  - Losses: KL distillation, soft cross-entropy, cosine feature loss
+  - Helper to obtain logits from models with/without built-in heads
+  - Lightweight classification head for backbone models (e.g., ViTModel)
+  - Simple evaluators (agreement %, KL) and diagnostics
+Adapters may override `adapter_get_logits(model, x)` if a family needs a
+custom extraction (e.g., language models with past_key_values).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional, Protocol, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+@dataclass
+class KDConfig:
+    temperature: float = 2.0
+    alpha: float = 1.0  # multiplier for KL term; task loss handled outside
+# -----------------------------------------------------------------------------
+# Losses
+# -----------------------------------------------------------------------------
+def kl_divergence(student_logits: torch.Tensor, teacher_logits: torch.Tensor, T: float = 2.0) -> torch.Tensor:
+    """Batchmean KL(student/ T || teacher/ T) scaled by T^2 (Hinton-style)."""
+    p_s = F.log_softmax(student_logits / T, dim=-1)
+    p_t = F.softmax(teacher_logits / T, dim=-1)
+    return F.kl_div(p_s, p_t, reduction="batchmean") * (T * T)
+def kd_loss(student_logits: torch.Tensor, teacher_logits: torch.Tensor, cfg: KDConfig) -> torch.Tensor:
+    return cfg.alpha * kl_divergence(student_logits, teacher_logits, T=cfg.temperature)
+def soft_ce(student_logits: torch.Tensor, soft_targets: torch.Tensor) -> torch.Tensor:
+    """Soft cross-entropy: expects `soft_targets` already normalized."""
+    logp = F.log_softmax(student_logits, dim=-1)
+    return -(soft_targets * logp).sum(dim=-1).mean()
+def cosine_feature_loss(student_feats: torch.Tensor, teacher_feats: torch.Tensor) -> torch.Tensor:
+    """1 - cosine similarity averaged over batch and time/patch dims."""
+    s = F.normalize(student_feats, dim=-1)
+    t = F.normalize(teacher_feats, dim=-1)
+    return (1.0 - (s * t).sum(dim=-1)).mean()
+def mse_reg(student_logits: torch.Tensor, teacher_logits: torch.Tensor, T: float = 2.0) -> torch.Tensor:
+    mse = F.mse_loss(student_logits,teacher_logits, reduction="mean")
+    return mse * (T * T)
+# -----------------------------------------------------------------------------
+# Logit extraction
+# -----------------------------------------------------------------------------
+class LogitsProvider(Protocol):
+    def __call__(self, model: nn.Module, x: torch.Tensor) -> torch.Tensor: ...
+class ClsHead(nn.Module):
+    """Minimal classification head: LN + Linear.
+    Useful when the backbone outputs hidden states (e.g., ViTModel) and you
+    want logits comparable to a teacher with a classification head.
+    """
+    def __init__(self, hidden_size: int, num_classes: int = 1000, base_head: Optional[nn.Module] = None):
+        super().__init__()
+        self.norm = nn.LayerNorm(hidden_size)
+        self.fc = nn.Linear(hidden_size, num_classes)
+        if base_head is not None:
+            # Try to load weights if shapes match (e.g., from HF classifier)
+            try:
+                self.load_state_dict(base_head.state_dict(), strict=False)
+            except Exception:
+                pass
+    def forward(self, cls_token: torch.Tensor) -> torch.Tensor:
+        return self.fc(self.norm(cls_token))
+@torch.no_grad()
+def infer_hidden_size(model: nn.Module, sample: torch.Tensor) -> int:
+    # Run a tiny forward to inspect hidden size when unknown
+    model.eval()
+    out = model(pixel_values=sample)
+    if hasattr(out, "last_hidden_state"):
+        return int(out.last_hidden_state.shape[-1])
+    if hasattr(out, "logits"):
+        return int(out.logits.shape[-1])
+    raise RuntimeError("Cannot infer hidden size; provide explicitly.")
+def default_get_logits(model: nn.Module, x: torch.Tensor, *, head: Optional[nn.Module] = None) -> torch.Tensor:
+    """Family-agnostic logits extractor.
+    - If model output has `.logits`, return it.
+    - Else expects `.last_hidden_state` and uses [CLS] via provided `head`.
+    """
+    out = model(pixel_values=x)
+    if hasattr(out, "logits"):
+        return out.logits
+    if hasattr(out, "last_hidden_state"):
+        if head is None:
+            raise ValueError("Backbone returned hidden states; supply a classification head.")
+        cls_tok = out.last_hidden_state[:, 0, :]
+        return head(cls_tok)
+    raise ValueError("Model output lacks logits and last_hidden_state.")
+# -----------------------------------------------------------------------------
+# Evaluators & diagnostics
+# -----------------------------------------------------------------------------
+@torch.inference_mode()
+def logits_std(model: nn.Module, loader, *, get_logits: LogitsProvider, batches: int = 10, device: str = "cuda") -> Tuple[float, int]:
+    s = 0.0
+    k = 0
+    for x in loader:
+        if k >= batches:
+            break
+        x = x.to(device)
+        y = get_logits(model, x)
+        s += y.std().item()
+        k += 1
+    return (s / max(1, k), k)
+@torch.inference_mode()
+def agreement_metrics(
+    student: nn.Module,
+    teacher: nn.Module,
+    loader,
+    *,
+    get_student_logits: LogitsProvider,
+    get_teacher_logits: LogitsProvider,
+    batches: int = 20,
+    T: float = 1.0,
+    device: str = "cuda",
+) -> dict:
+    kl_sum = 0.0
+    n = 0
+    top1 = 0
+    tot = 0
+    for i, x in enumerate(loader):
+        if i >= batches:
+            break
+        x = x.to(device)
+        t = get_teacher_logits(teacher, x)
+        s = get_student_logits(student, x)
+        p_s = F.log_softmax(s / T, dim=-1)
+        p_t = F.softmax(t / T, dim=-1)
+        kl_sum += (F.kl_div(p_s, p_t, reduction="batchmean") * (T * T)).item()
+        top1 += (s.argmax(-1) == t.argmax(-1)).sum().item()
+        tot += x.size(0)
+        n += 1
+    return {"kl_TT": kl_sum / max(1, n), "top1_agreement": top1 / max(1, tot)}
+# -----------------------------------------------------------------------------
+# Small trainer helpers
+# -----------------------------------------------------------------------------
+class DualEMA:
+    """Simple exponential moving average for a scalar (e.g., lambda or latency)."""
+    def __init__(self, beta: float = 0.9, value: float = 0.0):
+        self.beta = float(beta)
+        self.value = float(value)
+    def update(self, x: float) -> float:
+        self.value = self.beta * self.value + (1 - self.beta) * float(x)
+        return self.value

core/.ipynb_checkpoints/finetune-checkpoint.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# core/finetune.py
+"""Post-pruning fine-tuning utilities (distillation)."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple, Iterable
+import torch
+import torch.nn as nn
+from core.distill import KDConfig, kd_loss, mse_reg
+from core.utils import ensure_trainable_parameters
+import copy
+@dataclass
+class FinetuneConfig:
+    epochs: int = 5
+    lr: float = 3e-4
+    wd: float = 0.0
+    kd: KDConfig = KDConfig(temperature=2.0, alpha=1.0)
+    amp: bool = True
+    # "auto" -> bf16 if supported else fp16; "bf16" | "fp16" | "off" also allowed
+    amp_dtype: str = "auto"
+    device: str = "cuda"
+    log_every: int = 200
+    # diagnostics
+    grad_check_every: int = 50
+    grad_warn_if_zero_steps: int = 2   # consecutive checks with zero grad -> warn
+    mse_weight: float = 0.0
+def _autocast_and_scaler(amp: bool, amp_dtype: str) -> Tuple[torch.autocast, Optional[torch.amp.GradScaler], bool, str]:
+    """
+    Returns (autocast_ctx, scaler_or_None, use_scaler_bool, amp_mode_str)
+      - BF16 -> autocast(bfloat16), NO GradScaler
+      - FP16 -> autocast(float16),  GradScaler ENABLED
+      - OFF  -> disabled autocast,  NO GradScaler
+    """
+    if not amp or amp_dtype == "off":
+        ctx = torch.amp.autocast(device_type="cuda", enabled=False)
+        return ctx, None, False, "OFF"
+    if amp_dtype == "auto":
+        use_bf16 = torch.cuda.is_bf16_supported()
+    elif amp_dtype == "bf16":
+        use_bf16 = True
+    elif amp_dtype == "fp16":
+        use_bf16 = False
+    else:
+        raise ValueError(f"Unknown amp_dtype={amp_dtype!r}")
+    if use_bf16:
+        ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True)
+        return ctx, None, False, "BF16"
+    else:
+        ctx = torch.amp.autocast(device_type="cuda", dtype=torch.float16, enabled=True)
+        try:
+            scaler = torch.amp.GradScaler("cuda", enabled=True)
+        except TypeError:
+            scaler = torch.cuda.amp.GradScaler(enabled=True)
+        return ctx, scaler, True, "FP16"
+def _images_from_batch(batch):
+    if isinstance(batch, dict):
+        return batch.get("pixel_values") or batch.get("input")
+    if isinstance(batch, (tuple, list)):
+        return batch[0]
+    return batch
+def _param_iter_trainable(model: nn.Module) -> Iterable[torch.nn.Parameter]:
+    for p in model.parameters():
+        if p.requires_grad:
+            yield p
+def _grad_norm_and_nonzero(params: Iterable[torch.nn.Parameter]) -> Tuple[float, int]:
+    total_sq, nonzero = 0.0, 0
+    for p in params:
+        g = p.grad
+        if g is None:
+            continue
+        if g.is_sparse:
+            g = g.coalesce().values()
+        gn = float(g.detach().norm().cpu())
+        if gn > 0.0:
+            nonzero += 1
+        total_sq += gn * gn
+    return (total_sq ** 0.5), nonzero
+@torch.no_grad()
+def recalibrate_bn_stats(model, loader, max_batches=200, device="cuda"):
+    model.train()  # use training mode to update running stats
+    seen = 0
+    for i, batch in enumerate(loader):
+        if i >= max_batches: break
+        x = batch[0] if isinstance(batch, (tuple, list)) else batch
+        if not torch.is_tensor(x): continue
+        x = x.to(device, non_blocking=True)
+        model(x)
+        seen += x.size(0)
+    return seen
+def finetune_student(
+    student: nn.Module,
+    teacher: nn.Module,
+    train_loader,
+    *,
+    get_student_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+    get_teacher_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+    cfg: FinetuneConfig = FinetuneConfig(),
+    val_loader=None,
+    on_step: Optional[Callable[[int, float], None]] = None,
+    save_best=False
+) -> nn.Module:
+    """Fine-tune a pruned student against a frozen teacher using KD."""
+    dev = cfg.device
+    student = student.to(dev)
+    teacher = teacher.to(dev).eval()
+    for p in teacher.parameters():
+        p.requires_grad_(False)
+    for p in student.parameters():
+        p.requires_grad_(True)
+    # Make sure we can actually train
+    ensure_trainable_parameters(student, requires_grad=True)
+    trainable = sum(p.numel() for p in student.parameters() if p.requires_grad)
+    if trainable == 0:
+        raise RuntimeError("No trainable parameters in student — cannot finetune.")
+    opt = torch.optim.AdamW(
+        _param_iter_trainable(student),
+        lr=cfg.lr,
+        weight_decay=cfg.wd,
+    )
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=cfg.epochs*len(train_loader), eta_min=3e-5)
+    autocast_ctx, scaler, use_scaler, amp_mode = _autocast_and_scaler(cfg.amp, cfg.amp_dtype)
+    print(f"[AMP] Mode={amp_mode} | GradScaler={'ON' if use_scaler else 'OFF'} | "
+          f"KD: T={cfg.kd.temperature} alpha={cfg.kd.alpha} | LR={cfg.lr} WD={cfg.wd} | Trainable params={trainable:,}")
+    zero_grad_streak = 0
+    global_step = 0
+    T_max = cfg.kd.temperature
+    T_min = 2.0
+    kd_conf = cfg.kd
+    best_state = None
+    best_val = float("inf")
+    for ep in range(cfg.epochs):
+        student.train()
+        running, seen = 0.0, 0
+        for i, batch in enumerate(train_loader):
+            step = ep*len(train_loader) + i # global step for T scheduling
+            max_steps = cfg.epochs*len(train_loader)
+            kd_conf.temperature = T_max - (step/max_steps)*(T_max - T_min)
+            # print(f"Step {step}/{max_steps}, T_min={T_min}, T={kd_conf.temperature}, T_max={T_max}")
+            x = _images_from_batch(batch)
+            if not torch.is_tensor(x):
+                raise ValueError("Train loader must yield tensors or (tensor, target) tuples.")
+            x = x.to(dev, non_blocking=True)
+            with torch.no_grad():
+                t = get_teacher_logits(teacher, x)
+                # Force numerically stable dtype for the loss
+                t = t.float()
+            # ---- forward student under autocast
+            with autocast_ctx:
+                s = get_student_logits(student, x)
+            # ---- compute KD loss in FP32 (outside autocast) for stability
+            s32 = s.float()
+            mse = cfg.mse_weight*mse_reg(s32, t, kd_conf.temperature)
+            loss = kd_loss(s32, t, kd_conf) + mse
+            opt.zero_grad(set_to_none=True)
+            if use_scaler:
+                scaler.scale(loss).backward()
+                scaler.step(opt)
+                scaler.update()
+            else:
+                loss.backward()
+                opt.step()
+            # ---- diagnostics
+            bs = x.size(0)
+            running += float(loss.detach()) * bs
+            seen += bs
+            global_step += 1
+            if cfg.grad_check_every and (global_step % cfg.grad_check_every == 0):
+                gnorm, n_nonzero = _grad_norm_and_nonzero(_param_iter_trainable(student))
+                if n_nonzero == 0 or gnorm == 0.0:
+                    zero_grad_streak += 1
+                    if zero_grad_streak >= cfg.grad_warn_if_zero_steps:
+                        print(f"[WARN] Step {global_step}: zero gradients detected "
+                              f"(nonzero={n_nonzero}, grad_norm={gnorm:.3e}). "
+                              f"Check get_student_logits, requires_grad, AMP settings, and data pipeline.")
+                else:
+                    zero_grad_streak = 0
+            if cfg.log_every and (i + 1) % cfg.log_every == 0:
+                print(f"Step {i+1}/{len(train_loader)} (ep {ep+1}/{cfg.epochs}): "
+                      f"running loss = {running / max(1, seen):.4f}")
+            if on_step is not None:
+                on_step(global_step, float(loss.detach()))
+            # free ASAP
+            del s, s32, t, loss
+        # ---- validation
+        if val_loader is not None:
+            _ = recalibrate_bn_stats(student, train_loader, max_batches=1000, device=cfg.device)
+            student.eval()
+            val_loss, vseen = 0.0, 0
+            with torch.no_grad():
+                for vbatch in val_loader:
+                    vx = _images_from_batch(vbatch)
+                    if not torch.is_tensor(vx):
+                        raise ValueError("Val loader must yield tensors or (tensor, target) tuples.")
+                    vx = vx.to(dev, non_blocking=True)
+                    vt = get_teacher_logits(teacher, vx).float()
+                    with autocast_ctx:
+                        vs = get_student_logits(student, vx)
+                    vs32 = vs.float()
+                    vmse = cfg.mse_weight*mse_reg(vs32, vt, kd_conf.temperature)
+                    vloss = kd_loss(vs32, vt, kd_conf) + vmse
+                    val_loss += float(vloss.detach()) * vx.size(0)
+                    vseen += vx.size(0)
+            mean_val = val_loss / max(1, vseen)
+            print("\n------------------------------------------------")
+            print(f"Epoch {ep+1}/{cfg.epochs}: T={kd_conf.temperature:.2f}, train={running / max(1, seen):.6f}, "
+                  f"val={mean_val:.6f}")
+            if save_best and (mean_val < best_val):
+                best_val = mean_val
+                best_state = copy.deepcopy(student.state_dict())
+            print("------------------------------------------------\n")
+        else:
+            print(f"Epoch {ep+1}/{cfg.epochs}: train={running / max(1, seen):.6f}")
+        scheduler.step()
+    if save_best and val_loader is not None and best_state is not None:
+        student.load_state_dict(best_state)
+    student.eval()
+    return student

core/.ipynb_checkpoints/profiler-checkpoint.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Simple, robust latency measurement utilities.
+This module provides GPU-friendly profilers with warmup, multiple repeats,
+median/percentile reporting, and optional outlier rejection via MAD.
+Design goals:
+  - Family-agnostic: take a callable `forward(model, x)` or rely on HF `.forward`
+  - Deterministic when desired; avoids autograd by default
+  - Works with CUDA or CPU; uses `torch.cuda.Event` for accurate GPU timing
+Key APIs:
+  - measure_latency_ms(model, input_shape | input_tensor, ...)
+  - profile(model, sample, settings) -> {mean, p50, p90, p95, p99}
+  - LatencyProfiler(settings).measure(...)
+  - profile_many_shapes(model, shapes, settings)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from statistics import median
+from typing import Callable, Dict, Iterable, Optional, Sequence, Tuple
+import contextlib
+import math
+import time
+import torch
+import torch.nn as nn
+# -----------------------------------------------------------------------------
+# Settings
+# -----------------------------------------------------------------------------
+@dataclass
+class ProfileSettings:
+    warmup: int = 10
+    iters: int = 50
+    percentile: Sequence[int] = (50, 90, 95, 99)
+    sync_each_iter: bool = True
+    use_inference_mode: bool = True
+    cuda_graph: bool = False  # advanced users can enable with static shapes
+    reject_outliers_mad: float = 0.0  # e.g., 3.5 to drop extreme spikes
+    cudnn_benchmark: bool = True
+    deterministic: bool = False  # sets cudnn.deterministic
+# -----------------------------------------------------------------------------
+# Context helpers
+# -----------------------------------------------------------------------------
+@contextlib.contextmanager
+def _torch_backend_ctx(settings: ProfileSettings):
+    prev_bench = torch.backends.cudnn.benchmark
+    prev_det = torch.backends.cudnn.deterministic
+    try:
+        torch.backends.cudnn.benchmark = bool(settings.cudnn_benchmark)
+        torch.backends.cudnn.deterministic = bool(settings.deterministic)
+        yield
+    finally:
+        torch.backends.cudnn.benchmark = prev_bench
+        torch.backends.cudnn.deterministic = prev_det
+def _percentiles(sorted_vals: Sequence[float], qs: Sequence[int]) -> Dict[int, float]:
+    n = len(sorted_vals)
+    if n == 0:
+        return {q: float("nan") for q in qs}
+    out = {}
+    for q in qs:
+        if n == 1:
+            out[q] = sorted_vals[0]
+            continue
+        k = (q / 100.0) * (n - 1)
+        f = math.floor(k)
+        c = min(n - 1, f + 1)
+        if f == c:
+            out[q] = sorted_vals[int(k)]
+        else:
+            d0 = sorted_vals[f] * (c - k)
+            d1 = sorted_vals[c] * (k - f)
+            out[q] = d0 + d1
+    return out
+def _apply_mad_filter(vals: Sequence[float], thresh: float) -> Sequence[float]:
+    if thresh <= 0 or len(vals) < 5:
+        return vals
+    med = median(vals)
+    dev = [abs(v - med) for v in vals]
+    mad = median(dev) or 1e-12
+    keep = [v for v, d in zip(vals, dev) if (d / mad) <= thresh]
+    return keep if keep else vals
+# -----------------------------------------------------------------------------
+# Core measurement
+# -----------------------------------------------------------------------------
+@torch.inference_mode()
+def measure_latency_ms(
+    model: nn.Module,
+    sample: torch.Tensor | Tuple[int, ...],
+    *,
+    settings: Optional[ProfileSettings] = None,
+    device: str = "cuda",
+    forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None,
+) -> Tuple[float, float]:
+    """Return (mean_ms, p95_ms) over `iters` measurements.
+    If `sample` is a shape tuple, a random tensor is created on-device.
+    The default forward calls `model(pixel_values=x)` if available, else `model(x)`.
+    """
+    cfg = settings or ProfileSettings()
+    with _torch_backend_ctx(cfg):
+        m = model.to(device).eval()
+        if isinstance(sample, torch.Tensor):
+            x = sample.to(device)
+        else:
+            x = torch.randn(*sample, device=device)
+        # Default forward
+        def _fwd(mod, inp):
+            if hasattr(mod, "forward"):
+                try:
+                    return mod(pixel_values=inp)
+                except TypeError:
+                    return mod(inp)
+            return mod(inp)
+        fn = forward_fn or _fwd
+        # Warmup
+        if torch.cuda.is_available() and device.startswith("cuda"):
+            for _ in range(cfg.warmup):
+                _ = fn(m, x)
+            torch.cuda.synchronize()
+        else:
+            for _ in range(cfg.warmup):
+                _ = fn(m, x)
+            torch.cuda.synchronize() if torch.cuda.is_available() else None
+        times: list[float] = []
+        if torch.cuda.is_available() and device.startswith("cuda"):
+            for _ in range(cfg.iters):
+                t0 = torch.cuda.Event(enable_timing=True)
+                t1 = torch.cuda.Event(enable_timing=True)
+                t0.record()
+                _ = fn(m, x)
+                t1.record()
+                if cfg.sync_each_iter:
+                    torch.cuda.synchronize()
+                times.append(t0.elapsed_time(t1))  # milliseconds
+        else:
+            for _ in range(cfg.iters):
+                t0 = time.perf_counter()
+                _ = fn(m, x)
+                if cfg.sync_each_iter and torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t1 = time.perf_counter()
+                times.append((t1 - t0) * 1000.0)
+        times = sorted(_apply_mad_filter(times, cfg.reject_outliers_mad))
+        mean_ms = sum(times) / max(1, len(times))
+        p = _percentiles(times, cfg.percentile)
+        p95 = p.get(95, times[int(0.95 * (len(times) - 1))] if times else float("nan"))
+        return mean_ms, p95
+# Higher level wrapper returning multiple percentiles
+@torch.inference_mode()
+def profile(
+    model: nn.Module,
+    sample: torch.Tensor | Tuple[int, ...],
+    *,
+    settings: Optional[ProfileSettings] = None,
+    device: str = "cuda",
+    forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None,
+) -> Dict[str, float]:
+    cfg = settings or ProfileSettings()
+    mean_ms, _ = measure_latency_ms(model, sample, settings=cfg, device=device, forward_fn=forward_fn)
+    # Re-run percentile calc on same settings for consistency
+    m = model.to(device).eval()
+    if isinstance(sample, torch.Tensor):
+        x = sample.to(device)
+    else:
+        x = torch.randn(*sample, device=device)
+    if torch.cuda.is_available() and device.startswith("cuda"):
+        times = []
+        for _ in range(cfg.iters):
+            t0 = torch.cuda.Event(True); t1 = torch.cuda.Event(True)
+            t0.record(); _ = (forward_fn or (lambda a, b: a(pixel_values=b)))(m, x); t1.record();
+            if cfg.sync_each_iter: torch.cuda.synchronize()
+            times.append(t0.elapsed_time(t1))
+    else:
+        times = []
+        for _ in range(cfg.iters):
+            t0 = time.perf_counter(); _ = (forward_fn or (lambda a, b: a(pixel_values=b)))(m, x); t1 = time.perf_counter()
+            times.append((t1 - t0) * 1000.0)
+    times = sorted(_apply_mad_filter(times, cfg.reject_outliers_mad))
+    percs = _percentiles(times, cfg.percentile)
+    out = {"mean": sum(times) / max(1, len(times))}
+    out.update({f"p{q}": v for q, v in percs.items()})
+    return out
+class LatencyProfiler:
+    """Reusable profiler with fixed settings."""
+    def __init__(self, settings: Optional[ProfileSettings] = None, device: str = "cuda"):
+        self.settings = settings or ProfileSettings()
+        self.device = device
+    def measure(self, model: nn.Module, sample: torch.Tensor | Tuple[int, ...], *, forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None) -> Tuple[float, float]:
+        return measure_latency_ms(model, sample, settings=self.settings, device=self.device, forward_fn=forward_fn)
+    def profile(self, model: nn.Module, sample: torch.Tensor | Tuple[int, ...], *, forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None) -> Dict[str, float]:
+        return profile(model, sample, settings=self.settings, device=self.device, forward_fn=forward_fn)
+@torch.inference_mode()
+def profile_many_shapes(
+    model: nn.Module,
+    shapes: Iterable[Tuple[int, ...]],
+    *,
+    settings: Optional[ProfileSettings] = None,
+    device: str = "cuda",
+    forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None,
+) -> Dict[Tuple[int, ...], Dict[str, float]]:
+    out: Dict[Tuple[int, ...], Dict[str, float]] = {}
+    for shp in shapes:
+        out[tuple(shp)] = profile(model, shp, settings=settings, device=device, forward_fn=forward_fn)
+    return out

core/.ipynb_checkpoints/proxy_cost-checkpoint.py ADDED Viewed

	@@ -0,0 +1,771 @@

+# core/proxy_cost.py
+"""Latency proxy models and a tiny LUT for hardware correction.
+This file defines a family-agnostic interface plus concrete proxies (ViT, ResNet, LLM)
+that estimate latency from *soft structure* (gates) and input size. All proxies accept
+the trainer's `(model, batch) -> ms` call signature directly (batches may be dict/tuple/tensor).
+A small, in-memory LUT can be populated from real measurements during training to correct
+analytic estimates.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union, List
+import torch
+import torch.nn as nn
+from .gates import iter_gates, _as_like  # _as_like is used by ViT proxy
+# -----------------------------------------------------------------------------
+# Small batch helpers (shared)
+# -----------------------------------------------------------------------------
+TensorOrBatch = Union[torch.Tensor, Tuple, List, Dict[str, Any]]
+def _first_tensor(batch: TensorOrBatch) -> torch.Tensor:
+    """Find the first tensor inside a batch-like structure."""
+    if torch.is_tensor(batch):
+        return batch
+    if isinstance(batch, dict):
+        # Common keys across tasks
+        for k in ("input_ids", "pixel_values", "images", "x"):
+            v = batch.get(k, None)
+            if torch.is_tensor(v):
+                return v
+        # fallback: first tensor value
+        for v in batch.values():
+            if torch.is_tensor(v):
+                return v
+        raise ValueError("Batch dict has no tensor field I recognize.")
+    if isinstance(batch, (list, tuple)):
+        for v in batch:
+            if torch.is_tensor(v):
+                return v
+        # torchvision pattern: ([aug1, aug2], label)
+        if len(batch) and isinstance(batch[0], (list, tuple)):
+            for v in batch[0]:
+                if torch.is_tensor(v):
+                    return v
+    raise ValueError("Cannot find a tensor in the provided batch.")
+def _ids_from_batch(batch: TensorOrBatch) -> torch.Tensor:
+    """Return a 2D [B,S] tensor representing token ids for LLMs."""
+    if isinstance(batch, dict) and "input_ids" in batch and torch.is_tensor(batch["input_ids"]):
+        return batch["input_ids"]
+    t = _first_tensor(batch)
+    if t.dim() >= 2:
+        return t
+    raise ValueError("Cannot infer [B,S] from batch; need 'input_ids' or a 2D tensor.")
+def _nchw_from_batch(batch: TensorOrBatch) -> Tuple[int, int, int, int]:
+    """Return NCHW shape from a batch or an explicit (N,C,H,W) tuple/list/tensor."""
+    if isinstance(batch, (tuple, list)) and len(batch) == 4 and all(isinstance(x, int) for x in batch):
+        return tuple(batch)  # type: ignore[return-value]
+    x = _first_tensor(batch)
+    if x.dim() != 4:
+        raise ValueError(f"Expected NCHW tensor for CNN proxy; got tensor with shape {tuple(x.shape)}")
+    N, C, H, W = map(int, x.shape)
+    return (N, C, H, W)
+# -----------------------------------------------------------------------------
+# Base proxy + LUT
+# -----------------------------------------------------------------------------
+class LatencyProxy(nn.Module):
+    """Abstract proxy producing a scalar latency-like value (ms).
+    Subclasses implement `_predict_raw` and may define `_signature` keys used by
+    a LUT to refine estimates with real measurements. Proxies accept either a
+    batch-like object (dict/tuple/tensor) or an explicit shape tuple.
+    """
+    def __init__(self):
+        super().__init__()
+    def predict(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Batch-friendly entry point. `sample` may be a batch or explicit shape."""
+        return self._predict_raw(model, sample, policy=policy, step=step, **kwargs)
+    def _predict_raw(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None,
+        **kwargs,
+    ) -> torch.Tensor:  # pragma: no cover - abstract
+        raise NotImplementedError
+    def signature(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None
+    ) -> Tuple:
+        """Return a hashable signature describing the workload shape."""
+        if torch.is_tensor(sample):
+            shp = tuple(sample.shape)
+        elif isinstance(sample, (tuple, list)):
+            shp = tuple(sample)
+        elif isinstance(sample, dict):
+            # summarize the shapes of any tensors in dict
+            shp = tuple((k, tuple(v.shape)) for k, v in sample.items() if torch.is_tensor(v))
+        else:
+            shp = (str(type(sample)),)
+        return (type(self).__name__, shp)
+class LatencyLUT:
+    """Tiny LUT mapping `(signature) -> measured_ms`."""
+    def __init__(self):
+        self._table: Dict[Tuple[Any, ...], float] = {}
+    def update(self, signature: Tuple[Any, ...], measured_ms: float) -> None:
+        self._table[signature] = float(measured_ms)
+    def get(self, signature: Tuple[Any, ...]) -> Optional[float]:
+        return self._table.get(signature)
+    def blend(self, raw_estimate: torch.Tensor, signature: Tuple[Any, ...]) -> torch.Tensor:
+        val = self.get(signature)
+        if val is None:
+            return raw_estimate
+        # Put on same device/dtype as raw_estimate
+        return _as_like(raw_estimate, val)
+# -----------------------------------------------------------------------------
+# ViT proxy (analytic + gates), with scale and per-term weights
+# -----------------------------------------------------------------------------
+@dataclass
+class ViTProxyConfig:
+    scale_ms: float = 1.0
+    alpha_qkv: float = 1.0
+    alpha_scores: float = 1.0
+    alpha_out: float = 1.0
+    alpha_mlp: float = 1.0
+def _vit_layers(m):
+    enc = getattr(m, "encoder", None)
+    if enc is not None and hasattr(enc, "layer"):
+        return enc.layer
+    vit = getattr(m, "vit", None)
+    if vit is not None and hasattr(vit, "encoder") and hasattr(vit.encoder, "layer"):
+        return vit.encoder.layer
+    raise TypeError("Expected a HF ViT with *.encoder.layer (ViTModel or ViTForImageClassification).")
+class ViTLatencyProxy(LatencyProxy):
+    """Latency proxy for ViT models. Accepts batches or (N,C,H,W) tuples."""
+    def __init__(self, cfg: Optional[ViTProxyConfig] = None, lut: Optional[LatencyLUT] = None):
+        super().__init__()
+        self.cfg = cfg or ViTProxyConfig()
+        self.lut = lut or LatencyLUT()
+    # ---- helpers -------------------------------------------------------------
+    @staticmethod
+    def _input_spec(sample: TensorOrBatch) -> Tuple[int, int, int]:
+        if isinstance(sample, (tuple, list)) and len(sample) == 4 and all(isinstance(x, int) for x in sample):
+            B, C, H, W = sample
+            return int(B), int(H), int(W)
+        x = _first_tensor(sample)
+        if x.dim() != 4:
+            raise ValueError("ViTLatencyProxy expects a tensor [B,3,H,W] or a 4-tuple (B,3,H,W)")
+        B, C, H, W = x.shape
+        return int(B), int(H), int(W)
+    @staticmethod
+    def _patch_hw(cfg) -> Tuple[int, int]:
+        patch = getattr(cfg, "patch_size", 16)
+        if isinstance(patch, (tuple, list)):
+            return int(patch[0]), int(patch[1])
+        return int(patch), int(patch)
+    @staticmethod
+    def _soft_heads_from_block(blk) -> Optional[torch.Tensor]:
+        # Prefer a nested attention with kept_heads_soft()
+        attn = getattr(getattr(blk, "attention", None), "attention", None)
+        if attn is not None and hasattr(attn, "kept_heads_soft"):
+            return attn.kept_heads_soft()
+        return None
+    @staticmethod
+    def _find_ffn_gate(blk):
+        inter = getattr(blk, "intermediate", None)
+        if inter is None:
+            return None
+        # Common attribute names
+        for nm in ("neuron_gate", "gate", "ffn_gate"):
+            g = getattr(inter, nm, None)
+            if g is not None and hasattr(g, "logits") and hasattr(g, "tau"):
+                return g
+        # Last resort: scan children
+        for m in blk.modules():
+            if hasattr(m, "logits") and hasattr(m, "tau"):
+                return m
+        return None
+    # ---- proxy ---------------------------------------------------------------
+    def _predict_raw(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None
+    ) -> torch.Tensor:
+        anchor = next((p for p in model.parameters()), torch.tensor(0.0))
+        B, H_img, W_img = self._input_spec(sample)
+        cfg = getattr(model, "config", None)
+        if cfg is None:
+            raise ValueError("Model must expose a HuggingFace-like .config for ViT proxy")
+        ph, pw = self._patch_hw(cfg)
+        S = _as_like(anchor, 1 + (H_img // ph) * (W_img // pw))
+        D = _as_like(anchor, int(getattr(cfg, "hidden_size", 768)))
+        Hh = _as_like(anchor, int(getattr(cfg, "num_attention_heads", 12)))
+        Dh = D // Hh
+        warm = False
+        if policy is not None and step is not None:
+            warm = (step < int(getattr(policy, "warmup_steps", 0)))
+        total_qkv = _as_like(anchor, 0.0)
+        total_scores = _as_like(anchor, 0.0)
+        total_out = _as_like(anchor, 0.0)
+        total_mlp = _as_like(anchor, 0.0)
+        default_hidden = _as_like(anchor, int(getattr(cfg, "intermediate_size", 4 * int(D))))
+        layers = _vit_layers(model)
+        for blk in layers:
+            heads_soft = Hh if warm else (self._soft_heads_from_block(blk) or Hh)
+            # FFN hidden expectation
+            if warm:
+                hidden_soft = default_hidden
+            else:
+                g = self._find_ffn_gate(blk)
+                if g is None:
+                    hidden_soft = default_hidden
+                else:
+                    probs = torch.sigmoid(g.logits / g.tau)
+                    group = int(getattr(g, "group", getattr(g, "group_size", 16)))
+                    hidden_soft = probs.sum() * _as_like(anchor, group)
+            D_kept = heads_soft * Dh
+            total_qkv += 3 * S * D * D_kept
+            total_scores += (S * S) * heads_soft * Dh
+            total_out += S * D_kept * D
+            total_mlp += 2 * S * D * hidden_soft
+        raw = (
+            self.cfg.alpha_qkv * total_qkv
+            + self.cfg.alpha_scores * total_scores
+            + self.cfg.alpha_out * total_out
+            + self.cfg.alpha_mlp * total_mlp
+        )
+        raw_ms = raw * _as_like(anchor, float(self.cfg.scale_ms))
+        # optional LUT correction
+        sig = self.signature(model, sample, policy=policy, step=step)
+        return self.lut.blend(raw_ms, sig)
+    # A reasonable default signature for ViT workloads
+    def signature(self, model: nn.Module, sample, *, policy=None, step: Optional[int] = None) -> Tuple:
+        if torch.is_tensor(sample):
+            shp = tuple(sample.shape)
+        elif isinstance(sample, (tuple, list)):
+            shp = tuple(sample)
+        elif isinstance(sample, dict):
+            shp = tuple((k, tuple(v.shape)) for k, v in sample.items() if torch.is_tensor(v))
+        else:
+            shp = (str(type(sample)),)
+        cfg = getattr(model, "config", None)
+        heads = int(getattr(cfg, "num_attention_heads", 12))
+        hidden = int(getattr(cfg, "hidden_size", 768))
+        inter = int(getattr(cfg, "intermediate_size", 3072))
+        return ("ViT", shp, heads, hidden, inter)
+    @torch.no_grad()
+    def calibrate(self, model: nn.Module, shape: tuple, measure_fn, *, device: str = "cuda") -> float:
+        """Set proxy scale so that keep-all student matches measured ms.
+        `measure_fn(model, shape_or_tensor)` should return `(mean_ms, p95_ms)`.
+        """
+        sample_t = torch.randn(shape, device=device)
+        sample_t = sample_t.to(device)
+        model = model.to(device).eval()
+        mean_ms, _ = measure_fn(model, shape, device=device)
+        soft_ms = self.predict(model, sample_t).item()
+        self.cfg.scale_ms = float(mean_ms / max(soft_ms, 1e-9))
+        return self.cfg.scale_ms
+# ------------------------------ ResNet Proxy ------------------------------
+@dataclass
+class ResNetProxyConfig:
+    scale_ms: float = 1.0
+    alpha_conv: float = 1.0   # weight for conv FLOPs term
+def _as_const_like_resnet(x_like: torch.Tensor, val):
+    return torch.as_tensor(val, device=x_like.device, dtype=x_like.dtype)
+def _find_anchor_param(model: nn.Module) -> torch.Tensor:
+    # Prefer any gate-like parameter; otherwise any parameter; else cpu scalar
+    for m in model.modules():
+        for nm in ("logits", "head_gate"):
+            t = getattr(m, nm, None)
+            if isinstance(t, torch.Tensor):
+                return t
+    for p in model.parameters():
+        return p
+    return torch.tensor(0.0)
+def _kept_from_gate(module, anchor: torch.Tensor) -> Optional[torch.Tensor]:
+    """Return expected kept channels for a BN gate: probs.sum() * group_size.
+    If no gate is found, return None.
+    """
+    g = None
+    for nm in ("gate", "neuron_gate", "channel_gate", "bn_gate"):
+        if hasattr(module, nm):
+            g = getattr(module, nm)
+            break
+    if g is None and hasattr(module, "logits") and hasattr(module, "tau"):
+        g = module
+    if g is None or not hasattr(g, "logits"):
+        return None
+    logits = g.logits
+    tau = float(getattr(g, "tau", 1.5))
+    group = int(getattr(g, "group", getattr(g, "group_size", 1)))
+    if group <= 0: group = 1
+    probs = torch.sigmoid(logits / tau)
+    return probs.sum() * _as_const_like_resnet(anchor, group)
+class ResNetLatencyProxy(LatencyProxy):
+    """Latency proxy for ResNet-like backbones with BN gates.
+    Approximates latency with a FLOPs-style sum over convs, using the *expected*
+    kept channels after each BN gate (probs.sum()*group_size). Falls back to the
+    full channel count when a gate is not found.
+    Accepts a batch or an explicit (N,C,H,W) shape.
+    """
+    def __init__(self, cfg: Optional[ResNetProxyConfig] = None):
+        super().__init__()
+        self.cfg = cfg or ResNetProxyConfig()
+    def _add_cost(self, cost_like: torch.Tensor, oc, ic, k, stride, H, W):
+        alpha = _as_const_like_resnet(cost_like, self.cfg.alpha_conv)
+        # update spatial dims with conv stride (roughly, ignoring padding effects)
+        H = (H + stride - 1) // stride
+        W = (W + stride - 1) // stride
+        flops = _as_const_like_resnet(cost_like, oc) * _as_const_like_resnet(cost_like, ic) * (k * k) * _as_const_like_resnet(cost_like, H) * _as_const_like_resnet(cost_like, W)
+        return cost_like + alpha * flops, H, W
+    def _predict_raw(self, model: nn.Module, sample: TensorOrBatch, **_) -> torch.Tensor:
+        N, C_in, H0, W0 = _nchw_from_batch(sample)
+        anchor = _find_anchor_param(model)
+        cost = _as_const_like_resnet(anchor, 0.0)
+        H = _as_const_like_resnet(anchor, int(H0))
+        W = _as_const_like_resnet(anchor, int(W0))
+        # Stem
+        conv1 = getattr(model, "conv1")
+        bn1 = getattr(model, "bn1", None)
+        k = conv1.kernel_size[0]
+        s = conv1.stride[0]
+        kept_out = None
+        if bn1 is not None:
+            kept = _kept_from_gate(bn1, anchor)
+            if kept is not None:
+                kept_out = kept
+        oc_eff = kept_out if kept_out is not None else _as_const_like_resnet(anchor, conv1.out_channels)
+        cost, H, W = self._add_cost(cost, oc_eff, _as_const_like_resnet(anchor, C_in), k, s, H, W)
+        in_ch = oc_eff
+        def _block_cost(block, in_ch, H, W, cost):
+            # conv1 -> bn1
+            c1 = block.conv1
+            b1 = block.bn1 if hasattr(block, "bn1") else None
+            k1, s1 = c1.kernel_size[0], c1.stride[0]
+            oc1_eff = _kept_from_gate(b1, anchor) or _as_const_like_resnet(anchor, c1.out_channels)
+            cost, H, W = self._add_cost(cost, oc1_eff, in_ch, k1, s1, H, W)
+            # conv2 -> bn2
+            c2 = block.conv2
+            b2 = block.bn2 if hasattr(block, "bn2") else None
+            k2, s2 = c2.kernel_size[0], c2.stride[0]
+            oc2_eff = _kept_from_gate(b2, anchor) or _as_const_like_resnet(anchor, c2.out_channels)
+            cost, H, W = self._add_cost(cost, oc2_eff, oc1_eff, k2, s2, H, W)
+            return oc2_eff, H, W, cost
+        # Layers
+        for lname in ("layer1", "layer2", "layer3", "layer4"):
+            layer = getattr(model, lname, None)
+            if layer is None:
+                continue
+            for blk in layer:
+                in_ch, H, W, cost = _block_cost(blk, in_ch, H, W, cost)
+        scale = _as_const_like_resnet(anchor, self.cfg.scale_ms)
+        return cost * scale
+    @torch.no_grad()
+    def calibrate(self, model: nn.Module, keepall_export_fn, profiler_fn, sample: TensorOrBatch, device: str = "cuda") -> float:
+        """Calibrate `scale_ms` so proxy(model_keepall) ~= real latency in ms."""
+        keep = keepall_export_fn(model)
+        sample_shape = _nchw_from_batch(sample)
+        mean_ms, _ = profiler_fn(keep, sample_shape, device=device)
+        soft = float(self.predict(model, sample).detach().cpu())
+        self.cfg.scale_ms = mean_ms / max(soft, 1e-9)
+        return mean_ms
+# -----------------------------------------------------------------------------
+# LLM proxy
+# -----------------------------------------------------------------------------
+"""
+LatencyProxyLLM
+---------------
+A lightweight latency proxy for decoder-only HF LLMs (LLaMA/Mistral style).
+- Estimates end-to-end latency (ms-like scalar) for a given (B, S, T):
+    * Prefill on S tokens (build KV cache)
+    * Cached decode for T steps
+- Uses soft gate expectations:
+    * Attention heads (HeadGate on GatedSelfAttentionLLM)
+    * FFN hidden (SwiGLUWidthGate via .mlp.neuron_gate)
+- Calibrate .scale_ms so proxy ≈ real latency of a keep-all model.
+Public API
+----------
+- LatencyProxyLLM(...).predict(model, batch_or_shape)     # trainer entry
+- LatencyProxyLLM(...).predict(model, B=?, S=?, T=?)      # explicit entry
+- LatencyProxyLLM(...).debug_layer_view(...)
+- calibrate_proxy_llm(...), calibrate_proxy_llm_from_batch(...)
+"""
+# ------------------------------------------------------------
+# Shared tiny utils (device/dtype-safe constants)
+# ------------------------------------------------------------
+def _find_gate_param_or_fallback(model: nn.Module) -> torch.Tensor:
+    """
+    Return a tensor to anchor device/dtype for proxy constants.
+    Prefer gate logits; else any parameter; else CPU fp32 scalar.
+    """
+    for m in model.modules():
+        if hasattr(m, "head_gate") and hasattr(getattr(m, "head_gate"), "logits"):
+            return m.head_gate.logits
+        if hasattr(m, "neuron_gate") and hasattr(m.neuron_gate, "logits"):
+            return m.neuron_gate.logits
+        if hasattr(m, "logits") and isinstance(getattr(m, "logits"), torch.Tensor):
+            return m.logits
+    for p in model.parameters():
+        return p
+    return torch.tensor(0.0)
+def _as_const_like(x_like: torch.Tensor, val):
+    return torch.as_tensor(val, device=x_like.device, dtype=x_like.dtype)
+# ------------------------------------------------------------
+# Proxy
+# ------------------------------------------------------------
+@dataclass
+class _WarmupOnlyPolicy:
+    """Tiny policy shim so you can pass warmup_steps to .predict()."""
+    warmup_steps: int = 0
+class LatencyProxyLLM(LatencyProxy):
+    """
+    LLM latency proxy (ms ~ weighted FLOPs/bandwidth terms) for prefill + cached decode.
+    Accepts either a batch or explicit B,S,T.
+    """
+    def __init__(
+        self,
+        *,
+        scale_ms: float = 1.0,
+        alpha_qkv: float = 1.0,
+        alpha_scores: float = 1.0,
+        alpha_out: float = 1.0,
+        alpha_mlp: float = 1.0,
+        gate_kv_in_proxy: bool = False,
+        default_T: int = 128,
+    ):
+        super().__init__()
+        self.scale_ms = float(scale_ms)
+        self.alpha_qkv = float(alpha_qkv)
+        self.alpha_scores = float(alpha_scores)
+        self.alpha_out = float(alpha_out)
+        self.alpha_mlp = float(alpha_mlp)
+        self.gate_kv_in_proxy = bool(gate_kv_in_proxy)
+        self.default_T = int(default_T)
+    # ---------- gate discovery ----------
+    @staticmethod
+    def _soft_heads_from_block_llm(blk) -> Optional[torch.Tensor]:
+        attn = getattr(blk, "self_attn", None)
+        if attn is None:
+            return None
+        if hasattr(attn, "kept_heads_soft") and callable(attn.kept_heads_soft):
+            return attn.kept_heads_soft()
+        logits, tau = None, None
+        if hasattr(attn, "head_gate") and hasattr(attn.head_gate, "logits"):
+            logits = attn.head_gate.logits
+            tau = float(getattr(attn.head_gate, "tau", getattr(attn, "tau", 1.5)))
+        elif hasattr(attn, "logits"):
+            logits = attn.logits
+            tau = float(getattr(attn, "tau", 1.5))
+        if logits is None:
+            return None
+        return torch.sigmoid(logits / tau).sum()
+    @staticmethod
+    def _find_ffn_gate_llm(blk):
+        mlp = getattr(blk, "mlp", None)
+        g = getattr(mlp, "neuron_gate", None) if mlp is not None else None
+        if g is not None and hasattr(g, "logits") and hasattr(g, "tau"):
+            return g
+        return None
+    def _soft_hidden_from_block_llm(self, blk, default_hidden, anchor, warm=False):
+        if warm:
+            return default_hidden
+        g = self._find_ffn_gate_llm(blk)
+        if g is None:
+            return default_hidden
+        probs = torch.sigmoid(g.logits / float(g.tau))  # [#groups]
+        group = int(getattr(g, "group", getattr(g, "group_size", 128)))
+        kept_hidden = probs.sum() * _as_const_like(anchor, group)
+        return kept_hidden
+    # ---------- main ----------
+    def predict(  # trainer entry and explicit-shape entry unified
+        self,
+        model: nn.Module,
+        sample: Optional[TensorOrBatch] = None,
+        *,
+        B: Optional[int] = None,
+        S: Optional[int] = None,
+        T: Optional[int] = None,
+        policy: Optional[object] = None,
+        step: Optional[int] = None,
+        return_terms: bool = False,
+    ):
+        # Allow explicit B,S,(T) path
+        if B is not None and S is not None:
+            ids_B, ids_S = int(B), int(S)
+            ids_T = int(T) if T is not None else int(self.default_T)
+        else:
+            if sample is None:
+                raise ValueError("LatencyProxyLLM.predict needs either a batch sample or explicit B,S.")
+            if isinstance(sample, (tuple, list)) and len(sample) in (2, 3) and all(isinstance(x, int) for x in sample):
+                # explicit (B,S) or (B,S,T)
+                ids_B, ids_S = int(sample[0]), int(sample[1])
+                ids_T = int(sample[2]) if len(sample) == 3 else int(self.default_T)
+            else:
+                ids = _ids_from_batch(sample)
+                ids_B, ids_S = int(ids.size(0)), int(ids.size(1))
+                ids_T = int(self.default_T) if T is None else int(T)
+        anchor = _find_gate_param_or_fallback(model)
+        # scalar tensors (same device/dtype)
+        B_t = _as_const_like(anchor, ids_B)
+        S_t = _as_const_like(anchor, ids_S)
+        T_t = _as_const_like(anchor, ids_T)
+        cfg = model.config
+        D  = _as_const_like(anchor, int(cfg.hidden_size))
+        Hh = _as_const_like(anchor, int(cfg.num_attention_heads))
+        Hkv = _as_const_like(anchor, int(getattr(cfg, "num_key_value_heads", int(Hh))))
+        Dh = D // Hh
+        warmup_steps = int(getattr(policy, "warmup_steps", 0)) if policy is not None else 0
+        warm = bool(step is not None and step < warmup_steps)
+        total_qkv = anchor.new_zeros(())
+        total_scores = anchor.new_zeros(())
+        total_out = anchor.new_zeros(())
+        total_mlp = anchor.new_zeros(())
+        default_hidden = _as_const_like(anchor, int(getattr(cfg, "intermediate_size", 4 * int(D))))
+        layers = getattr(getattr(model, "model", model), "layers", [])
+        for blk in layers:
+            heads_soft = Hh if warm else (self._soft_heads_from_block_llm(blk) or Hh)
+            Dq = heads_soft * Dh
+            # K/V effective width
+            if self.gate_kv_in_proxy:
+                Dkv = heads_soft * Dh
+            else:
+                Dkv = Hkv * Dh
+            hidden_soft = self._soft_hidden_from_block_llm(blk, default_hidden, anchor, warm=warm)
+            # Prefill + decode (simplified aggregation)
+            Seff = S_t + T_t
+            # q/k/v linear FLOP-like terms
+            total_qkv = total_qkv + (
+                # q
+                B_t * Seff * D * Dq +
+                # k + v
+                2 * B_t * Seff * D * Dkv
+            )
+            # attention scores (prefill SxS + decode triangular)
+            total_scores = total_scores + (
+                B_t * (S_t * S_t) * heads_soft * Dh +
+                B_t * heads_soft * Dh * (T_t * S_t + (T_t * (T_t + 1)) // 2)
+            )
+            # out proj
+            total_out = total_out + B_t * Seff * Dq * D
+            # mlp
+            total_mlp = total_mlp + B_t * Seff * 2 * D * hidden_soft
+        flops_like = (
+            self.alpha_qkv * total_qkv
+            + self.alpha_scores * total_scores
+            + self.alpha_out * total_out
+            + self.alpha_mlp * total_mlp
+        )
+        ms = flops_like * _as_const_like(anchor, self.scale_ms)
+        if return_terms:
+            return ms, {
+                "qkv": float((self.alpha_qkv * total_qkv).detach().cpu()),
+                "scores": float((self.alpha_scores * total_scores).detach().cpu()),
+                "out": float((self.alpha_out * total_out).detach().cpu()),
+                "mlp": float((self.alpha_mlp * total_mlp).detach().cpu()),
+            }
+        return ms
+    # ---------- per-layer debug ----------
+    @torch.no_grad()
+    def debug_layer_view(
+        self,
+        model: nn.Module,
+        *,
+        B: int,
+        S: int,
+        T: int,
+        policy: Optional[object] = None,
+        step: Optional[int] = None,
+    ) -> list:
+        anchor = _find_gate_param_or_fallback(model)
+        cfg = getattr(model, "config", None)
+        D   = _as_const_like(anchor, int(getattr(cfg, "hidden_size", 0)))
+        Hq  = _as_const_like(anchor, int(getattr(cfg, "num_attention_heads", 0)))
+        Hkv = _as_const_like(anchor, int(getattr(cfg, "num_key_value_heads", int(Hq))))
+        Dh  = D // Hq
+        warm = False
+        if policy is not None and step is not None:
+            warm = (int(step) < int(getattr(policy, "warmup_steps", 0)))
+        rows = []
+        layers = getattr(getattr(model, "model", model), "layers", None) or []
+        for i, blk in enumerate(layers):
+            heads_soft = Hq if warm else (self._soft_heads_from_block_llm(blk) or Hq)
+            Dq = heads_soft * Dh
+            Dkv = (heads_soft * Dh) if self.gate_kv_in_proxy else (Hkv * Dh)
+            hidden_soft = self._soft_hidden_from_block_llm(
+                blk, _as_const_like(anchor, int(getattr(cfg, "intermediate_size", 4 * int(D)))), anchor, warm=warm
+            )
+            rows.append({
+                "layer": i,
+                "heads_soft": float(heads_soft.detach().cpu()),
+                "Dq≈heads*Dh": float(Dq.detach().cpu()),
+                "Dkv_used": float(Dkv.detach().cpu()),
+                "ffn_hidden_soft": float(hidden_soft.detach().cpu()),
+            })
+        return rows
+# ------------------------------------------------------------
+# Calibration helpers for LLM
+# ------------------------------------------------------------
+@torch.inference_mode()
+def calibrate_proxy_llm(
+    proxy: LatencyProxyLLM,
+    model: nn.Module,
+    *,
+    B: int,
+    S: int,
+    T: int,
+    export_keepall_fn,
+    device: str = "cuda",
+    warmup: int = 10,
+    iters: int = 30,
+) -> float:
+    """
+    Calibrate proxy.scale_ms so proxy.predict(...) matches real keep-all latency for (B,S,T).
+    Returns the measured real mean latency in ms.
+    """
+    keepall = export_keepall_fn(model).to(device).eval()
+    # Measure real latency (prefill + decode)
+    from core.measure import measure_latency_text_ms as _measure  # adjust if your path differs
+    real_ms, _ = _measure(keepall, B=B, S=S, T=T, warmup=warmup, iters=iters, device=device)
+    # Soft/proxy latency on *gated* model
+    ms_like = proxy.predict(model, B=B, S=S, T=T)
+    soft_ms = float(ms_like.detach().item()) if torch.is_tensor(ms_like) else float(ms_like)
+    proxy.scale_ms = float(real_ms / max(soft_ms, 1e-9))
+    return real_ms
+@torch.inference_mode()
+def calibrate_proxy_llm_from_batch(
+    proxy: LatencyProxyLLM,
+    model: nn.Module,
+    batch: Dict[str, torch.Tensor],
+    *,
+    T: int,
+    export_keepall_fn,
+    device: str = "cuda",
+    warmup: int = 10,
+    iters: int = 30,
+) -> Tuple[int, int, int, float]:
+    """
+    Infers (B,S) from a batch like {'input_ids': [B,S], ...},
+    calibrates for (B,S,T), and returns (B,S,T, real_ms).
+    """
+    input_ids = batch["input_ids"]
+    B, S = int(input_ids.size(0)), int(input_ids.size(1))
+    ms = calibrate_proxy_llm(
+        proxy, model, B=B, S=S, T=T, export_keepall_fn=export_keepall_fn,
+        device=device, warmup=warmup, iters=iters
+    )
+    return B, S, T, ms

core/.ipynb_checkpoints/train-checkpoint.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""Generic Lagrangian trainer (family-agnostic).
+This module provides a light framework to optimize *gated* students against
+teachers with a latency target enforced via a proxy + optional real probes.
+It does not assume ViT/ResNet/LLM specifics; adapters provide tiny callables.
+Key ingredients:
+  - Two-phase update per step: (A) weights w.r.t. KD/task, (B) gates w.r.t. KD +
+    sparsity + latency penalty with a dual variable λ.
+  - Optional periodic export + real-latency probe to correct λ.
+  - Constraint projection for gates after each step.
+Adapters must provide:
+  - get_student_logits(model, x) -> Tensor
+  - get_teacher_logits(model, x) -> Tensor
+  - export_keepall(model) -> nn.Module (clean copy without gates)
+  - export_pruned(model, policy, step) -> nn.Module (transient copy for profiling)
+Core modules used:
+  - `distill.KDConfig`, `distill.kd_loss`
+  - `gates.combined_penalty`, `gates.PenaltyWeights`, `gates.project_gates_into_constraints`
+  - `proxy_cost.LatencyProxy`
+  - `profiler.measure_latency_ms`
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional
+import gc
+import torch
+import torch.nn as nn
+from .distill import KDConfig, kd_loss, mse_reg
+from .gates import PenaltyWeights, Constraints, combined_penalty, project_gates_into_constraints, collect_param_groups
+from .proxy_cost import LatencyProxy
+from .profiler import measure_latency_ms
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+@dataclass
+class DualConfig:
+    lr: float = 0.05     # step for λ update
+    ema_beta: float = 0.5  # blend proxy-driven λ and real probe λ
+    clip: float = 10.0
+@dataclass
+class TrainerConfig:
+    kd: KDConfig = KDConfig()
+    penalties: PenaltyWeights = PenaltyWeights(l0=0.0, keep_floor_ratio=0.0, bimodality=0.0)
+    constraints: Constraints = Constraints(min_keep_ratio=0.0, min_groups=1, max_groups_drop=None)
+    latency_target_ms: float = 30.0
+    real_probe_every: int = 0        # steps; 0 disables real probes
+    probe_batch_override: Optional[int] = None
+    gate_warmup_steps: int = 0 # Freeze gates for early steps
+    mse_weight: float = 0.0
+    early_stopping_patience: int = 0
+    early_stopping_lambda: float = 1e-4
+    amp: bool = True
+    device: str = "cuda"
+    # Optimizers
+    lr_gate: float = 1e-2
+    lr_linear: float = 1e-4
+    lr_affine: float = 3e-4
+    wd_linear: float = 1e-4
+    # Mixed precision scaler
+    use_grad_scaler: bool = True
+    # Dual update
+    dual: DualConfig = DualConfig()
+# -----------------------------------------------------------------------------
+# Trainer
+# -----------------------------------------------------------------------------
+class LagrangeTrainer:
+    def __init__(
+        self,
+        student: nn.Module,
+        teacher: nn.Module,
+        proxy: LatencyProxy,
+        *,
+        adapter_get_student_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+        adapter_get_teacher_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+        adapter_export_keepall: Callable[[nn.Module], nn.Module],
+        adapter_export_pruned: Callable[[nn.Module, object, int], nn.Module],
+        export_policy: object,
+        cfg: TrainerConfig,
+    ) -> None:
+        self.student = student
+        self.teacher = teacher.eval()
+        for p in self.teacher.parameters():
+            p.requires_grad_(False)
+        self.proxy = proxy
+        self.get_s = adapter_get_student_logits
+        self.get_t = adapter_get_teacher_logits
+        self.export_keepall = adapter_export_keepall
+        self.export_pruned = adapter_export_pruned
+        self.export_policy = export_policy
+        self.cfg = cfg
+        # Build optimizers (grouped)
+        param_groups = collect_param_groups(
+            student,
+            lr_gate=cfg.lr_gate,
+            lr_linear=cfg.lr_linear,
+            lr_affine=cfg.lr_affine,
+            wd_linear=cfg.wd_linear,
+        )
+        # gates-only optimizer uses first group
+        self.opt_g = torch.optim.Adam([param_groups[0]], lr=param_groups[0]["lr"])  # type: ignore[arg-type]
+        # weights optimizer for the rest
+        self.opt_w = torch.optim.Adam(param_groups[1:])
+        self.scaler = torch.amp.GradScaler('cuda', enabled=(cfg.amp and cfg.use_grad_scaler))
+        self.lambda_: float = 0.0
+        self.mse_weight = cfg.mse_weight
+    # ---- internal helpers -----------------------------------------------------
+    def _zero_grads(self, params):
+        for p in params:
+            if p.grad is not None:
+                p.grad = None
+    def _has_grad(self, params) -> bool:
+        for p in params:
+            if p.grad is not None:
+                return True
+        return False
+    # ---- training -------------------------------------------------------------
+    def train_epoch(self, loader, *, real_policy=None, verbose_every: int = 50):
+        device = self.cfg.device
+        self.student.train().to(device)
+        self.teacher.to(device).eval()
+        running = 0.0
+        seen = 0
+        lam_real = self.lambda_
+        total_steps = len(loader)
+        for step, batch in enumerate(loader, 1):
+            # Move batch to device in a type-safe way
+            batch = _move_batch_to_device(batch, device)
+            # with torch.inference_mode():
+            with torch.no_grad():
+                t_logits = self.get_t(self.teacher, batch)  # [B,1,V]
+            # match AMP compute dtype to avoid upcasting later
+            if self.cfg.amp:
+                # infer autocast dtype from student params (bf16 or fp16)
+                sparam = next(self.student.parameters())
+                t_logits = t_logits.to(dtype=sparam.dtype, non_blocking=True)
+            # -------- Pass A: WEIGHTS (KD only) --------
+            self.opt_w.zero_grad(set_to_none=True)
+            with torch.amp.autocast('cuda', enabled=self.cfg.amp):
+                # Adapters receive the batch object (dict/tuple/tensor)
+                s_logits = self.get_s(self.student, batch)
+                # with torch.no_grad():
+                #     t_logits = self.get_t(self.teacher, batch)
+                mse = self.mse_weight*mse_reg(s_logits, t_logits, self.cfg.kd.temperature)
+                loss_w = kd_loss(s_logits, t_logits, self.cfg.kd) + mse
+            self.scaler.scale(loss_w).backward()
+            # Prevent gate params from changing in pass A
+            gate_params = self.opt_g.param_groups[0]["params"]
+            self._zero_grads(gate_params)
+            if any(p.grad is not None for pg in self.opt_w.param_groups for p in pg["params"]):
+                self.scaler.step(self.opt_w)
+                self.scaler.update()
+            else:
+                self.opt_w.zero_grad(set_to_none=True)
+            del s_logits
+            gc.collect()
+            torch.cuda.empty_cache()
+            if step > int(self.cfg.gate_warmup_steps):
+                # -------- Pass B: GATES (KD + sparsity + λ * gap) --------
+                self.opt_g.zero_grad(set_to_none=True)
+                with torch.amp.autocast('cuda', enabled=self.cfg.amp):
+                    s_logits = self.get_s(self.student, batch)
+                    # with torch.no_grad():
+                    #     t_logits = self.get_t(self.teacher, batch)
+                    kd_g = kd_loss(s_logits, t_logits, self.cfg.kd)
+                    # Proxy gets the batch object too; family-specific proxy can read (B,S) etc.
+                    o1_ms = self.proxy.predict(self.student, batch)
+                    gap = torch.relu(o1_ms - float(self.cfg.latency_target_ms))
+                    reg = combined_penalty(self.student, self.cfg.penalties)
+                    mse = self.mse_weight*mse_reg(s_logits, t_logits, self.cfg.kd.temperature)
+                    loss_g = kd_g + _to_tensor(self.lambda_, o1_ms) * gap + reg + mse
+                self.scaler.scale(loss_g).backward()
+                # Prevent non-gate params from changing in pass B
+                for pg in self.opt_w.param_groups:
+                    self._zero_grads(pg["params"])
+                if self._has_grad(self.opt_g.param_groups[0]["params"]):
+                    self.scaler.step(self.opt_g)
+                    self.scaler.update()
+                else:
+                    self.opt_g.zero_grad(set_to_none=True)
+            else:
+                o1_ms = self.proxy.predict(self.student, batch)
+                s_logits = loss_g = kd_g = reg = torch.tensor(0.0, device=device)
+            # -------- Dual (λ) update using proxy --------
+            with torch.no_grad():
+                lam_proxy = max(0.0, self.lambda_ + self.cfg.dual.lr * (float(o1_ms.detach()) - self.cfg.latency_target_ms))
+                self.lambda_ = 0.5 * (lam_real + lam_proxy)
+            # -------- Constraint projection, optional real probe --------
+            project_gates_into_constraints(self.student, self.cfg.constraints)
+            if self.cfg.real_probe_every and (step % int(self.cfg.real_probe_every) == 0):
+                # Build a probe shape for latency func if needed
+                try:
+                    from core.measure import measure_latency_text_ms  # text-friendly
+                    if isinstance(batch, dict) and "input_ids" in batch and torch.is_tensor(batch["input_ids"]):
+                        B, S = int(batch["input_ids"].size(0)), int(batch["input_ids"].size(1))
+                    else:
+                        # Fallback: try tensor-like batch
+                        x0 = batch["input_ids"] if isinstance(batch, dict) else (batch[0] if isinstance(batch, (tuple, list)) else batch)
+                        B = int(x0.size(0)); S = int(x0.size(1))
+                    slim = self.export_pruned(self.student, real_policy or self.export_policy, step)
+                    mean_ms, p95_ms = measure_latency_text_ms(slim, B=B, S=S, T=128, device=device)
+                except Exception:
+                    # If the project has a different profiler, retain compatibility:
+                    from .profiler import measure_latency_ms
+                    x0 = batch["input_ids"] if isinstance(batch, dict) else (batch[0] if isinstance(batch, (tuple, list)) else batch)
+                    shape = (int(x0.size(0)), *list(x0.shape[1:]))
+                    slim = self.export_pruned(self.student, real_policy or self.export_policy, step)
+                    mean_ms, p95_ms = measure_latency_ms(slim, shape, device=device)
+                with torch.no_grad():
+                    lam_real = max(0.0, self.lambda_ + self.cfg.dual.lr * (mean_ms - self.cfg.latency_target_ms))
+                # scale_correction = mean_ms / max(1e-9, o1_ms.detach())
+                # self.proxy.cfg.scale_ms = 0.9 * self.proxy.cfg.scale_ms + 0.1 * scale_correction * self.proxy.cfg.scale_ms
+                if (step % verbose_every) == 0:
+                    print(
+                        f"Step {step}/{len(loader)} | KL={float(loss_w.item()):.6f} | MSE={float(mse.item()):.6f} | "
+                        f"Gate={float(loss_g.item()):.6f} | "
+                        f"proxy={float(o1_ms.detach()):.3f}ms | real_mean={mean_ms:.3f}ms p95={p95_ms:.3f}ms | λ={self.lambda_:.6f}"
+                    )
+            running += float(loss_g.detach())
+            seen += _batch_size(batch)
+            del s_logits, t_logits, o1_ms, kd_g, reg, loss_g, loss_w
+            torch.cuda.empty_cache()
+            gc.collect()
+        print(f"Epoch loss {running / max(1, seen):.6f}")
+        return self.lambda_
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+def _to_tensor(val: float, like: torch.Tensor) -> torch.Tensor:
+    return torch.as_tensor(val, device=like.device, dtype=like.dtype)
+def _move_batch_to_device(batch, device: str):
+    """
+    Supports:
+      - dict with keys 'input_ids' and optional 'attention_mask'
+      - (x,) or (x, y) tuples/lists -> move each tensor-like to device
+      - single Tensor
+    Converts attention_mask to bool (preferred by HF SDPA).
+    """
+    if isinstance(batch, dict):
+        out = {}
+        for k, v in batch.items():
+            if torch.is_tensor(v):
+                v = v.to(device, non_blocking=True)
+                if k == "attention_mask" and v.dtype != torch.bool:
+                    v = v.to(torch.bool)
+            out[k] = v
+        return out
+    if isinstance(batch, (tuple, list)):
+        moved = []
+        for v in batch:
+            if torch.is_tensor(v):
+                v = v.to(device, non_blocking=True)
+            moved.append(v)
+        return type(batch)(moved)
+    if torch.is_tensor(batch):
+        return batch.to(device, non_blocking=True)
+    # Unknown type: return as-is (adapters/proxy should handle it)
+    return batch
+def _batch_size(batch) -> int:
+    """Best-effort batch size for logging/averages."""
+    if isinstance(batch, dict) and "input_ids" in batch and torch.is_tensor(batch["input_ids"]):
+        return int(batch["input_ids"].size(0))
+    if torch.is_tensor(batch):
+        return int(batch.size(0))
+    if isinstance(batch, (tuple, list)) and len(batch) and torch.is_tensor(batch[0]):
+        return int(batch[0].size(0))
+    return 1

core/.ipynb_checkpoints/utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""Shared utilities used across core and adapters.
+Consolidates helpers that are generic (device/dtype, seeding, shapes, rounding,
+parameter grouping, model copying, etc.). Keep this file dependency-light.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, Iterator, List, Optional, Sequence, Tuple
+import copy
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+# -----------------------------------------------------------------------------
+# Device / dtype helpers
+# -----------------------------------------------------------------------------
+def as_like(x: torch.Tensor, val) -> torch.Tensor:
+    """Create a scalar/tensor constant on same device/dtype as `x`."""
+    return torch.as_tensor(val, device=x.device, dtype=x.dtype)
+def first_param(module: nn.Module) -> torch.Tensor:
+    for p in module.parameters(recurse=True):
+        return p
+    return torch.tensor(0.0)
+def to_device_dtype(x: torch.Tensor, ref: torch.Tensor) -> torch.Tensor:
+    return x.to(device=ref.device, dtype=ref.dtype)
+# -----------------------------------------------------------------------------
+# Seeding & determinism
+# -----------------------------------------------------------------------------
+def set_seed(seed: int = 42, deterministic: bool = False) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+# -----------------------------------------------------------------------------
+# Model parameter helpers
+# -----------------------------------------------------------------------------
+def freeze(module: nn.Module) -> None:
+    for p in module.parameters():
+        p.requires_grad_(False)
+def unfreeze(module: nn.Module) -> None:
+    for p in module.parameters():
+        p.requires_grad_(True)
+def count_parameters(module: nn.Module, *, trainable_only: bool = False) -> int:
+    if trainable_only:
+        return sum(p.numel() for p in module.parameters() if p.requires_grad)
+    return sum(p.numel() for p in module.parameters())
+# -----------------------------------------------------------------------------
+# Shape/signature helpers
+# -----------------------------------------------------------------------------
+def input_spec_vision(sample) -> Tuple[int, int, int]:
+    """Accept either a 4D tensor [B,3,H,W] or a 4-tuple (B,3,H,W). Returns (B,H,W)."""
+    if isinstance(sample, torch.Tensor):
+        B, C, H, W = sample.shape
+        return int(B), int(H), int(W)
+    if isinstance(sample, (tuple, list)) and len(sample) == 4:
+        B, C, H, W = sample
+        return int(B), int(H), int(W)
+    raise ValueError("sample must be a tensor [B,3,H,W] or a 4-tuple (B,3,H,W)")
+# -----------------------------------------------------------------------------
+# Rounding / multiples
+# -----------------------------------------------------------------------------
+def round_down_multiple(n: int, m: int) -> int:
+    if m is None or m <= 1:
+        return max(1, int(n))
+    n = int(n)
+    return max(m, (n // m) * m)
+def clamp_int(v: int, lo: int, hi: int) -> int:
+    return max(lo, min(int(v), hi))
+# -----------------------------------------------------------------------------
+# Slicing helpers
+# -----------------------------------------------------------------------------
+@torch.no_grad()
+def slice_linear(mat: nn.Linear, keep_in: Optional[Sequence[int]] = None, keep_out: Optional[Sequence[int]] = None) -> nn.Linear:
+    W = mat.weight.detach()
+    b = mat.bias.detach() if mat.bias is not None else None
+    if keep_out is not None:
+        idx_out = torch.as_tensor(keep_out, device=W.device)
+        W = W.index_select(0, idx_out)
+        if b is not None:
+            b = b.index_select(0, idx_out)
+    if keep_in is not None:
+        idx_in = torch.as_tensor(keep_in, device=W.device)
+        W = W.index_select(1, idx_in)
+    out_f, in_f = W.shape
+    new = nn.Linear(in_f, out_f, bias=(b is not None)).to(W.device)
+    new.weight.copy_(W)
+    if b is not None:
+        new.bias.copy_(b)
+    return new
+# -----------------------------------------------------------------------------
+# Copying & detaching models
+# -----------------------------------------------------------------------------
+def deepcopy_eval_cpu(module: nn.Module) -> nn.Module:
+    m = copy.deepcopy(module).cpu().eval()
+    return m
+# -----------------------------------------------------------------------------
+# Gradient utilities
+# -----------------------------------------------------------------------------
+def zero_if_any(params: Iterable[torch.Tensor]) -> None:
+    for p in params:
+        if p.grad is not None:
+            p.grad = None
+def any_grad(params: Iterable[torch.Tensor]) -> bool:
+    for p in params:
+        if p.grad is not None:
+            return True
+    return False
+# -----------------------------------------------------------------------------
+# For fine-tuning
+# -----------------------------------------------------------------------------
+def ensure_trainable_parameters(module: nn.Module, *, requires_grad: bool = True) -> nn.Module:
+    """
+    Rebuild all parameters as fresh nn.Parameter tensors (detach+clone),
+    which drops any 'inference tensor' tag and re-enables autograd.
+    """
+    for mod in module.modules():
+        for name, p in list(mod._parameters.items()):
+            if p is None:
+                continue
+            new_p = nn.Parameter(p.detach().clone(), requires_grad=requires_grad)
+            setattr(mod, name, new_p)
+    return module
+# -----------------------------------------------------------------------------
+# Misc
+# -----------------------------------------------------------------------------
+@dataclass
+class ExportRounding:
+    head_floor_post: int = 1
+    head_multiple_post: int = 1
+    ffn_min_keep_ratio_post: float = 0.0
+    ffn_snap_groups_post: int = 1
+def shape_signature_vit(cfg, sample_shape: Tuple[int, int, int, int]) -> Tuple:
+    B, C, H, W = sample_shape
+    return (
+        "ViT",
+        sample_shape,
+        int(getattr(cfg, "num_attention_heads", 12)),
+        int(getattr(cfg, "hidden_size", 768)),
+        int(getattr(cfg, "intermediate_size", 3072)),
+        int(getattr(cfg, "patch_size", 16)) if not isinstance(getattr(cfg, "patch_size", 16), (tuple, list)) else tuple(getattr(cfg, "patch_size", (16, 16))),
+    )

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (127 Bytes). View file

core/__pycache__/distill.cpython-310.pyc ADDED Viewed

Binary file (6.94 kB). View file

core/__pycache__/export.cpython-310.pyc ADDED Viewed

Binary file (7.31 kB). View file

core/__pycache__/finetune.cpython-310.pyc ADDED Viewed

Binary file (7.35 kB). View file

core/__pycache__/gates.cpython-310.pyc ADDED Viewed

Binary file (13.6 kB). View file

core/__pycache__/profiler.cpython-310.pyc ADDED Viewed

Binary file (7.68 kB). View file

core/__pycache__/proxy_cost.cpython-310.pyc ADDED Viewed

Binary file (22.8 kB). View file

core/__pycache__/search_export.cpython-310.pyc ADDED Viewed

Binary file (2.95 kB). View file

core/__pycache__/train.cpython-310.pyc ADDED Viewed

Binary file (9.12 kB). View file

core/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (5.98 kB). View file

core/distill.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Knowledge-distillation utilities (model-family agnostic).
+This module provides:
+  - Losses: KL distillation, soft cross-entropy, cosine feature loss
+  - Helper to obtain logits from models with/without built-in heads
+  - Lightweight classification head for backbone models (e.g., ViTModel)
+  - Simple evaluators (agreement %, KL) and diagnostics
+Adapters may override `adapter_get_logits(model, x)` if a family needs a
+custom extraction (e.g., language models with past_key_values).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional, Protocol, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+@dataclass
+class KDConfig:
+    temperature: float = 2.0
+    alpha: float = 1.0  # multiplier for KL term; task loss handled outside
+# -----------------------------------------------------------------------------
+# Losses
+# -----------------------------------------------------------------------------
+def kl_divergence(student_logits: torch.Tensor, teacher_logits: torch.Tensor, T: float = 2.0) -> torch.Tensor:
+    """Batchmean KL(student/ T || teacher/ T) scaled by T^2 (Hinton-style)."""
+    p_s = F.log_softmax(student_logits / T, dim=-1)
+    p_t = F.softmax(teacher_logits / T, dim=-1)
+    return F.kl_div(p_s, p_t, reduction="batchmean") * (T * T)
+def kd_loss(student_logits: torch.Tensor, teacher_logits: torch.Tensor, cfg: KDConfig) -> torch.Tensor:
+    return cfg.alpha * kl_divergence(student_logits, teacher_logits, T=cfg.temperature)
+def mse_reg(student_logits: torch.Tensor, teacher_logits: torch.Tensor, T: float = 2.0) -> torch.Tensor:
+    mse = F.mse_loss(student_logits,teacher_logits, reduction="mean")
+    return mse * (T * T)
+def soft_ce(student_logits: torch.Tensor, soft_targets: torch.Tensor) -> torch.Tensor:
+    """Soft cross-entropy: expects `soft_targets` already normalized."""
+    logp = F.log_softmax(student_logits, dim=-1)
+    return -(soft_targets * logp).sum(dim=-1).mean()
+def cosine_feature_loss(student_feats: torch.Tensor, teacher_feats: torch.Tensor) -> torch.Tensor:
+    """1 - cosine similarity averaged over batch and time/patch dims."""
+    s = F.normalize(student_feats, dim=-1)
+    t = F.normalize(teacher_feats, dim=-1)
+    return (1.0 - (s * t).sum(dim=-1)).mean()
+# -----------------------------------------------------------------------------
+# Logit extraction
+# -----------------------------------------------------------------------------
+class LogitsProvider(Protocol):
+    def __call__(self, model: nn.Module, x: torch.Tensor) -> torch.Tensor: ...
+class ClsHead(nn.Module):
+    """Minimal classification head: LN + Linear.
+    Useful when the backbone outputs hidden states (e.g., ViTModel) and you
+    want logits comparable to a teacher with a classification head.
+    """
+    def __init__(self, hidden_size: int, num_classes: int = 1000, base_head: Optional[nn.Module] = None):
+        super().__init__()
+        self.norm = nn.LayerNorm(hidden_size)
+        self.fc = nn.Linear(hidden_size, num_classes)
+        if base_head is not None:
+            # Try to load weights if shapes match (e.g., from HF classifier)
+            try:
+                self.load_state_dict(base_head.state_dict(), strict=False)
+            except Exception:
+                pass
+    def forward(self, cls_token: torch.Tensor) -> torch.Tensor:
+        return self.fc(self.norm(cls_token))
+@torch.no_grad()
+def infer_hidden_size(model: nn.Module, sample: torch.Tensor) -> int:
+    # Run a tiny forward to inspect hidden size when unknown
+    model.eval()
+    out = model(pixel_values=sample)
+    if hasattr(out, "last_hidden_state"):
+        return int(out.last_hidden_state.shape[-1])
+    if hasattr(out, "logits"):
+        return int(out.logits.shape[-1])
+    raise RuntimeError("Cannot infer hidden size; provide explicitly.")
+def default_get_logits(model: nn.Module, x: torch.Tensor, *, head: Optional[nn.Module] = None) -> torch.Tensor:
+    """Family-agnostic logits extractor.
+    - If model output has `.logits`, return it.
+    - Else expects `.last_hidden_state` and uses [CLS] via provided `head`.
+    """
+    out = model(pixel_values=x)
+    if hasattr(out, "logits"):
+        return out.logits
+    if hasattr(out, "last_hidden_state"):
+        if head is None:
+            raise ValueError("Backbone returned hidden states; supply a classification head.")
+        cls_tok = out.last_hidden_state[:, 0, :]
+        return head(cls_tok)
+    raise ValueError("Model output lacks logits and last_hidden_state.")
+# -----------------------------------------------------------------------------
+# Evaluators & diagnostics
+# -----------------------------------------------------------------------------
+@torch.inference_mode()
+def logits_std(model: nn.Module, loader, *, get_logits: LogitsProvider, batches: int = 10, device: str = "cuda") -> Tuple[float, int]:
+    s = 0.0
+    k = 0
+    for x in loader:
+        if k >= batches:
+            break
+        x = x.to(device)
+        y = get_logits(model, x)
+        s += y.std().item()
+        k += 1
+    return (s / max(1, k), k)
+@torch.inference_mode()
+def agreement_metrics(
+    student: nn.Module,
+    teacher: nn.Module,
+    loader,
+    *,
+    get_student_logits: LogitsProvider,
+    get_teacher_logits: LogitsProvider,
+    batches: int = 20,
+    T: float = 1.0,
+    device: str = "cuda",
+) -> dict:
+    kl_sum = 0.0
+    n = 0
+    top1 = 0
+    tot = 0
+    for i, x in enumerate(loader):
+        if i >= batches:
+            break
+        x = x.to(device)
+        t = get_teacher_logits(teacher, x)
+        s = get_student_logits(student, x)
+        p_s = F.log_softmax(s / T, dim=-1)
+        p_t = F.softmax(t / T, dim=-1)
+        kl_sum += (F.kl_div(p_s, p_t, reduction="batchmean") * (T * T)).item()
+        top1 += (s.argmax(-1) == t.argmax(-1)).sum().item()
+        tot += x.size(0)
+        n += 1
+    return {"kl_TT": kl_sum / max(1, n), "top1_agreement": top1 / max(1, tot)}
+# -----------------------------------------------------------------------------
+# Small trainer helpers
+# -----------------------------------------------------------------------------
+class DualEMA:
+    """Simple exponential moving average for a scalar (e.g., lambda or latency)."""
+    def __init__(self, beta: float = 0.9, value: float = 0.0):
+        self.beta = float(beta)
+        self.value = float(value)
+    def update(self, x: float) -> float:
+        self.value = self.beta * self.value + (1 - self.beta) * float(x)
+        return self.value

core/export.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""Core export utilities for hard-pruning and kernel-aligned rounding.
+This module is *family-agnostic*. Adapters (e.g., ViT, ResNet, LLM) should:
+  1) decide which gates map to which structural dims (heads, hidden groups, channels),
+  2) obtain KEEP indices using helpers in this file, and
+  3) rebuild family-specific modules with the sliced weights.
+Provided here:
+  - Rounding policies and helpers (floors, multiples, warmup keep-all)
+  - KEEP index selection from a `Gate` (or gate-like) object
+  - Generic weight slicers for Linear / Conv2d / Embedding
+  - Small safe-guards for dtype/device and shape checks
+The library avoids touching family internals here. Exporters in adapters should
+use these primitives to assemble a clean pruned model.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, Optional, Sequence, Tuple
+import torch
+import torch.nn as nn
+from .gates import Gate, expand_group_indices
+# -----------------------------------------------------------------------------
+# Policies & rounding
+# -----------------------------------------------------------------------------
+@dataclass
+class Rounding:
+    """Rounding policy for a single gated axis.
+    Attributes
+    ----------
+    floor_groups : int
+        Minimum number of groups to keep after rounding.
+    multiple_groups : int
+        Snap the number of groups kept down to a multiple of this (>=1).
+    min_keep_ratio : float
+        Optional fractional lower bound on expected keep; applied before rounding.
+    """
+    floor_groups: int = 1
+    multiple_groups: int = 1
+    min_keep_ratio: float = 0.0
+@dataclass
+class ExportPolicy:
+    """Export-time policy shared by families.
+    - `warmup_steps`: if current `step < warmup_steps`, keep-all.
+    - `rounding`: default rounding used unless adapter overrides per-axis.
+    """
+    warmup_steps: int = 0
+    rounding: Rounding = Rounding()
+def _round_down_mult(n: int, m: int) -> int:
+    if m is None or m <= 1:
+        return max(1, int(n))
+    n = int(n)
+    return max(m, (n // m) * m)
+def _compute_keep_k(
+    expected_kept: float,
+    total_groups: int,
+    *,
+    rounding: Rounding,
+) -> int:
+    # Start from nearest-integer expectation
+    k = int(round(expected_kept))
+    # Apply ratio floor, then absolute floor, then multiple snapping
+    k = max(k, int(rounding.min_keep_ratio * total_groups))
+    k = max(k, int(rounding.floor_groups))
+    k = min(k, total_groups)
+    k = _round_down_mult(k, int(rounding.multiple_groups))
+    return max(1, min(k, total_groups))
+# -----------------------------------------------------------------------------
+# KEEP index selection from a gate
+# -----------------------------------------------------------------------------
+@torch.no_grad()
+def keep_group_indices_from_gate(
+    gate: Gate,
+    *,
+    policy: ExportPolicy,
+    step: Optional[int] = None,
+    custom_rounding: Optional[Rounding] = None,
+) -> torch.Tensor:
+    """Return sorted indices of groups to KEEP based on `gate` and policy.
+    If `step < warmup_steps`, returns all indices (keep-all). Otherwise, the
+    number of groups to keep is computed from the *expected keep* under the
+    current logits and snapped according to the rounding policy.
+    """
+    G = int(gate.num_groups)
+    if step is not None and step < int(policy.warmup_steps):
+        return torch.arange(G, device=gate.logits.device)
+    rounding = custom_rounding or policy.rounding
+    p = torch.sigmoid(gate.logits.detach().float() / float(gate.tau))
+    k = _compute_keep_k(expected_kept=float(p.sum()), total_groups=G, rounding=rounding)
+    idx = torch.topk(p, k, largest=True).indices.sort().values
+    return idx.to(torch.long)
+@torch.no_grad()
+def keep_element_indices_from_gate(
+    gate: Gate,
+    *,
+    policy: ExportPolicy,
+    step: Optional[int] = None,
+    custom_rounding: Optional[Rounding] = None,
+) -> torch.Tensor:
+    """Expand kept *group* indices into element indices using `group_size`."""
+    grp_idx = keep_group_indices_from_gate(gate, policy=policy, step=step, custom_rounding=custom_rounding)
+    return expand_group_indices(grp_idx, gate.group_size)
+# -----------------------------------------------------------------------------
+# Generic slicers
+# -----------------------------------------------------------------------------
+@torch.no_grad()
+def slice_linear(mat: nn.Linear, keep_in: Optional[Sequence[int]] = None, keep_out: Optional[Sequence[int]] = None) -> nn.Linear:
+    """Create a new Linear with selected input/output features preserved.
+    - `keep_out` selects rows (output features)
+    - `keep_in`  selects columns (input features)
+    """
+    W = mat.weight.detach()
+    b = mat.bias.detach() if mat.bias is not None else None
+    if keep_out is not None:
+        W = W.index_select(0, torch.as_tensor(keep_out, device=W.device))
+        if b is not None:
+            b = b.index_select(0, torch.as_tensor(keep_out, device=b.device))
+    if keep_in is not None:
+        W = W.index_select(1, torch.as_tensor(keep_in, device=W.device))
+    out_f, in_f = W.shape
+    new = nn.Linear(in_f, out_f, bias=(b is not None)).to(W.device)
+    new.weight.copy_(W)
+    if b is not None:
+        new.bias.copy_(b)
+    return new
+@torch.no_grad()
+def slice_conv2d(conv: nn.Conv2d, keep_in: Optional[Sequence[int]] = None, keep_out: Optional[Sequence[int]] = None) -> nn.Conv2d:
+    """Create a new Conv2d with selected in/out channels preserved.
+    Only supports standard conv2d (no groups/depthwise changes). For grouped
+    convs, the adapter should handle group alignment before calling this.
+    """
+    W = conv.weight.detach()
+    b = conv.bias.detach() if conv.bias is not None else None
+    if keep_out is not None:
+        W = W.index_select(0, torch.as_tensor(keep_out, device=W.device))
+        if b is not None:
+            b = b.index_select(0, torch.as_tensor(keep_out, device=b.device))
+    if keep_in is not None:
+        W = W.index_select(1, torch.as_tensor(keep_in, device=W.device))
+    out_c, in_c = W.shape[:2]
+    new = nn.Conv2d(
+        in_c,
+        out_c,
+        kernel_size=conv.kernel_size,
+        stride=conv.stride,
+        padding=conv.padding,
+        dilation=conv.dilation,
+        groups=1,
+        bias=(b is not None),
+        padding_mode=conv.padding_mode,
+    ).to(W.device)
+    new.weight.copy_(W)
+    if b is not None:
+        new.bias.copy_(b)
+    return new
+@torch.no_grad()
+def slice_embedding(emb: nn.Embedding, keep_rows: Optional[Sequence[int]] = None, keep_dim: Optional[Sequence[int]] = None) -> nn.Embedding:
+    """Create a new Embedding with selected rows (vocab) and/or dims kept."""
+    W = emb.weight.detach()
+    if keep_rows is not None:
+        W = W.index_select(0, torch.as_tensor(keep_rows, device=W.device))
+    if keep_dim is not None:
+        W = W.index_select(1, torch.as_tensor(keep_dim, device=W.device))
+    num, dim = W.shape
+    new = nn.Embedding(num, dim, padding_idx=emb.padding_idx, max_norm=emb.max_norm, norm_type=emb.norm_type, scale_grad_by_freq=emb.scale_grad_by_freq, sparse=emb.sparse, device=W.device, dtype=W.dtype)
+    new.weight.copy_(W)
+    return new
+# -----------------------------------------------------------------------------
+# Small helpers for adapters
+# -----------------------------------------------------------------------------
+@torch.no_grad()
+def concat_index_ranges(ranges: Sequence[Tuple[int, int]]) -> torch.Tensor:
+    """Given [(start, end_exclusive), ...], return concatenated 1D indices."""
+    parts = [torch.arange(a, b, dtype=torch.long) for a, b in ranges if b > a]
+    return torch.cat(parts, dim=0) if parts else torch.empty(0, dtype=torch.long)
+@torch.no_grad()
+def block_indices_from_groups(groups: Sequence[int], group_size: int) -> torch.Tensor:
+    """Convert sorted group ids to expanded feature indices."""
+    groups = torch.as_tensor(groups, dtype=torch.long)
+    return expand_group_indices(groups, int(group_size))

core/finetune.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# core/finetune.py
+"""Post-pruning fine-tuning utilities (distillation)."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple, Iterable
+import torch
+import torch.nn as nn
+from core.distill import KDConfig, kd_loss, mse_reg
+from core.utils import ensure_trainable_parameters
+import copy
+@dataclass
+class FinetuneConfig:
+    epochs: int = 5
+    lr: float = 3e-4
+    wd: float = 0.0
+    kd: KDConfig = KDConfig(temperature=2.0, alpha=1.0)
+    amp: bool = True
+    # "auto" -> bf16 if supported else fp16; "bf16" | "fp16" | "off" also allowed
+    amp_dtype: str = "auto"
+    device: str = "cuda"
+    log_every: int = 200
+    # diagnostics
+    grad_check_every: int = 50
+    grad_warn_if_zero_steps: int = 2   # consecutive checks with zero grad -> warn
+    mse_weight: float = 0.0
+def _autocast_and_scaler(amp: bool, amp_dtype: str) -> Tuple[torch.autocast, Optional[torch.amp.GradScaler], bool, str]:
+    """
+    Returns (autocast_ctx, scaler_or_None, use_scaler_bool, amp_mode_str)
+      - BF16 -> autocast(bfloat16), NO GradScaler
+      - FP16 -> autocast(float16),  GradScaler ENABLED
+      - OFF  -> disabled autocast,  NO GradScaler
+    """
+    if not amp or amp_dtype == "off":
+        ctx = torch.amp.autocast(device_type="cuda", enabled=False)
+        return ctx, None, False, "OFF"
+    if amp_dtype == "auto":
+        use_bf16 = torch.cuda.is_bf16_supported()
+    elif amp_dtype == "bf16":
+        use_bf16 = True
+    elif amp_dtype == "fp16":
+        use_bf16 = False
+    else:
+        raise ValueError(f"Unknown amp_dtype={amp_dtype!r}")
+    if use_bf16:
+        ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True)
+        return ctx, None, False, "BF16"
+    else:
+        ctx = torch.amp.autocast(device_type="cuda", dtype=torch.float16, enabled=True)
+        try:
+            scaler = torch.amp.GradScaler("cuda", enabled=True)
+        except TypeError:
+            scaler = torch.cuda.amp.GradScaler(enabled=True)
+        return ctx, scaler, True, "FP16"
+def _images_from_batch(batch):
+    if isinstance(batch, dict):
+        return batch.get("pixel_values") or batch.get("input")
+    if isinstance(batch, (tuple, list)):
+        return batch[0]
+    return batch
+def _param_iter_trainable(model: nn.Module) -> Iterable[torch.nn.Parameter]:
+    for p in model.parameters():
+        if p.requires_grad:
+            yield p
+def _grad_norm_and_nonzero(params: Iterable[torch.nn.Parameter]) -> Tuple[float, int]:
+    total_sq, nonzero = 0.0, 0
+    for p in params:
+        g = p.grad
+        if g is None:
+            continue
+        if g.is_sparse:
+            g = g.coalesce().values()
+        gn = float(g.detach().norm().cpu())
+        if gn > 0.0:
+            nonzero += 1
+        total_sq += gn * gn
+    return (total_sq ** 0.5), nonzero
+@torch.no_grad()
+def recalibrate_bn_stats(model, loader, max_batches=200, device="cuda"):
+    model.train()  # use training mode to update running stats
+    seen = 0
+    for i, batch in enumerate(loader):
+        if i >= max_batches: break
+        x = batch[0] if isinstance(batch, (tuple, list)) else batch
+        if not torch.is_tensor(x): continue
+        x = x.to(device, non_blocking=True)
+        model(x)
+        seen += x.size(0)
+    return seen
+def finetune_student(
+    student: nn.Module,
+    teacher: nn.Module,
+    train_loader,
+    *,
+    get_student_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+    get_teacher_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+    cfg: FinetuneConfig = FinetuneConfig(),
+    val_loader=None,
+    on_step: Optional[Callable[[int, float], None]] = None,
+    save_best=False
+) -> nn.Module:
+    """Fine-tune a pruned student against a frozen teacher using KD."""
+    dev = cfg.device
+    student = student.to(dev)
+    teacher = teacher.to(dev).eval()
+    for p in teacher.parameters():
+        p.requires_grad_(False)
+    for p in student.parameters():
+        p.requires_grad_(True)
+    # Make sure we can actually train
+    ensure_trainable_parameters(student, requires_grad=True)
+    trainable = sum(p.numel() for p in student.parameters() if p.requires_grad)
+    if trainable == 0:
+        raise RuntimeError("No trainable parameters in student — cannot finetune.")
+    opt = torch.optim.AdamW(
+        _param_iter_trainable(student),
+        lr=cfg.lr,
+        weight_decay=cfg.wd,
+    )
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=cfg.epochs*len(train_loader), eta_min=3e-5)
+    autocast_ctx, scaler, use_scaler, amp_mode = _autocast_and_scaler(cfg.amp, cfg.amp_dtype)
+    print(f"[AMP] Mode={amp_mode} | GradScaler={'ON' if use_scaler else 'OFF'} | "
+          f"KD: T={cfg.kd.temperature} alpha={cfg.kd.alpha} | LR={cfg.lr} WD={cfg.wd} | Trainable params={trainable:,}")
+    zero_grad_streak = 0
+    global_step = 0
+    T_max = cfg.kd.temperature
+    T_min = 2.0
+    kd_conf = cfg.kd
+    best_state = None
+    best_val = float("inf")
+    for ep in range(cfg.epochs):
+        student.train()
+        running, seen = 0.0, 0
+        for i, batch in enumerate(train_loader):
+            step = ep*len(train_loader) + i # global step for T scheduling
+            max_steps = cfg.epochs*len(train_loader)
+            kd_conf.temperature = T_max - (step/max_steps)*(T_max - T_min)
+            # print(f"Step {step}/{max_steps}, T_min={T_min}, T={kd_conf.temperature}, T_max={T_max}")
+            x = _images_from_batch(batch)
+            if not torch.is_tensor(x):
+                raise ValueError("Train loader must yield tensors or (tensor, target) tuples.")
+            x = x.to(dev, non_blocking=True)
+            with torch.no_grad():
+                t = get_teacher_logits(teacher, x)
+                # Force numerically stable dtype for the loss
+                t = t.float()
+            # ---- forward student under autocast
+            with autocast_ctx:
+                s = get_student_logits(student, x)
+            # ---- compute KD loss in FP32 (outside autocast) for stability
+            s32 = s.float()
+            mse = cfg.mse_weight*mse_reg(s32, t, kd_conf.temperature)
+            loss = kd_loss(s32, t, kd_conf) + mse
+            opt.zero_grad(set_to_none=True)
+            if use_scaler:
+                scaler.scale(loss).backward()
+                scaler.step(opt)
+                scaler.update()
+            else:
+                loss.backward()
+                opt.step()
+            # ---- diagnostics
+            bs = x.size(0)
+            running += float(loss.detach()) * bs
+            seen += bs
+            global_step += 1
+            if cfg.grad_check_every and (global_step % cfg.grad_check_every == 0):
+                gnorm, n_nonzero = _grad_norm_and_nonzero(_param_iter_trainable(student))
+                if n_nonzero == 0 or gnorm == 0.0:
+                    zero_grad_streak += 1
+                    if zero_grad_streak >= cfg.grad_warn_if_zero_steps:
+                        print(f"[WARN] Step {global_step}: zero gradients detected "
+                              f"(nonzero={n_nonzero}, grad_norm={gnorm:.3e}). "
+                              f"Check get_student_logits, requires_grad, AMP settings, and data pipeline.")
+                else:
+                    zero_grad_streak = 0
+            if cfg.log_every and (i + 1) % cfg.log_every == 0:
+                print(f"Step {i+1}/{len(train_loader)} (ep {ep+1}/{cfg.epochs}): "
+                      f"running loss = {running / max(1, seen):.4f}")
+            if on_step is not None:
+                on_step(global_step, float(loss.detach()))
+            # free ASAP
+            del s, s32, t, loss
+        # ---- validation
+        if val_loader is not None:
+            _ = recalibrate_bn_stats(student, train_loader, max_batches=1000, device=cfg.device)
+            student.eval()
+            val_loss, vseen = 0.0, 0
+            with torch.no_grad():
+                for vbatch in val_loader:
+                    vx = _images_from_batch(vbatch)
+                    if not torch.is_tensor(vx):
+                        raise ValueError("Val loader must yield tensors or (tensor, target) tuples.")
+                    vx = vx.to(dev, non_blocking=True)
+                    vt = get_teacher_logits(teacher, vx).float()
+                    with autocast_ctx:
+                        vs = get_student_logits(student, vx)
+                    vs32 = vs.float()
+                    vmse = cfg.mse_weight*mse_reg(vs32, vt, kd_conf.temperature)
+                    vloss = kd_loss(vs32, vt, kd_conf) + vmse
+                    val_loss += float(vloss.detach()) * vx.size(0)
+                    vseen += vx.size(0)
+            mean_val = val_loss / max(1, vseen)
+            print("\n------------------------------------------------")
+            print(f"Epoch {ep+1}/{cfg.epochs}: T={kd_conf.temperature:.2f}, train={running / max(1, seen):.6f}, "
+                  f"val={mean_val:.6f}")
+            if save_best and (mean_val < best_val):
+                best_val = mean_val
+                best_state = copy.deepcopy(student.state_dict())
+            print("------------------------------------------------\n")
+        else:
+            print(f"Epoch {ep+1}/{cfg.epochs}: train={running / max(1, seen):.6f}")
+        scheduler.step()
+    if save_best and val_loader is not None and best_state is not None:
+        student.load_state_dict(best_state)
+    student.eval()
+    return student

core/gates.py ADDED Viewed

	@@ -0,0 +1,389 @@

+"""Core gating primitives for hardware-aware model optimization.
+This module defines:
+  - Base `Gate` interface (nn.Module) with a small, consistent API
+  - Concrete gates: HeadGate, GroupGate, LayerGate
+  - Straight-Through (ST) relaxed Bernoulli with Gumbel noise
+  - Penalties/regularizers commonly used during training
+  - Constraint projection helpers
+Design goals:
+  - TorchScript-friendly where possible
+  - Minimal assumptions about model family (ViT, ResNet, LLM)
+  - Gates operate on *groups* of units; group_size controls expansion
+  - No direct knowledge of attention/FFN/etc. — adapters wire masks
+Typical usage (adapter side):
+  >>> gate = GroupGate(num_groups=H, group_size=Dh, tau=1.5, init_logit=3.0)
+  >>> m = gate.mask(training=self.training)           # [H * Dh]
+  >>> tensor = tensor * m.view(1, H, 1, Dh)           # example broadcast
+Penalties scan the module tree for objects exposing `.logits` and `.tau`.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# -----------------------------------------------------------------------------
+# Utilities
+# -----------------------------------------------------------------------------
+def _as_like(x: torch.Tensor, val) -> torch.Tensor:
+    return torch.as_tensor(val, device=x.device, dtype=x.dtype)
+def _gumbel_like(x: torch.Tensor) -> torch.Tensor:
+    # Uniform(0,1) clamped for numerical stability
+    u = torch.rand_like(x).clamp_(1e-6, 1 - 1e-6)
+    return u.log().neg_() - (1 - u).log().neg_()  # log(u) - log(1-u)
+# -----------------------------------------------------------------------------
+# Base Gate
+# -----------------------------------------------------------------------------
+class Gate(nn.Module):
+    """Abstract gate over *groups*.
+    A gate controls `num_groups` binary decisions, typically expanded by
+    `group_size` when applied to tensors. For example, gating ViT MLP hidden
+    units in groups of 16: `num_groups = hidden // 16`, `group_size = 16`.
+    Subclasses may override `sample_mask` for custom relaxations.
+    """
+    def __init__(
+        self,
+        num_groups: int,
+        *,
+        group_size: int = 1,
+        tau: float = 1.5,
+        init_logit: float = 3.0,
+        hard_during_eval: bool = True,
+    ) -> None:
+        super().__init__()
+        assert num_groups > 0 and group_size > 0
+        self.num_groups = int(num_groups)
+        self.group_size = int(group_size)
+        self.tau = float(tau)
+        self.hard_during_eval = bool(hard_during_eval)
+        self.logits = nn.Parameter(torch.full((self.num_groups,), float(init_logit)))
+    # ----- probabilities & stats ------------------------------------------------
+    def probs(self) -> torch.Tensor:
+        """Return per-group keep probabilities (sigmoid(logit / tau))."""
+        # Using /tau here makes `tau` affect both train and eval statistics
+        return torch.sigmoid(self.logits / self.tau)
+    def expected_kept(self) -> torch.Tensor:
+        """Expected *elements* kept (groups × group_size)."""
+        return self.probs().sum() * _as_like(self.logits, self.group_size)
+    # ----- masks ----------------------------------------------------------------
+    def _hard_mask(self) -> torch.Tensor:
+        m = (self.logits > 0).to(self.logits.dtype)
+        return m.repeat_interleave(self.group_size)
+    def _soft_st_mask(self) -> torch.Tensor:
+        # Straight-through relaxed Bernoulli via Gumbel-sigmoid
+        s = _gumbel_like(self.logits)
+        y = torch.sigmoid((self.logits + s) / self.tau)
+        y_hard = (y > 0.5).to(y.dtype)
+        m = (y_hard - y).detach() + y
+        return m.repeat_interleave(self.group_size)
+    def mask(self, training: Optional[bool] = None) -> torch.Tensor:
+        """Return a 1D mask of length `num_groups * group_size`.
+        - Training: straight-through relaxed mask
+        - Eval: hard (thresholded) mask if `hard_during_eval` else probs expanded
+        """
+        if training is None:
+            training = self.training
+        if training:
+            return self._soft_st_mask()
+        if self.hard_during_eval:
+            return self._hard_mask()
+        p = self.probs()
+        return p.repeat_interleave(self.group_size)
+    # ----- export helpers -------------------------------------------------------
+    @torch.no_grad()
+    def topk_indices(self, k: int) -> torch.Tensor:
+        k = int(max(1, min(k, self.num_groups)))
+        return torch.topk(self.logits, k, largest=True).indices.sort().values
+    @torch.no_grad()
+    def threshold_count(self) -> int:
+        # Rounds to the nearest integer expectation, then clamps
+        p = self.probs()
+        k = int(torch.round(p.sum()).item())
+        return max(1, min(k, self.num_groups))
+# -----------------------------------------------------------------------------
+# Concrete gates
+# -----------------------------------------------------------------------------
+class HeadGate(Gate):
+    """Per-head gate. Often used with attention where group_size=head_dim."""
+    def __init__(self, num_heads: int, *, head_dim: int = 1, **kw):
+        super().__init__(num_groups=num_heads, group_size=head_dim, **kw)
+class GroupGate(Gate):
+    """Generic group gate (e.g., MLP hidden grouped by `group_size`)."""
+    pass
+class LayerGate(Gate):
+    """One bit per layer (group_size=1)."""
+    def __init__(self, num_layers: int, **kw):
+        super().__init__(num_groups=num_layers, group_size=1, **kw)
+# -----------------------------------------------------------------------------
+# Penalties / Regularizers
+# -----------------------------------------------------------------------------
+@dataclass
+class PenaltyWeights:
+    """Scalars to blend regularization terms.
+    Attributes
+    ----------
+    l0 : float
+        Weight for the L0-like sparsity term (sum of keep probs).
+    keep_floor_ratio : float
+        Soft constraint: expected kept groups >= floor_ratio * groups.
+    bimodality : float
+        Encourages probabilities away from 0.5.
+    """
+    l0: float = 0.0
+    keep_floor_ratio: float = 0.0
+    bimodality: float = 0.0
+def iter_gates(module: nn.Module) -> Iterable[Gate]:
+    for m in module.modules():
+        if isinstance(m, Gate):
+            yield m
+        else:
+            # Duck-typing compatibility: any module with `.logits` and `.tau`
+            if hasattr(m, "logits") and hasattr(m, "tau"):
+                logits = getattr(m, "logits")
+                if isinstance(logits, torch.Tensor) and logits.dim() == 1:
+                    # Wrap view: expose basic API via adapter shim
+                    g = _TensorBackedGateShim(m)
+                    yield g
+class _TensorBackedGateShim:
+    """Lightweight adapter exposing .logits, .tau, .group_size, .num_groups.
+    It is intentionally NOT an nn.Module and NOT a Gate subclass to avoid
+    ctor/signature constraints and registration side-effects. It's only used
+    by projection/regularization utilities that read/update .logits.
+    """
+    __slots__ = ("host", "logits", "tau", "group_size", "num_groups")
+    def __init__(self, host):
+        self.host = host
+        # logits must be a Tensor/Parameter on the host
+        self.logits = getattr(host, "logits")
+        # default tau=1.5 if not present
+        self.tau = float(getattr(host, "tau", 1.5))
+        # support either group_size or group attribute names
+        self.group_size = int(getattr(host, "group_size", getattr(host, "group", 1)))
+        self.num_groups = int(self.logits.numel())
+    def forward(self, *args, **kwargs):  # pragma: no cover - shim is not used as a layer
+        raise RuntimeError("Gate shim is not a callable layer")
+def l0_like_sparsity(module: nn.Module) -> torch.Tensor:
+    """Sum of keep probabilities across all gates (acts like L0/L1)."""
+    val = _as_like(next(module.parameters(), torch.tensor(0.0, device="cpu")), 0.0)
+    out = torch.as_tensor(0.0, device=val.device, dtype=val.dtype)
+    for g in iter_gates(module):
+        out = out + g.probs().sum()
+    return out
+def keep_floor(module: nn.Module, floor_ratio: float) -> torch.Tensor:
+    """Soft penalty if expected-kept falls below a fraction per gate.
+    For each gate with G groups, penalize relu(floor*G - sum(p)).
+    """
+    if floor_ratio <= 0:
+        return torch.tensor(0.0, device=next(module.parameters(), torch.tensor(0.0)).device)
+    floor_ratio = float(floor_ratio)
+    val = _as_like(next(module.parameters(), torch.tensor(0.0, device="cpu")), 0.0)
+    out = torch.as_tensor(0.0, device=val.device, dtype=val.dtype)
+    for g in iter_gates(module):
+        G = _as_like(val, g.num_groups)
+        floor_groups = _as_like(val, max(1.0, floor_ratio * float(g.num_groups)))
+        out = out + F.relu(floor_groups - g.probs().sum())
+    return out
+def bimodality(module: nn.Module) -> torch.Tensor:
+    """Sum over p*(1-p) to push probs away from 0.5 (minimum at 0 or 1)."""
+    val = _as_like(next(module.parameters(), torch.tensor(0.0, device="cpu")), 0.0)
+    out = torch.as_tensor(0.0, device=val.device, dtype=val.dtype)
+    for g in iter_gates(module):
+        p = g.probs()
+        out = out + (p * (1.0 - p)).sum()
+    return out
+def combined_penalty(
+    module: nn.Module,
+    weights: PenaltyWeights,
+) -> torch.Tensor:
+    out = torch.tensor(0.0, device=next(module.parameters(), torch.tensor(0.0)).device)
+    if weights.l0:
+        out = out + weights.l0 * l0_like_sparsity(module)
+    if weights.keep_floor_ratio:
+        out = out + keep_floor(module, weights.keep_floor_ratio)
+    if weights.bimodality:
+        out = out + weights.bimodality * bimodality(module)
+    return out
+# -----------------------------------------------------------------------------
+# Constraint projection
+# -----------------------------------------------------------------------------
+@dataclass
+class Constraints:
+    """High-level feasibility constraints.
+    * min_keep_ratio: per-gate minimum fraction of groups to keep (soft cap via
+      projection onto [min_k, G]).
+    * min_groups: absolute lower bound per gate (after rounding).
+    * max_groups_drop: optional ceiling on groups dropped per gate.
+    """
+    min_keep_ratio: float = 0.0
+    min_groups: int = 1
+    max_groups_drop: Optional[int] = None
+@torch.no_grad()
+def project_gates_into_constraints(module: nn.Module, cons: Constraints) -> None:
+    """Project gate logits so that expected kept groups respect constraints.
+    We rescale logits by an additive bias to achieve a target sum of probs when
+    violating the lower/upper bounds. This is a light-touch projection that
+    keeps relative ordering intact.
+    """
+    for g in iter_gates(module):
+        p = torch.sigmoid(g.logits / g.tau)
+        G = p.numel()
+        # Lower bound
+        min_keep = max(cons.min_groups, int(cons.min_keep_ratio * G))
+        if p.sum().item() < min_keep:
+            # Additive bias to increase sum(p)
+            bias = torch.tensor(2.0, device=p.device, dtype=p.dtype)
+            # Increase iteratively but cheaply
+            for _ in range(6):
+                p = torch.sigmoid((g.logits + bias) / g.tau)
+                if p.sum().item() >= min_keep:
+                    break
+                bias = bias * 2
+            g.logits.add_(bias)
+        # Optional upper bound on drops
+        if cons.max_groups_drop is not None:
+            max_drop = int(cons.max_groups_drop)
+            max_keep = max(1, G - max_drop)
+            if p.sum().item() > max_keep:
+                bias = torch.tensor(-2.0, device=p.device, dtype=p.dtype)
+                for _ in range(6):
+                    p = torch.sigmoid((g.logits + bias) / g.tau)
+                    if p.sum().item() <= max_keep:
+                        break
+                    bias = bias * 2
+                g.logits.add_(bias)
+# -----------------------------------------------------------------------------
+# Export helpers (indices from gates)
+# -----------------------------------------------------------------------------
+@torch.no_grad()
+def topk_group_indices(g: Gate, keep_k: Optional[int] = None) -> torch.Tensor:
+    """Return sorted group indices to KEEP based on logits/probs.
+    If `keep_k` is None, use nearest-integer of expected kept.
+    """
+    if keep_k is None:
+        keep_k = g.threshold_count()
+    idx = torch.topk(g.logits, int(keep_k), largest=True).indices
+    return idx.sort().values
+@torch.no_grad()
+def expand_group_indices(idx: torch.Tensor, group_size: int) -> torch.Tensor:
+    """Expand group indices into element indices by `group_size` blocks."""
+    if group_size == 1:
+        return idx.clone()
+    starts = idx * group_size
+    parts = [torch.arange(s, s + group_size, device=idx.device) for s in starts]
+    return torch.cat(parts, dim=0).long()
+# -----------------------------------------------------------------------------
+# Parameter utilities
+# -----------------------------------------------------------------------------
+def collect_gate_params(module: nn.Module) -> List[nn.Parameter]:
+    return [g.logits for g in iter_gates(module) if isinstance(g.logits, torch.Tensor)]
+def collect_param_groups(
+    module: nn.Module,
+    *,
+    lr_gate: float = 1e-2,
+    lr_linear: float = 1e-4,
+    lr_affine: float = 3e-4,
+    wd_linear: float = 1e-4,
+) -> List[dict]:
+    """Convenience grouping matching common training setups.
+    Group 0: gate logits (no weight decay)
+    Group 1: linear weights (with weight decay)
+    Group 2: linear biases (no decay)
+    Group 3: norm affine params (no decay)
+    """
+    gates, ln_affine, linear_w, linear_b = [], [], [], []
+    for n, p in module.named_parameters():
+        if not p.requires_grad:
+            continue
+        if n.endswith((".logits", ".head_gate", ".channel_gate")):
+            gates.append(p)
+            continue
+        is_linear_path = (".weight" in n or ".bias" in n) and (
+            ".dense" in n or ".query" in n or ".key" in n or ".value" in n or ".proj" in n
+        )
+        if n.endswith(".weight") and is_linear_path:
+            linear_w.append(p)
+        elif n.endswith(".bias") and is_linear_path:
+            linear_b.append(p)
+        elif "layernorm" in n.lower() or "layer_norm" in n.lower() or "LayerNorm" in n:
+            ln_affine.append(p)
+    return [
+        {"params": gates, "lr": lr_gate, "weight_decay": 0.0},
+        {"params": linear_w, "lr": lr_linear, "weight_decay": wd_linear},
+        {"params": linear_b, "lr": lr_linear, "weight_decay": 0.0},
+        {"params": ln_affine, "lr": lr_affine, "weight_decay": 0.0},
+    ]

core/profiler.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Simple, robust latency measurement utilities.
+This module provides GPU-friendly profilers with warmup, multiple repeats,
+median/percentile reporting, and optional outlier rejection via MAD.
+Design goals:
+  - Family-agnostic: take a callable `forward(model, x)` or rely on HF `.forward`
+  - Deterministic when desired; avoids autograd by default
+  - Works with CUDA or CPU; uses `torch.cuda.Event` for accurate GPU timing
+Key APIs:
+  - measure_latency_ms(model, input_shape | input_tensor, ...)
+  - profile(model, sample, settings) -> {mean, p50, p90, p95, p99}
+  - LatencyProfiler(settings).measure(...)
+  - profile_many_shapes(model, shapes, settings)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from statistics import median
+from typing import Callable, Dict, Iterable, Optional, Sequence, Tuple
+import contextlib
+import math
+import time
+import torch
+import torch.nn as nn
+# -----------------------------------------------------------------------------
+# Settings
+# -----------------------------------------------------------------------------
+@dataclass
+class ProfileSettings:
+    warmup: int = 10
+    iters: int = 50
+    percentile: Sequence[int] = (50, 90, 95, 99)
+    sync_each_iter: bool = True
+    use_inference_mode: bool = True
+    cuda_graph: bool = False  # advanced users can enable with static shapes
+    reject_outliers_mad: float = 0.0  # e.g., 3.5 to drop extreme spikes
+    cudnn_benchmark: bool = True
+    deterministic: bool = False  # sets cudnn.deterministic
+# -----------------------------------------------------------------------------
+# Context helpers
+# -----------------------------------------------------------------------------
+@contextlib.contextmanager
+def _torch_backend_ctx(settings: ProfileSettings):
+    prev_bench = torch.backends.cudnn.benchmark
+    prev_det = torch.backends.cudnn.deterministic
+    try:
+        torch.backends.cudnn.benchmark = bool(settings.cudnn_benchmark)
+        torch.backends.cudnn.deterministic = bool(settings.deterministic)
+        yield
+    finally:
+        torch.backends.cudnn.benchmark = prev_bench
+        torch.backends.cudnn.deterministic = prev_det
+def _percentiles(sorted_vals: Sequence[float], qs: Sequence[int]) -> Dict[int, float]:
+    n = len(sorted_vals)
+    if n == 0:
+        return {q: float("nan") for q in qs}
+    out = {}
+    for q in qs:
+        if n == 1:
+            out[q] = sorted_vals[0]
+            continue
+        k = (q / 100.0) * (n - 1)
+        f = math.floor(k)
+        c = min(n - 1, f + 1)
+        if f == c:
+            out[q] = sorted_vals[int(k)]
+        else:
+            d0 = sorted_vals[f] * (c - k)
+            d1 = sorted_vals[c] * (k - f)
+            out[q] = d0 + d1
+    return out
+def _apply_mad_filter(vals: Sequence[float], thresh: float) -> Sequence[float]:
+    if thresh <= 0 or len(vals) < 5:
+        return vals
+    med = median(vals)
+    dev = [abs(v - med) for v in vals]
+    mad = median(dev) or 1e-12
+    keep = [v for v, d in zip(vals, dev) if (d / mad) <= thresh]
+    return keep if keep else vals
+# -----------------------------------------------------------------------------
+# Core measurement
+# -----------------------------------------------------------------------------
+@torch.inference_mode()
+def measure_latency_ms(
+    model: nn.Module,
+    sample: torch.Tensor | Tuple[int, ...],
+    *,
+    settings: Optional[ProfileSettings] = None,
+    device: str = "cuda",
+    forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None,
+) -> Tuple[float, float]:
+    """Return (mean_ms, p95_ms) over `iters` measurements.
+    If `sample` is a shape tuple, a random tensor is created on-device.
+    The default forward calls `model(pixel_values=x)` if available, else `model(x)`.
+    """
+    cfg = settings or ProfileSettings()
+    with _torch_backend_ctx(cfg):
+        m = model.to(device).eval()
+        if isinstance(sample, torch.Tensor):
+            x = sample.to(device)
+        else:
+            x = torch.randn(*sample, device=device)
+        # Default forward
+        def _fwd(mod, inp):
+            if hasattr(mod, "forward"):
+                try:
+                    return mod(pixel_values=inp)
+                except TypeError:
+                    return mod(inp)
+            return mod(inp)
+        fn = forward_fn or _fwd
+        # Warmup
+        if torch.cuda.is_available() and device.startswith("cuda"):
+            for _ in range(cfg.warmup):
+                _ = fn(m, x)
+            torch.cuda.synchronize()
+        else:
+            for _ in range(cfg.warmup):
+                _ = fn(m, x)
+            torch.cuda.synchronize() if torch.cuda.is_available() else None
+        times: list[float] = []
+        if torch.cuda.is_available() and device.startswith("cuda"):
+            for _ in range(cfg.iters):
+                t0 = torch.cuda.Event(enable_timing=True)
+                t1 = torch.cuda.Event(enable_timing=True)
+                t0.record()
+                _ = fn(m, x)
+                t1.record()
+                if cfg.sync_each_iter:
+                    torch.cuda.synchronize()
+                times.append(t0.elapsed_time(t1))  # milliseconds
+        else:
+            for _ in range(cfg.iters):
+                t0 = time.perf_counter()
+                _ = fn(m, x)
+                if cfg.sync_each_iter and torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t1 = time.perf_counter()
+                times.append((t1 - t0) * 1000.0)
+        times = sorted(_apply_mad_filter(times, cfg.reject_outliers_mad))
+        mean_ms = sum(times) / max(1, len(times))
+        p = _percentiles(times, cfg.percentile)
+        p95 = p.get(95, times[int(0.95 * (len(times) - 1))] if times else float("nan"))
+        return mean_ms, p95
+# Higher level wrapper returning multiple percentiles
+@torch.inference_mode()
+def profile(
+    model: nn.Module,
+    sample: torch.Tensor | Tuple[int, ...],
+    *,
+    settings: Optional[ProfileSettings] = None,
+    device: str = "cuda",
+    forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None,
+) -> Dict[str, float]:
+    cfg = settings or ProfileSettings()
+    mean_ms, _ = measure_latency_ms(model, sample, settings=cfg, device=device, forward_fn=forward_fn)
+    # Re-run percentile calc on same settings for consistency
+    m = model.to(device).eval()
+    if isinstance(sample, torch.Tensor):
+        x = sample.to(device)
+    else:
+        x = torch.randn(*sample, device=device)
+    if torch.cuda.is_available() and device.startswith("cuda"):
+        times = []
+        for _ in range(cfg.iters):
+            t0 = torch.cuda.Event(True); t1 = torch.cuda.Event(True)
+            t0.record(); _ = (forward_fn or (lambda a, b: a(pixel_values=b)))(m, x); t1.record();
+            if cfg.sync_each_iter: torch.cuda.synchronize()
+            times.append(t0.elapsed_time(t1))
+    else:
+        times = []
+        for _ in range(cfg.iters):
+            t0 = time.perf_counter(); _ = (forward_fn or (lambda a, b: a(pixel_values=b)))(m, x); t1 = time.perf_counter()
+            times.append((t1 - t0) * 1000.0)
+    times = sorted(_apply_mad_filter(times, cfg.reject_outliers_mad))
+    percs = _percentiles(times, cfg.percentile)
+    out = {"mean": sum(times) / max(1, len(times))}
+    out.update({f"p{q}": v for q, v in percs.items()})
+    return out
+class LatencyProfiler:
+    """Reusable profiler with fixed settings."""
+    def __init__(self, settings: Optional[ProfileSettings] = None, device: str = "cuda"):
+        self.settings = settings or ProfileSettings()
+        self.device = device
+    def measure(self, model: nn.Module, sample: torch.Tensor | Tuple[int, ...], *, forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None) -> Tuple[float, float]:
+        return measure_latency_ms(model, sample, settings=self.settings, device=self.device, forward_fn=forward_fn)
+    def profile(self, model: nn.Module, sample: torch.Tensor | Tuple[int, ...], *, forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None) -> Dict[str, float]:
+        return profile(model, sample, settings=self.settings, device=self.device, forward_fn=forward_fn)
+@torch.inference_mode()
+def profile_many_shapes(
+    model: nn.Module,
+    shapes: Iterable[Tuple[int, ...]],
+    *,
+    settings: Optional[ProfileSettings] = None,
+    device: str = "cuda",
+    forward_fn: Optional[Callable[[nn.Module, torch.Tensor], torch.Tensor]] = None,
+) -> Dict[Tuple[int, ...], Dict[str, float]]:
+    out: Dict[Tuple[int, ...], Dict[str, float]] = {}
+    for shp in shapes:
+        out[tuple(shp)] = profile(model, shp, settings=settings, device=device, forward_fn=forward_fn)
+    return out

core/proxy_cost.py ADDED Viewed

	@@ -0,0 +1,771 @@

+# core/proxy_cost.py
+"""Latency proxy models and a tiny LUT for hardware correction.
+This file defines a family-agnostic interface plus concrete proxies (ViT, ResNet, LLM)
+that estimate latency from *soft structure* (gates) and input size. All proxies accept
+the trainer's `(model, batch) -> ms` call signature directly (batches may be dict/tuple/tensor).
+A small, in-memory LUT can be populated from real measurements during training to correct
+analytic estimates.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union, List
+import torch
+import torch.nn as nn
+from .gates import iter_gates, _as_like  # _as_like is used by ViT proxy
+# -----------------------------------------------------------------------------
+# Small batch helpers (shared)
+# -----------------------------------------------------------------------------
+TensorOrBatch = Union[torch.Tensor, Tuple, List, Dict[str, Any]]
+def _first_tensor(batch: TensorOrBatch) -> torch.Tensor:
+    """Find the first tensor inside a batch-like structure."""
+    if torch.is_tensor(batch):
+        return batch
+    if isinstance(batch, dict):
+        # Common keys across tasks
+        for k in ("input_ids", "pixel_values", "images", "x"):
+            v = batch.get(k, None)
+            if torch.is_tensor(v):
+                return v
+        # fallback: first tensor value
+        for v in batch.values():
+            if torch.is_tensor(v):
+                return v
+        raise ValueError("Batch dict has no tensor field I recognize.")
+    if isinstance(batch, (list, tuple)):
+        for v in batch:
+            if torch.is_tensor(v):
+                return v
+        # torchvision pattern: ([aug1, aug2], label)
+        if len(batch) and isinstance(batch[0], (list, tuple)):
+            for v in batch[0]:
+                if torch.is_tensor(v):
+                    return v
+    raise ValueError("Cannot find a tensor in the provided batch.")
+def _ids_from_batch(batch: TensorOrBatch) -> torch.Tensor:
+    """Return a 2D [B,S] tensor representing token ids for LLMs."""
+    if isinstance(batch, dict) and "input_ids" in batch and torch.is_tensor(batch["input_ids"]):
+        return batch["input_ids"]
+    t = _first_tensor(batch)
+    if t.dim() >= 2:
+        return t
+    raise ValueError("Cannot infer [B,S] from batch; need 'input_ids' or a 2D tensor.")
+def _nchw_from_batch(batch: TensorOrBatch) -> Tuple[int, int, int, int]:
+    """Return NCHW shape from a batch or an explicit (N,C,H,W) tuple/list/tensor."""
+    if isinstance(batch, (tuple, list)) and len(batch) == 4 and all(isinstance(x, int) for x in batch):
+        return tuple(batch)  # type: ignore[return-value]
+    x = _first_tensor(batch)
+    if x.dim() != 4:
+        raise ValueError(f"Expected NCHW tensor for CNN proxy; got tensor with shape {tuple(x.shape)}")
+    N, C, H, W = map(int, x.shape)
+    return (N, C, H, W)
+# -----------------------------------------------------------------------------
+# Base proxy + LUT
+# -----------------------------------------------------------------------------
+class LatencyProxy(nn.Module):
+    """Abstract proxy producing a scalar latency-like value (ms).
+    Subclasses implement `_predict_raw` and may define `_signature` keys used by
+    a LUT to refine estimates with real measurements. Proxies accept either a
+    batch-like object (dict/tuple/tensor) or an explicit shape tuple.
+    """
+    def __init__(self):
+        super().__init__()
+    def predict(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Batch-friendly entry point. `sample` may be a batch or explicit shape."""
+        return self._predict_raw(model, sample, policy=policy, step=step, **kwargs)
+    def _predict_raw(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None,
+        **kwargs,
+    ) -> torch.Tensor:  # pragma: no cover - abstract
+        raise NotImplementedError
+    def signature(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None
+    ) -> Tuple:
+        """Return a hashable signature describing the workload shape."""
+        if torch.is_tensor(sample):
+            shp = tuple(sample.shape)
+        elif isinstance(sample, (tuple, list)):
+            shp = tuple(sample)
+        elif isinstance(sample, dict):
+            # summarize the shapes of any tensors in dict
+            shp = tuple((k, tuple(v.shape)) for k, v in sample.items() if torch.is_tensor(v))
+        else:
+            shp = (str(type(sample)),)
+        return (type(self).__name__, shp)
+class LatencyLUT:
+    """Tiny LUT mapping `(signature) -> measured_ms`."""
+    def __init__(self):
+        self._table: Dict[Tuple[Any, ...], float] = {}
+    def update(self, signature: Tuple[Any, ...], measured_ms: float) -> None:
+        self._table[signature] = float(measured_ms)
+    def get(self, signature: Tuple[Any, ...]) -> Optional[float]:
+        return self._table.get(signature)
+    def blend(self, raw_estimate: torch.Tensor, signature: Tuple[Any, ...]) -> torch.Tensor:
+        val = self.get(signature)
+        if val is None:
+            return raw_estimate
+        # Put on same device/dtype as raw_estimate
+        return _as_like(raw_estimate, val)
+# -----------------------------------------------------------------------------
+# ViT proxy (analytic + gates), with scale and per-term weights
+# -----------------------------------------------------------------------------
+@dataclass
+class ViTProxyConfig:
+    scale_ms: float = 1.0
+    alpha_qkv: float = 1.0
+    alpha_scores: float = 1.0
+    alpha_out: float = 1.0
+    alpha_mlp: float = 1.0
+def _vit_layers(m):
+    enc = getattr(m, "encoder", None)
+    if enc is not None and hasattr(enc, "layer"):
+        return enc.layer
+    vit = getattr(m, "vit", None)
+    if vit is not None and hasattr(vit, "encoder") and hasattr(vit.encoder, "layer"):
+        return vit.encoder.layer
+    raise TypeError("Expected a HF ViT with *.encoder.layer (ViTModel or ViTForImageClassification).")
+class ViTLatencyProxy(LatencyProxy):
+    """Latency proxy for ViT models. Accepts batches or (N,C,H,W) tuples."""
+    def __init__(self, cfg: Optional[ViTProxyConfig] = None, lut: Optional[LatencyLUT] = None):
+        super().__init__()
+        self.cfg = cfg or ViTProxyConfig()
+        self.lut = lut or LatencyLUT()
+    # ---- helpers -------------------------------------------------------------
+    @staticmethod
+    def _input_spec(sample: TensorOrBatch) -> Tuple[int, int, int]:
+        if isinstance(sample, (tuple, list)) and len(sample) == 4 and all(isinstance(x, int) for x in sample):
+            B, C, H, W = sample
+            return int(B), int(H), int(W)
+        x = _first_tensor(sample)
+        if x.dim() != 4:
+            raise ValueError("ViTLatencyProxy expects a tensor [B,3,H,W] or a 4-tuple (B,3,H,W)")
+        B, C, H, W = x.shape
+        return int(B), int(H), int(W)
+    @staticmethod
+    def _patch_hw(cfg) -> Tuple[int, int]:
+        patch = getattr(cfg, "patch_size", 16)
+        if isinstance(patch, (tuple, list)):
+            return int(patch[0]), int(patch[1])
+        return int(patch), int(patch)
+    @staticmethod
+    def _soft_heads_from_block(blk) -> Optional[torch.Tensor]:
+        # Prefer a nested attention with kept_heads_soft()
+        attn = getattr(getattr(blk, "attention", None), "attention", None)
+        if attn is not None and hasattr(attn, "kept_heads_soft"):
+            return attn.kept_heads_soft()
+        return None
+    @staticmethod
+    def _find_ffn_gate(blk):
+        inter = getattr(blk, "intermediate", None)
+        if inter is None:
+            return None
+        # Common attribute names
+        for nm in ("neuron_gate", "gate", "ffn_gate"):
+            g = getattr(inter, nm, None)
+            if g is not None and hasattr(g, "logits") and hasattr(g, "tau"):
+                return g
+        # Last resort: scan children
+        for m in blk.modules():
+            if hasattr(m, "logits") and hasattr(m, "tau"):
+                return m
+        return None
+    # ---- proxy ---------------------------------------------------------------
+    def _predict_raw(
+        self,
+        model: nn.Module,
+        sample: TensorOrBatch,
+        *,
+        policy=None,
+        step: Optional[int] = None
+    ) -> torch.Tensor:
+        anchor = next((p for p in model.parameters()), torch.tensor(0.0))
+        B, H_img, W_img = self._input_spec(sample)
+        cfg = getattr(model, "config", None)
+        if cfg is None:
+            raise ValueError("Model must expose a HuggingFace-like .config for ViT proxy")
+        ph, pw = self._patch_hw(cfg)
+        S = _as_like(anchor, 1 + (H_img // ph) * (W_img // pw))
+        D = _as_like(anchor, int(getattr(cfg, "hidden_size", 768)))
+        Hh = _as_like(anchor, int(getattr(cfg, "num_attention_heads", 12)))
+        Dh = D // Hh
+        warm = False
+        if policy is not None and step is not None:
+            warm = (step < int(getattr(policy, "warmup_steps", 0)))
+        total_qkv = _as_like(anchor, 0.0)
+        total_scores = _as_like(anchor, 0.0)
+        total_out = _as_like(anchor, 0.0)
+        total_mlp = _as_like(anchor, 0.0)
+        default_hidden = _as_like(anchor, int(getattr(cfg, "intermediate_size", 4 * int(D))))
+        layers = _vit_layers(model)
+        for blk in layers:
+            heads_soft = Hh if warm else (self._soft_heads_from_block(blk) or Hh)
+            # FFN hidden expectation
+            if warm:
+                hidden_soft = default_hidden
+            else:
+                g = self._find_ffn_gate(blk)
+                if g is None:
+                    hidden_soft = default_hidden
+                else:
+                    probs = torch.sigmoid(g.logits / g.tau)
+                    group = int(getattr(g, "group", getattr(g, "group_size", 16)))
+                    hidden_soft = probs.sum() * _as_like(anchor, group)
+            D_kept = heads_soft * Dh
+            total_qkv += 3 * S * D * D_kept
+            total_scores += (S * S) * heads_soft * Dh
+            total_out += S * D_kept * D
+            total_mlp += 2 * S * D * hidden_soft
+        raw = (
+            self.cfg.alpha_qkv * total_qkv
+            + self.cfg.alpha_scores * total_scores
+            + self.cfg.alpha_out * total_out
+            + self.cfg.alpha_mlp * total_mlp
+        )
+        raw_ms = raw * _as_like(anchor, float(self.cfg.scale_ms))
+        # optional LUT correction
+        sig = self.signature(model, sample, policy=policy, step=step)
+        return self.lut.blend(raw_ms, sig)
+    # A reasonable default signature for ViT workloads
+    def signature(self, model: nn.Module, sample, *, policy=None, step: Optional[int] = None) -> Tuple:
+        if torch.is_tensor(sample):
+            shp = tuple(sample.shape)
+        elif isinstance(sample, (tuple, list)):
+            shp = tuple(sample)
+        elif isinstance(sample, dict):
+            shp = tuple((k, tuple(v.shape)) for k, v in sample.items() if torch.is_tensor(v))
+        else:
+            shp = (str(type(sample)),)
+        cfg = getattr(model, "config", None)
+        heads = int(getattr(cfg, "num_attention_heads", 12))
+        hidden = int(getattr(cfg, "hidden_size", 768))
+        inter = int(getattr(cfg, "intermediate_size", 3072))
+        return ("ViT", shp, heads, hidden, inter)
+    @torch.no_grad()
+    def calibrate(self, model: nn.Module, shape: tuple, measure_fn, *, device: str = "cuda") -> float:
+        """Set proxy scale so that keep-all student matches measured ms.
+        `measure_fn(model, shape_or_tensor)` should return `(mean_ms, p95_ms)`.
+        """
+        sample_t = torch.randn(shape, device=device)
+        sample_t = sample_t.to(device)
+        model = model.to(device).eval()
+        mean_ms, _ = measure_fn(model, shape, device=device)
+        soft_ms = self.predict(model, sample_t).item()
+        self.cfg.scale_ms = float(mean_ms / max(soft_ms, 1e-9))
+        return self.cfg.scale_ms
+# ------------------------------ ResNet Proxy ------------------------------
+@dataclass
+class ResNetProxyConfig:
+    scale_ms: float = 1.0
+    alpha_conv: float = 1.0   # weight for conv FLOPs term
+def _as_const_like_resnet(x_like: torch.Tensor, val):
+    return torch.as_tensor(val, device=x_like.device, dtype=x_like.dtype)
+def _find_anchor_param(model: nn.Module) -> torch.Tensor:
+    # Prefer any gate-like parameter; otherwise any parameter; else cpu scalar
+    for m in model.modules():
+        for nm in ("logits", "head_gate"):
+            t = getattr(m, nm, None)
+            if isinstance(t, torch.Tensor):
+                return t
+    for p in model.parameters():
+        return p
+    return torch.tensor(0.0)
+def _kept_from_gate(module, anchor: torch.Tensor) -> Optional[torch.Tensor]:
+    """Return expected kept channels for a BN gate: probs.sum() * group_size.
+    If no gate is found, return None.
+    """
+    g = None
+    for nm in ("gate", "neuron_gate", "channel_gate", "bn_gate"):
+        if hasattr(module, nm):
+            g = getattr(module, nm)
+            break
+    if g is None and hasattr(module, "logits") and hasattr(module, "tau"):
+        g = module
+    if g is None or not hasattr(g, "logits"):
+        return None
+    logits = g.logits
+    tau = float(getattr(g, "tau", 1.5))
+    group = int(getattr(g, "group", getattr(g, "group_size", 1)))
+    if group <= 0: group = 1
+    probs = torch.sigmoid(logits / tau)
+    return probs.sum() * _as_const_like_resnet(anchor, group)
+class ResNetLatencyProxy(LatencyProxy):
+    """Latency proxy for ResNet-like backbones with BN gates.
+    Approximates latency with a FLOPs-style sum over convs, using the *expected*
+    kept channels after each BN gate (probs.sum()*group_size). Falls back to the
+    full channel count when a gate is not found.
+    Accepts a batch or an explicit (N,C,H,W) shape.
+    """
+    def __init__(self, cfg: Optional[ResNetProxyConfig] = None):
+        super().__init__()
+        self.cfg = cfg or ResNetProxyConfig()
+    def _add_cost(self, cost_like: torch.Tensor, oc, ic, k, stride, H, W):
+        alpha = _as_const_like_resnet(cost_like, self.cfg.alpha_conv)
+        # update spatial dims with conv stride (roughly, ignoring padding effects)
+        H = (H + stride - 1) // stride
+        W = (W + stride - 1) // stride
+        flops = _as_const_like_resnet(cost_like, oc) * _as_const_like_resnet(cost_like, ic) * (k * k) * _as_const_like_resnet(cost_like, H) * _as_const_like_resnet(cost_like, W)
+        return cost_like + alpha * flops, H, W
+    def _predict_raw(self, model: nn.Module, sample: TensorOrBatch, **_) -> torch.Tensor:
+        N, C_in, H0, W0 = _nchw_from_batch(sample)
+        anchor = _find_anchor_param(model)
+        cost = _as_const_like_resnet(anchor, 0.0)
+        H = _as_const_like_resnet(anchor, int(H0))
+        W = _as_const_like_resnet(anchor, int(W0))
+        # Stem
+        conv1 = getattr(model, "conv1")
+        bn1 = getattr(model, "bn1", None)
+        k = conv1.kernel_size[0]
+        s = conv1.stride[0]
+        kept_out = None
+        if bn1 is not None:
+            kept = _kept_from_gate(bn1, anchor)
+            if kept is not None:
+                kept_out = kept
+        oc_eff = kept_out if kept_out is not None else _as_const_like_resnet(anchor, conv1.out_channels)
+        cost, H, W = self._add_cost(cost, oc_eff, _as_const_like_resnet(anchor, C_in), k, s, H, W)
+        in_ch = oc_eff
+        def _block_cost(block, in_ch, H, W, cost):
+            # conv1 -> bn1
+            c1 = block.conv1
+            b1 = block.bn1 if hasattr(block, "bn1") else None
+            k1, s1 = c1.kernel_size[0], c1.stride[0]
+            oc1_eff = _kept_from_gate(b1, anchor) or _as_const_like_resnet(anchor, c1.out_channels)
+            cost, H, W = self._add_cost(cost, oc1_eff, in_ch, k1, s1, H, W)
+            # conv2 -> bn2
+            c2 = block.conv2
+            b2 = block.bn2 if hasattr(block, "bn2") else None
+            k2, s2 = c2.kernel_size[0], c2.stride[0]
+            oc2_eff = _kept_from_gate(b2, anchor) or _as_const_like_resnet(anchor, c2.out_channels)
+            cost, H, W = self._add_cost(cost, oc2_eff, oc1_eff, k2, s2, H, W)
+            return oc2_eff, H, W, cost
+        # Layers
+        for lname in ("layer1", "layer2", "layer3", "layer4"):
+            layer = getattr(model, lname, None)
+            if layer is None:
+                continue
+            for blk in layer:
+                in_ch, H, W, cost = _block_cost(blk, in_ch, H, W, cost)
+        scale = _as_const_like_resnet(anchor, self.cfg.scale_ms)
+        return cost * scale
+    @torch.no_grad()
+    def calibrate(self, model: nn.Module, keepall_export_fn, profiler_fn, sample: TensorOrBatch, device: str = "cuda") -> float:
+        """Calibrate `scale_ms` so proxy(model_keepall) ~= real latency in ms."""
+        keep = keepall_export_fn(model)
+        sample_shape = _nchw_from_batch(sample)
+        mean_ms, _ = profiler_fn(keep, sample_shape, device=device)
+        soft = float(self.predict(model, sample).detach().cpu())
+        self.cfg.scale_ms = mean_ms / max(soft, 1e-9)
+        return mean_ms
+# -----------------------------------------------------------------------------
+# LLM proxy
+# -----------------------------------------------------------------------------
+"""
+LatencyProxyLLM
+---------------
+A lightweight latency proxy for decoder-only HF LLMs (LLaMA/Mistral style).
+- Estimates end-to-end latency (ms-like scalar) for a given (B, S, T):
+    * Prefill on S tokens (build KV cache)
+    * Cached decode for T steps
+- Uses soft gate expectations:
+    * Attention heads (HeadGate on GatedSelfAttentionLLM)
+    * FFN hidden (SwiGLUWidthGate via .mlp.neuron_gate)
+- Calibrate .scale_ms so proxy ≈ real latency of a keep-all model.
+Public API
+----------
+- LatencyProxyLLM(...).predict(model, batch_or_shape)     # trainer entry
+- LatencyProxyLLM(...).predict(model, B=?, S=?, T=?)      # explicit entry
+- LatencyProxyLLM(...).debug_layer_view(...)
+- calibrate_proxy_llm(...), calibrate_proxy_llm_from_batch(...)
+"""
+# ------------------------------------------------------------
+# Shared tiny utils (device/dtype-safe constants)
+# ------------------------------------------------------------
+def _find_gate_param_or_fallback(model: nn.Module) -> torch.Tensor:
+    """
+    Return a tensor to anchor device/dtype for proxy constants.
+    Prefer gate logits; else any parameter; else CPU fp32 scalar.
+    """
+    for m in model.modules():
+        if hasattr(m, "head_gate") and hasattr(getattr(m, "head_gate"), "logits"):
+            return m.head_gate.logits
+        if hasattr(m, "neuron_gate") and hasattr(m.neuron_gate, "logits"):
+            return m.neuron_gate.logits
+        if hasattr(m, "logits") and isinstance(getattr(m, "logits"), torch.Tensor):
+            return m.logits
+    for p in model.parameters():
+        return p
+    return torch.tensor(0.0)
+def _as_const_like(x_like: torch.Tensor, val):
+    return torch.as_tensor(val, device=x_like.device, dtype=x_like.dtype)
+# ------------------------------------------------------------
+# Proxy
+# ------------------------------------------------------------
+@dataclass
+class _WarmupOnlyPolicy:
+    """Tiny policy shim so you can pass warmup_steps to .predict()."""
+    warmup_steps: int = 0
+class LatencyProxyLLM(LatencyProxy):
+    """
+    LLM latency proxy (ms ~ weighted FLOPs/bandwidth terms) for prefill + cached decode.
+    Accepts either a batch or explicit B,S,T.
+    """
+    def __init__(
+        self,
+        *,
+        scale_ms: float = 1.0,
+        alpha_qkv: float = 1.0,
+        alpha_scores: float = 1.0,
+        alpha_out: float = 1.0,
+        alpha_mlp: float = 1.0,
+        gate_kv_in_proxy: bool = False,
+        default_T: int = 128,
+    ):
+        super().__init__()
+        self.scale_ms = float(scale_ms)
+        self.alpha_qkv = float(alpha_qkv)
+        self.alpha_scores = float(alpha_scores)
+        self.alpha_out = float(alpha_out)
+        self.alpha_mlp = float(alpha_mlp)
+        self.gate_kv_in_proxy = bool(gate_kv_in_proxy)
+        self.default_T = int(default_T)
+    # ---------- gate discovery ----------
+    @staticmethod
+    def _soft_heads_from_block_llm(blk) -> Optional[torch.Tensor]:
+        attn = getattr(blk, "self_attn", None)
+        if attn is None:
+            return None
+        if hasattr(attn, "kept_heads_soft") and callable(attn.kept_heads_soft):
+            return attn.kept_heads_soft()
+        logits, tau = None, None
+        if hasattr(attn, "head_gate") and hasattr(attn.head_gate, "logits"):
+            logits = attn.head_gate.logits
+            tau = float(getattr(attn.head_gate, "tau", getattr(attn, "tau", 1.5)))
+        elif hasattr(attn, "logits"):
+            logits = attn.logits
+            tau = float(getattr(attn, "tau", 1.5))
+        if logits is None:
+            return None
+        return torch.sigmoid(logits / tau).sum()
+    @staticmethod
+    def _find_ffn_gate_llm(blk):
+        mlp = getattr(blk, "mlp", None)
+        g = getattr(mlp, "neuron_gate", None) if mlp is not None else None
+        if g is not None and hasattr(g, "logits") and hasattr(g, "tau"):
+            return g
+        return None
+    def _soft_hidden_from_block_llm(self, blk, default_hidden, anchor, warm=False):
+        if warm:
+            return default_hidden
+        g = self._find_ffn_gate_llm(blk)
+        if g is None:
+            return default_hidden
+        probs = torch.sigmoid(g.logits / float(g.tau))  # [#groups]
+        group = int(getattr(g, "group", getattr(g, "group_size", 128)))
+        kept_hidden = probs.sum() * _as_const_like(anchor, group)
+        return kept_hidden
+    # ---------- main ----------
+    def predict(  # trainer entry and explicit-shape entry unified
+        self,
+        model: nn.Module,
+        sample: Optional[TensorOrBatch] = None,
+        *,
+        B: Optional[int] = None,
+        S: Optional[int] = None,
+        T: Optional[int] = None,
+        policy: Optional[object] = None,
+        step: Optional[int] = None,
+        return_terms: bool = False,
+    ):
+        # Allow explicit B,S,(T) path
+        if B is not None and S is not None:
+            ids_B, ids_S = int(B), int(S)
+            ids_T = int(T) if T is not None else int(self.default_T)
+        else:
+            if sample is None:
+                raise ValueError("LatencyProxyLLM.predict needs either a batch sample or explicit B,S.")
+            if isinstance(sample, (tuple, list)) and len(sample) in (2, 3) and all(isinstance(x, int) for x in sample):
+                # explicit (B,S) or (B,S,T)
+                ids_B, ids_S = int(sample[0]), int(sample[1])
+                ids_T = int(sample[2]) if len(sample) == 3 else int(self.default_T)
+            else:
+                ids = _ids_from_batch(sample)
+                ids_B, ids_S = int(ids.size(0)), int(ids.size(1))
+                ids_T = int(self.default_T) if T is None else int(T)
+        anchor = _find_gate_param_or_fallback(model)
+        # scalar tensors (same device/dtype)
+        B_t = _as_const_like(anchor, ids_B)
+        S_t = _as_const_like(anchor, ids_S)
+        T_t = _as_const_like(anchor, ids_T)
+        cfg = model.config
+        D  = _as_const_like(anchor, int(cfg.hidden_size))
+        Hh = _as_const_like(anchor, int(cfg.num_attention_heads))
+        Hkv = _as_const_like(anchor, int(getattr(cfg, "num_key_value_heads", int(Hh))))
+        Dh = D // Hh
+        warmup_steps = int(getattr(policy, "warmup_steps", 0)) if policy is not None else 0
+        warm = bool(step is not None and step < warmup_steps)
+        total_qkv = anchor.new_zeros(())
+        total_scores = anchor.new_zeros(())
+        total_out = anchor.new_zeros(())
+        total_mlp = anchor.new_zeros(())
+        default_hidden = _as_const_like(anchor, int(getattr(cfg, "intermediate_size", 4 * int(D))))
+        layers = getattr(getattr(model, "model", model), "layers", [])
+        for blk in layers:
+            heads_soft = Hh if warm else (self._soft_heads_from_block_llm(blk) or Hh)
+            Dq = heads_soft * Dh
+            # K/V effective width
+            if self.gate_kv_in_proxy:
+                Dkv = heads_soft * Dh
+            else:
+                Dkv = Hkv * Dh
+            hidden_soft = self._soft_hidden_from_block_llm(blk, default_hidden, anchor, warm=warm)
+            # Prefill + decode (simplified aggregation)
+            Seff = S_t + T_t
+            # q/k/v linear FLOP-like terms
+            total_qkv = total_qkv + (
+                # q
+                B_t * Seff * D * Dq +
+                # k + v
+                2 * B_t * Seff * D * Dkv
+            )
+            # attention scores (prefill SxS + decode triangular)
+            total_scores = total_scores + (
+                B_t * (S_t * S_t) * heads_soft * Dh +
+                B_t * heads_soft * Dh * (T_t * S_t + (T_t * (T_t + 1)) // 2)
+            )
+            # out proj
+            total_out = total_out + B_t * Seff * Dq * D
+            # mlp
+            total_mlp = total_mlp + B_t * Seff * 2 * D * hidden_soft
+        flops_like = (
+            self.alpha_qkv * total_qkv
+            + self.alpha_scores * total_scores
+            + self.alpha_out * total_out
+            + self.alpha_mlp * total_mlp
+        )
+        ms = flops_like * _as_const_like(anchor, self.scale_ms)
+        if return_terms:
+            return ms, {
+                "qkv": float((self.alpha_qkv * total_qkv).detach().cpu()),
+                "scores": float((self.alpha_scores * total_scores).detach().cpu()),
+                "out": float((self.alpha_out * total_out).detach().cpu()),
+                "mlp": float((self.alpha_mlp * total_mlp).detach().cpu()),
+            }
+        return ms
+    # ---------- per-layer debug ----------
+    @torch.no_grad()
+    def debug_layer_view(
+        self,
+        model: nn.Module,
+        *,
+        B: int,
+        S: int,
+        T: int,
+        policy: Optional[object] = None,
+        step: Optional[int] = None,
+    ) -> list:
+        anchor = _find_gate_param_or_fallback(model)
+        cfg = getattr(model, "config", None)
+        D   = _as_const_like(anchor, int(getattr(cfg, "hidden_size", 0)))
+        Hq  = _as_const_like(anchor, int(getattr(cfg, "num_attention_heads", 0)))
+        Hkv = _as_const_like(anchor, int(getattr(cfg, "num_key_value_heads", int(Hq))))
+        Dh  = D // Hq
+        warm = False
+        if policy is not None and step is not None:
+            warm = (int(step) < int(getattr(policy, "warmup_steps", 0)))
+        rows = []
+        layers = getattr(getattr(model, "model", model), "layers", None) or []
+        for i, blk in enumerate(layers):
+            heads_soft = Hq if warm else (self._soft_heads_from_block_llm(blk) or Hq)
+            Dq = heads_soft * Dh
+            Dkv = (heads_soft * Dh) if self.gate_kv_in_proxy else (Hkv * Dh)
+            hidden_soft = self._soft_hidden_from_block_llm(
+                blk, _as_const_like(anchor, int(getattr(cfg, "intermediate_size", 4 * int(D)))), anchor, warm=warm
+            )
+            rows.append({
+                "layer": i,
+                "heads_soft": float(heads_soft.detach().cpu()),
+                "Dq≈heads*Dh": float(Dq.detach().cpu()),
+                "Dkv_used": float(Dkv.detach().cpu()),
+                "ffn_hidden_soft": float(hidden_soft.detach().cpu()),
+            })
+        return rows
+# ------------------------------------------------------------
+# Calibration helpers for LLM
+# ------------------------------------------------------------
+@torch.inference_mode()
+def calibrate_proxy_llm(
+    proxy: LatencyProxyLLM,
+    model: nn.Module,
+    *,
+    B: int,
+    S: int,
+    T: int,
+    export_keepall_fn,
+    device: str = "cuda",
+    warmup: int = 10,
+    iters: int = 30,
+) -> float:
+    """
+    Calibrate proxy.scale_ms so proxy.predict(...) matches real keep-all latency for (B,S,T).
+    Returns the measured real mean latency in ms.
+    """
+    keepall = export_keepall_fn(model).to(device).eval()
+    # Measure real latency (prefill + decode)
+    from core.measure import measure_latency_text_ms as _measure  # adjust if your path differs
+    real_ms, _ = _measure(keepall, B=B, S=S, T=T, warmup=warmup, iters=iters, device=device)
+    # Soft/proxy latency on *gated* model
+    ms_like = proxy.predict(model, B=B, S=S, T=T)
+    soft_ms = float(ms_like.detach().item()) if torch.is_tensor(ms_like) else float(ms_like)
+    proxy.scale_ms = float(real_ms / max(soft_ms, 1e-9))
+    return real_ms
+@torch.inference_mode()
+def calibrate_proxy_llm_from_batch(
+    proxy: LatencyProxyLLM,
+    model: nn.Module,
+    batch: Dict[str, torch.Tensor],
+    *,
+    T: int,
+    export_keepall_fn,
+    device: str = "cuda",
+    warmup: int = 10,
+    iters: int = 30,
+) -> Tuple[int, int, int, float]:
+    """
+    Infers (B,S) from a batch like {'input_ids': [B,S], ...},
+    calibrates for (B,S,T), and returns (B,S,T, real_ms).
+    """
+    input_ids = batch["input_ids"]
+    B, S = int(input_ids.size(0)), int(input_ids.size(1))
+    ms = calibrate_proxy_llm(
+        proxy, model, B=B, S=S, T=T, export_keepall_fn=export_keepall_fn,
+        device=device, warmup=warmup, iters=iters
+    )
+    return B, S, T, ms

core/search_export.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Export-parameter search (hardware-aware).
+This module performs a small grid search over export rounding/multiple knobs and
+picks the configuration that minimizes *measured* latency for the target batch
+shape. It is family-agnostic; adapters provide the export function.
+For ViT, see `vit_search_best_export` which scans per-head multiples and FFN
+snap group sizes, mirroring kernel-friendly widths.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Iterable, List, Optional, Sequence, Tuple
+import copy
+import itertools
+import torch
+import torch.nn as nn
+from .export import ExportPolicy as CoreExportPolicy, Rounding as CoreRounding
+from .profiler import measure_latency_ms, ProfileSettings
+# Type alias: adapter export function
+ExportFn = Callable[[nn.Module, object, int], nn.Module]
+@dataclass
+class SearchResult:
+    best_model: nn.Module
+    best_params: dict
+    trials: List[dict]
+def grid_search_latency(
+    model_with_gates: nn.Module,
+    export_fn: ExportFn,
+    *,
+    head_multiples: Sequence[int],
+    ffn_snaps: Sequence[int],
+    step: int,
+    batch_shape: Tuple[int, int, int, int],  # (B,C,H,W)
+    measure_settings: Optional[ProfileSettings] = None,
+    device: str = "cuda",
+    make_policy: Optional[Callable[[int, int], object]] = None,
+) -> SearchResult:
+    """Generic grid search over (head_multiple, ffn_snap_groups).
+    - `make_policy(h_mult, ffn_snap)` must return an adapter-acceptable export policy.
+      If not provided, falls back to a single-rounding `CoreExportPolicy` using
+      `multiple_groups=head_multiple` for both heads and FFN.
+    """
+    trials: List[dict] = []
+    best = None
+    to_try = itertools.product(head_multiples, ffn_snaps)
+    for i, (hm, fs) in enumerate(to_try):
+        policy = make_policy(hm, fs) if make_policy is not None else CoreExportPolicy(
+            warmup_steps=0,
+            rounding=CoreRounding(floor_groups=1, multiple_groups=int(hm), min_keep_ratio=0.0),
+        )
+        slim = export_fn(model_with_gates, policy, step)
+        mean_ms, p95_ms = measure_latency_ms(slim, batch_shape, settings=measure_settings, device=device)
+        rec = {"head_multiple": int(hm), "ffn_snap": int(fs), "mean_ms": float(mean_ms), "p95_ms": float(p95_ms)}
+        print(f"[{i}/{len(list(to_try))}] head_multiple {int(hm)} | ffn_snap {int(fs)} | mean_ms = {float(mean_ms)}")
+        trials.append(rec)
+        if best is None or mean_ms < best[0]:
+            best = (mean_ms, hm, fs, slim)
+    assert best is not None
+    _, hm_best, fs_best, slim_best = best
+    return SearchResult(best_model=slim_best, best_params={"head_multiple": int(hm_best), "ffn_snap": int(fs_best)}, trials=trials)

core/train.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""Generic Lagrangian trainer (family-agnostic).
+This module provides a light framework to optimize *gated* students against
+teachers with a latency target enforced via a proxy + optional real probes.
+It does not assume ViT/ResNet/LLM specifics; adapters provide tiny callables.
+Key ingredients:
+  - Two-phase update per step: (A) weights w.r.t. KD/task, (B) gates w.r.t. KD +
+    sparsity + latency penalty with a dual variable λ.
+  - Optional periodic export + real-latency probe to correct λ.
+  - Constraint projection for gates after each step.
+Adapters must provide:
+  - get_student_logits(model, x) -> Tensor
+  - get_teacher_logits(model, x) -> Tensor
+  - export_keepall(model) -> nn.Module (clean copy without gates)
+  - export_pruned(model, policy, step) -> nn.Module (transient copy for profiling)
+Core modules used:
+  - `distill.KDConfig`, `distill.kd_loss`
+  - `gates.combined_penalty`, `gates.PenaltyWeights`, `gates.project_gates_into_constraints`
+  - `proxy_cost.LatencyProxy`
+  - `profiler.measure_latency_ms`
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional
+import gc
+import torch
+import torch.nn as nn
+from .distill import KDConfig, kd_loss, mse_reg
+from .gates import PenaltyWeights, Constraints, combined_penalty, project_gates_into_constraints, collect_param_groups
+from .proxy_cost import LatencyProxy
+from .profiler import measure_latency_ms
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+@dataclass
+class DualConfig:
+    lr: float = 0.05     # step for λ update
+    ema_beta: float = 0.5  # blend proxy-driven λ and real probe λ
+    clip: float = 10.0
+@dataclass
+class TrainerConfig:
+    kd: KDConfig = KDConfig()
+    penalties: PenaltyWeights = PenaltyWeights(l0=0.0, keep_floor_ratio=0.0, bimodality=0.0)
+    constraints: Constraints = Constraints(min_keep_ratio=0.0, min_groups=1, max_groups_drop=None)
+    latency_target_ms: float = 30.0
+    real_probe_every: int = 0        # steps; 0 disables real probes
+    probe_batch_override: Optional[int] = None
+    gate_warmup_steps: int = 0 # Freeze gates for early steps
+    mse_weight: float = 0.0
+    early_stopping_patience: int = 0
+    early_stopping_lambda: float = 1e-4
+    amp: bool = True
+    device: str = "cuda"
+    # Optimizers
+    lr_gate: float = 1e-2
+    lr_linear: float = 1e-4
+    lr_affine: float = 3e-4
+    wd_linear: float = 1e-4
+    # Mixed precision scaler
+    use_grad_scaler: bool = True
+    # Dual update
+    dual: DualConfig = DualConfig()
+# -----------------------------------------------------------------------------
+# Trainer
+# -----------------------------------------------------------------------------
+class LagrangeTrainer:
+    def __init__(
+        self,
+        student: nn.Module,
+        teacher: nn.Module,
+        proxy: LatencyProxy,
+        *,
+        adapter_get_student_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+        adapter_get_teacher_logits: Callable[[nn.Module, torch.Tensor], torch.Tensor],
+        adapter_export_keepall: Callable[[nn.Module], nn.Module],
+        adapter_export_pruned: Callable[[nn.Module, object, int], nn.Module],
+        export_policy: object,
+        cfg: TrainerConfig,
+    ) -> None:
+        self.student = student
+        self.teacher = teacher.eval()
+        for p in self.teacher.parameters():
+            p.requires_grad_(False)
+        self.proxy = proxy
+        self.get_s = adapter_get_student_logits
+        self.get_t = adapter_get_teacher_logits
+        self.export_keepall = adapter_export_keepall
+        self.export_pruned = adapter_export_pruned
+        self.export_policy = export_policy
+        self.cfg = cfg
+        # Build optimizers (grouped)
+        param_groups = collect_param_groups(
+            student,
+            lr_gate=cfg.lr_gate,
+            lr_linear=cfg.lr_linear,
+            lr_affine=cfg.lr_affine,
+            wd_linear=cfg.wd_linear,
+        )
+        # gates-only optimizer uses first group
+        self.opt_g = torch.optim.Adam([param_groups[0]], lr=param_groups[0]["lr"])  # type: ignore[arg-type]
+        # weights optimizer for the rest
+        self.opt_w = torch.optim.Adam(param_groups[1:])
+        self.scaler = torch.amp.GradScaler('cuda', enabled=(cfg.amp and cfg.use_grad_scaler))
+        self.lambda_: float = 0.0
+        self.mse_weight = cfg.mse_weight
+    # ---- internal helpers -----------------------------------------------------
+    def _zero_grads(self, params):
+        for p in params:
+            if p.grad is not None:
+                p.grad = None
+    def _has_grad(self, params) -> bool:
+        for p in params:
+            if p.grad is not None:
+                return True
+        return False
+    # ---- training -------------------------------------------------------------
+    def train_epoch(self, loader, *, real_policy=None, verbose_every: int = 50):
+        device = self.cfg.device
+        self.student.train().to(device)
+        self.teacher.to(device).eval()
+        running = 0.0
+        seen = 0
+        lam_real = self.lambda_
+        total_steps = len(loader)
+        for step, batch in enumerate(loader, 1):
+            # Move batch to device in a type-safe way
+            batch = _move_batch_to_device(batch, device)
+            # with torch.inference_mode():
+            with torch.no_grad():
+                t_logits = self.get_t(self.teacher, batch)  # [B,1,V]
+            # match AMP compute dtype to avoid upcasting later
+            if self.cfg.amp:
+                # infer autocast dtype from student params (bf16 or fp16)
+                sparam = next(self.student.parameters())
+                t_logits = t_logits.to(dtype=sparam.dtype, non_blocking=True)
+            # -------- Pass A: WEIGHTS (KD only) --------
+            self.opt_w.zero_grad(set_to_none=True)
+            with torch.amp.autocast('cuda', enabled=self.cfg.amp):
+                # Adapters receive the batch object (dict/tuple/tensor)
+                s_logits = self.get_s(self.student, batch)
+                # with torch.no_grad():
+                #     t_logits = self.get_t(self.teacher, batch)
+                mse = self.mse_weight*mse_reg(s_logits, t_logits, self.cfg.kd.temperature)
+                loss_w = kd_loss(s_logits, t_logits, self.cfg.kd) + mse
+            self.scaler.scale(loss_w).backward()
+            # Prevent gate params from changing in pass A
+            gate_params = self.opt_g.param_groups[0]["params"]
+            self._zero_grads(gate_params)
+            if any(p.grad is not None for pg in self.opt_w.param_groups for p in pg["params"]):
+                self.scaler.step(self.opt_w)
+                self.scaler.update()
+            else:
+                self.opt_w.zero_grad(set_to_none=True)
+            del s_logits
+            gc.collect()
+            torch.cuda.empty_cache()
+            if step > int(self.cfg.gate_warmup_steps):
+                # -------- Pass B: GATES (KD + sparsity + λ * gap) --------
+                self.opt_g.zero_grad(set_to_none=True)
+                with torch.amp.autocast('cuda', enabled=self.cfg.amp):
+                    s_logits = self.get_s(self.student, batch)
+                    # with torch.no_grad():
+                    #     t_logits = self.get_t(self.teacher, batch)
+                    kd_g = kd_loss(s_logits, t_logits, self.cfg.kd)
+                    # Proxy gets the batch object too; family-specific proxy can read (B,S) etc.
+                    o1_ms = self.proxy.predict(self.student, batch)
+                    gap = torch.relu(o1_ms - float(self.cfg.latency_target_ms))
+                    reg = combined_penalty(self.student, self.cfg.penalties)
+                    mse = self.mse_weight*mse_reg(s_logits, t_logits, self.cfg.kd.temperature)
+                    loss_g = kd_g + _to_tensor(self.lambda_, o1_ms) * gap + reg + mse
+                self.scaler.scale(loss_g).backward()
+                # Prevent non-gate params from changing in pass B
+                for pg in self.opt_w.param_groups:
+                    self._zero_grads(pg["params"])
+                if self._has_grad(self.opt_g.param_groups[0]["params"]):
+                    self.scaler.step(self.opt_g)
+                    self.scaler.update()
+                else:
+                    self.opt_g.zero_grad(set_to_none=True)
+            else:
+                o1_ms = self.proxy.predict(self.student, batch)
+                s_logits = loss_g = kd_g = reg = torch.tensor(0.0, device=device)
+            # -------- Dual (λ) update using proxy --------
+            with torch.no_grad():
+                lam_proxy = max(0.0, self.lambda_ + self.cfg.dual.lr * (float(o1_ms.detach()) - self.cfg.latency_target_ms))
+                self.lambda_ = 0.5 * (lam_real + lam_proxy)
+            # -------- Constraint projection, optional real probe --------
+            project_gates_into_constraints(self.student, self.cfg.constraints)
+            if self.cfg.real_probe_every and (step % int(self.cfg.real_probe_every) == 0):
+                # Build a probe shape for latency func if needed
+                try:
+                    from core.measure import measure_latency_text_ms  # text-friendly
+                    if isinstance(batch, dict) and "input_ids" in batch and torch.is_tensor(batch["input_ids"]):
+                        B, S = int(batch["input_ids"].size(0)), int(batch["input_ids"].size(1))
+                    else:
+                        # Fallback: try tensor-like batch
+                        x0 = batch["input_ids"] if isinstance(batch, dict) else (batch[0] if isinstance(batch, (tuple, list)) else batch)
+                        B = int(x0.size(0)); S = int(x0.size(1))
+                    slim = self.export_pruned(self.student, real_policy or self.export_policy, step)
+                    mean_ms, p95_ms = measure_latency_text_ms(slim, B=B, S=S, T=128, device=device)
+                except Exception:
+                    # If the project has a different profiler, retain compatibility:
+                    from .profiler import measure_latency_ms
+                    x0 = batch["input_ids"] if isinstance(batch, dict) else (batch[0] if isinstance(batch, (tuple, list)) else batch)
+                    shape = (int(x0.size(0)), *list(x0.shape[1:]))
+                    slim = self.export_pruned(self.student, real_policy or self.export_policy, step)
+                    mean_ms, p95_ms = measure_latency_ms(slim, shape, device=device)
+                with torch.no_grad():
+                    lam_real = max(0.0, self.lambda_ + self.cfg.dual.lr * (mean_ms - self.cfg.latency_target_ms))
+                # scale_correction = mean_ms / max(1e-9, o1_ms.detach())
+                # self.proxy.cfg.scale_ms = 0.9 * self.proxy.cfg.scale_ms + 0.1 * scale_correction * self.proxy.cfg.scale_ms
+                if (step % verbose_every) == 0:
+                    print(
+                        f"Step {step}/{len(loader)} | KL={float(loss_w.item()):.6f} | MSE={float(mse.item()):.6f} | "
+                        f"Gate={float(loss_g.item()):.6f} | "
+                        f"proxy={float(o1_ms.detach()):.3f}ms | real_mean={mean_ms:.3f}ms p95={p95_ms:.3f}ms | λ={self.lambda_:.6f}"
+                    )
+            running += float(loss_g.detach())
+            seen += _batch_size(batch)
+            del s_logits, t_logits, o1_ms, kd_g, reg, loss_g, loss_w
+            torch.cuda.empty_cache()
+            gc.collect()
+        print(f"Epoch loss {running / max(1, seen):.6f}")
+        return self.lambda_
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+def _to_tensor(val: float, like: torch.Tensor) -> torch.Tensor:
+    return torch.as_tensor(val, device=like.device, dtype=like.dtype)
+def _move_batch_to_device(batch, device: str):
+    """
+    Supports:
+      - dict with keys 'input_ids' and optional 'attention_mask'
+      - (x,) or (x, y) tuples/lists -> move each tensor-like to device
+      - single Tensor
+    Converts attention_mask to bool (preferred by HF SDPA).
+    """
+    if isinstance(batch, dict):
+        out = {}
+        for k, v in batch.items():
+            if torch.is_tensor(v):
+                v = v.to(device, non_blocking=True)
+                if k == "attention_mask" and v.dtype != torch.bool:
+                    v = v.to(torch.bool)
+            out[k] = v
+        return out
+    if isinstance(batch, (tuple, list)):
+        moved = []
+        for v in batch:
+            if torch.is_tensor(v):
+                v = v.to(device, non_blocking=True)
+            moved.append(v)
+        return type(batch)(moved)
+    if torch.is_tensor(batch):
+        return batch.to(device, non_blocking=True)
+    # Unknown type: return as-is (adapters/proxy should handle it)
+    return batch
+def _batch_size(batch) -> int:
+    """Best-effort batch size for logging/averages."""
+    if isinstance(batch, dict) and "input_ids" in batch and torch.is_tensor(batch["input_ids"]):
+        return int(batch["input_ids"].size(0))
+    if torch.is_tensor(batch):
+        return int(batch.size(0))
+    if isinstance(batch, (tuple, list)) and len(batch) and torch.is_tensor(batch[0]):
+        return int(batch[0].size(0))
+    return 1

core/utils.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""Shared utilities used across core and adapters.
+Consolidates helpers that are generic (device/dtype, seeding, shapes, rounding,
+parameter grouping, model copying, etc.). Keep this file dependency-light.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, Iterator, List, Optional, Sequence, Tuple
+import copy
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+# -----------------------------------------------------------------------------
+# Device / dtype helpers
+# -----------------------------------------------------------------------------
+def as_like(x: torch.Tensor, val) -> torch.Tensor:
+    """Create a scalar/tensor constant on same device/dtype as `x`."""
+    return torch.as_tensor(val, device=x.device, dtype=x.dtype)
+def first_param(module: nn.Module) -> torch.Tensor:
+    for p in module.parameters(recurse=True):
+        return p
+    return torch.tensor(0.0)
+def to_device_dtype(x: torch.Tensor, ref: torch.Tensor) -> torch.Tensor:
+    return x.to(device=ref.device, dtype=ref.dtype)
+# -----------------------------------------------------------------------------
+# Seeding & determinism
+# -----------------------------------------------------------------------------
+def set_seed(seed: int = 42, deterministic: bool = False) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+# -----------------------------------------------------------------------------
+# Model parameter helpers
+# -----------------------------------------------------------------------------
+def freeze(module: nn.Module) -> None:
+    for p in module.parameters():
+        p.requires_grad_(False)
+def unfreeze(module: nn.Module) -> None:
+    for p in module.parameters():
+        p.requires_grad_(True)
+def count_parameters(module: nn.Module, *, trainable_only: bool = False) -> int:
+    if trainable_only:
+        return sum(p.numel() for p in module.parameters() if p.requires_grad)
+    return sum(p.numel() for p in module.parameters())
+# -----------------------------------------------------------------------------
+# Shape/signature helpers
+# -----------------------------------------------------------------------------
+def input_spec_vision(sample) -> Tuple[int, int, int]:
+    """Accept either a 4D tensor [B,3,H,W] or a 4-tuple (B,3,H,W). Returns (B,H,W)."""
+    if isinstance(sample, torch.Tensor):
+        B, C, H, W = sample.shape
+        return int(B), int(H), int(W)
+    if isinstance(sample, (tuple, list)) and len(sample) == 4:
+        B, C, H, W = sample
+        return int(B), int(H), int(W)
+    raise ValueError("sample must be a tensor [B,3,H,W] or a 4-tuple (B,3,H,W)")
+# -----------------------------------------------------------------------------
+# Rounding / multiples
+# -----------------------------------------------------------------------------
+def round_down_multiple(n: int, m: int) -> int:
+    if m is None or m <= 1:
+        return max(1, int(n))
+    n = int(n)
+    return max(m, (n // m) * m)
+def clamp_int(v: int, lo: int, hi: int) -> int:
+    return max(lo, min(int(v), hi))
+# -----------------------------------------------------------------------------
+# Slicing helpers
+# -----------------------------------------------------------------------------
+@torch.no_grad()
+def slice_linear(mat: nn.Linear, keep_in: Optional[Sequence[int]] = None, keep_out: Optional[Sequence[int]] = None) -> nn.Linear:
+    W = mat.weight.detach()
+    b = mat.bias.detach() if mat.bias is not None else None
+    if keep_out is not None:
+        idx_out = torch.as_tensor(keep_out, device=W.device)
+        W = W.index_select(0, idx_out)
+        if b is not None:
+            b = b.index_select(0, idx_out)
+    if keep_in is not None:
+        idx_in = torch.as_tensor(keep_in, device=W.device)
+        W = W.index_select(1, idx_in)
+    out_f, in_f = W.shape
+    new = nn.Linear(in_f, out_f, bias=(b is not None)).to(W.device)
+    new.weight.copy_(W)
+    if b is not None:
+        new.bias.copy_(b)
+    return new
+# -----------------------------------------------------------------------------
+# Copying & detaching models
+# -----------------------------------------------------------------------------
+def deepcopy_eval_cpu(module: nn.Module) -> nn.Module:
+    m = copy.deepcopy(module).cpu().eval()
+    return m
+# -----------------------------------------------------------------------------
+# Gradient utilities
+# -----------------------------------------------------------------------------
+def zero_if_any(params: Iterable[torch.Tensor]) -> None:
+    for p in params:
+        if p.grad is not None:
+            p.grad = None
+def any_grad(params: Iterable[torch.Tensor]) -> bool:
+    for p in params:
+        if p.grad is not None:
+            return True
+    return False
+# -----------------------------------------------------------------------------
+# For fine-tuning
+# -----------------------------------------------------------------------------
+def ensure_trainable_parameters(module: nn.Module, *, requires_grad: bool = True) -> nn.Module:
+    """
+    Rebuild all parameters as fresh nn.Parameter tensors (detach+clone),
+    which drops any 'inference tensor' tag and re-enables autograd.
+    """
+    for mod in module.modules():
+        for name, p in list(mod._parameters.items()):
+            if p is None:
+                continue
+            new_p = nn.Parameter(p.detach().clone(), requires_grad=requires_grad)
+            setattr(mod, name, new_p)
+    return module
+# -----------------------------------------------------------------------------
+# Misc
+# -----------------------------------------------------------------------------
+@dataclass
+class ExportRounding:
+    head_floor_post: int = 1
+    head_multiple_post: int = 1
+    ffn_min_keep_ratio_post: float = 0.0
+    ffn_snap_groups_post: int = 1
+def shape_signature_vit(cfg, sample_shape: Tuple[int, int, int, int]) -> Tuple:
+    B, C, H, W = sample_shape
+    return (
+        "ViT",
+        sample_shape,
+        int(getattr(cfg, "num_attention_heads", 12)),
+        int(getattr(cfg, "hidden_size", 768)),
+        int(getattr(cfg, "intermediate_size", 3072)),
+        int(getattr(cfg, "patch_size", 16)) if not isinstance(getattr(cfg, "patch_size", 16), (tuple, list)) else tuple(getattr(cfg, "patch_size", (16, 16))),
+    )

custom_code.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Marker file so Hub shows 'custom code' banner.

huggingface/.ipynb_checkpoints/llama-checkpoint.py ADDED Viewed

	@@ -0,0 +1,607 @@

+"""HuggingFace LLaMA/Mistral adapter
+Bridges the family-agnostic core (gates/export/proxy/train) to HF causal LMs
+(LlamaForCausalLM / MistralForCausalLM, etc.).
+Responsibilities
+----------------
+- Attach gates to attention Q heads (and optional KV) + grouped MLP (SwiGLU)
+- Provide a logits getter (student/teacher)
+- Exporters:
+    * keep-all (unwrap gates, restore clean HF modules)
+    * pruned (slice q_proj/o_proj and SwiGLU up/gate/down; update HF metadata)
+- Grid-search wrapper for post-export rounding/snap params
+This adapter intentionally keeps the core unaware of LLaMA internals.
+"""
+from __future__ import annotations
+# Ensure repo root on sys.path for absolute imports (core, adapters, data)
+import sys, pathlib
+sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
+from dataclasses import dataclass
+from typing import Optional, Sequence, Callable, Tuple
+import copy
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Core (absolute imports so running `-m examples.run_llama_optimize` works)
+from core.gates import HeadGate, GroupGate
+from core.export import (
+    ExportPolicy as CoreExportPolicy,
+    Rounding as CoreRounding,
+    keep_group_indices_from_gate,
+    slice_linear,
+)
+from core.utils import deepcopy_eval_cpu
+from core.search_export import grid_search_latency
+# -------------------------------------------------------------------------
+# Configs
+# -------------------------------------------------------------------------
+@dataclass
+class LlamaGatingConfig:
+    tau: float = 1.5
+    init_logit: float = 3.0
+    head_gating: bool = True
+    gate_kv: bool = False          # optional: gate KV along with Q
+    ffn_group: int = 128           # SwiGLU groups
+    ffn_gating: bool = True
+    hard_eval: bool = True         # use hard gates in eval forward
+# -------------------------------------------------------------------------
+# Helpers (GQA, rotary, cache-safe)
+# -------------------------------------------------------------------------
+def _last_nonpad_index(attn_mask: Optional[torch.Tensor], seq_len: int, device) -> torch.Tensor:
+    if attn_mask is None:
+        return torch.full((1,), seq_len - 1, device=device, dtype=torch.long)  # will be expanded per-batch later
+    # attn_mask: [B, S] in {0,1}; works for left/right padding
+    return (attn_mask.sum(dim=1) - 1).clamp(min=0).long()
+def _repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    if n_rep == 1:
+        return x
+    B, Hkv, T, Dh = x.shape
+    return x.unsqueeze(2).expand(B, Hkv, n_rep, T, Dh).reshape(B, Hkv * n_rep, T, Dh)
+try:
+    from transformers.cache_utils import Cache
+except Exception:
+    class Cache:  # type: ignore
+        pass
+# -------------------------------------------------------------------------
+# Gated attention wrapper (Llama/Mistral ready)
+# -------------------------------------------------------------------------
+class GatedSelfAttentionLLM(nn.Module):
+    """
+    Thin wrapper around HF Llama/Mistral attention module.
+    - Uses the base module's q_proj/k_proj/v_proj/o_proj
+    - Applies per-Q-head gates (and optional KV gates)
+    - Handles rotary and cache (tuple or HF Cache)
+    - Runs SDPA directly, then o_proj
+    """
+    def __init__(self, attn_container: nn.Module,
+                 num_q_heads: int, num_kv_heads: int, head_dim: int,
+                 cfg: LlamaGatingConfig, layer_idx: int):
+        super().__init__()
+        self.base_attn = attn_container
+        self.q_proj = attn_container.q_proj
+        self.k_proj = attn_container.k_proj
+        self.v_proj = attn_container.v_proj
+        self.o_proj = getattr(attn_container, "o_proj", getattr(attn_container, "out_proj", None))
+        self.num_q_heads = int(num_q_heads)
+        self.num_kv_heads = int(num_kv_heads)
+        self.head_dim = int(head_dim)
+        self.gate_kv = bool(cfg.gate_kv)
+        self.drop_p = float(getattr(attn_container, "attention_dropout",
+                                    getattr(attn_container, "attn_dropout",
+                                            getattr(attn_container, "dropout", 0.0))))
+        self.head_gate = HeadGate(num_heads=self.num_q_heads,
+                                  head_dim=self.head_dim,
+                                  tau=cfg.tau, init_logit=cfg.init_logit,
+                                  hard_during_eval=cfg.hard_eval)
+        # rotary helpers if present on base
+        self.rotary_emb = getattr(attn_container, "rotary_emb", None)
+        self.apply_rotary_pos_emb = getattr(attn_container, "apply_rotary_pos_emb", None)
+        self.layer_idx = int(layer_idx)
+    @property
+    def logits(self) -> torch.Tensor:
+        return self.head_gate.logits
+    def kept_heads_soft(self) -> torch.Tensor:
+        p = self.head_gate.probs().detach().float().view(-1)
+        if p.numel() == self.num_q_heads * self.head_dim:
+            p = p.view(self.num_q_heads, self.head_dim).mean(dim=1)
+        return p.sum()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,                   # [B,T,D]
+        attention_mask: Optional[torch.Tensor] = None, # additive mask [B,1,Tq,Tk] or None
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_value = None,                         # tuple, list, Cache or None
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        B, T, D = hidden_states.shape
+        Hq, Hkv, Dh = self.num_q_heads, self.num_kv_heads, self.head_dim
+        assert Hq * Dh == D, "hidden_size must equal num_heads * head_dim"
+        n_rep = max(1, Hq // Hkv)
+        # qkv projections
+        q = self.q_proj(hidden_states).view(B, T, Hq, Dh).transpose(1, 2)   # [B,Hq,T,Dh]
+        k = self.k_proj(hidden_states).view(B, T, Hkv, Dh).transpose(1, 2)  # [B,Hkv,T,Dh]
+        v = self.v_proj(hidden_states).view(B, T, Hkv, Dh).transpose(1, 2)  # [B,Hkv,T,Dh]
+        # rotary
+        if (self.rotary_emb is not None) and (self.apply_rotary_pos_emb is not None):
+            Tpast = 0
+            if isinstance(past_key_value, (tuple, list)) and len(past_key_value) == 2:
+                Tpast = int(past_key_value[0].size(2))
+            elif isinstance(past_key_value, Cache):
+                Tpast = int(cache_position.max().item() if cache_position is not None else 0)
+            seq_len = Tpast + T
+            try:
+                cos, sin = self.rotary_emb(v, seq_len=seq_len)
+            except TypeError:
+                cos, sin = self.rotary_emb(q, seq_len=seq_len)
+            # try rich signature first
+            try:
+                q, k = self.apply_rotary_pos_emb(
+                    q, k, cos, sin,
+                    position_ids=position_ids,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings
+                )
+            except TypeError:
+                try:
+                    q, k = self.apply_rotary_pos_emb(q, k, cos, sin, position_ids=position_ids)
+                except TypeError:
+                    q, k = self.apply_rotary_pos_emb(q, k, cos, sin)
+        # cache merge
+        present = None
+        if past_key_value is None or (isinstance(past_key_value, (tuple, list)) and len(past_key_value) == 0):
+            pass
+        elif isinstance(past_key_value, (tuple, list)):
+            pk, pv = past_key_value   # [B,Hkv,Tpast,Dh]
+            k = torch.cat([pk, k], dim=2)
+            v = torch.cat([pv, v], dim=2)
+            present = (k, v) if use_cache else None
+        elif isinstance(past_key_value, Cache):
+            k, v = past_key_value.update(k, v, self.layer_idx, cache_position)
+            present = past_key_value
+        # gates
+        # g = self.head_gate.mask(self.training).view(1, Hq, 1, 1)
+        # ---- gates (supports per-head OR per-channel HeadGate) ----
+        m = self.head_gate.mask(self.training)                   # 1D tensor
+        m = m.detach() if not self.training else m
+        if m.numel() == Hq:
+            # per-head gating
+            gH = m.view(1, Hq, 1, 1)                             # [1,Hq,1,1]
+            q = q * gH
+            if self.gate_kv:
+                if n_rep == 1:
+                    k = k * gH; v = v * gH
+                else:
+                    g_kv = gH.view(1, Hkv, n_rep, 1, 1).amax(dim=2)
+                    k = k * g_kv; v = v * g_kv
+        elif m.numel() == Hq * Dh:
+            # per-channel gating
+            gHD = m.view(1, Hq, 1, Dh)                           # [1,Hq,1,Dh]
+            q = q * gHD
+            if self.gate_kv:
+                # collapse to per-head for KV, then map to Hkv via amax over replicas
+                gH = gHD.amax(dim=-1, keepdim=True)              # [1,Hq,1,1]
+                if n_rep == 1:
+                    k = k * gH; v = v * gH
+                else:
+                    g_kv = gH.view(1, Hkv, n_rep, 1, 1).amax(dim=2)
+                    k = k * g_kv; v = v * g_kv
+        else:
+            raise RuntimeError(
+                f"HeadGate mask has {m.numel()} elems; expected {Hq} or {Hq*Dh}"
+            )
+        # GQA replicate KV to Q count
+        k = _repeat_kv(k, n_rep)
+        v = _repeat_kv(v, n_rep)
+        attn = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attention_mask,
+            dropout_p=self.drop_p if self.training else 0.0,
+            is_causal=True
+        )
+        out = attn.transpose(1, 2).contiguous().view(B, T, Hq * Dh)
+        out = self.o_proj(out)
+        attn_weights = None
+        # HF expects (attn_output, attn_weights, present_key_value) always
+        if output_attentions:
+            return (out, attn_weights, present)
+        else:
+            return (out, None, present)
+# -------------------------------------------------------------------------
+# Adapter
+# -------------------------------------------------------------------------
+class LlamaAdapter:
+    def __init__(self, model: nn.Module):
+        self.model = model
+        core = getattr(model, "model", model)
+        if not hasattr(core, "layers"):
+            raise ValueError("Provided model does not look like HF LLaMA/Mistral (missing .model.layers or .layers)")
+    # ---------- Gating attachment ----------
+    def attach_gates(self, cfg: LlamaGatingConfig) -> nn.Module:
+        m = self.model
+        core = getattr(m, "model", m)
+        layers = core.layers
+        Hq  = int(core.config.num_attention_heads)
+        Hkv = int(getattr(core.config, "num_key_value_heads", Hq))
+        Dh  = int(core.config.hidden_size // Hq)
+        for li, layer in enumerate(layers):
+            # Attention heads
+            if cfg.head_gating:
+                base = layer.self_attn
+                if not isinstance(base, GatedSelfAttentionLLM):
+                    gated = GatedSelfAttentionLLM(
+                        attn_container=base,
+                        num_q_heads=Hq,
+                        num_kv_heads=Hkv,
+                        head_dim=Dh,
+                        cfg=cfg,
+                        layer_idx=li,
+                    )
+                    layer.self_attn = gated  # route via our wrapper
+            # MLP grouped gating (SwiGLU)
+            if cfg.ffn_gating:
+                mlp = layer.mlp
+                I = int(mlp.up_proj.out_features)
+                assert I % cfg.ffn_group == 0, f"SwiGLU size {I} not divisible by group {cfg.ffn_group}"
+                if not hasattr(mlp, "neuron_gate"):
+                    mlp.neuron_gate = GroupGate(
+                        num_groups=I // cfg.ffn_group,
+                        group_size=cfg.ffn_group,
+                        tau=cfg.tau, init_logit=cfg.init_logit,
+                        hard_during_eval=cfg.hard_eval,
+                    )
+                if not hasattr(mlp, "_orig_forward"):
+                    mlp._orig_forward = mlp.forward
+                    def _gated_mlp_forward(this, x):
+                        # LLaMA: z = silu(up(x)) * (gate(x) * m); out = down(z)
+                        u = this.up_proj(x)
+                        g = this.gate_proj(x)
+                        m = this.neuron_gate.mask(this.training).view(1, 1, -1)
+                        z = torch.nn.functional.silu(u) * (g * m)
+                        return this.down_proj(z)
+                    mlp.forward = _gated_mlp_forward.__get__(mlp, mlp.__class__)
+        return m
+    # ---------- Logits helper ----------
+    @staticmethod
+    def _last_token_index(attn_mask: torch.Tensor) -> torch.Tensor:
+        # attn_mask: [B, S] with 1 for tokens, 0 for padding
+        # returns [B] indices of last non-pad
+        # works for both bool and int masks
+        if attn_mask is None:
+            # no mask → use last position S-1
+            return None
+        if attn_mask.dtype != torch.long:
+            attn_mask = attn_mask.to(torch.long)
+        # idx = lengths - 1
+        return (attn_mask.sum(dim=-1) - 1).clamp_min(0)
+    @staticmethod
+    def get_logits(model: nn.Module,
+                   input_ids: torch.Tensor,
+                   attention_mask: Optional[torch.Tensor] = None,
+                   last_only: bool = True,
+                   **forward_kwargs) -> torch.Tensor:
+        """
+        Returns logits. If last_only=True, computes ONLY the last-token logits by:
+          1) getting hidden states from the base decoder,
+          2) selecting last non-pad position per sample,
+          3) projecting through lm_head on that 1 position.
+        This avoids allocating [B,S,V].
+        """
+        # (1) run base decoder, not the full CausalLM head
+        core = getattr(model, "model", None)
+        if core is None:
+            # fallback if the model is already a bare decoder (rare)
+            core = model
+        # We only need last_hidden_state; no cache; avoid building logits for all S
+        # return_dict=False to grab tuple and avoid extra allocations
+        outputs = core(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            use_cache=False,
+            return_dict=False,
+            **forward_kwargs
+        )
+        hidden = outputs[0]  # [B, S, D]
+        if not last_only:
+            # If someone explicitly wants all logits, fine:
+            return model.lm_head(hidden)  # [B,S,V] (expensive!)
+        # (2) select last token per sample
+        B, S, D = hidden.shape
+        if attention_mask is None:
+            # simple "last index"
+            idx = torch.full((B,), S - 1, device=hidden.device, dtype=torch.long)
+        else:
+            idx = LlamaAdapter._last_token_index(attention_mask)
+        # gather last hidden: [B, D]
+        last_h = hidden[torch.arange(B, device=hidden.device), idx]  # [B, D]
+        # (3) project to logits for that 1 position
+        last_logits = model.lm_head(last_h).unsqueeze(1)  # [B,1,V]
+        return last_logits
+    # ---------- Exporters ----------
+    @staticmethod
+    @torch.no_grad()
+    def export_keepall(model_with_gates: nn.Module) -> nn.Module:
+        """
+        Unwrap attention wrappers; restore original MLP.forward; drop gates.
+        """
+        slim = deepcopy_eval_cpu(model_with_gates)
+        core = getattr(slim, "model", slim)
+        if not hasattr(core, "layers"):
+            return slim
+        for layer in core.layers:
+            # attention
+            attn = layer.self_attn
+            if isinstance(attn, GatedSelfAttentionLLM):
+                gat = attn
+                new_attn = copy.deepcopy(gat.base_attn)
+                # keep metadata consistent
+                if hasattr(new_attn, "num_heads"):
+                    new_attn.num_heads = int(gat.num_q_heads)
+                if hasattr(new_attn, "num_key_value_heads"):
+                    new_attn.num_key_value_heads = int(gat.num_kv_heads)
+                if hasattr(new_attn, "head_dim"):
+                    new_attn.head_dim = int(gat.head_dim)
+                layer.self_attn = new_attn
+            # mlp
+            mlp = layer.mlp
+            if hasattr(mlp, "_orig_forward"):
+                mlp.forward = mlp._orig_forward
+                delattr(mlp, "_orig_forward")
+            if hasattr(mlp, "neuron_gate"):
+                delattr(mlp, "neuron_gate")
+        return slim
+    @staticmethod
+    @torch.no_grad()
+    def export_pruned(model_with_gates: nn.Module, policy, step: int) -> nn.Module:
+        """
+        Produce a clean CPU eval model:
+          - Read gates to choose Q heads; slice q_proj rows and o_proj cols
+          - Snap kept heads to an LCM of (policy multiple, Hkv)
+          - Slice SwiGLU up/gate/down by groups
+          - Unwrap back to plain HF modules; update metadata
+        """
+        # Accept either CoreExportPolicy with per-axis rounding or family policy
+        if isinstance(policy, LlamaExportPolicy):
+            head_rounding = policy.head_rounding
+            ffn_rounding = policy.ffn_rounding
+            warmup_steps = policy.warmup_steps
+        else:
+            head_rounding = getattr(policy, "rounding", None)
+            ffn_rounding = getattr(policy, "rounding", None)
+            warmup_steps = int(getattr(policy, "warmup_steps", 0))
+        slim = deepcopy_eval_cpu(model_with_gates)
+        core = getattr(slim, "model", slim)
+        layers = getattr(core, "layers", None)
+        if layers is None:
+            return slim
+        warm = (step < warmup_steps)
+        def _lcm(a: int, b: int) -> int:
+            return abs(a * b) // math.gcd(max(a, 1), max(b, 1)) if a > 0 and b > 0 else max(a, b, 1)
+        for li, layer in enumerate(layers):
+            # ---- Attention (Q heads) ----
+            attn = layer.self_attn
+            if isinstance(attn, GatedSelfAttentionLLM):
+                gat = attn
+                base = gat.base_attn
+                Hq  = int(gat.num_q_heads)
+                Hkv = int(gat.num_kv_heads)
+                Dh  = int(gat.head_dim)
+                if warm:
+                    keep_idx = torch.arange(Hq)
+                else:
+                    # Build a "per-head" proxy gate if base gate is per-channel.
+                    base_logits = gat.head_gate.logits.detach().float().view(-1)
+                    tau = float(getattr(gat.head_gate, "tau", 1.0))
+                    if base_logits.numel() == Hq:
+                        # Native per-head gate: use as-is
+                        proxy_gate = gat.head_gate
+                        keep_idx = keep_group_indices_from_gate(
+                            proxy_gate, policy=policy, step=step, custom_rounding=head_rounding
+                        )
+                    elif base_logits.numel() == Hq * Dh:
+                        # Collapse per-channel → per-head (mean; or use .amax for stricter)
+                        per_head_logits = base_logits.view(Hq, Dh).mean(dim=1)
+                        class _PerHeadProxyGate:
+                            def __init__(self, logits, tau):
+                                self.logits = logits
+                                self.tau = tau
+                                self.num_groups = logits.numel()
+                                self.group_size = 1
+                        proxy_gate = _PerHeadProxyGate(per_head_logits, tau)
+                        keep_idx = keep_group_indices_from_gate(
+                            proxy_gate, policy=policy, step=step, custom_rounding=head_rounding
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"Unexpected HeadGate logits len {base_logits.numel()} vs H={Hq} or H*Dh={Hq*Dh}"
+                        )
+                    # Enforce LCM with GQA (Hkv) via truncation to floor-multiple
+                    def _lcm(a: int, b: int) -> int:
+                        import math
+                        return abs(a * b) // math.gcd(max(a, 1), max(b, 1)) if a > 0 and b > 0 else max(a, b, 1)
+                    pol_mult = getattr(head_rounding, "multiple_groups", 1)
+                    snap = _lcm(int(pol_mult), max(1, Hkv))
+                    if keep_idx.numel() % snap != 0:
+                        k = (keep_idx.numel() // snap) * snap
+                        k = max(snap, min(Hq, k))
+                        # recompute top-k by per-head logits (ensure same criterion used above)
+                        if base_logits.numel() == Hq * Dh:
+                            scores = per_head_logits
+                        else:
+                            scores = base_logits
+                        keep_idx = torch.topk(scores, k=k, largest=True).indices.sort().values
+                H_keep = int(keep_idx.numel())
+                # channels for q/o slicing
+                ch_idx = torch.cat([torch.arange(h * Dh, (h + 1) * Dh) for h in keep_idx]).long()
+                # slice wrapper linears
+                gat.q_proj = slice_linear(gat.q_proj, keep_out=ch_idx)
+                gat.o_proj = slice_linear(gat.o_proj, keep_in=ch_idx)
+                # transplant into a clean HF attention
+                new_attn = copy.deepcopy(base)
+                if hasattr(new_attn, "q_proj"):
+                    new_attn.q_proj = gat.q_proj
+                if hasattr(new_attn, "o_proj"):
+                    new_attn.o_proj = gat.o_proj
+                elif hasattr(new_attn, "out_proj"):
+                    new_attn.out_proj = gat.o_proj
+                # update metadata
+                if hasattr(new_attn, "num_heads"):
+                    new_attn.num_heads = int(H_keep)
+                if hasattr(new_attn, "num_key_value_heads"):
+                    new_attn.num_key_value_heads = int(Hkv)
+                if hasattr(new_attn, "head_dim"):
+                    new_attn.head_dim = int(Dh)
+                if hasattr(core.config, "hidden_size"):
+                    core.config.hidden_size = int(H_keep * Dh)
+                layer.self_attn = new_attn  # unwrap
+            # ---- MLP (SwiGLU grouped) ----
+            mlp = layer.mlp
+            g = getattr(mlp, "neuron_gate", None)
+            if g is not None:
+                grp_idx = keep_group_indices_from_gate(
+                    g, policy=policy, step=step, custom_rounding=ffn_rounding,
+                )
+                group = int(g.group_size)  # GroupGate exposes group_size
+                keep_exp = torch.cat([torch.arange(i * group, (i + 1) * group) for i in grp_idx]).long()
+                mlp.up_proj   = slice_linear(mlp.up_proj,   keep_out=keep_exp)
+                mlp.gate_proj = slice_linear(mlp.gate_proj, keep_out=keep_exp)
+                mlp.down_proj = slice_linear(mlp.down_proj, keep_in=keep_exp)
+                # Restore clean forward & drop gate
+                if hasattr(mlp, "_orig_forward"):
+                    mlp.forward = mlp._orig_forward
+                    delattr(mlp, "_orig_forward")
+                if hasattr(mlp, "neuron_gate"):
+                    delattr(mlp, "neuron_gate")
+        return slim
+# -------------------------------------------------------------------------
+# Export policy (allow different rounding for Heads vs FFN)
+# -------------------------------------------------------------------------
+@dataclass
+class LlamaExportPolicy:
+    warmup_steps: int = 0
+    head_rounding: CoreRounding = CoreRounding()  # e.g., CoreRounding(floor=8, multiple=8)
+    ffn_rounding:  CoreRounding = CoreRounding()  # e.g., CoreRounding(min_keep_ratio=0.8, multiple=32)
+# -------------------------------------------------------------------------
+# Grid-search convenience
+# -------------------------------------------------------------------------
+@dataclass
+class LlamaGrid:
+    head_multiple_grid: Optional[Sequence[int]] = (1, 2, 4, 8)
+    ffn_snap_grid: Sequence[int] = (1, 32, 64, 128)
+def llama_search_best_export(
+    model_with_gates: nn.Module,
+    *,
+    export_fn: Callable[[nn.Module, CoreExportPolicy, int], nn.Module],
+    num_q_heads: int,
+    num_kv_heads: int,
+    step: int,
+    batch_shape: Tuple[int, int],  # (B,S) for text
+    grid: Optional[LlamaGrid] = None,
+    device: str = "cuda",
+    measure_settings=None,
+    make_policy: Optional[Callable[[int, int], object]] = None,
+):
+    """
+    Convenience wrapper for LLaMA-style search.
+    Uses the same `grid_search_latency` as ViT; we just feed head/ffn grids.
+    """
+    g = grid or LlamaGrid()
+    head_grid = g.head_multiple_grid or [1]
+    ffn_grid = list(g.ffn_snap_grid)
+    return grid_search_latency(
+        model_with_gates,
+        export_fn,
+        head_multiples=head_grid,
+        ffn_snaps=ffn_grid,
+        step=step,
+        batch_shape=batch_shape,         # adapter’s runner should interpret as (B,S)
+        measure_settings=measure_settings,
+        device=device,
+        make_policy=make_policy,
+    )

huggingface/.ipynb_checkpoints/vit-checkpoint.py ADDED Viewed

	@@ -0,0 +1,383 @@

+"""HuggingFace ViT adapter
+Bridges the family-agnostic core (gates/export/proxy/train) to ViT-like models
+from Hugging Face (`ViTModel`, `ViTForImageClassification`, DeiT, etc.).
+Responsibilities
+----------------
+- Attach gates to attention heads and MLP hidden in groups
+- Provide logits getters for student/teacher
+- Export helpers: keep-all (remove gates), and pruned (slice weights + metadata)
+This adapter intentionally keeps the core unaware of ViT internals.
+"""
+from __future__ import annotations
+# Ensure repo root on sys.path for absolute imports (core, adapters, data)
+import sys, pathlib
+sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
+from dataclasses import dataclass
+from typing import Optional
+import copy
+import torch
+import torch.nn as nn
+# NOTE: absolute imports so running `-m examples.run_vit_optimize` works without package install
+from core.gates import HeadGate, GroupGate
+from core.export import (
+    ExportPolicy as CoreExportPolicy,
+    Rounding as CoreRounding,
+    keep_group_indices_from_gate,
+    keep_element_indices_from_gate,
+    slice_linear,
+    Rounding as CoreRounding,
+)
+from core.utils import deepcopy_eval_cpu
+from core.search_export import grid_search_latency
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+@dataclass
+class ViTGatingConfig:
+    tau: float = 1.5
+    init_logit: float = 3.0
+    head_gating: bool = True
+    ffn_group: int = 16
+    ffn_gating: bool = True
+    hard_eval: bool = True  # use hard masks in eval mode during forward
+def _encoder_layers(m: nn.Module):
+    """
+    Return the sequence of Transformer blocks for HF ViT.
+    Supports:
+      - ViTModel:                 m.encoder.layer
+      - ViTForImageClassification: m.vit.encoder.layer
+    """
+    # ViTModel path
+    enc = getattr(m, "encoder", None)
+    if enc is not None and hasattr(enc, "layer"):
+        return enc.layer
+    # ViTForImageClassification path
+    vit = getattr(m, "vit", None)
+    if vit is not None and hasattr(vit, "encoder") and hasattr(vit.encoder, "layer"):
+        return vit.encoder.layer
+    raise ValueError("Provided model does not look like a HF ViT (missing *.encoder.layer)")
+# -----------------------------------------------------------------------------
+# Gated attention wrapper
+# -----------------------------------------------------------------------------
+class GatedSelfAttentionHF(nn.Module):
+    """A thin wrapper around HF ViT self-attention that multiplies per-head gates.
+    It keeps references to the underlying query/key/value `nn.Linear` layers and
+    the output projection, while exposing a `HeadGate` in `head_gate`.
+    """
+    def __init__(self, attn_container: nn.Module, num_heads: int, head_dim: int, cfg: ViTGatingConfig):
+        super().__init__()
+        base_attn = attn_container.attention  # ViTSdpaSelfAttention or ViTSelfAttention
+        out_proj = attn_container.output.dense
+        self.base_attn = base_attn
+        self.out_proj = out_proj
+        self.q_proj = base_attn.query
+        self.k_proj = base_attn.key
+        self.v_proj = base_attn.value
+        self.num_heads = int(num_heads)
+        self.head_dim = int(head_dim)
+        self.drop_p = getattr(base_attn, "dropout", nn.Dropout(0.0)).p
+        self.head_gate = HeadGate(num_heads=self.num_heads, head_dim=self.head_dim, tau=cfg.tau, init_logit=cfg.init_logit, hard_during_eval=cfg.hard_eval)
+    @property
+    def logits(self) -> torch.Tensor:
+        return self.head_gate.logits
+    def kept_heads_soft(self) -> torch.Tensor:
+        return self.head_gate.probs().sum()
+    def forward(self, hidden_states, head_mask=None):
+        B, N, _ = hidden_states.shape
+        H, Dh = self.num_heads, self.head_dim
+        wdev = self.q_proj.weight.device
+        if hidden_states.device != wdev:
+            hidden_states = hidden_states.to(wdev, non_blocking=True)
+        q_lin = self.q_proj(hidden_states)
+        k_lin = self.k_proj(hidden_states)
+        v_lin = self.v_proj(hidden_states)
+        q = q_lin.view(B, N, H, Dh).transpose(1, 2)
+        k = k_lin.view(B, N, H, Dh).transpose(1, 2)
+        v = v_lin.view(B, N, H, Dh).transpose(1, 2)
+        logits = self.head_gate.logits
+        tau = float(self.head_gate.tau)
+        if self.training:
+            u = torch.rand_like(logits).clamp_(1e-6, 1-1e-6)
+            s = u.log() - (1 - u).log()
+            y = torch.sigmoid((logits + s) / tau)
+            g_head = ((y > 0.5).to(y.dtype) - y).detach() + y
+        else:
+            if getattr(self.head_gate, 'hard_during_eval', True):
+                g_head = (logits > 0).to(logits.dtype)
+            else:
+                g_head = torch.sigmoid(logits / tau)
+        g = g_head.view(1, H, 1, 1)
+        q = q * g; k = k * g; v = v * g
+        attn_out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.drop_p if self.training else 0.0
+        )  # [B, H, N, Dh]
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, N, H * Dh)
+        attn_out = self.out_proj(attn_out)
+        return attn_out, None
+# -----------------------------------------------------------------------------
+# Adapter
+# -----------------------------------------------------------------------------
+class ViTAdapter:
+    def __init__(self, model: nn.Module):
+        self.model = model
+        _ = _encoder_layers(model)
+    # ---------- Gating attachment ----------
+    def attach_gates(self, cfg: ViTGatingConfig) -> nn.Module:
+        m = self.model
+        H = int(getattr(m.config, "num_attention_heads", 12))
+        D = int(getattr(m.config, "hidden_size", 768))
+        Dh = D // H
+        for layer in _encoder_layers(m):
+            # Attention heads
+            if cfg.head_gating:
+                attn_container = layer.attention
+                if not isinstance(getattr(attn_container, "attention", None), GatedSelfAttentionHF):
+                    gated = GatedSelfAttentionHF(attn_container, H, Dh, cfg)
+                    attn_container.attention = gated
+            # FFN hidden (grouped)
+            if cfg.ffn_gating:
+                inter = layer.intermediate
+                d_ff = int(inter.dense.out_features)
+                assert d_ff % cfg.ffn_group == 0, f"FFN size {d_ff} not divisible by group {cfg.ffn_group}"
+                if not hasattr(inter, "neuron_gate"):
+                    inter.neuron_gate = GroupGate(num_groups=d_ff // cfg.ffn_group, group_size=cfg.ffn_group, tau=cfg.tau, init_logit=cfg.init_logit, hard_during_eval=cfg.hard_eval)
+                # Monkey-patch forward to apply mask after activation (keeps HF shapes)
+                if not hasattr(inter, "_orig_forward"):
+                    inter._orig_forward = inter.forward
+                    def _gated_forward(this, x):
+                        h = this.dense(x)
+                        h = this.intermediate_act_fn(h)
+                        msk = this.neuron_gate.mask(this.training).view(1, 1, -1)
+                        return h * msk
+                    inter.forward = _gated_forward.__get__(inter, inter.__class__)
+        return m
+    # ---------- Logits helpers ----------
+    @staticmethod
+    def get_logits(model: nn.Module, x: torch.Tensor, *, head: Optional[nn.Module] = None) -> torch.Tensor:
+        out = model(pixel_values=x)
+        if hasattr(out, "logits"):
+            return out.logits                        # ViTForImageClassification path
+        if hasattr(out, "last_hidden_state"):        # ViTModel path (needs external head)
+            if head is None:
+                raise ValueError("Provide a classification head when using ViTModel without logits.")
+            cls_tok = out.last_hidden_state[:, 0, :]
+            if next(head.parameters(), torch.tensor([], device=cls_tok.device)).device != cls_tok.device:
+                head = head.to(cls_tok.device)
+            return head(cls_tok)
+        raise ValueError("Model output lacks logits and last_hidden_state.")
+    # ---------- Exporters ----------
+    @staticmethod
+    @torch.no_grad()
+    def export_keepall(model_with_gates: nn.Module) -> nn.Module:
+        slim = deepcopy_eval_cpu(model_with_gates)
+        for layer in _encoder_layers(slim):
+            # Attention: unwrap gate
+            attn_container = layer.attention
+            if isinstance(getattr(attn_container, "attention", None), GatedSelfAttentionHF):
+                gat = attn_container.attention
+                new_attn = copy.deepcopy(gat.base_attn)
+                # restore HF metadata if present
+                if hasattr(new_attn, "num_attention_heads"):
+                    new_attn.num_attention_heads = int(gat.num_heads)
+                if hasattr(new_attn, "attention_head_size"):
+                    new_attn.attention_head_size = int(gat.head_dim)
+                if hasattr(new_attn, "all_head_size"):
+                    new_attn.all_head_size = int(gat.num_heads * gat.head_dim)
+                attn_container.attention = new_attn
+            # FFN: restore original forward and drop gate
+            inter = layer.intermediate
+            if hasattr(inter, "_orig_forward"):
+                inter.forward = inter._orig_forward
+                delattr(inter, "_orig_forward")
+            if hasattr(inter, "neuron_gate"):
+                delattr(inter, "neuron_gate")
+        return slim
+    @staticmethod
+    @torch.no_grad()
+    def export_pruned(model_with_gates: nn.Module, policy, step: int) -> nn.Module:
+        # Support both CoreExportPolicy (single rounding) and ViTExportPolicy (per-axis)
+        if isinstance(policy, ViTExportPolicy):
+            head_rounding = policy.head_rounding
+            ffn_rounding = policy.ffn_rounding
+            warmup_steps = policy.warmup_steps
+        else:
+            # fallback to single rounding for both
+            head_rounding = getattr(policy, "rounding", None)
+            ffn_rounding = getattr(policy, "rounding", None)
+            warmup_steps = int(getattr(policy, "warmup_steps", 0))
+        slim = deepcopy_eval_cpu(model_with_gates)
+        warm = (step < warmup_steps)
+        for layer in _encoder_layers(slim):
+            # --- Attention heads ---
+            attn_container = layer.attention
+            gat = getattr(attn_container, "attention", None)
+            if isinstance(gat, GatedSelfAttentionHF):
+                # choose rounding
+                rnd = head_rounding
+                # decide head indices via our helper; honor warmup if needed by passing step
+                grp_idx = keep_group_indices_from_gate(
+                    gat.head_gate,
+                    policy=policy,
+                    step=step,
+                    custom_rounding=rnd,
+                )
+                H_keep = int(grp_idx.numel())
+                Dh = int(gat.head_dim)
+                ch_idx = torch.cat([torch.arange(h * Dh, (h + 1) * Dh) for h in grp_idx]).long()
+                gat.q_proj = slice_linear(gat.q_proj, keep_out=ch_idx)
+                gat.k_proj = slice_linear(gat.k_proj, keep_out=ch_idx)
+                gat.v_proj = slice_linear(gat.v_proj, keep_out=ch_idx)
+                attn_container.output.dense = slice_linear(attn_container.output.dense, keep_in=ch_idx)
+                new_attn = copy.deepcopy(gat.base_attn)
+                new_attn.query = gat.q_proj
+                new_attn.key = gat.k_proj
+                new_attn.value = gat.v_proj
+                if hasattr(new_attn, "num_attention_heads"):
+                    new_attn.num_attention_heads = H_keep
+                if hasattr(new_attn, "attention_head_size"):
+                    new_attn.attention_head_size = Dh
+                if hasattr(new_attn, "all_head_size"):
+                    new_attn.all_head_size = H_keep * Dh
+                attn_container.attention = new_attn
+            # --- FFN groups ---
+            inter, out = layer.intermediate, layer.output
+            g = getattr(inter, "neuron_gate", None)
+            if g is not None:
+                rnd = ffn_rounding
+                grp_idx = keep_group_indices_from_gate(
+                    g,
+                    policy=policy,
+                    step=step,
+                    custom_rounding=rnd,
+                )
+                group = int(g.group_size)
+                keep_exp = torch.cat([torch.arange(i * group, (i + 1) * group) for i in grp_idx]).long()
+                inter.dense = slice_linear(inter.dense, keep_out=keep_exp)
+                out.dense = slice_linear(out.dense, keep_in=keep_exp)
+                # # restore clean forward & drop gate
+                # if hasattr(inter, "_orig_forward"):
+                #     def _clean_forward(this, x):
+                #         h = this.dense(x)
+                #         return this.intermediate_act_fn(h)
+                #     inter.forward = _clean_forward.__get__(inter, inter.__class__)
+                #     delattr(inter, "_orig_forward")
+                # if hasattr(inter, "neuron_gate"):
+                #     delattr(inter, "neuron_gate")
+                inter.forward = inter.__class__.forward.__get__(inter, inter.__class__)
+                if hasattr(inter, "neuron_gate"):
+                    delattr(inter, "neuron_gate")
+                if hasattr(inter, "_orig_forward"):
+                    delattr(inter, "_orig_forward")
+        return slim
+# -----------------------------------------------------------------------------
+# Export policy
+# -----------------------------------------------------------------------------
+"""ViT-specific export policy that allows different rounding for heads vs FFN."""
+@dataclass
+class ViTExportPolicy:
+    warmup_steps: int = 0
+    head_rounding: CoreRounding = CoreRounding()
+    ffn_rounding: CoreRounding = CoreRounding()
+@dataclass
+class ViTGrid:
+    head_multiple_grid: Optional[Sequence[int]] = (2, 4, 8)
+    ffn_snap_grid: Sequence[int] = (1, 8)
+    # head_multiple_grid: Optional[Sequence[int]] = None  # default --> 1..num_heads
+    # ffn_snap_grid: Sequence[int] = (1, 2, 4, 8, 16)
+def vit_search_best_export(
+    model_with_gates: nn.Module,
+    *,
+    export_fn: ExportFn,
+    num_heads: int,
+    step: int,
+    batch_shape: Tuple[int, int, int, int],
+    grid: Optional[ViTGrid] = None,
+    device: str = "cuda",
+    measure_settings: Optional[ProfileSettings] = None,
+    make_policy: Optional[Callable[[int, int], object]] = None,
+) -> SearchResult:
+    """Convenience wrapper for ViT-style search.
+    If `make_policy` is not provided, the caller's adapter should accept a
+    policy with separate head/FFN rounding; see `adapters.huggingface.vit.ViTExportPolicy`.
+    """
+    g = grid or ViTGrid()
+    head_grid = g.head_multiple_grid or list(range(1, int(num_heads) + 1))
+    ffn_grid = list(g.ffn_snap_grid)
+    return grid_search_latency(
+        model_with_gates,
+        export_fn,
+        head_multiples=head_grid,
+        ffn_snaps=ffn_grid,
+        step=step,
+        batch_shape=batch_shape,
+        measure_settings=measure_settings,
+        device=device,
+        make_policy=make_policy,
+    )

huggingface/__init__.py ADDED Viewed

File without changes

huggingface/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (143 Bytes). View file

huggingface/__pycache__/vit.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

huggingface/llama.py ADDED Viewed

	@@ -0,0 +1,607 @@

+"""HuggingFace LLaMA/Mistral adapter
+Bridges the family-agnostic core (gates/export/proxy/train) to HF causal LMs
+(LlamaForCausalLM / MistralForCausalLM, etc.).
+Responsibilities
+----------------
+- Attach gates to attention Q heads (and optional KV) + grouped MLP (SwiGLU)
+- Provide a logits getter (student/teacher)
+- Exporters:
+    * keep-all (unwrap gates, restore clean HF modules)
+    * pruned (slice q_proj/o_proj and SwiGLU up/gate/down; update HF metadata)
+- Grid-search wrapper for post-export rounding/snap params
+This adapter intentionally keeps the core unaware of LLaMA internals.
+"""
+from __future__ import annotations
+# Ensure repo root on sys.path for absolute imports (core, adapters, data)
+import sys, pathlib
+sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
+from dataclasses import dataclass
+from typing import Optional, Sequence, Callable, Tuple
+import copy
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Core (absolute imports so running `-m examples.run_llama_optimize` works)
+from core.gates import HeadGate, GroupGate
+from core.export import (
+    ExportPolicy as CoreExportPolicy,
+    Rounding as CoreRounding,
+    keep_group_indices_from_gate,
+    slice_linear,
+)
+from core.utils import deepcopy_eval_cpu
+from core.search_export import grid_search_latency
+# -------------------------------------------------------------------------
+# Configs
+# -------------------------------------------------------------------------
+@dataclass
+class LlamaGatingConfig:
+    tau: float = 1.5
+    init_logit: float = 3.0
+    head_gating: bool = True
+    gate_kv: bool = False          # optional: gate KV along with Q
+    ffn_group: int = 128           # SwiGLU groups
+    ffn_gating: bool = True
+    hard_eval: bool = True         # use hard gates in eval forward
+# -------------------------------------------------------------------------
+# Helpers (GQA, rotary, cache-safe)
+# -------------------------------------------------------------------------
+def _last_nonpad_index(attn_mask: Optional[torch.Tensor], seq_len: int, device) -> torch.Tensor:
+    if attn_mask is None:
+        return torch.full((1,), seq_len - 1, device=device, dtype=torch.long)  # will be expanded per-batch later
+    # attn_mask: [B, S] in {0,1}; works for left/right padding
+    return (attn_mask.sum(dim=1) - 1).clamp(min=0).long()
+def _repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    if n_rep == 1:
+        return x
+    B, Hkv, T, Dh = x.shape
+    return x.unsqueeze(2).expand(B, Hkv, n_rep, T, Dh).reshape(B, Hkv * n_rep, T, Dh)
+try:
+    from transformers.cache_utils import Cache
+except Exception:
+    class Cache:  # type: ignore
+        pass
+# -------------------------------------------------------------------------
+# Gated attention wrapper (Llama/Mistral ready)
+# -------------------------------------------------------------------------
+class GatedSelfAttentionLLM(nn.Module):
+    """
+    Thin wrapper around HF Llama/Mistral attention module.
+    - Uses the base module's q_proj/k_proj/v_proj/o_proj
+    - Applies per-Q-head gates (and optional KV gates)
+    - Handles rotary and cache (tuple or HF Cache)
+    - Runs SDPA directly, then o_proj
+    """
+    def __init__(self, attn_container: nn.Module,
+                 num_q_heads: int, num_kv_heads: int, head_dim: int,
+                 cfg: LlamaGatingConfig, layer_idx: int):
+        super().__init__()
+        self.base_attn = attn_container
+        self.q_proj = attn_container.q_proj
+        self.k_proj = attn_container.k_proj
+        self.v_proj = attn_container.v_proj
+        self.o_proj = getattr(attn_container, "o_proj", getattr(attn_container, "out_proj", None))
+        self.num_q_heads = int(num_q_heads)
+        self.num_kv_heads = int(num_kv_heads)
+        self.head_dim = int(head_dim)
+        self.gate_kv = bool(cfg.gate_kv)
+        self.drop_p = float(getattr(attn_container, "attention_dropout",
+                                    getattr(attn_container, "attn_dropout",
+                                            getattr(attn_container, "dropout", 0.0))))
+        self.head_gate = HeadGate(num_heads=self.num_q_heads,
+                                  head_dim=self.head_dim,
+                                  tau=cfg.tau, init_logit=cfg.init_logit,
+                                  hard_during_eval=cfg.hard_eval)
+        # rotary helpers if present on base
+        self.rotary_emb = getattr(attn_container, "rotary_emb", None)
+        self.apply_rotary_pos_emb = getattr(attn_container, "apply_rotary_pos_emb", None)
+        self.layer_idx = int(layer_idx)
+    @property
+    def logits(self) -> torch.Tensor:
+        return self.head_gate.logits
+    def kept_heads_soft(self) -> torch.Tensor:
+        p = self.head_gate.probs().detach().float().view(-1)
+        if p.numel() == self.num_q_heads * self.head_dim:
+            p = p.view(self.num_q_heads, self.head_dim).mean(dim=1)
+        return p.sum()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,                   # [B,T,D]
+        attention_mask: Optional[torch.Tensor] = None, # additive mask [B,1,Tq,Tk] or None
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_value = None,                         # tuple, list, Cache or None
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        B, T, D = hidden_states.shape
+        Hq, Hkv, Dh = self.num_q_heads, self.num_kv_heads, self.head_dim
+        assert Hq * Dh == D, "hidden_size must equal num_heads * head_dim"
+        n_rep = max(1, Hq // Hkv)
+        # qkv projections
+        q = self.q_proj(hidden_states).view(B, T, Hq, Dh).transpose(1, 2)   # [B,Hq,T,Dh]
+        k = self.k_proj(hidden_states).view(B, T, Hkv, Dh).transpose(1, 2)  # [B,Hkv,T,Dh]
+        v = self.v_proj(hidden_states).view(B, T, Hkv, Dh).transpose(1, 2)  # [B,Hkv,T,Dh]
+        # rotary
+        if (self.rotary_emb is not None) and (self.apply_rotary_pos_emb is not None):
+            Tpast = 0
+            if isinstance(past_key_value, (tuple, list)) and len(past_key_value) == 2:
+                Tpast = int(past_key_value[0].size(2))
+            elif isinstance(past_key_value, Cache):
+                Tpast = int(cache_position.max().item() if cache_position is not None else 0)
+            seq_len = Tpast + T
+            try:
+                cos, sin = self.rotary_emb(v, seq_len=seq_len)
+            except TypeError:
+                cos, sin = self.rotary_emb(q, seq_len=seq_len)
+            # try rich signature first
+            try:
+                q, k = self.apply_rotary_pos_emb(
+                    q, k, cos, sin,
+                    position_ids=position_ids,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings
+                )
+            except TypeError:
+                try:
+                    q, k = self.apply_rotary_pos_emb(q, k, cos, sin, position_ids=position_ids)
+                except TypeError:
+                    q, k = self.apply_rotary_pos_emb(q, k, cos, sin)
+        # cache merge
+        present = None
+        if past_key_value is None or (isinstance(past_key_value, (tuple, list)) and len(past_key_value) == 0):
+            pass
+        elif isinstance(past_key_value, (tuple, list)):
+            pk, pv = past_key_value   # [B,Hkv,Tpast,Dh]
+            k = torch.cat([pk, k], dim=2)
+            v = torch.cat([pv, v], dim=2)
+            present = (k, v) if use_cache else None
+        elif isinstance(past_key_value, Cache):
+            k, v = past_key_value.update(k, v, self.layer_idx, cache_position)
+            present = past_key_value
+        # gates
+        # g = self.head_gate.mask(self.training).view(1, Hq, 1, 1)
+        # ---- gates (supports per-head OR per-channel HeadGate) ----
+        m = self.head_gate.mask(self.training)                   # 1D tensor
+        m = m.detach() if not self.training else m
+        if m.numel() == Hq:
+            # per-head gating
+            gH = m.view(1, Hq, 1, 1)                             # [1,Hq,1,1]
+            q = q * gH
+            if self.gate_kv:
+                if n_rep == 1:
+                    k = k * gH; v = v * gH
+                else:
+                    g_kv = gH.view(1, Hkv, n_rep, 1, 1).amax(dim=2)
+                    k = k * g_kv; v = v * g_kv
+        elif m.numel() == Hq * Dh:
+            # per-channel gating
+            gHD = m.view(1, Hq, 1, Dh)                           # [1,Hq,1,Dh]
+            q = q * gHD
+            if self.gate_kv:
+                # collapse to per-head for KV, then map to Hkv via amax over replicas
+                gH = gHD.amax(dim=-1, keepdim=True)              # [1,Hq,1,1]
+                if n_rep == 1:
+                    k = k * gH; v = v * gH
+                else:
+                    g_kv = gH.view(1, Hkv, n_rep, 1, 1).amax(dim=2)
+                    k = k * g_kv; v = v * g_kv
+        else:
+            raise RuntimeError(
+                f"HeadGate mask has {m.numel()} elems; expected {Hq} or {Hq*Dh}"
+            )
+        # GQA replicate KV to Q count
+        k = _repeat_kv(k, n_rep)
+        v = _repeat_kv(v, n_rep)
+        attn = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attention_mask,
+            dropout_p=self.drop_p if self.training else 0.0,
+            is_causal=True
+        )
+        out = attn.transpose(1, 2).contiguous().view(B, T, Hq * Dh)
+        out = self.o_proj(out)
+        attn_weights = None
+        # HF expects (attn_output, attn_weights, present_key_value) always
+        if output_attentions:
+            return (out, attn_weights, present)
+        else:
+            return (out, None, present)
+# -------------------------------------------------------------------------
+# Adapter
+# -------------------------------------------------------------------------
+class LlamaAdapter:
+    def __init__(self, model: nn.Module):
+        self.model = model
+        core = getattr(model, "model", model)
+        if not hasattr(core, "layers"):
+            raise ValueError("Provided model does not look like HF LLaMA/Mistral (missing .model.layers or .layers)")
+    # ---------- Gating attachment ----------
+    def attach_gates(self, cfg: LlamaGatingConfig) -> nn.Module:
+        m = self.model
+        core = getattr(m, "model", m)
+        layers = core.layers
+        Hq  = int(core.config.num_attention_heads)
+        Hkv = int(getattr(core.config, "num_key_value_heads", Hq))
+        Dh  = int(core.config.hidden_size // Hq)
+        for li, layer in enumerate(layers):
+            # Attention heads
+            if cfg.head_gating:
+                base = layer.self_attn
+                if not isinstance(base, GatedSelfAttentionLLM):
+                    gated = GatedSelfAttentionLLM(
+                        attn_container=base,
+                        num_q_heads=Hq,
+                        num_kv_heads=Hkv,
+                        head_dim=Dh,
+                        cfg=cfg,
+                        layer_idx=li,
+                    )
+                    layer.self_attn = gated  # route via our wrapper
+            # MLP grouped gating (SwiGLU)
+            if cfg.ffn_gating:
+                mlp = layer.mlp
+                I = int(mlp.up_proj.out_features)
+                assert I % cfg.ffn_group == 0, f"SwiGLU size {I} not divisible by group {cfg.ffn_group}"
+                if not hasattr(mlp, "neuron_gate"):
+                    mlp.neuron_gate = GroupGate(
+                        num_groups=I // cfg.ffn_group,
+                        group_size=cfg.ffn_group,
+                        tau=cfg.tau, init_logit=cfg.init_logit,
+                        hard_during_eval=cfg.hard_eval,
+                    )
+                if not hasattr(mlp, "_orig_forward"):
+                    mlp._orig_forward = mlp.forward
+                    def _gated_mlp_forward(this, x):
+                        # LLaMA: z = silu(up(x)) * (gate(x) * m); out = down(z)
+                        u = this.up_proj(x)
+                        g = this.gate_proj(x)
+                        m = this.neuron_gate.mask(this.training).view(1, 1, -1)
+                        z = torch.nn.functional.silu(u) * (g * m)
+                        return this.down_proj(z)
+                    mlp.forward = _gated_mlp_forward.__get__(mlp, mlp.__class__)
+        return m
+    # ---------- Logits helper ----------
+    @staticmethod
+    def _last_token_index(attn_mask: torch.Tensor) -> torch.Tensor:
+        # attn_mask: [B, S] with 1 for tokens, 0 for padding
+        # returns [B] indices of last non-pad
+        # works for both bool and int masks
+        if attn_mask is None:
+            # no mask → use last position S-1
+            return None
+        if attn_mask.dtype != torch.long:
+            attn_mask = attn_mask.to(torch.long)
+        # idx = lengths - 1
+        return (attn_mask.sum(dim=-1) - 1).clamp_min(0)
+    @staticmethod
+    def get_logits(model: nn.Module,
+                   input_ids: torch.Tensor,
+                   attention_mask: Optional[torch.Tensor] = None,
+                   last_only: bool = True,
+                   **forward_kwargs) -> torch.Tensor:
+        """
+        Returns logits. If last_only=True, computes ONLY the last-token logits by:
+          1) getting hidden states from the base decoder,
+          2) selecting last non-pad position per sample,
+          3) projecting through lm_head on that 1 position.
+        This avoids allocating [B,S,V].
+        """
+        # (1) run base decoder, not the full CausalLM head
+        core = getattr(model, "model", None)
+        if core is None:
+            # fallback if the model is already a bare decoder (rare)
+            core = model
+        # We only need last_hidden_state; no cache; avoid building logits for all S
+        # return_dict=False to grab tuple and avoid extra allocations
+        outputs = core(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            use_cache=False,
+            return_dict=False,
+            **forward_kwargs
+        )
+        hidden = outputs[0]  # [B, S, D]
+        if not last_only:
+            # If someone explicitly wants all logits, fine:
+            return model.lm_head(hidden)  # [B,S,V] (expensive!)
+        # (2) select last token per sample
+        B, S, D = hidden.shape
+        if attention_mask is None:
+            # simple "last index"
+            idx = torch.full((B,), S - 1, device=hidden.device, dtype=torch.long)
+        else:
+            idx = LlamaAdapter._last_token_index(attention_mask)
+        # gather last hidden: [B, D]
+        last_h = hidden[torch.arange(B, device=hidden.device), idx]  # [B, D]
+        # (3) project to logits for that 1 position
+        last_logits = model.lm_head(last_h).unsqueeze(1)  # [B,1,V]
+        return last_logits
+    # ---------- Exporters ----------
+    @staticmethod
+    @torch.no_grad()
+    def export_keepall(model_with_gates: nn.Module) -> nn.Module:
+        """
+        Unwrap attention wrappers; restore original MLP.forward; drop gates.
+        """
+        slim = deepcopy_eval_cpu(model_with_gates)
+        core = getattr(slim, "model", slim)
+        if not hasattr(core, "layers"):
+            return slim
+        for layer in core.layers:
+            # attention
+            attn = layer.self_attn
+            if isinstance(attn, GatedSelfAttentionLLM):
+                gat = attn
+                new_attn = copy.deepcopy(gat.base_attn)
+                # keep metadata consistent
+                if hasattr(new_attn, "num_heads"):
+                    new_attn.num_heads = int(gat.num_q_heads)
+                if hasattr(new_attn, "num_key_value_heads"):
+                    new_attn.num_key_value_heads = int(gat.num_kv_heads)
+                if hasattr(new_attn, "head_dim"):
+                    new_attn.head_dim = int(gat.head_dim)
+                layer.self_attn = new_attn
+            # mlp
+            mlp = layer.mlp
+            if hasattr(mlp, "_orig_forward"):
+                mlp.forward = mlp._orig_forward
+                delattr(mlp, "_orig_forward")
+            if hasattr(mlp, "neuron_gate"):
+                delattr(mlp, "neuron_gate")
+        return slim
+    @staticmethod
+    @torch.no_grad()
+    def export_pruned(model_with_gates: nn.Module, policy, step: int) -> nn.Module:
+        """
+        Produce a clean CPU eval model:
+          - Read gates to choose Q heads; slice q_proj rows and o_proj cols
+          - Snap kept heads to an LCM of (policy multiple, Hkv)
+          - Slice SwiGLU up/gate/down by groups
+          - Unwrap back to plain HF modules; update metadata
+        """
+        # Accept either CoreExportPolicy with per-axis rounding or family policy
+        if isinstance(policy, LlamaExportPolicy):
+            head_rounding = policy.head_rounding
+            ffn_rounding = policy.ffn_rounding
+            warmup_steps = policy.warmup_steps
+        else:
+            head_rounding = getattr(policy, "rounding", None)
+            ffn_rounding = getattr(policy, "rounding", None)
+            warmup_steps = int(getattr(policy, "warmup_steps", 0))
+        slim = deepcopy_eval_cpu(model_with_gates)
+        core = getattr(slim, "model", slim)
+        layers = getattr(core, "layers", None)
+        if layers is None:
+            return slim
+        warm = (step < warmup_steps)
+        def _lcm(a: int, b: int) -> int:
+            return abs(a * b) // math.gcd(max(a, 1), max(b, 1)) if a > 0 and b > 0 else max(a, b, 1)
+        for li, layer in enumerate(layers):
+            # ---- Attention (Q heads) ----
+            attn = layer.self_attn
+            if isinstance(attn, GatedSelfAttentionLLM):
+                gat = attn
+                base = gat.base_attn
+                Hq  = int(gat.num_q_heads)
+                Hkv = int(gat.num_kv_heads)
+                Dh  = int(gat.head_dim)
+                if warm:
+                    keep_idx = torch.arange(Hq)
+                else:
+                    # Build a "per-head" proxy gate if base gate is per-channel.
+                    base_logits = gat.head_gate.logits.detach().float().view(-1)
+                    tau = float(getattr(gat.head_gate, "tau", 1.0))
+                    if base_logits.numel() == Hq:
+                        # Native per-head gate: use as-is
+                        proxy_gate = gat.head_gate
+                        keep_idx = keep_group_indices_from_gate(
+                            proxy_gate, policy=policy, step=step, custom_rounding=head_rounding
+                        )
+                    elif base_logits.numel() == Hq * Dh:
+                        # Collapse per-channel → per-head (mean; or use .amax for stricter)
+                        per_head_logits = base_logits.view(Hq, Dh).mean(dim=1)
+                        class _PerHeadProxyGate:
+                            def __init__(self, logits, tau):
+                                self.logits = logits
+                                self.tau = tau
+                                self.num_groups = logits.numel()
+                                self.group_size = 1
+                        proxy_gate = _PerHeadProxyGate(per_head_logits, tau)
+                        keep_idx = keep_group_indices_from_gate(
+                            proxy_gate, policy=policy, step=step, custom_rounding=head_rounding
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"Unexpected HeadGate logits len {base_logits.numel()} vs H={Hq} or H*Dh={Hq*Dh}"
+                        )
+                    # Enforce LCM with GQA (Hkv) via truncation to floor-multiple
+                    def _lcm(a: int, b: int) -> int:
+                        import math
+                        return abs(a * b) // math.gcd(max(a, 1), max(b, 1)) if a > 0 and b > 0 else max(a, b, 1)
+                    pol_mult = getattr(head_rounding, "multiple_groups", 1)
+                    snap = _lcm(int(pol_mult), max(1, Hkv))
+                    if keep_idx.numel() % snap != 0:
+                        k = (keep_idx.numel() // snap) * snap
+                        k = max(snap, min(Hq, k))
+                        # recompute top-k by per-head logits (ensure same criterion used above)
+                        if base_logits.numel() == Hq * Dh:
+                            scores = per_head_logits
+                        else:
+                            scores = base_logits
+                        keep_idx = torch.topk(scores, k=k, largest=True).indices.sort().values
+                H_keep = int(keep_idx.numel())
+                # channels for q/o slicing
+                ch_idx = torch.cat([torch.arange(h * Dh, (h + 1) * Dh) for h in keep_idx]).long()
+                # slice wrapper linears
+                gat.q_proj = slice_linear(gat.q_proj, keep_out=ch_idx)
+                gat.o_proj = slice_linear(gat.o_proj, keep_in=ch_idx)
+                # transplant into a clean HF attention
+                new_attn = copy.deepcopy(base)
+                if hasattr(new_attn, "q_proj"):
+                    new_attn.q_proj = gat.q_proj
+                if hasattr(new_attn, "o_proj"):
+                    new_attn.o_proj = gat.o_proj
+                elif hasattr(new_attn, "out_proj"):
+                    new_attn.out_proj = gat.o_proj
+                # update metadata
+                if hasattr(new_attn, "num_heads"):
+                    new_attn.num_heads = int(H_keep)
+                if hasattr(new_attn, "num_key_value_heads"):
+                    new_attn.num_key_value_heads = int(Hkv)
+                if hasattr(new_attn, "head_dim"):
+                    new_attn.head_dim = int(Dh)
+                if hasattr(core.config, "hidden_size"):
+                    core.config.hidden_size = int(H_keep * Dh)
+                layer.self_attn = new_attn  # unwrap
+            # ---- MLP (SwiGLU grouped) ----
+            mlp = layer.mlp
+            g = getattr(mlp, "neuron_gate", None)
+            if g is not None:
+                grp_idx = keep_group_indices_from_gate(
+                    g, policy=policy, step=step, custom_rounding=ffn_rounding,
+                )
+                group = int(g.group_size)  # GroupGate exposes group_size
+                keep_exp = torch.cat([torch.arange(i * group, (i + 1) * group) for i in grp_idx]).long()
+                mlp.up_proj   = slice_linear(mlp.up_proj,   keep_out=keep_exp)
+                mlp.gate_proj = slice_linear(mlp.gate_proj, keep_out=keep_exp)
+                mlp.down_proj = slice_linear(mlp.down_proj, keep_in=keep_exp)
+                # Restore clean forward & drop gate
+                if hasattr(mlp, "_orig_forward"):
+                    mlp.forward = mlp._orig_forward
+                    delattr(mlp, "_orig_forward")
+                if hasattr(mlp, "neuron_gate"):
+                    delattr(mlp, "neuron_gate")
+        return slim
+# -------------------------------------------------------------------------
+# Export policy (allow different rounding for Heads vs FFN)
+# -------------------------------------------------------------------------
+@dataclass
+class LlamaExportPolicy:
+    warmup_steps: int = 0
+    head_rounding: CoreRounding = CoreRounding()  # e.g., CoreRounding(floor=8, multiple=8)
+    ffn_rounding:  CoreRounding = CoreRounding()  # e.g., CoreRounding(min_keep_ratio=0.8, multiple=32)
+# -------------------------------------------------------------------------
+# Grid-search convenience
+# -------------------------------------------------------------------------
+@dataclass
+class LlamaGrid:
+    head_multiple_grid: Optional[Sequence[int]] = (1, 2, 4, 8)
+    ffn_snap_grid: Sequence[int] = (1, 32, 64, 128)
+def llama_search_best_export(
+    model_with_gates: nn.Module,
+    *,
+    export_fn: Callable[[nn.Module, CoreExportPolicy, int], nn.Module],
+    num_q_heads: int,
+    num_kv_heads: int,
+    step: int,
+    batch_shape: Tuple[int, int],  # (B,S) for text
+    grid: Optional[LlamaGrid] = None,
+    device: str = "cuda",
+    measure_settings=None,
+    make_policy: Optional[Callable[[int, int], object]] = None,
+):
+    """
+    Convenience wrapper for LLaMA-style search.
+    Uses the same `grid_search_latency` as ViT; we just feed head/ffn grids.
+    """
+    g = grid or LlamaGrid()
+    head_grid = g.head_multiple_grid or [1]
+    ffn_grid = list(g.ffn_snap_grid)
+    return grid_search_latency(
+        model_with_gates,
+        export_fn,
+        head_multiples=head_grid,
+        ffn_snaps=ffn_grid,
+        step=step,
+        batch_shape=batch_shape,         # adapter’s runner should interpret as (B,S)
+        measure_settings=measure_settings,
+        device=device,
+        make_policy=make_policy,
+    )

huggingface/registry.py ADDED Viewed

File without changes

huggingface/vit.py ADDED Viewed

	@@ -0,0 +1,383 @@

+"""HuggingFace ViT adapter
+Bridges the family-agnostic core (gates/export/proxy/train) to ViT-like models
+from Hugging Face (`ViTModel`, `ViTForImageClassification`, DeiT, etc.).
+Responsibilities
+----------------
+- Attach gates to attention heads and MLP hidden in groups
+- Provide logits getters for student/teacher
+- Export helpers: keep-all (remove gates), and pruned (slice weights + metadata)
+This adapter intentionally keeps the core unaware of ViT internals.
+"""
+from __future__ import annotations
+# Ensure repo root on sys.path for absolute imports (core, adapters, data)
+import sys, pathlib
+sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
+from dataclasses import dataclass
+from typing import Optional
+import copy
+import torch
+import torch.nn as nn
+# NOTE: absolute imports so running `-m examples.run_vit_optimize` works without package install
+from core.gates import HeadGate, GroupGate
+from core.export import (
+    ExportPolicy as CoreExportPolicy,
+    Rounding as CoreRounding,
+    keep_group_indices_from_gate,
+    keep_element_indices_from_gate,
+    slice_linear,
+    Rounding as CoreRounding,
+)
+from core.utils import deepcopy_eval_cpu
+from core.search_export import grid_search_latency
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+@dataclass
+class ViTGatingConfig:
+    tau: float = 1.5
+    init_logit: float = 3.0
+    head_gating: bool = True
+    ffn_group: int = 16
+    ffn_gating: bool = True
+    hard_eval: bool = True  # use hard masks in eval mode during forward
+def _encoder_layers(m: nn.Module):
+    """
+    Return the sequence of Transformer blocks for HF ViT.
+    Supports:
+      - ViTModel:                 m.encoder.layer
+      - ViTForImageClassification: m.vit.encoder.layer
+    """
+    # ViTModel path
+    enc = getattr(m, "encoder", None)
+    if enc is not None and hasattr(enc, "layer"):
+        return enc.layer
+    # ViTForImageClassification path
+    vit = getattr(m, "vit", None)
+    if vit is not None and hasattr(vit, "encoder") and hasattr(vit.encoder, "layer"):
+        return vit.encoder.layer
+    raise ValueError("Provided model does not look like a HF ViT (missing *.encoder.layer)")
+# -----------------------------------------------------------------------------
+# Gated attention wrapper
+# -----------------------------------------------------------------------------
+class GatedSelfAttentionHF(nn.Module):
+    """A thin wrapper around HF ViT self-attention that multiplies per-head gates.
+    It keeps references to the underlying query/key/value `nn.Linear` layers and
+    the output projection, while exposing a `HeadGate` in `head_gate`.
+    """
+    def __init__(self, attn_container: nn.Module, num_heads: int, head_dim: int, cfg: ViTGatingConfig):
+        super().__init__()
+        base_attn = attn_container.attention  # ViTSdpaSelfAttention or ViTSelfAttention
+        out_proj = attn_container.output.dense
+        self.base_attn = base_attn
+        self.out_proj = out_proj
+        self.q_proj = base_attn.query
+        self.k_proj = base_attn.key
+        self.v_proj = base_attn.value
+        self.num_heads = int(num_heads)
+        self.head_dim = int(head_dim)
+        self.drop_p = getattr(base_attn, "dropout", nn.Dropout(0.0)).p
+        self.head_gate = HeadGate(num_heads=self.num_heads, head_dim=self.head_dim, tau=cfg.tau, init_logit=cfg.init_logit, hard_during_eval=cfg.hard_eval)
+    @property
+    def logits(self) -> torch.Tensor:
+        return self.head_gate.logits
+    def kept_heads_soft(self) -> torch.Tensor:
+        return self.head_gate.probs().sum()
+    def forward(self, hidden_states, head_mask=None):
+        B, N, _ = hidden_states.shape
+        H, Dh = self.num_heads, self.head_dim
+        wdev = self.q_proj.weight.device
+        if hidden_states.device != wdev:
+            hidden_states = hidden_states.to(wdev, non_blocking=True)
+        q_lin = self.q_proj(hidden_states)
+        k_lin = self.k_proj(hidden_states)
+        v_lin = self.v_proj(hidden_states)
+        q = q_lin.view(B, N, H, Dh).transpose(1, 2)
+        k = k_lin.view(B, N, H, Dh).transpose(1, 2)
+        v = v_lin.view(B, N, H, Dh).transpose(1, 2)
+        logits = self.head_gate.logits
+        tau = float(self.head_gate.tau)
+        if self.training:
+            u = torch.rand_like(logits).clamp_(1e-6, 1-1e-6)
+            s = u.log() - (1 - u).log()
+            y = torch.sigmoid((logits + s) / tau)
+            g_head = ((y > 0.5).to(y.dtype) - y).detach() + y
+        else:
+            if getattr(self.head_gate, 'hard_during_eval', True):
+                g_head = (logits > 0).to(logits.dtype)
+            else:
+                g_head = torch.sigmoid(logits / tau)
+        g = g_head.view(1, H, 1, 1)
+        q = q * g; k = k * g; v = v * g
+        attn_out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.drop_p if self.training else 0.0
+        )  # [B, H, N, Dh]
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, N, H * Dh)
+        attn_out = self.out_proj(attn_out)
+        return attn_out, None
+# -----------------------------------------------------------------------------
+# Adapter
+# -----------------------------------------------------------------------------
+class ViTAdapter:
+    def __init__(self, model: nn.Module):
+        self.model = model
+        _ = _encoder_layers(model)
+    # ---------- Gating attachment ----------
+    def attach_gates(self, cfg: ViTGatingConfig) -> nn.Module:
+        m = self.model
+        H = int(getattr(m.config, "num_attention_heads", 12))
+        D = int(getattr(m.config, "hidden_size", 768))
+        Dh = D // H
+        for layer in _encoder_layers(m):
+            # Attention heads
+            if cfg.head_gating:
+                attn_container = layer.attention
+                if not isinstance(getattr(attn_container, "attention", None), GatedSelfAttentionHF):
+                    gated = GatedSelfAttentionHF(attn_container, H, Dh, cfg)
+                    attn_container.attention = gated
+            # FFN hidden (grouped)
+            if cfg.ffn_gating:
+                inter = layer.intermediate
+                d_ff = int(inter.dense.out_features)
+                assert d_ff % cfg.ffn_group == 0, f"FFN size {d_ff} not divisible by group {cfg.ffn_group}"
+                if not hasattr(inter, "neuron_gate"):
+                    inter.neuron_gate = GroupGate(num_groups=d_ff // cfg.ffn_group, group_size=cfg.ffn_group, tau=cfg.tau, init_logit=cfg.init_logit, hard_during_eval=cfg.hard_eval)
+                # Monkey-patch forward to apply mask after activation (keeps HF shapes)
+                if not hasattr(inter, "_orig_forward"):
+                    inter._orig_forward = inter.forward
+                    def _gated_forward(this, x):
+                        h = this.dense(x)
+                        h = this.intermediate_act_fn(h)
+                        msk = this.neuron_gate.mask(this.training).view(1, 1, -1)
+                        return h * msk
+                    inter.forward = _gated_forward.__get__(inter, inter.__class__)
+        return m
+    # ---------- Logits helpers ----------
+    @staticmethod
+    def get_logits(model: nn.Module, x: torch.Tensor, *, head: Optional[nn.Module] = None) -> torch.Tensor:
+        out = model(pixel_values=x)
+        if hasattr(out, "logits"):
+            return out.logits                        # ViTForImageClassification path
+        if hasattr(out, "last_hidden_state"):        # ViTModel path (needs external head)
+            if head is None:
+                raise ValueError("Provide a classification head when using ViTModel without logits.")
+            cls_tok = out.last_hidden_state[:, 0, :]
+            if next(head.parameters(), torch.tensor([], device=cls_tok.device)).device != cls_tok.device:
+                head = head.to(cls_tok.device)
+            return head(cls_tok)
+        raise ValueError("Model output lacks logits and last_hidden_state.")
+    # ---------- Exporters ----------
+    @staticmethod
+    @torch.no_grad()
+    def export_keepall(model_with_gates: nn.Module) -> nn.Module:
+        slim = deepcopy_eval_cpu(model_with_gates)
+        for layer in _encoder_layers(slim):
+            # Attention: unwrap gate
+            attn_container = layer.attention
+            if isinstance(getattr(attn_container, "attention", None), GatedSelfAttentionHF):
+                gat = attn_container.attention
+                new_attn = copy.deepcopy(gat.base_attn)
+                # restore HF metadata if present
+                if hasattr(new_attn, "num_attention_heads"):
+                    new_attn.num_attention_heads = int(gat.num_heads)
+                if hasattr(new_attn, "attention_head_size"):
+                    new_attn.attention_head_size = int(gat.head_dim)
+                if hasattr(new_attn, "all_head_size"):
+                    new_attn.all_head_size = int(gat.num_heads * gat.head_dim)
+                attn_container.attention = new_attn
+            # FFN: restore original forward and drop gate
+            inter = layer.intermediate
+            if hasattr(inter, "_orig_forward"):
+                inter.forward = inter._orig_forward
+                delattr(inter, "_orig_forward")
+            if hasattr(inter, "neuron_gate"):
+                delattr(inter, "neuron_gate")
+        return slim
+    @staticmethod
+    @torch.no_grad()
+    def export_pruned(model_with_gates: nn.Module, policy, step: int) -> nn.Module:
+        # Support both CoreExportPolicy (single rounding) and ViTExportPolicy (per-axis)
+        if isinstance(policy, ViTExportPolicy):
+            head_rounding = policy.head_rounding
+            ffn_rounding = policy.ffn_rounding
+            warmup_steps = policy.warmup_steps
+        else:
+            # fallback to single rounding for both
+            head_rounding = getattr(policy, "rounding", None)
+            ffn_rounding = getattr(policy, "rounding", None)
+            warmup_steps = int(getattr(policy, "warmup_steps", 0))
+        slim = deepcopy_eval_cpu(model_with_gates)
+        warm = (step < warmup_steps)
+        for layer in _encoder_layers(slim):
+            # --- Attention heads ---
+            attn_container = layer.attention
+            gat = getattr(attn_container, "attention", None)
+            if isinstance(gat, GatedSelfAttentionHF):
+                # choose rounding
+                rnd = head_rounding
+                # decide head indices via our helper; honor warmup if needed by passing step
+                grp_idx = keep_group_indices_from_gate(
+                    gat.head_gate,
+                    policy=policy,
+                    step=step,
+                    custom_rounding=rnd,
+                )
+                H_keep = int(grp_idx.numel())
+                Dh = int(gat.head_dim)
+                ch_idx = torch.cat([torch.arange(h * Dh, (h + 1) * Dh) for h in grp_idx]).long()
+                gat.q_proj = slice_linear(gat.q_proj, keep_out=ch_idx)
+                gat.k_proj = slice_linear(gat.k_proj, keep_out=ch_idx)
+                gat.v_proj = slice_linear(gat.v_proj, keep_out=ch_idx)
+                attn_container.output.dense = slice_linear(attn_container.output.dense, keep_in=ch_idx)
+                new_attn = copy.deepcopy(gat.base_attn)
+                new_attn.query = gat.q_proj
+                new_attn.key = gat.k_proj
+                new_attn.value = gat.v_proj
+                if hasattr(new_attn, "num_attention_heads"):
+                    new_attn.num_attention_heads = H_keep
+                if hasattr(new_attn, "attention_head_size"):
+                    new_attn.attention_head_size = Dh
+                if hasattr(new_attn, "all_head_size"):
+                    new_attn.all_head_size = H_keep * Dh
+                attn_container.attention = new_attn
+            # --- FFN groups ---
+            inter, out = layer.intermediate, layer.output
+            g = getattr(inter, "neuron_gate", None)
+            if g is not None:
+                rnd = ffn_rounding
+                grp_idx = keep_group_indices_from_gate(
+                    g,
+                    policy=policy,
+                    step=step,
+                    custom_rounding=rnd,
+                )
+                group = int(g.group_size)
+                keep_exp = torch.cat([torch.arange(i * group, (i + 1) * group) for i in grp_idx]).long()
+                inter.dense = slice_linear(inter.dense, keep_out=keep_exp)
+                out.dense = slice_linear(out.dense, keep_in=keep_exp)
+                # # restore clean forward & drop gate
+                # if hasattr(inter, "_orig_forward"):
+                #     def _clean_forward(this, x):
+                #         h = this.dense(x)
+                #         return this.intermediate_act_fn(h)
+                #     inter.forward = _clean_forward.__get__(inter, inter.__class__)
+                #     delattr(inter, "_orig_forward")
+                # if hasattr(inter, "neuron_gate"):
+                #     delattr(inter, "neuron_gate")
+                inter.forward = inter.__class__.forward.__get__(inter, inter.__class__)
+                if hasattr(inter, "neuron_gate"):
+                    delattr(inter, "neuron_gate")
+                if hasattr(inter, "_orig_forward"):
+                    delattr(inter, "_orig_forward")
+        return slim
+# -----------------------------------------------------------------------------
+# Export policy
+# -----------------------------------------------------------------------------
+"""ViT-specific export policy that allows different rounding for heads vs FFN."""
+@dataclass
+class ViTExportPolicy:
+    warmup_steps: int = 0
+    head_rounding: CoreRounding = CoreRounding()
+    ffn_rounding: CoreRounding = CoreRounding()
+@dataclass
+class ViTGrid:
+    head_multiple_grid: Optional[Sequence[int]] = (2, 4, 8)
+    ffn_snap_grid: Sequence[int] = (1, 8)
+    # head_multiple_grid: Optional[Sequence[int]] = None  # default --> 1..num_heads
+    # ffn_snap_grid: Sequence[int] = (1, 2, 4, 8, 16)
+def vit_search_best_export(
+    model_with_gates: nn.Module,
+    *,
+    export_fn: ExportFn,
+    num_heads: int,
+    step: int,
+    batch_shape: Tuple[int, int, int, int],
+    grid: Optional[ViTGrid] = None,
+    device: str = "cuda",
+    measure_settings: Optional[ProfileSettings] = None,
+    make_policy: Optional[Callable[[int, int], object]] = None,
+) -> SearchResult:
+    """Convenience wrapper for ViT-style search.
+    If `make_policy` is not provided, the caller's adapter should accept a
+    policy with separate head/FFN rounding; see `adapters.huggingface.vit.ViTExportPolicy`.
+    """
+    g = grid or ViTGrid()
+    head_grid = g.head_multiple_grid or list(range(1, int(num_heads) + 1))
+    ffn_grid = list(g.ffn_snap_grid)
+    return grid_search_latency(
+        model_with_gates,
+        export_fn,
+        head_multiples=head_grid,
+        ffn_snaps=ffn_grid,
+        step=step,
+        batch_shape=batch_shape,
+        measure_settings=measure_settings,
+        device=device,
+        make_policy=make_policy,
+    )

model_index.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "task": "image-classification",
+  "base_id": "google/vit-base-patch16-224",
+  "variant": "gated-student"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eca9442ba47bd27888b3dc0b0df757113779d2c21182d5626cf1d54643fe637c
+size 343618083