BiliSakura commited on about 15 hours ago

Commit

098ef8f

verified ·

1 Parent(s): 4968e7f

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitattributes +1 -0
PixelFlow-256/demo.png +0 -0
PixelFlow-256/model_index.json +1003 -1
PixelFlow-256/pipeline.py +336 -349
PixelFlow-256/scheduler/scheduler_config.json +2 -2
PixelFlow-256/scheduler/scheduling_pixelflow.py +14 -0
PixelFlow-256/transformer/transformer_pixelflow.py +457 -1
PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc +0 -0
PixelFlow-T2I/model_index.json +1 -1
PixelFlow-T2I/pipeline.py +347 -213
PixelFlow-T2I/scheduler/scheduler_config.json +2 -2
PixelFlow-T2I/scheduler/scheduling_pixelflow.py +14 -0
PixelFlow-T2I/transformer/transformer_pixelflow.py +457 -1
README.md +122 -61
demo_inference_c2i.py +41 -0
demo_inference_t2i.py +38 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+PixelFlow-256/demo.png filter=lfs diff=lfs merge=lfs -text

PixelFlow-256/demo.png CHANGED Viewed

Git LFS Details

SHA256: 729a0166881da84ff71d6006df90284e4592b6330684fc81238ef70c49bf67b3
Pointer size: 131 Bytes
Size of remote file: 101 kB

PixelFlow-256/model_index.json CHANGED Viewed

@@ -8,5 +8,1007 @@
   "transformer": [
     "transformer_pixelflow",
     "PixelFlowTransformer2DModel"
-  ]
 }

   "transformer": [
     "transformer_pixelflow",
     "PixelFlowTransformer2DModel"
+  ],
+  "id2label": {
+    "0": "tench, Tinca tinca",
+    "1": "goldfish, Carassius auratus",
+    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    "3": "tiger shark, Galeocerdo cuvieri",
+    "4": "hammerhead, hammerhead shark",
+    "5": "electric ray, crampfish, numbfish, torpedo",
+    "6": "stingray",
+    "7": "cock",
+    "8": "hen",
+    "9": "ostrich, Struthio camelus",
+    "10": "brambling, Fringilla montifringilla",
+    "11": "goldfinch, Carduelis carduelis",
+    "12": "house finch, linnet, Carpodacus mexicanus",
+    "13": "junco, snowbird",
+    "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    "15": "robin, American robin, Turdus migratorius",
+    "16": "bulbul",
+    "17": "jay",
+    "18": "magpie",
+    "19": "chickadee",
+    "20": "water ouzel, dipper",
+    "21": "kite",
+    "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
+    "23": "vulture",
+    "24": "great grey owl, great gray owl, Strix nebulosa",
+    "25": "European fire salamander, Salamandra salamandra",
+    "26": "common newt, Triturus vulgaris",
+    "27": "eft",
+    "28": "spotted salamander, Ambystoma maculatum",
+    "29": "axolotl, mud puppy, Ambystoma mexicanum",
+    "30": "bullfrog, Rana catesbeiana",
+    "31": "tree frog, tree-frog",
+    "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    "33": "loggerhead, loggerhead turtle, Caretta caretta",
+    "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    "35": "mud turtle",
+    "36": "terrapin",
+    "37": "box turtle, box tortoise",
+    "38": "banded gecko",
+    "39": "common iguana, iguana, Iguana iguana",
+    "40": "American chameleon, anole, Anolis carolinensis",
+    "41": "whiptail, whiptail lizard",
+    "42": "agama",
+    "43": "frilled lizard, Chlamydosaurus kingi",
+    "44": "alligator lizard",
+    "45": "Gila monster, Heloderma suspectum",
+    "46": "green lizard, Lacerta viridis",
+    "47": "African chameleon, Chamaeleo chamaeleon",
+    "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
+    "50": "American alligator, Alligator mississipiensis",
+    "51": "triceratops",
+    "52": "thunder snake, worm snake, Carphophis amoenus",
+    "53": "ringneck snake, ring-necked snake, ring snake",
+    "54": "hognose snake, puff adder, sand viper",
+    "55": "green snake, grass snake",
+    "56": "king snake, kingsnake",
+    "57": "garter snake, grass snake",
+    "58": "water snake",
+    "59": "vine snake",
+    "60": "night snake, Hypsiglena torquata",
+    "61": "boa constrictor, Constrictor constrictor",
+    "62": "rock python, rock snake, Python sebae",
+    "63": "Indian cobra, Naja naja",
+    "64": "green mamba",
+    "65": "sea snake",
+    "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
+    "69": "trilobite",
+    "70": "harvestman, daddy longlegs, Phalangium opilio",
+    "71": "scorpion",
+    "72": "black and gold garden spider, Argiope aurantia",
+    "73": "barn spider, Araneus cavaticus",
+    "74": "garden spider, Aranea diademata",
+    "75": "black widow, Latrodectus mactans",
+    "76": "tarantula",
+    "77": "wolf spider, hunting spider",
+    "78": "tick",
+    "79": "centipede",
+    "80": "black grouse",
+    "81": "ptarmigan",
+    "82": "ruffed grouse, partridge, Bonasa umbellus",
+    "83": "prairie chicken, prairie grouse, prairie fowl",
+    "84": "peacock",
+    "85": "quail",
+    "86": "partridge",
+    "87": "African grey, African gray, Psittacus erithacus",
+    "88": "macaw",
+    "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    "90": "lorikeet",
+    "91": "coucal",
+    "92": "bee eater",
+    "93": "hornbill",
+    "94": "hummingbird",
+    "95": "jacamar",
+    "96": "toucan",
+    "97": "drake",
+    "98": "red-breasted merganser, Mergus serrator",
+    "99": "goose",
+    "100": "black swan, Cygnus atratus",
+    "101": "tusker",
+    "102": "echidna, spiny anteater, anteater",
+    "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    "104": "wallaby, brush kangaroo",
+    "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    "106": "wombat",
+    "107": "jellyfish",
+    "108": "sea anemone, anemone",
+    "109": "brain coral",
+    "110": "flatworm, platyhelminth",
+    "111": "nematode, nematode worm, roundworm",
+    "112": "conch",
+    "113": "snail",
+    "114": "slug",
+    "115": "sea slug, nudibranch",
+    "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    "117": "chambered nautilus, pearly nautilus, nautilus",
+    "118": "Dungeness crab, Cancer magister",
+    "119": "rock crab, Cancer irroratus",
+    "120": "fiddler crab",
+    "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    "124": "crayfish, crawfish, crawdad, crawdaddy",
+    "125": "hermit crab",
+    "126": "isopod",
+    "127": "white stork, Ciconia ciconia",
+    "128": "black stork, Ciconia nigra",
+    "129": "spoonbill",
+    "130": "flamingo",
+    "131": "little blue heron, Egretta caerulea",
+    "132": "American egret, great white heron, Egretta albus",
+    "133": "bittern",
+    "134": "crane",
+    "135": "limpkin, Aramus pictus",
+    "136": "European gallinule, Porphyrio porphyrio",
+    "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    "138": "bustard",
+    "139": "ruddy turnstone, Arenaria interpres",
+    "140": "red-backed sandpiper, dunlin, Erolia alpina",
+    "141": "redshank, Tringa totanus",
+    "142": "dowitcher",
+    "143": "oystercatcher, oyster catcher",
+    "144": "pelican",
+    "145": "king penguin, Aptenodytes patagonica",
+    "146": "albatross, mollymawk",
+    "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    "149": "dugong, Dugong dugon",
+    "150": "sea lion",
+    "151": "Chihuahua",
+    "152": "Japanese spaniel",
+    "153": "Maltese dog, Maltese terrier, Maltese",
+    "154": "Pekinese, Pekingese, Peke",
+    "155": "Shih-Tzu",
+    "156": "Blenheim spaniel",
+    "157": "papillon",
+    "158": "toy terrier",
+    "159": "Rhodesian ridgeback",
+    "160": "Afghan hound, Afghan",
+    "161": "basset, basset hound",
+    "162": "beagle",
+    "163": "bloodhound, sleuthhound",
+    "164": "bluetick",
+    "165": "black-and-tan coonhound",
+    "166": "Walker hound, Walker foxhound",
+    "167": "English foxhound",
+    "168": "redbone",
+    "169": "borzoi, Russian wolfhound",
+    "170": "Irish wolfhound",
+    "171": "Italian greyhound",
+    "172": "whippet",
+    "173": "Ibizan hound, Ibizan Podenco",
+    "174": "Norwegian elkhound, elkhound",
+    "175": "otterhound, otter hound",
+    "176": "Saluki, gazelle hound",
+    "177": "Scottish deerhound, deerhound",
+    "178": "Weimaraner",
+    "179": "Staffordshire bullterrier, Staffordshire bull terrier",
+    "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    "181": "Bedlington terrier",
+    "182": "Border terrier",
+    "183": "Kerry blue terrier",
+    "184": "Irish terrier",
+    "185": "Norfolk terrier",
+    "186": "Norwich terrier",
+    "187": "Yorkshire terrier",
+    "188": "wire-haired fox terrier",
+    "189": "Lakeland terrier",
+    "190": "Sealyham terrier, Sealyham",
+    "191": "Airedale, Airedale terrier",
+    "192": "cairn, cairn terrier",
+    "193": "Australian terrier",
+    "194": "Dandie Dinmont, Dandie Dinmont terrier",
+    "195": "Boston bull, Boston terrier",
+    "196": "miniature schnauzer",
+    "197": "giant schnauzer",
+    "198": "standard schnauzer",
+    "199": "Scotch terrier, Scottish terrier, Scottie",
+    "200": "Tibetan terrier, chrysanthemum dog",
+    "201": "silky terrier, Sydney silky",
+    "202": "soft-coated wheaten terrier",
+    "203": "West Highland white terrier",
+    "204": "Lhasa, Lhasa apso",
+    "205": "flat-coated retriever",
+    "206": "curly-coated retriever",
+    "207": "golden retriever",
+    "208": "Labrador retriever",
+    "209": "Chesapeake Bay retriever",
+    "210": "German short-haired pointer",
+    "211": "vizsla, Hungarian pointer",
+    "212": "English setter",
+    "213": "Irish setter, red setter",
+    "214": "Gordon setter",
+    "215": "Brittany spaniel",
+    "216": "clumber, clumber spaniel",
+    "217": "English springer, English springer spaniel",
+    "218": "Welsh springer spaniel",
+    "219": "cocker spaniel, English cocker spaniel, cocker",
+    "220": "Sussex spaniel",
+    "221": "Irish water spaniel",
+    "222": "kuvasz",
+    "223": "schipperke",
+    "224": "groenendael",
+    "225": "malinois",
+    "226": "briard",
+    "227": "kelpie",
+    "228": "komondor",
+    "229": "Old English sheepdog, bobtail",
+    "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
+    "231": "collie",
+    "232": "Border collie",
+    "233": "Bouvier des Flandres, Bouviers des Flandres",
+    "234": "Rottweiler",
+    "235": "German shepherd, German shepherd dog, German police dog, alsatian",
+    "236": "Doberman, Doberman pinscher",
+    "237": "miniature pinscher",
+    "238": "Greater Swiss Mountain dog",
+    "239": "Bernese mountain dog",
+    "240": "Appenzeller",
+    "241": "EntleBucher",
+    "242": "boxer",
+    "243": "bull mastiff",
+    "244": "Tibetan mastiff",
+    "245": "French bulldog",
+    "246": "Great Dane",
+    "247": "Saint Bernard, St Bernard",
+    "248": "Eskimo dog, husky",
+    "249": "malamute, malemute, Alaskan malamute",
+    "250": "Siberian husky",
+    "251": "dalmatian, coach dog, carriage dog",
+    "252": "affenpinscher, monkey pinscher, monkey dog",
+    "253": "basenji",
+    "254": "pug, pug-dog",
+    "255": "Leonberg",
+    "256": "Newfoundland, Newfoundland dog",
+    "257": "Great Pyrenees",
+    "258": "Samoyed, Samoyede",
+    "259": "Pomeranian",
+    "260": "chow, chow chow",
+    "261": "keeshond",
+    "262": "Brabancon griffon",
+    "263": "Pembroke, Pembroke Welsh corgi",
+    "264": "Cardigan, Cardigan Welsh corgi",
+    "265": "toy poodle",
+    "266": "miniature poodle",
+    "267": "standard poodle",
+    "268": "Mexican hairless",
+    "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
+    "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
+    "271": "red wolf, maned wolf, Canis rufus, Canis niger",
+    "272": "coyote, prairie wolf, brush wolf, Canis latrans",
+    "273": "dingo, warrigal, warragal, Canis dingo",
+    "274": "dhole, Cuon alpinus",
+    "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    "276": "hyena, hyaena",
+    "277": "red fox, Vulpes vulpes",
+    "278": "kit fox, Vulpes macrotis",
+    "279": "Arctic fox, white fox, Alopex lagopus",
+    "280": "grey fox, gray fox, Urocyon cinereoargenteus",
+    "281": "tabby, tabby cat",
+    "282": "tiger cat",
+    "283": "Persian cat",
+    "284": "Siamese cat, Siamese",
+    "285": "Egyptian cat",
+    "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    "287": "lynx, catamount",
+    "288": "leopard, Panthera pardus",
+    "289": "snow leopard, ounce, Panthera uncia",
+    "290": "jaguar, panther, Panthera onca, Felis onca",
+    "291": "lion, king of beasts, Panthera leo",
+    "292": "tiger, Panthera tigris",
+    "293": "cheetah, chetah, Acinonyx jubatus",
+    "294": "brown bear, bruin, Ursus arctos",
+    "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    "297": "sloth bear, Melursus ursinus, Ursus ursinus",
+    "298": "mongoose",
+    "299": "meerkat, mierkat",
+    "300": "tiger beetle",
+    "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    "302": "ground beetle, carabid beetle",
+    "303": "long-horned beetle, longicorn, longicorn beetle",
+    "304": "leaf beetle, chrysomelid",
+    "305": "dung beetle",
+    "306": "rhinoceros beetle",
+    "307": "weevil",
+    "308": "fly",
+    "309": "bee",
+    "310": "ant, emmet, pismire",
+    "311": "grasshopper, hopper",
+    "312": "cricket",
+    "313": "walking stick, walkingstick, stick insect",
+    "314": "cockroach, roach",
+    "315": "mantis, mantid",
+    "316": "cicada, cicala",
+    "317": "leafhopper",
+    "318": "lacewing, lacewing fly",
+    "319": "dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    "320": "damselfly",
+    "321": "admiral",
+    "322": "ringlet, ringlet butterfly",
+    "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    "324": "cabbage butterfly",
+    "325": "sulphur butterfly, sulfur butterfly",
+    "326": "lycaenid, lycaenid butterfly",
+    "327": "starfish, sea star",
+    "328": "sea urchin",
+    "329": "sea cucumber, holothurian",
+    "330": "wood rabbit, cottontail, cottontail rabbit",
+    "331": "hare",
+    "332": "Angora, Angora rabbit",
+    "333": "hamster",
+    "334": "porcupine, hedgehog",
+    "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
+    "336": "marmot",
+    "337": "beaver",
+    "338": "guinea pig, Cavia cobaya",
+    "339": "sorrel",
+    "340": "zebra",
+    "341": "hog, pig, grunter, squealer, Sus scrofa",
+    "342": "wild boar, boar, Sus scrofa",
+    "343": "warthog",
+    "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    "345": "ox",
+    "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    "347": "bison",
+    "348": "ram, tup",
+    "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    "350": "ibex, Capra ibex",
+    "351": "hartebeest",
+    "352": "impala, Aepyceros melampus",
+    "353": "gazelle",
+    "354": "Arabian camel, dromedary, Camelus dromedarius",
+    "355": "llama",
+    "356": "weasel",
+    "357": "mink",
+    "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
+    "359": "black-footed ferret, ferret, Mustela nigripes",
+    "360": "otter",
+    "361": "skunk, polecat, wood pussy",
+    "362": "badger",
+    "363": "armadillo",
+    "364": "three-toed sloth, ai, Bradypus tridactylus",
+    "365": "orangutan, orang, orangutang, Pongo pygmaeus",
+    "366": "gorilla, Gorilla gorilla",
+    "367": "chimpanzee, chimp, Pan troglodytes",
+    "368": "gibbon, Hylobates lar",
+    "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    "370": "guenon, guenon monkey",
+    "371": "patas, hussar monkey, Erythrocebus patas",
+    "372": "baboon",
+    "373": "macaque",
+    "374": "langur",
+    "375": "colobus, colobus monkey",
+    "376": "proboscis monkey, Nasalis larvatus",
+    "377": "marmoset",
+    "378": "capuchin, ringtail, Cebus capucinus",
+    "379": "howler monkey, howler",
+    "380": "titi, titi monkey",
+    "381": "spider monkey, Ateles geoffroyi",
+    "382": "squirrel monkey, Saimiri sciureus",
+    "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
+    "384": "indri, indris, Indri indri, Indri brevicaudatus",
+    "385": "Indian elephant, Elephas maximus",
+    "386": "African elephant, Loxodonta africana",
+    "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    "389": "barracouta, snoek",
+    "390": "eel",
+    "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    "392": "rock beauty, Holocanthus tricolor",
+    "393": "anemone fish",
+    "394": "sturgeon",
+    "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    "396": "lionfish",
+    "397": "puffer, pufferfish, blowfish, globefish",
+    "398": "abacus",
+    "399": "abaya",
+    "400": "academic gown, academic robe, judge robe",
+    "401": "accordion, piano accordion, squeeze box",
+    "402": "acoustic guitar",
+    "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    "404": "airliner",
+    "405": "airship, dirigible",
+    "406": "altar",
+    "407": "ambulance",
+    "408": "amphibian, amphibious vehicle",
+    "409": "analog clock",
+    "410": "apiary, bee house",
+    "411": "apron",
+    "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    "413": "assault rifle, assault gun",
+    "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    "415": "bakery, bakeshop, bakehouse",
+    "416": "balance beam, beam",
+    "417": "balloon",
+    "418": "ballpoint, ballpoint pen, ballpen, Biro",
+    "419": "Band Aid",
+    "420": "banjo",
+    "421": "bannister, banister, balustrade, balusters, handrail",
+    "422": "barbell",
+    "423": "barber chair",
+    "424": "barbershop",
+    "425": "barn",
+    "426": "barometer",
+    "427": "barrel, cask",
+    "428": "barrow, garden cart, lawn cart, wheelbarrow",
+    "429": "baseball",
+    "430": "basketball",
+    "431": "bassinet",
+    "432": "bassoon",
+    "433": "bathing cap, swimming cap",
+    "434": "bath towel",
+    "435": "bathtub, bathing tub, bath, tub",
+    "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    "437": "beacon, lighthouse, beacon light, pharos",
+    "438": "beaker",
+    "439": "bearskin, busby, shako",
+    "440": "beer bottle",
+    "441": "beer glass",
+    "442": "bell cote, bell cot",
+    "443": "bib",
+    "444": "bicycle-built-for-two, tandem bicycle, tandem",
+    "445": "bikini, two-piece",
+    "446": "binder, ring-binder",
+    "447": "binoculars, field glasses, opera glasses",
+    "448": "birdhouse",
+    "449": "boathouse",
+    "450": "bobsled, bobsleigh, bob",
+    "451": "bolo tie, bolo, bola tie, bola",
+    "452": "bonnet, poke bonnet",
+    "453": "bookcase",
+    "454": "bookshop, bookstore, bookstall",
+    "455": "bottlecap",
+    "456": "bow",
+    "457": "bow tie, bow-tie, bowtie",
+    "458": "brass, memorial tablet, plaque",
+    "459": "brassiere, bra, bandeau",
+    "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    "461": "breastplate, aegis, egis",
+    "462": "broom",
+    "463": "bucket, pail",
+    "464": "buckle",
+    "465": "bulletproof vest",
+    "466": "bullet train, bullet",
+    "467": "butcher shop, meat market",
+    "468": "cab, hack, taxi, taxicab",
+    "469": "caldron, cauldron",
+    "470": "candle, taper, wax light",
+    "471": "cannon",
+    "472": "canoe",
+    "473": "can opener, tin opener",
+    "474": "cardigan",
+    "475": "car mirror",
+    "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    "477": "carpenters kit, tool kit",
+    "478": "carton",
+    "479": "car wheel",
+    "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    "481": "cassette",
+    "482": "cassette player",
+    "483": "castle",
+    "484": "catamaran",
+    "485": "CD player",
+    "486": "cello, violoncello",
+    "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    "488": "chain",
+    "489": "chainlink fence",
+    "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    "491": "chain saw, chainsaw",
+    "492": "chest",
+    "493": "chiffonier, commode",
+    "494": "chime, bell, gong",
+    "495": "china cabinet, china closet",
+    "496": "Christmas stocking",
+    "497": "church, church building",
+    "498": "cinema, movie theater, movie theatre, movie house, picture palace",
+    "499": "cleaver, meat cleaver, chopper",
+    "500": "cliff dwelling",
+    "501": "cloak",
+    "502": "clog, geta, patten, sabot",
+    "503": "cocktail shaker",
+    "504": "coffee mug",
+    "505": "coffeepot",
+    "506": "coil, spiral, volute, whorl, helix",
+    "507": "combination lock",
+    "508": "computer keyboard, keypad",
+    "509": "confectionery, confectionary, candy store",
+    "510": "container ship, containership, container vessel",
+    "511": "convertible",
+    "512": "corkscrew, bottle screw",
+    "513": "cornet, horn, trumpet, trump",
+    "514": "cowboy boot",
+    "515": "cowboy hat, ten-gallon hat",
+    "516": "cradle",
+    "517": "crane",
+    "518": "crash helmet",
+    "519": "crate",
+    "520": "crib, cot",
+    "521": "Crock Pot",
+    "522": "croquet ball",
+    "523": "crutch",
+    "524": "cuirass",
+    "525": "dam, dike, dyke",
+    "526": "desk",
+    "527": "desktop computer",
+    "528": "dial telephone, dial phone",
+    "529": "diaper, nappy, napkin",
+    "530": "digital clock",
+    "531": "digital watch",
+    "532": "dining table, board",
+    "533": "dishrag, dishcloth",
+    "534": "dishwasher, dish washer, dishwashing machine",
+    "535": "disk brake, disc brake",
+    "536": "dock, dockage, docking facility",
+    "537": "dogsled, dog sled, dog sleigh",
+    "538": "dome",
+    "539": "doormat, welcome mat",
+    "540": "drilling platform, offshore rig",
+    "541": "drum, membranophone, tympan",
+    "542": "drumstick",
+    "543": "dumbbell",
+    "544": "Dutch oven",
+    "545": "electric fan, blower",
+    "546": "electric guitar",
+    "547": "electric locomotive",
+    "548": "entertainment center",
+    "549": "envelope",
+    "550": "espresso maker",
+    "551": "face powder",
+    "552": "feather boa, boa",
+    "553": "file, file cabinet, filing cabinet",
+    "554": "fireboat",
+    "555": "fire engine, fire truck",
+    "556": "fire screen, fireguard",
+    "557": "flagpole, flagstaff",
+    "558": "flute, transverse flute",
+    "559": "folding chair",
+    "560": "football helmet",
+    "561": "forklift",
+    "562": "fountain",
+    "563": "fountain pen",
+    "564": "four-poster",
+    "565": "freight car",
+    "566": "French horn, horn",
+    "567": "frying pan, frypan, skillet",
+    "568": "fur coat",
+    "569": "garbage truck, dustcart",
+    "570": "gasmask, respirator, gas helmet",
+    "571": "gas pump, gasoline pump, petrol pump, island dispenser",
+    "572": "goblet",
+    "573": "go-kart",
+    "574": "golf ball",
+    "575": "golfcart, golf cart",
+    "576": "gondola",
+    "577": "gong, tam-tam",
+    "578": "gown",
+    "579": "grand piano, grand",
+    "580": "greenhouse, nursery, glasshouse",
+    "581": "grille, radiator grille",
+    "582": "grocery store, grocery, food market, market",
+    "583": "guillotine",
+    "584": "hair slide",
+    "585": "hair spray",
+    "586": "half track",
+    "587": "hammer",
+    "588": "hamper",
+    "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    "590": "hand-held computer, hand-held microcomputer",
+    "591": "handkerchief, hankie, hanky, hankey",
+    "592": "hard disc, hard disk, fixed disk",
+    "593": "harmonica, mouth organ, harp, mouth harp",
+    "594": "harp",
+    "595": "harvester, reaper",
+    "596": "hatchet",
+    "597": "holster",
+    "598": "home theater, home theatre",
+    "599": "honeycomb",
+    "600": "hook, claw",
+    "601": "hoopskirt, crinoline",
+    "602": "horizontal bar, high bar",
+    "603": "horse cart, horse-cart",
+    "604": "hourglass",
+    "605": "iPod",
+    "606": "iron, smoothing iron",
+    "607": "jack-o-lantern",
+    "608": "jean, blue jean, denim",
+    "609": "jeep, landrover",
+    "610": "jersey, T-shirt, tee shirt",
+    "611": "jigsaw puzzle",
+    "612": "jinrikisha, ricksha, rickshaw",
+    "613": "joystick",
+    "614": "kimono",
+    "615": "knee pad",
+    "616": "knot",
+    "617": "lab coat, laboratory coat",
+    "618": "ladle",
+    "619": "lampshade, lamp shade",
+    "620": "laptop, laptop computer",
+    "621": "lawn mower, mower",
+    "622": "lens cap, lens cover",
+    "623": "letter opener, paper knife, paperknife",
+    "624": "library",
+    "625": "lifeboat",
+    "626": "lighter, light, igniter, ignitor",
+    "627": "limousine, limo",
+    "628": "liner, ocean liner",
+    "629": "lipstick, lip rouge",
+    "630": "Loafer",
+    "631": "lotion",
+    "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    "633": "loupe, jewelers loupe",
+    "634": "lumbermill, sawmill",
+    "635": "magnetic compass",
+    "636": "mailbag, postbag",
+    "637": "mailbox, letter box",
+    "638": "maillot",
+    "639": "maillot, tank suit",
+    "640": "manhole cover",
+    "641": "maraca",
+    "642": "marimba, xylophone",
+    "643": "mask",
+    "644": "matchstick",
+    "645": "maypole",
+    "646": "maze, labyrinth",
+    "647": "measuring cup",
+    "648": "medicine chest, medicine cabinet",
+    "649": "megalith, megalithic structure",
+    "650": "microphone, mike",
+    "651": "microwave, microwave oven",
+    "652": "military uniform",
+    "653": "milk can",
+    "654": "minibus",
+    "655": "miniskirt, mini",
+    "656": "minivan",
+    "657": "missile",
+    "658": "mitten",
+    "659": "mixing bowl",
+    "660": "mobile home, manufactured home",
+    "661": "Model T",
+    "662": "modem",
+    "663": "monastery",
+    "664": "monitor",
+    "665": "moped",
+    "666": "mortar",
+    "667": "mortarboard",
+    "668": "mosque",
+    "669": "mosquito net",
+    "670": "motor scooter, scooter",
+    "671": "mountain bike, all-terrain bike, off-roader",
+    "672": "mountain tent",
+    "673": "mouse, computer mouse",
+    "674": "mousetrap",
+    "675": "moving van",
+    "676": "muzzle",
+    "677": "nail",
+    "678": "neck brace",
+    "679": "necklace",
+    "680": "nipple",
+    "681": "notebook, notebook computer",
+    "682": "obelisk",
+    "683": "oboe, hautboy, hautbois",
+    "684": "ocarina, sweet potato",
+    "685": "odometer, hodometer, mileometer, milometer",
+    "686": "oil filter",
+    "687": "organ, pipe organ",
+    "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    "689": "overskirt",
+    "690": "oxcart",
+    "691": "oxygen mask",
+    "692": "packet",
+    "693": "paddle, boat paddle",
+    "694": "paddlewheel, paddle wheel",
+    "695": "padlock",
+    "696": "paintbrush",
+    "697": "pajama, pyjama, pjs, jammies",
+    "698": "palace",
+    "699": "panpipe, pandean pipe, syrinx",
+    "700": "paper towel",
+    "701": "parachute, chute",
+    "702": "parallel bars, bars",
+    "703": "park bench",
+    "704": "parking meter",
+    "705": "passenger car, coach, carriage",
+    "706": "patio, terrace",
+    "707": "pay-phone, pay-station",
+    "708": "pedestal, plinth, footstall",
+    "709": "pencil box, pencil case",
+    "710": "pencil sharpener",
+    "711": "perfume, essence",
+    "712": "Petri dish",
+    "713": "photocopier",
+    "714": "pick, plectrum, plectron",
+    "715": "pickelhaube",
+    "716": "picket fence, paling",
+    "717": "pickup, pickup truck",
+    "718": "pier",
+    "719": "piggy bank, penny bank",
+    "720": "pill bottle",
+    "721": "pillow",
+    "722": "ping-pong ball",
+    "723": "pinwheel",
+    "724": "pirate, pirate ship",
+    "725": "pitcher, ewer",
+    "726": "plane, carpenters plane, woodworking plane",
+    "727": "planetarium",
+    "728": "plastic bag",
+    "729": "plate rack",
+    "730": "plow, plough",
+    "731": "plunger, plumbers helper",
+    "732": "Polaroid camera, Polaroid Land camera",
+    "733": "pole",
+    "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    "735": "poncho",
+    "736": "pool table, billiard table, snooker table",
+    "737": "pop bottle, soda bottle",
+    "738": "pot, flowerpot",
+    "739": "potters wheel",
+    "740": "power drill",
+    "741": "prayer rug, prayer mat",
+    "742": "printer",
+    "743": "prison, prison house",
+    "744": "projectile, missile",
+    "745": "projector",
+    "746": "puck, hockey puck",
+    "747": "punching bag, punch bag, punching ball, punchball",
+    "748": "purse",
+    "749": "quill, quill pen",
+    "750": "quilt, comforter, comfort, puff",
+    "751": "racer, race car, racing car",
+    "752": "racket, racquet",
+    "753": "radiator",
+    "754": "radio, wireless",
+    "755": "radio telescope, radio reflector",
+    "756": "rain barrel",
+    "757": "recreational vehicle, RV, R.V.",
+    "758": "reel",
+    "759": "reflex camera",
+    "760": "refrigerator, icebox",
+    "761": "remote control, remote",
+    "762": "restaurant, eating house, eating place, eatery",
+    "763": "revolver, six-gun, six-shooter",
+    "764": "rifle",
+    "765": "rocking chair, rocker",
+    "766": "rotisserie",
+    "767": "rubber eraser, rubber, pencil eraser",
+    "768": "rugby ball",
+    "769": "rule, ruler",
+    "770": "running shoe",
+    "771": "safe",
+    "772": "safety pin",
+    "773": "saltshaker, salt shaker",
+    "774": "sandal",
+    "775": "sarong",
+    "776": "sax, saxophone",
+    "777": "scabbard",
+    "778": "scale, weighing machine",
+    "779": "school bus",
+    "780": "schooner",
+    "781": "scoreboard",
+    "782": "screen, CRT screen",
+    "783": "screw",
+    "784": "screwdriver",
+    "785": "seat belt, seatbelt",
+    "786": "sewing machine",
+    "787": "shield, buckler",
+    "788": "shoe shop, shoe-shop, shoe store",
+    "789": "shoji",
+    "790": "shopping basket",
+    "791": "shopping cart",
+    "792": "shovel",
+    "793": "shower cap",
+    "794": "shower curtain",
+    "795": "ski",
+    "796": "ski mask",
+    "797": "sleeping bag",
+    "798": "slide rule, slipstick",
+    "799": "sliding door",
+    "800": "slot, one-armed bandit",
+    "801": "snorkel",
+    "802": "snowmobile",
+    "803": "snowplow, snowplough",
+    "804": "soap dispenser",
+    "805": "soccer ball",
+    "806": "sock",
+    "807": "solar dish, solar collector, solar furnace",
+    "808": "sombrero",
+    "809": "soup bowl",
+    "810": "space bar",
+    "811": "space heater",
+    "812": "space shuttle",
+    "813": "spatula",
+    "814": "speedboat",
+    "815": "spider web, spiders web",
+    "816": "spindle",
+    "817": "sports car, sport car",
+    "818": "spotlight, spot",
+    "819": "stage",
+    "820": "steam locomotive",
+    "821": "steel arch bridge",
+    "822": "steel drum",
+    "823": "stethoscope",
+    "824": "stole",
+    "825": "stone wall",
+    "826": "stopwatch, stop watch",
+    "827": "stove",
+    "828": "strainer",
+    "829": "streetcar, tram, tramcar, trolley, trolley car",
+    "830": "stretcher",
+    "831": "studio couch, day bed",
+    "832": "stupa, tope",
+    "833": "submarine, pigboat, sub, U-boat",
+    "834": "suit, suit of clothes",
+    "835": "sundial",
+    "836": "sunglass",
+    "837": "sunglasses, dark glasses, shades",
+    "838": "sunscreen, sunblock, sun blocker",
+    "839": "suspension bridge",
+    "840": "swab, swob, mop",
+    "841": "sweatshirt",
+    "842": "swimming trunks, bathing trunks",
+    "843": "swing",
+    "844": "switch, electric switch, electrical switch",
+    "845": "syringe",
+    "846": "table lamp",
+    "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    "848": "tape player",
+    "849": "teapot",
+    "850": "teddy, teddy bear",
+    "851": "television, television system",
+    "852": "tennis ball",
+    "853": "thatch, thatched roof",
+    "854": "theater curtain, theatre curtain",
+    "855": "thimble",
+    "856": "thresher, thrasher, threshing machine",
+    "857": "throne",
+    "858": "tile roof",
+    "859": "toaster",
+    "860": "tobacco shop, tobacconist shop, tobacconist",
+    "861": "toilet seat",
+    "862": "torch",
+    "863": "totem pole",
+    "864": "tow truck, tow car, wrecker",
+    "865": "toyshop",
+    "866": "tractor",
+    "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    "868": "tray",
+    "869": "trench coat",
+    "870": "tricycle, trike, velocipede",
+    "871": "trimaran",
+    "872": "tripod",
+    "873": "triumphal arch",
+    "874": "trolleybus, trolley coach, trackless trolley",
+    "875": "trombone",
+    "876": "tub, vat",
+    "877": "turnstile",
+    "878": "typewriter keyboard",
+    "879": "umbrella",
+    "880": "unicycle, monocycle",
+    "881": "upright, upright piano",
+    "882": "vacuum, vacuum cleaner",
+    "883": "vase",
+    "884": "vault",
+    "885": "velvet",
+    "886": "vending machine",
+    "887": "vestment",
+    "888": "viaduct",
+    "889": "violin, fiddle",
+    "890": "volleyball",
+    "891": "waffle iron",
+    "892": "wall clock",
+    "893": "wallet, billfold, notecase, pocketbook",
+    "894": "wardrobe, closet, press",
+    "895": "warplane, military plane",
+    "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    "897": "washer, automatic washer, washing machine",
+    "898": "water bottle",
+    "899": "water jug",
+    "900": "water tower",
+    "901": "whiskey jug",
+    "902": "whistle",
+    "903": "wig",
+    "904": "window screen",
+    "905": "window shade",
+    "906": "Windsor tie",
+    "907": "wine bottle",
+    "908": "wing",
+    "909": "wok",
+    "910": "wooden spoon",
+    "911": "wool, woolen, woollen",
+    "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
+    "913": "wreck",
+    "914": "yawl",
+    "915": "yurt",
+    "916": "web site, website, internet site, site",
+    "917": "comic book",
+    "918": "crossword puzzle, crossword",
+    "919": "street sign",
+    "920": "traffic light, traffic signal, stoplight",
+    "921": "book jacket, dust cover, dust jacket, dust wrapper",
+    "922": "menu",
+    "923": "plate",
+    "924": "guacamole",
+    "925": "consomme",
+    "926": "hot pot, hotpot",
+    "927": "trifle",
+    "928": "ice cream, icecream",
+    "929": "ice lolly, lolly, lollipop, popsicle",
+    "930": "French loaf",
+    "931": "bagel, beigel",
+    "932": "pretzel",
+    "933": "cheeseburger",
+    "934": "hotdog, hot dog, red hot",
+    "935": "mashed potato",
+    "936": "head cabbage",
+    "937": "broccoli",
+    "938": "cauliflower",
+    "939": "zucchini, courgette",
+    "940": "spaghetti squash",
+    "941": "acorn squash",
+    "942": "butternut squash",
+    "943": "cucumber, cuke",
+    "944": "artichoke, globe artichoke",
+    "945": "bell pepper",
+    "946": "cardoon",
+    "947": "mushroom",
+    "948": "Granny Smith",
+    "949": "strawberry",
+    "950": "orange",
+    "951": "lemon",
+    "952": "fig",
+    "953": "pineapple, ananas",
+    "954": "banana",
+    "955": "jackfruit, jak, jack",
+    "956": "custard apple",
+    "957": "pomegranate",
+    "958": "hay",
+    "959": "carbonara",
+    "960": "chocolate sauce, chocolate syrup",
+    "961": "dough",
+    "962": "meat loaf, meatloaf",
+    "963": "pizza, pizza pie",
+    "964": "potpie",
+    "965": "burrito",
+    "966": "red wine",
+    "967": "espresso",
+    "968": "cup",
+    "969": "eggnog",
+    "970": "alp",
+    "971": "bubble",
+    "972": "cliff, drop, drop-off",
+    "973": "coral reef",
+    "974": "geyser",
+    "975": "lakeside, lakeshore",
+    "976": "promontory, headland, head, foreland",
+    "977": "sandbar, sand bar",
+    "978": "seashore, coast, seacoast, sea-coast",
+    "979": "valley, vale",
+    "980": "volcano",
+    "981": "ballplayer, baseball player",
+    "982": "groom, bridegroom",
+    "983": "scuba diver",
+    "984": "rapeseed",
+    "985": "daisy",
+    "986": "yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    "987": "corn",
+    "988": "acorn",
+    "989": "hip, rose hip, rosehip",
+    "990": "buckeye, horse chestnut, conker",
+    "991": "coral fungus",
+    "992": "agaric",
+    "993": "gyromitra",
+    "994": "stinkhorn, carrion fungus",
+    "995": "earthstar",
+    "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    "997": "bolete",
+    "998": "ear, spike, capitulum",
+    "999": "toilet tissue, toilet paper, bathroom tissue"
+  }
 }

PixelFlow-256/pipeline.py CHANGED Viewed

@@ -1,16 +1,23 @@
-"""Hub custom pipeline: PixelFlowPipeline.
-Load with native Hugging Face diffusers and `trust_remote_code=True`.
-"""
-from __future__ import annotations
 import importlib
 import math
 import sys
-from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -19,25 +26,83 @@ from einops import rearrange
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models.embeddings import get_2d_rotary_pos_embed
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.utils import BaseOutput
 from diffusers.utils.torch_utils import randn_tensor
-@dataclass
-class PixelFlowPipelineOutput(BaseOutput):
-    images: Union[torch.Tensor, List, np.ndarray]
 class PixelFlowPipeline(DiffusionPipeline):
-    """Pipeline for PixelFlow pixel-space flow generation (class-conditional or text-to-image)."""
-    model_cpu_offload_seq = "text_encoder->transformer"
-    _optional_components = ["text_encoder", "tokenizer"]
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
         """Load a self-contained variant folder locally or from the Hub."""
         repo_root = Path(__file__).resolve().parent
         if pretrained_model_name_or_path in (None, "", "."):
@@ -62,109 +127,78 @@ class PixelFlowPipeline(DiffusionPipeline):
             if subfolder:
                 variant = variant / subfolder
         model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        def _load_text_components():
-            text_encoder = None
-            tokenizer = None
-            te_dir = variant / "text_encoder"
-            tok_dir = variant / "tokenizer"
-            if te_dir.exists() and (te_dir / "config.json").exists():
-                from transformers import T5EncoderModel, T5Tokenizer
-                text_encoder = T5EncoderModel.from_pretrained(str(te_dir), **model_kwargs)
-                tokenizer = T5Tokenizer.from_pretrained(str(tok_dir))
-            return text_encoder, tokenizer
         try:
-            transformer = _load_component("transformer", "transformer_pixelflow", "PixelFlowTransformer2DModel")
-            scheduler = _load_component("scheduler", "scheduling_pixelflow", "PixelFlowScheduler")
-            text_encoder, tokenizer = _load_text_components()
-            if scheduler is None:
-                sched_dir = variant / "scheduler"
-                if (sched_dir / "scheduling_pixelflow.py").exists():
-                    sched_path = str(sched_dir)
-                    if sched_path not in sys.path:
-                        sys.path.insert(0, sched_path)
-                        inserted.append(sched_path)
-                    scheduler = importlib.import_module("scheduling_pixelflow").PixelFlowScheduler()
-            if transformer is None:
                 raise ValueError(f"No loadable transformer found under {variant}")
-            id2label = None
-            id2label_cn = None
-            labels_dir = variant.parent / "labels"
-            if labels_dir.is_dir():
-                labels_path = str(labels_dir)
-                if labels_path not in sys.path:
-                    sys.path.insert(0, labels_path)
-                    inserted.append(labels_path)
-                from imagenet_labels import load_id2label
-                id2label = load_id2label(labels_dir, lang="en")
-                id2label_cn = load_id2label(labels_dir, lang="cn")
-            return cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-                id2label=id2label,
-                id2label_cn=id2label_cn,
-            )
         finally:
             for comp_path in inserted:
                 if comp_path in sys.path:
                     sys.path.remove(comp_path)
-    def __init__(
-        self,
-        transformer,
-        scheduler,
-        text_encoder=None,
-        tokenizer=None,
-        max_token_length: int = 512,
-        id2label: Optional[dict[int, str]] = None,
-        id2label_cn: Optional[dict[int, str]] = None,
-    ):
-        super().__init__()
-        self.register_modules(
-            transformer=transformer,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
-        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
-        self.class_cond = transformer.config.num_classes > 0
-        self.max_token_length = max_token_length
-        self._id2label = id2label or {}
-        self._id2label_cn = id2label_cn or {}
-        self.labels = self._build_label2id(self._id2label)
-        self.labels_cn = self._build_label2id(self._id2label_cn)
     @staticmethod
-    def _build_label2id(id2label: dict[int, str]) -> dict[str, int]:
-        label2id: dict[str, int] = {}
         for class_id, value in id2label.items():
             for synonym in value.split(","):
                 synonym = synonym.strip()
@@ -173,37 +207,23 @@ class PixelFlowPipeline(DiffusionPipeline):
         return dict(sorted(label2id.items()))
     @property
-    def id2label(self) -> dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
-    @property
-    def id2label_cn(self) -> dict[int, str]:
-        """ImageNet class id to Chinese label string (comma-separated synonyms)."""
-        return self._id2label_cn
-    def get_label_ids(self, label: Union[str, List[str]], lang: str = "en") -> List[int]:
         r"""
         Map ImageNet label strings to class ids.
         Args:
             label (`str` or `list[str]`):
-                One or more label strings. Each string must match a synonym in `id2label` (English)
-                or `id2label_cn` (Chinese).
-            lang (`str`, *optional*, defaults to `"en"`):
-                `"en"` uses English synonyms; `"cn"` uses Chinese synonyms.
-        Returns:
-            `list[int]`: Class ids for [`~PixelFlowPipeline.__call__`].
         """
-        if lang not in ("en", "cn"):
-            raise ValueError(f"`lang` must be 'en' or 'cn', got {lang!r}.")
-        label2id = self.labels if lang == "en" else self.labels_cn
         if not label2id:
-            raise ValueError(
-                f"No {lang} labels loaded. Ensure `labels/id2label_{lang}.json` exists next to the variant folder."
-            )
         if isinstance(label, str):
             label = [label]
@@ -211,279 +231,246 @@ class PixelFlowPipeline(DiffusionPipeline):
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown label(s) for lang={lang!r}: {missing}. Example valid labels: {preview}, ..."
-            )
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         self,
-        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]],
-    ) -> Optional[Union[int, List[int], torch.Tensor]]:
-        if class_labels is None:
-            return None
-        if isinstance(class_labels, str):
-            return self.get_label_ids(class_labels)[0]
-        if isinstance(class_labels, list) and class_labels and isinstance(class_labels[0], str):
-            if all(label in self.labels for label in class_labels):
-                return self.get_label_ids(class_labels, lang="en")
-            if all(label in self.labels_cn for label in class_labels):
-                return self.get_label_ids(class_labels, lang="cn")
             raise ValueError(
-                "Could not resolve string `class_labels`. Use English synonyms from `pipe.labels` "
-                "or Chinese synonyms from `pipe.labels_cn`."
             )
-        return class_labels
-    def sample_block_noise(self, bs, ch, height, width, eps=1e-6):
         gamma = self.scheduler.gamma
         dist = torch.distributions.multivariate_normal.MultivariateNormal(
             torch.zeros(4),
             torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
         )
-        block_number = bs * ch * (height // 2) * (width // 2)
         noise = torch.stack([dist.sample() for _ in range(block_number)])
-        noise = rearrange(
             noise,
             "(b c h w) (p q) -> b c (h p) (w q)",
-            b=bs,
-            c=ch,
             h=height // 2,
             w=width // 2,
             p=2,
             q=2,
         )
-        return noise
-    def _stage_guidance_scale(self, stage_idx: int) -> float:
-        if not self.class_cond:
-            return self._guidance_scale_value
-        scale_dict = {0: 0, 1: 1 / 6, 2: 2 / 3, 3: 1}
-        return (self._guidance_scale_value - 1) * scale_dict[stage_idx] + 1
-    @property
-    def do_classifier_free_guidance(self) -> bool:
-        return self._guidance_scale_value > 0
-    @torch.no_grad()
-    def encode_prompt(
         self,
-        prompt: Union[str, List[str]],
         device: torch.device,
-        num_images_per_prompt: int = 1,
-        do_classifier_free_guidance: bool = True,
-        negative_prompt: Union[str, List[str]] = "",
-        max_length: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if self.text_encoder is None or self.tokenizer is None:
-            raise ValueError("Text-to-image generation requires `text_encoder` and `tokenizer`.")
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        batch_size = len(prompt)
-        max_length = max_length or self.max_token_length
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_length,
-            truncation=True,
-            add_special_tokens=True,
-            return_tensors="pt",
         )
-        text_input_ids = text_inputs.input_ids.to(device)
-        prompt_attention_mask = text_inputs.attention_mask.to(device)
-        prompt_embeds = self.text_encoder(
-            text_input_ids,
-            attention_mask=prompt_attention_mask,
-        )[0]
-        dtype = self.text_encoder.dtype
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
-        if do_classifier_free_guidance:
-            if isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt] * batch_size
-            elif isinstance(negative_prompt, list):
-                if len(negative_prompt) != batch_size:
-                    raise ValueError(
-                        f"Negative prompt list length ({len(negative_prompt)}) must match prompt batch ({batch_size})."
-                    )
-                uncond_tokens = negative_prompt
-            else:
-                raise ValueError("Negative prompt must be a string or list of strings.")
-            uncond_inputs = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=prompt_embeds.shape[1],
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pt",
-            )
-            negative_input_ids = uncond_inputs.input_ids.to(device)
-            negative_prompt_attention_mask = uncond_inputs.attention_mask.to(device)
-            negative_prompt_embeds = self.text_encoder(
-                negative_input_ids,
-                attention_mask=negative_prompt_attention_mask,
-            )[0]
-            seq_len_neg = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len_neg, -1)
-            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
-        return prompt_embeds, prompt_attention_mask
-    @torch.no_grad()
     def __call__(
         self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: Union[int, List[int]] = 10,
         guidance_scale: float = 4.0,
         shift: float = 1.0,
-        negative_prompt: Union[str, List[str]] = "",
-        num_images_per_prompt: int = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: str = "pil",
         return_dict: bool = True,
-    ) -> Union[PixelFlowPipelineOutput, Tuple]:
-        if height is None:
-            height = int(self.transformer.config.sample_size)
-        if width is None:
-            width = int(self.transformer.config.sample_size)
-        device = self._execution_device
-        self._guidance_scale_value = guidance_scale
-        if isinstance(num_inference_steps, int):
-            num_inference_steps = [num_inference_steps] * self.scheduler.num_stages
-        prompt_attention_mask = None
-        if self.class_cond:
-            if class_labels is None:
-                raise ValueError("`class_labels` are required for class-conditional PixelFlow checkpoints.")
-            class_labels = self._normalize_class_labels(class_labels)
-            if isinstance(class_labels, int):
-                class_labels = [class_labels]
-            if not torch.is_tensor(class_labels):
-                class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
-            else:
-                class_labels = class_labels.to(device=device, dtype=torch.long)
-            batch_size = class_labels.shape[0]
-            prompt_embeds = class_labels
-            negative_prompt_embeds = torch.full_like(prompt_embeds, self.transformer.config.num_classes)
-            if self.do_classifier_free_guidance:
-                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-        else:
-            if prompt is None:
-                raise ValueError("`prompt` is required for text-to-image PixelFlow checkpoints.")
-            if isinstance(prompt, str):
-                prompt = [prompt]
-            batch_size = len(prompt)
-            prompt_embeds, prompt_attention_mask = self.encode_prompt(
-                prompt,
-                device,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=self.do_classifier_free_guidance and guidance_scale > 1.0,
-                negative_prompt=negative_prompt,
-            )
-        init_factor = 2 ** (self.scheduler.num_stages - 1)
-        height, width = height // init_factor, width // init_factor
-        latents = randn_tensor(
-            (batch_size * num_images_per_prompt, 3, height, width),
-            generator=generator,
-            device=device,
-            dtype=torch.float32,
-        )
-        for stage_idx in range(self.scheduler.num_stages):
-            self.scheduler.set_timesteps(num_inference_steps[stage_idx], stage_idx, device=device, shift=shift)
-            timesteps = self.scheduler.Timesteps
-            if stage_idx > 0:
-                height, width = height * 2, width * 2
-                latents = F.interpolate(latents, size=(height, width), mode="nearest")
-                original_start_t = self.scheduler.original_start_t[stage_idx]
-                gamma = self.scheduler.gamma
-                alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
-                beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
-                noise = self.sample_block_noise(*latents.shape)
-                noise = noise.to(device=device, dtype=latents.dtype)
-                latents = alpha * latents + beta * noise
-            size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
-            pos_embed = get_2d_rotary_pos_embed(
-                embed_dim=self.transformer.attention_head_dim,
-                crops_coords=((0, 0), (latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size)),
-                grid_size=(latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size),
-                device=device,
-                output_type="pt",
-            )
-            rope_pos = torch.stack(pos_embed, -1)
-            autocast_enabled = device.type == "cuda"
-            autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
-            for timestep in timesteps:
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
-                with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
-                    if self.class_cond:
-                        noise_pred = self.transformer(
-                            latent_model_input,
-                            timestep=timestep_batch,
-                            class_labels=prompt_embeds,
-                            latent_size=size_tensor,
-                            pos_embed=rope_pos,
-                        ).sample
-                    else:
                         noise_pred = self.transformer(
                             latent_model_input,
-                            encoder_hidden_states=prompt_embeds,
-                            encoder_attention_mask=prompt_attention_mask,
                             timestep=timestep_batch,
                             latent_size=size_tensor,
                             pos_embed=rope_pos,
                         ).sample
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + self._stage_guidance_scale(stage_idx) * (
-                        noise_pred_text - noise_pred_uncond
-                    )
-                latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
-        image = (latents / 2 + 0.5).clamp(0, 1)
-        if output_type == "pt":
-            pass
-        elif output_type in ("pil", "np"):
-            image = self.image_processor.postprocess(image, output_type=output_type)
-        else:
-            raise ValueError(f"Unsupported output_type: {output_type}")
         self.maybe_free_model_hooks()
         if not return_dict:
             return (image,)
-        return PixelFlowPipelineOutput(images=image)

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import importlib
+import json
 import math
 import sys
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models.embeddings import get_2d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
+DEFAULT_NATIVE_RESOLUTION = 256
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pathlib import Path
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> model_dir = Path("./PixelFlow-256").resolve()
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     trust_remote_code=True,
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> print(pipe.id2label[207])
+        >>> print(pipe.get_label_ids("golden retriever"))
+        >>> generator = torch.Generator(device="cuda").manual_seed(42)
+        >>> image = pipe(
+        ...     class_labels="golden retriever",
+        ...     height=256,
+        ...     width=256,
+        ...     num_inference_steps=[10, 10, 10, 10],
+        ...     guidance_scale=4.0,
+        ...     generator=generator,
+        ... ).images[0]
+        >>> image.save("demo.png")
+        ```
+"""
 class PixelFlowPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for class-conditional PixelFlow pixel-space cascade generation.
+    Parameters:
+        transformer ([`PixelFlowTransformer2DModel`]):
+            Class-conditional PixelFlow transformer operating in pixel space.
+        scheduler ([`PixelFlowScheduler`] or [`KarrasDiffusionSchedulers`]):
+            Multi-stage flow scheduler used by PixelFlow cascade denoising.
+        id2label (`dict[int, str]`, *optional*):
+            ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
+    """
+    model_cpu_offload_seq = "transformer"
+    def __init__(
+        self,
+        transformer: Any,
+        scheduler: Any,
+        id2label: Optional[Dict[Union[int, str], str]] = None,
+    ):
+        super().__init__()
+        self.register_modules(transformer=transformer, scheduler=scheduler)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
+        self._id2label = self._normalize_id2label(id2label)
+        self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
         """Load a self-contained variant folder locally or from the Hub."""
+        import importlib
+        import sys
         repo_root = Path(__file__).resolve().parent
         if pretrained_model_name_or_path in (None, "", "."):
             if subfolder:
                 variant = variant / subfolder
+        id2label_override = kwargs.pop("id2label", None)
+        kwargs.pop("trust_remote_code", None)
         model_kwargs = dict(kwargs)
+        scheduler_kwargs = model_kwargs.pop("scheduler_kwargs", {})
+        inserted = []
+        def _ensure_path(path: str) -> None:
+            if path not in sys.path:
+                sys.path.insert(0, path)
+                inserted.append(path)
         try:
+            transformer_dir = variant / "transformer"
+            if not (transformer_dir / "transformer_pixelflow.py").exists() or not (transformer_dir / "config.json").exists():
                 raise ValueError(f"No loadable transformer found under {variant}")
+            _ensure_path(str(transformer_dir))
+            transformer_cls = getattr(importlib.import_module("transformer_pixelflow"), "PixelFlowTransformer2DModel")
+            transformer = transformer_cls.from_pretrained(str(transformer_dir), **model_kwargs)
+            scheduler_dir = variant / "scheduler"
+            if not (scheduler_dir / "scheduler_config.json").exists():
+                raise FileNotFoundError(f"Expected scheduler config in {scheduler_dir}")
+            _ensure_path(str(scheduler_dir))
+            scheduler_cls = getattr(importlib.import_module("scheduling_pixelflow"), "PixelFlowScheduler")
+            try:
+                scheduler = scheduler_cls.from_pretrained(str(scheduler_dir), **scheduler_kwargs)
+            except Exception:
+                scheduler = scheduler_cls(**scheduler_kwargs)
+            id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
+            pipe = cls(transformer=transformer, scheduler=scheduler, id2label=id2label)
+            if hasattr(pipe, "register_to_config"):
+                pipe.register_to_config(_name_or_path=str(variant))
+            return pipe
         finally:
             for comp_path in inserted:
                 if comp_path in sys.path:
                     sys.path.remove(comp_path)
+    @staticmethod
+    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
+        if not id2label:
+            return {}
+        return {int(key): value for key, value in id2label.items()}
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
+    @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        model_index_path = Path(variant_path).resolve() / "model_index.json"
+        if not model_index_path.exists():
+            return {}
+        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
+        id2label = raw.get("id2label")
+        if not isinstance(id2label, dict):
+            return {}
+        return {int(key): value for key, value in id2label.items()}
     @staticmethod
+    def _build_label2id(id2label: Dict[int, str]) -> Dict[str, int]:
+        label2id: Dict[str, int] = {}
         for class_id, value in id2label.items():
             for synonym in value.split(","):
                 synonym = synonym.strip()
         return dict(sorted(label2id.items()))
     @property
+    def id2label(self) -> Dict[int, str]:
+        r"""ImageNet class id to English label string (comma-separated synonyms)."""
+        self._ensure_labels_loaded()
         return self._id2label
+    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
         r"""
         Map ImageNet label strings to class ids.
         Args:
             label (`str` or `list[str]`):
+                One or more English label strings. Each string must match a synonym in `id2label`.
         """
+        self._ensure_labels_loaded()
+        label2id = self.labels
         if not label2id:
+            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
         if isinstance(label, str):
             label = [label]
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
+    ) -> torch.LongTensor:
+        if torch.is_tensor(class_labels):
+            return class_labels.to(device=self._execution_device, dtype=torch.long).reshape(-1)
+        if isinstance(class_labels, int):
+            class_label_ids = [class_labels]
+        elif isinstance(class_labels, str):
+            class_label_ids = self.get_label_ids(class_labels)
+        elif class_labels and isinstance(class_labels[0], str):
+            class_label_ids = self.get_label_ids(class_labels)
+        else:
+            class_label_ids = list(class_labels)
+        return torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
+    def check_inputs(
+        self,
+        height: int,
+        width: int,
+        num_inference_steps: Union[int, List[int]],
+        output_type: str,
+    ) -> None:
+        if output_type not in {"pil", "np", "pt", "latent"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt', 'latent'.")
+        stage_steps = self._normalize_stage_steps(num_inference_steps)
+        if any(steps < 1 for steps in stage_steps):
+            raise ValueError("Each stage in num_inference_steps must be >= 1.")
+        if height <= 0 or width <= 0:
+            raise ValueError("height and width must be positive integers.")
+    def _normalize_stage_steps(self, num_inference_steps: Union[int, List[int]]) -> List[int]:
+        if isinstance(num_inference_steps, int):
+            return [num_inference_steps] * self.scheduler.num_stages
+        if len(num_inference_steps) != self.scheduler.num_stages:
             raise ValueError(
+                f"num_inference_steps must have length {self.scheduler.num_stages} "
+                f"(one value per stage), got {len(num_inference_steps)}."
             )
+        return list(num_inference_steps)
+    def prepare_latents(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ) -> Tuple[torch.Tensor, int, int]:
+        init_factor = 2 ** (self.scheduler.num_stages - 1)
+        coarse_height = height // init_factor
+        coarse_width = width // init_factor
+        latents = randn_tensor(
+            (batch_size, 3, coarse_height, coarse_width),
+            generator=generator,
+            device=device,
+            dtype=torch.float32,
+        )
+        return latents, coarse_height, coarse_width
+    def _sample_block_noise(
+        self,
+        batch_size: int,
+        channels: int,
+        height: int,
+        width: int,
+        eps: float = 1e-6,
+    ) -> torch.Tensor:
         gamma = self.scheduler.gamma
         dist = torch.distributions.multivariate_normal.MultivariateNormal(
             torch.zeros(4),
             torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
         )
+        block_number = batch_size * channels * (height // 2) * (width // 2)
         noise = torch.stack([dist.sample() for _ in range(block_number)])
+        return rearrange(
             noise,
             "(b c h w) (p q) -> b c (h p) (w q)",
+            b=batch_size,
+            c=channels,
             h=height // 2,
             w=width // 2,
             p=2,
             q=2,
         )
+    def _upsample_latents_for_stage(
         self,
+        latents: torch.Tensor,
+        stage_idx: int,
+        height: int,
+        width: int,
         device: torch.device,
+    ) -> torch.Tensor:
+        latents = F.interpolate(latents, size=(height, width), mode="nearest")
+        original_start_t = self.scheduler.original_start_t[stage_idx]
+        gamma = self.scheduler.gamma
+        alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
+        beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
+        noise = self._sample_block_noise(*latents.shape)
+        noise = noise.to(device=device, dtype=latents.dtype)
+        return alpha * latents + beta * noise
+    def _prepare_rope_pos_embed(self, latents: torch.Tensor, device: torch.device) -> torch.Tensor:
+        grid_size = latents.shape[-1] // self.transformer.patch_size
+        pos_embed = get_2d_rotary_pos_embed(
+            embed_dim=self.transformer.attention_head_dim,
+            crops_coords=((0, 0), (grid_size, grid_size)),
+            grid_size=(grid_size, grid_size),
+            device=device,
+            output_type="pt",
         )
+        return torch.stack(pos_embed, -1)
+    def _stage_guidance_scale(self, stage_idx: int, guidance_scale: float) -> float:
+        scale_dict = {0: 0, 1: 1 / 6, 2: 2 / 3, 3: 1}
+        return (guidance_scale - 1) * scale_dict[stage_idx] + 1
+    def _encode_class_condition(
+        self,
+        class_labels_tensor: torch.LongTensor,
+        guidance_scale: float,
+    ) -> torch.LongTensor:
+        null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
+        if guidance_scale > 0:
+            return torch.cat([null_labels, class_labels_tensor], dim=0)
+        return class_labels_tensor
+    def decode_latents(self, latents: torch.Tensor, output_type: str = "pil"):
+        image = (latents / 2 + 0.5).clamp(0, 1)
+        if output_type == "latent":
+            return latents
+        if output_type == "pt":
+            return image
+        if output_type in {"pil", "np"}:
+            return self.image_processor.postprocess(image, output_type=output_type)
+        raise ValueError(f"output_type must be one of: 'pil', 'np', 'pt', 'latent'. Got {output_type}.")
+    @torch.inference_mode()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: Union[int, List[int]] = 10,
         guidance_scale: float = 4.0,
         shift: float = 1.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: str = "pil",
         return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Generate class-conditional images with PixelFlow.
+        Examples:
+            <!-- this section is replaced by replace_example_docstring -->
+        Args:
+            class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.LongTensor`):
+                ImageNet class indices or human-readable English label strings.
+            height (`int`, *optional*):
+                Output image height in pixels. Defaults to the transformer's native resolution.
+            width (`int`, *optional*):
+                Output image width in pixels. Defaults to the transformer's native resolution.
+            num_inference_steps (`int` or `list[int]`, defaults to `10`):
+                Number of denoising steps per cascade stage.
+            guidance_scale (`float`, defaults to `4.0`):
+                Classifier-free guidance scale. Guidance is stage-weighted for PixelFlow cascades.
+            shift (`float`, defaults to `1.0`):
+                Noise shift applied by the scheduler when building stage timesteps.
+            generator (`torch.Generator`, *optional*):
+                RNG for reproducibility.
+            output_type (`str`, defaults to `"pil"`):
+                `"pil"`, `"np"`, `"pt"`, or `"latent"`.
+            return_dict (`bool`, defaults to `True`):
+                Return [`ImagePipelineOutput`] if True.
+        """
+        default_size = int(getattr(self.transformer.config, "sample_size", DEFAULT_NATIVE_RESOLUTION))
+        height = int(height or default_size)
+        width = int(width or default_size)
+        self.check_inputs(height, width, num_inference_steps, output_type)
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 0
+        stage_steps = self._normalize_stage_steps(num_inference_steps)
+        class_labels_tensor = self._normalize_class_labels(class_labels)
+        batch_size = class_labels_tensor.numel()
+        conditioning = self._encode_class_condition(class_labels_tensor, guidance_scale)
+        latents, height, width = self.prepare_latents(batch_size, height, width, device, generator)
+        size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
+        autocast_enabled = device.type == "cuda"
+        autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
+        with self.progress_bar(total=sum(stage_steps)) as progress_bar:
+            for stage_idx in range(self.scheduler.num_stages):
+                self.scheduler.set_timesteps(stage_steps[stage_idx], stage_idx, device=device, shift=shift)
+                timesteps = self.scheduler.Timesteps
+                if stage_idx > 0:
+                    height, width = height * 2, width * 2
+                    latents = self._upsample_latents_for_stage(latents, stage_idx, height, width, device)
+                    size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
+                rope_pos = self._prepare_rope_pos_embed(latents, device)
+                for timestep in timesteps:
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+                    with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
                         noise_pred = self.transformer(
                             latent_model_input,
                             timestep=timestep_batch,
+                            class_labels=conditioning,
                             latent_size=size_tensor,
                             pos_embed=rope_pos,
                         ).sample
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        stage_scale = self._stage_guidance_scale(stage_idx, guidance_scale)
+                        noise_pred = noise_pred_uncond + stage_scale * (noise_pred_text - noise_pred_uncond)
+                    latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
+                    progress_bar.update()
+        image = self.decode_latents(latents, output_type=output_type)
         self.maybe_free_model_hooks()
         if not return_dict:
             return (image,)
+        return ImagePipelineOutput(images=image)

PixelFlow-256/scheduler/scheduler_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_class_name": "PixelFlowScheduler",
   "_diffusers_version": "0.36.0",
-  "gamma": -0.3333333333333333,
   "num_stages": 4,
-  "num_train_timesteps": 1000
 }

 {
   "_class_name": "PixelFlowScheduler",
   "_diffusers_version": "0.36.0",
+  "num_train_timesteps": 1000,
   "num_stages": 4,
+  "gamma": -0.3333333333333333
 }

PixelFlow-256/scheduler/scheduling_pixelflow.py CHANGED Viewed

@@ -1,3 +1,17 @@
 import math
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union

PixelFlow-256/transformer/transformer_pixelflow.py CHANGED Viewed

@@ -1,14 +1,470 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
-from modeling_pixelflow import PixelFlowModel
 @dataclass

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import LabelEmbedding, TimestepEmbedding, Timesteps
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
+try:
+    from flash_attn import flash_attn_varlen_func
+except ImportError:
+    warnings.warn("`flash-attn` is not installed. Training mode may not work properly.", UserWarning)
+    flash_attn_varlen_func = None
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos, sin = freqs_cis.unbind(-1)
+    cos = cos[None, None]
+    sin = sin[None, None]
+    cos, sin = cos.to(x.device), sin.to(x.device)
+    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+    return out
+class PatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_channels, embed_dim, bias=True):
+        super().__init__()
+        self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
+    def forward_unfold(self, x):
+        out_unfold = x.matmul(self.proj.weight.view(self.proj.weight.size(0), -1).t())
+        if self.proj.bias is not None:
+            out_unfold += self.proj.bias.to(out_unfold.dtype)
+        return out_unfold
+    def forward(self, x):
+        if self.training:
+            return self.forward_unfold(x)
+        out = self.proj(x)
+        out = out.flatten(2).transpose(1, 2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, timestep, seqlen_list=None):
+        input_dtype = x.dtype
+        emb = self.linear(self.silu(timestep))
+        if seqlen_list is not None:
+            emb = torch.cat([one_emb[None].expand(repeat_time, -1) for one_emb, repeat_time in zip(emb, seqlen_list)])
+        else:
+            emb = emb.unsqueeze(1)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.float().chunk(6, dim=-1)
+        x = self.norm(x).float() * (1 + scale_msa) + shift_msa
+        return x.to(input_dtype), gate_msa, shift_mlp, scale_mlp, gate_mlp
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, inner_dim=None, bias=True):
+        super().__init__()
+        inner_dim = int(dim * mult) if inner_dim is None else inner_dim
+        dim_out = dim_out if dim_out is not None else dim
+        self.fc1 = nn.Linear(dim, inner_dim, bias=bias)
+        self.fc2 = nn.Linear(inner_dim, dim_out, bias=bias)
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states, approximate="tanh")
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        output = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (self.weight * output).to(x.dtype)
+class Attention(nn.Module):
+    def __init__(self, q_dim, kv_dim=None, heads=8, head_dim=64, dropout=0.0, bias=False):
+        super().__init__()
+        self.q_dim = q_dim
+        self.kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.inner_dim = head_dim * heads
+        self.dropout = dropout
+        self.head_dim = head_dim
+        self.num_heads = heads
+        self.q_proj = nn.Linear(self.q_dim, self.inner_dim, bias=bias)
+        self.k_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.v_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.o_proj = nn.Linear(self.inner_dim, self.q_dim, bias=bias)
+        self.q_norm = RMSNorm(self.inner_dim)
+        self.k_norm = RMSNorm(self.inner_dim)
+    def prepare_attention_mask(self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3):
+        head_size = self.num_heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def forward(
+        self,
+        inputs_q,
+        inputs_kv,
+        attention_mask=None,
+        cross_attention=False,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        max_seqlen_q=None,
+        max_seqlen_k=None,
+    ):
+        inputs_kv = inputs_q if inputs_kv is None else inputs_kv
+        query_states = self.q_proj(inputs_q)
+        key_states = self.k_proj(inputs_kv)
+        value_states = self.v_proj(inputs_kv)
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+        if max_seqlen_q is None:
+            assert not self.training, "PixelFlow needs sequence packing for training"
+            bsz, q_len, _ = inputs_q.shape
+            _, kv_len, _ = inputs_kv.shape
+            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+            key_states = key_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            query_states = apply_rotary_emb(query_states, rope_pos_embed)
+            if not cross_attention:
+                key_states = apply_rotary_emb(key_states, rope_pos_embed)
+            if attention_mask is not None:
+                attention_mask = self.prepare_attention_mask(attention_mask, kv_len, bsz)
+                attention_mask = attention_mask.view(bsz, self.num_heads, -1, attention_mask.shape[-1])
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=False,
+            )
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_output = attn_output.view(bsz, q_len, self.inner_dim)
+            attn_output = self.o_proj(attn_output)
+            return attn_output
+        query_states = query_states.view(-1, self.num_heads, self.head_dim)
+        key_states = key_states.view(-1, self.num_heads, self.head_dim)
+        value_states = value_states.view(-1, self.num_heads, self.head_dim)
+        query_states = apply_rotary_emb(query_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        if not cross_attention:
+            key_states = apply_rotary_emb(key_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+        )
+        attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=False,
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNorm(dim)
+        self.attn1 = Attention(
+            q_dim=dim,
+            kv_dim=None,
+            heads=num_attention_heads,
+            head_dim=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )
+        if cross_attention_dim is not None:
+            self.norm2 = RMSNorm(dim, eps=1e-6)
+            self.attn2 = Attention(
+                q_dim=dim,
+                kv_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                head_dim=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+            )
+        else:
+            self.attn2 = None
+        self.norm3 = RMSNorm(dim, eps=1e-6)
+        self.mlp = FeedForward(dim)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, timestep, seqlen_list_q)
+        attn_output = self.attn1(
+            inputs_q=norm_hidden_states,
+            inputs_kv=None,
+            attention_mask=None,
+            cross_attention=False,
+            rope_pos_embed=rope_pos_embed,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_q,
+            max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+            max_seqlen_k=max(seqlen_list_q) if seqlen_list_q is not None else None,
+        )
+        attn_output = (gate_msa * attn_output.float()).to(attn_output.dtype)
+        hidden_states = attn_output + hidden_states
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(
+                inputs_q=norm_hidden_states,
+                inputs_kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                cross_attention=True,
+                rope_pos_embed=rope_pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+                max_seqlen_k=max(seqlen_list_k) if seqlen_list_k is not None else None,
+            )
+            hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm3(hidden_states)
+        norm_hidden_states = (norm_hidden_states.float() * (1 + scale_mlp) + shift_mlp).to(norm_hidden_states.dtype)
+        ff_output = self.mlp(norm_hidden_states)
+        ff_output = (gate_mlp * ff_output.float()).to(ff_output.dtype)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class PixelFlowModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_attention_heads,
+        attention_head_dim,
+        depth,
+        patch_size,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=True,
+        num_classes=0,
+        init_weights=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.attention_head_dim = attention_head_dim
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        embed_dim = num_attention_heads * attention_head_dim
+        self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        self.latent_size_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        if self.num_classes > 0:
+            self.class_embedder = LabelEmbedding(num_classes, embed_dim, dropout_prob=0.1)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    embed_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout,
+                    cross_attention_dim,
+                    attention_bias,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.norm_out = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(embed_dim, 2 * embed_dim)
+        self.proj_out_2 = nn.Linear(embed_dim, patch_size * patch_size * out_channels)
+        if init_weights:
+            self.initialize_from_scratch()
+    def initialize_from_scratch(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        nn.init.normal_(self.timestep_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.timestep_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_2.weight, std=0.02)
+        if self.num_classes > 0:
+            nn.init.normal_(self.class_embedder.embedding_table.weight, std=0.02)
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.norm1.linear.weight, 0)
+            nn.init.constant_(block.norm1.linear.bias, 0)
+        nn.init.constant_(self.proj_out_1.weight, 0)
+        nn.init.constant_(self.proj_out_1.bias, 0)
+        nn.init.constant_(self.proj_out_2.weight, 0)
+        nn.init.constant_(self.proj_out_2.bias, 0)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        class_labels=None,
+        timestep=None,
+        latent_size=None,
+        encoder_attention_mask=None,
+        pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        orig_height, orig_width = hidden_states.shape[-2], hidden_states.shape[-1]
+        hidden_states = hidden_states.to(torch.float32)
+        hidden_states = self.patch_embed(hidden_states)
+        timesteps_proj = self.time_proj(timestep)
+        conditioning = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
+        if self.num_classes > 0:
+            class_embed = self.class_embedder(class_labels)
+            conditioning += class_embed
+        latent_size_proj = self.time_proj(latent_size)
+        latent_size_embed = self.latent_size_embedder(latent_size_proj.to(dtype=hidden_states.dtype))
+        conditioning += latent_size_embed
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=conditioning,
+                rope_pos_embed=pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                seqlen_list_q=seqlen_list_q,
+                seqlen_list_k=seqlen_list_k,
+            )
+        shift, scale = self.proj_out_1(F.silu(conditioning)).float().chunk(2, dim=1)
+        if seqlen_list_q is None:
+            shift = shift.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+        else:
+            shift = torch.cat([shift_i[None].expand(ri, -1) for shift_i, ri in zip(shift, seqlen_list_q)])
+            scale = torch.cat([scale_i[None].expand(ri, -1) for scale_i, ri in zip(scale, seqlen_list_q)])
+        hidden_states = (self.norm_out(hidden_states).float() * (1 + scale) + shift).to(hidden_states.dtype)
+        hidden_states = self.proj_out_2(hidden_states)
+        if self.training:
+            hidden_states = hidden_states.reshape(hidden_states.shape[0], self.patch_size, self.patch_size, self.out_channels)
+            hidden_states = hidden_states.permute(0, 3, 1, 2).flatten(1)
+            return hidden_states
+        height, width = orig_height // self.patch_size, orig_width // self.patch_size
+        hidden_states = hidden_states.reshape(shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels))
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size))
+        return output
 @dataclass

PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc CHANGED Viewed

Binary files a/PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc and b/PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc differ

PixelFlow-T2I/model_index.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_class_name": "PixelFlowPipeline",
   "_diffusers_version": "0.36.0",
   "scheduler": [
     "scheduling_pixelflow",

 {
+  "_class_name": "PixelFlowT2IPipeline",
   "_diffusers_version": "0.36.0",
   "scheduler": [
     "scheduling_pixelflow",

PixelFlow-T2I/pipeline.py CHANGED Viewed

@@ -1,16 +1,23 @@
-"""Hub custom pipeline: PixelFlowPipeline.
-Load with native Hugging Face diffusers and `trust_remote_code=True`.
-"""
-from __future__ import annotations
 import importlib
 import math
 import sys
-from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -19,25 +26,91 @@ from einops import rearrange
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models.embeddings import get_2d_rotary_pos_embed
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.utils import BaseOutput
 from diffusers.utils.torch_utils import randn_tensor
-@dataclass
-class PixelFlowPipelineOutput(BaseOutput):
-    images: Union[torch.Tensor, List, np.ndarray]
-class PixelFlowPipeline(DiffusionPipeline):
-    """Pipeline for PixelFlow pixel-space flow generation (class-conditional or text-to-image)."""
     model_cpu_offload_seq = "text_encoder->transformer"
     _optional_components = ["text_encoder", "tokenizer"]
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
         """Load a self-contained variant folder locally or from the Hub."""
         repo_root = Path(__file__).resolve().parent
         if pretrained_model_name_or_path in (None, "", "."):
@@ -63,129 +136,187 @@ class PixelFlowPipeline(DiffusionPipeline):
                 variant = variant / subfolder
         model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        def _load_text_components():
-            text_encoder = None
-            tokenizer = None
-            te_dir = variant / "text_encoder"
-            tok_dir = variant / "tokenizer"
-            if te_dir.exists() and (te_dir / "config.json").exists():
-                from transformers import T5EncoderModel, T5Tokenizer
-                text_encoder = T5EncoderModel.from_pretrained(str(te_dir), **model_kwargs)
-                tokenizer = T5Tokenizer.from_pretrained(str(tok_dir))
-            return text_encoder, tokenizer
-        def _load_text_encoder_name() -> str:
-            metadata_path = variant / "conversion_metadata.json"
-            if metadata_path.exists():
-                import json
-                metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
-                if metadata.get("text_encoder"):
-                    return metadata["text_encoder"]
-            return "google/flan-t5-xl"
         try:
-            transformer = _load_component("transformer", "transformer_pixelflow", "PixelFlowTransformer2DModel")
-            scheduler = _load_component("scheduler", "scheduling_pixelflow", "PixelFlowScheduler")
-            text_encoder, tokenizer = _load_text_components()
-            if scheduler is None:
-                sched_dir = variant / "scheduler"
-                if (sched_dir / "scheduling_pixelflow.py").exists():
-                    sched_path = str(sched_dir)
-                    if sched_path not in sys.path:
-                        sys.path.insert(0, sched_path)
-                        inserted.append(sched_path)
-                    scheduler = importlib.import_module("scheduling_pixelflow").PixelFlowScheduler()
-            if transformer is None:
                 raise ValueError(f"No loadable transformer found under {variant}")
-            if (
-                text_encoder is None
-                and tokenizer is None
-                and transformer.config.num_classes == 0
-                and transformer.config.cross_attention_dim is not None
-            ):
-                from transformers import T5EncoderModel, T5Tokenizer
-                text_encoder_name = _load_text_encoder_name()
                 text_encoder = T5EncoderModel.from_pretrained(text_encoder_name, **model_kwargs)
                 tokenizer = T5Tokenizer.from_pretrained(text_encoder_name)
-            return cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-            )
         finally:
             for comp_path in inserted:
                 if comp_path in sys.path:
                     sys.path.remove(comp_path)
-    def __init__(self, transformer, scheduler, text_encoder=None, tokenizer=None, max_token_length: int = 512):
-        super().__init__()
-        self.register_modules(
-            transformer=transformer,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
         )
-        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
-        self.class_cond = transformer.config.num_classes > 0
-        self.max_token_length = max_token_length
-    def sample_block_noise(self, bs, ch, height, width, eps=1e-6):
         gamma = self.scheduler.gamma
         dist = torch.distributions.multivariate_normal.MultivariateNormal(
             torch.zeros(4),
             torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
         )
-        block_number = bs * ch * (height // 2) * (width // 2)
         noise = torch.stack([dist.sample() for _ in range(block_number)])
-        noise = rearrange(
             noise,
             "(b c h w) (p q) -> b c (h p) (w q)",
-            b=bs,
-            c=ch,
             h=height // 2,
             w=width // 2,
             p=2,
             q=2,
         )
-        return noise
-    def _stage_guidance_scale(self, stage_idx: int) -> float:
-        if not self.class_cond:
-            return self._guidance_scale_value
-        scale_dict = {0: 0, 1: 1 / 6, 2: 2 / 3, 3: 1}
-        return (self._guidance_scale_value - 1) * scale_dict[stage_idx] + 1
-    @property
-    def do_classifier_free_guidance(self) -> bool:
-        return self._guidance_scale_value > 0
-    @torch.no_grad()
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -195,6 +326,23 @@ class PixelFlowPipeline(DiffusionPipeline):
         negative_prompt: Union[str, List[str]] = "",
         max_length: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if self.text_encoder is None or self.tokenizer is None:
             raise ValueError("Text-to-image generation requires `text_encoder` and `tokenizer`.")
@@ -257,18 +405,20 @@ class PixelFlowPipeline(DiffusionPipeline):
             negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len_neg, -1)
-            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
         return prompt_embeds, prompt_attention_mask
-    @torch.no_grad()
     def __call__(
         self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        class_labels: Optional[Union[int, List[int], torch.Tensor]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: Union[int, List[int]] = 10,
@@ -279,98 +429,91 @@ class PixelFlowPipeline(DiffusionPipeline):
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: str = "pil",
         return_dict: bool = True,
-    ) -> Union[PixelFlowPipelineOutput, Tuple]:
-        if height is None:
-            height = int(self.transformer.config.sample_size)
-        if width is None:
-            width = int(self.transformer.config.sample_size)
-        device = self._execution_device
-        self._guidance_scale_value = guidance_scale
-        if isinstance(num_inference_steps, int):
-            num_inference_steps = [num_inference_steps] * self.scheduler.num_stages
-        prompt_attention_mask = None
-        if self.class_cond:
-            if class_labels is None:
-                raise ValueError("`class_labels` are required for class-conditional PixelFlow checkpoints.")
-            if isinstance(class_labels, int):
-                class_labels = [class_labels]
-            if not torch.is_tensor(class_labels):
-                class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
-            else:
-                class_labels = class_labels.to(device=device, dtype=torch.long)
-            batch_size = class_labels.shape[0]
-            prompt_embeds = class_labels
-            negative_prompt_embeds = torch.full_like(prompt_embeds, self.transformer.config.num_classes)
-            if self.do_classifier_free_guidance:
-                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-        else:
-            if prompt is None:
-                raise ValueError("`prompt` is required for text-to-image PixelFlow checkpoints.")
-            if isinstance(prompt, str):
-                prompt = [prompt]
-            batch_size = len(prompt)
-            prompt_embeds, prompt_attention_mask = self.encode_prompt(
-                prompt,
-                device,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=self.do_classifier_free_guidance and guidance_scale > 1.0,
-                negative_prompt=negative_prompt,
-            )
-        init_factor = 2 ** (self.scheduler.num_stages - 1)
-        height, width = height // init_factor, width // init_factor
-        latents = randn_tensor(
-            (batch_size * num_images_per_prompt, 3, height, width),
-            generator=generator,
-            device=device,
-            dtype=torch.float32,
-        )
-        for stage_idx in range(self.scheduler.num_stages):
-            self.scheduler.set_timesteps(num_inference_steps[stage_idx], stage_idx, device=device, shift=shift)
-            timesteps = self.scheduler.Timesteps
-            if stage_idx > 0:
-                height, width = height * 2, width * 2
-                latents = F.interpolate(latents, size=(height, width), mode="nearest")
-                original_start_t = self.scheduler.original_start_t[stage_idx]
-                gamma = self.scheduler.gamma
-                alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
-                beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
-                noise = self.sample_block_noise(*latents.shape)
-                noise = noise.to(device=device, dtype=latents.dtype)
-                latents = alpha * latents + beta * noise
-            size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
-            pos_embed = get_2d_rotary_pos_embed(
-                embed_dim=self.transformer.attention_head_dim,
-                crops_coords=((0, 0), (latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size)),
-                grid_size=(latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size),
-                device=device,
-                output_type="pt",
-            )
-            rope_pos = torch.stack(pos_embed, -1)
-            autocast_enabled = device.type == "cuda"
-            autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
-            for timestep in timesteps:
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
-                with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
-                    if self.class_cond:
-                        noise_pred = self.transformer(
-                            latent_model_input,
-                            timestep=timestep_batch,
-                            class_labels=prompt_embeds,
-                            latent_size=size_tensor,
-                            pos_embed=rope_pos,
-                        ).sample
-                    else:
                         noise_pred = self.transformer(
                             latent_model_input,
                             encoder_hidden_states=prompt_embeds,
@@ -380,26 +523,17 @@ class PixelFlowPipeline(DiffusionPipeline):
                             pos_embed=rope_pos,
                         ).sample
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + self._stage_guidance_scale(stage_idx) * (
-                        noise_pred_text - noise_pred_uncond
-                    )
-                latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
-        image = (latents / 2 + 0.5).clamp(0, 1)
-        if output_type == "pt":
-            pass
-        elif output_type in ("pil", "np"):
-            image = self.image_processor.postprocess(image, output_type=output_type)
-        else:
-            raise ValueError(f"Unsupported output_type: {output_type}")
         self.maybe_free_model_hooks()
         if not return_dict:
             return (image,)
-        return PixelFlowPipelineOutput(images=image)

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import importlib
+import json
 import math
 import sys
 from pathlib import Path
+from typing import Any, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models.embeddings import get_2d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
+DEFAULT_NATIVE_RESOLUTION = 1024
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pathlib import Path
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> model_dir = Path("./PixelFlow-T2I").resolve()
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     trust_remote_code=True,
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> generator = torch.Generator(device="cuda").manual_seed(42)
+        >>> image = pipe(
+        ...     prompt="A golden retriever playing in a sunny garden",
+        ...     height=1024,
+        ...     width=1024,
+        ...     num_inference_steps=[10, 10, 10, 10],
+        ...     guidance_scale=4.0,
+        ...     generator=generator,
+        ... ).images[0]
+        >>> image.save("demo.png")
+        ```
+"""
+class PixelFlowT2IPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image PixelFlow pixel-space cascade generation.
+    Parameters:
+        transformer ([`PixelFlowTransformer2DModel`]):
+            Text-conditioned PixelFlow transformer operating in pixel space.
+        scheduler ([`PixelFlowScheduler`] or [`KarrasDiffusionSchedulers`]):
+            Multi-stage flow scheduler used by PixelFlow cascade denoising.
+        text_encoder ([`T5EncoderModel`], *optional*):
+            Text encoder used to embed prompts.
+        tokenizer ([`T5Tokenizer`], *optional*):
+            Tokenizer paired with the text encoder.
+    """
     model_cpu_offload_seq = "text_encoder->transformer"
     _optional_components = ["text_encoder", "tokenizer"]
+    def __init__(
+        self,
+        transformer: Any,
+        scheduler: Any,
+        text_encoder=None,
+        tokenizer=None,
+        max_token_length: int = 512,
+    ):
+        super().__init__()
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
+        self.max_token_length = max_token_length
+        self.set_progress_bar_config(disable=False)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
         """Load a self-contained variant folder locally or from the Hub."""
+        import importlib
+        import sys
+        from transformers import T5EncoderModel, T5Tokenizer
         repo_root = Path(__file__).resolve().parent
         if pretrained_model_name_or_path in (None, "", "."):
                 variant = variant / subfolder
         model_kwargs = dict(kwargs)
+        model_kwargs.pop("trust_remote_code", None)
+        scheduler_kwargs = model_kwargs.pop("scheduler_kwargs", {})
+        inserted = []
+        def _ensure_path(path: str) -> None:
+            if path not in sys.path:
+                sys.path.insert(0, path)
+                inserted.append(path)
         try:
+            transformer_dir = variant / "transformer"
+            if not (transformer_dir / "transformer_pixelflow.py").exists() or not (transformer_dir / "config.json").exists():
                 raise ValueError(f"No loadable transformer found under {variant}")
+            _ensure_path(str(transformer_dir))
+            transformer_cls = getattr(importlib.import_module("transformer_pixelflow"), "PixelFlowTransformer2DModel")
+            transformer = transformer_cls.from_pretrained(str(transformer_dir), **model_kwargs)
+            scheduler_dir = variant / "scheduler"
+            if not (scheduler_dir / "scheduler_config.json").exists():
+                raise FileNotFoundError(f"Expected scheduler config in {scheduler_dir}")
+            _ensure_path(str(scheduler_dir))
+            scheduler_cls = getattr(importlib.import_module("scheduling_pixelflow"), "PixelFlowScheduler")
+            try:
+                scheduler = scheduler_cls.from_pretrained(str(scheduler_dir), **scheduler_kwargs)
+            except Exception:
+                scheduler = scheduler_cls(**scheduler_kwargs)
+            text_encoder = None
+            tokenizer = None
+            text_encoder_dir = variant / "text_encoder"
+            tokenizer_dir = variant / "tokenizer"
+            if text_encoder_dir.exists() and (text_encoder_dir / "config.json").exists():
+                text_encoder = T5EncoderModel.from_pretrained(str(text_encoder_dir), **model_kwargs)
+                tokenizer = T5Tokenizer.from_pretrained(str(tokenizer_dir if tokenizer_dir.exists() else text_encoder_dir))
+            if text_encoder is None or tokenizer is None:
+                text_encoder_name = cls._read_text_encoder_name(variant)
                 text_encoder = T5EncoderModel.from_pretrained(text_encoder_name, **model_kwargs)
                 tokenizer = T5Tokenizer.from_pretrained(text_encoder_name)
+            pipe = cls(transformer=transformer, scheduler=scheduler, text_encoder=text_encoder, tokenizer=tokenizer)
+            if hasattr(pipe, "register_to_config"):
+                pipe.register_to_config(_name_or_path=str(variant))
+            return pipe
         finally:
             for comp_path in inserted:
                 if comp_path in sys.path:
                     sys.path.remove(comp_path)
+    @staticmethod
+    def _read_text_encoder_name(variant_path: Path) -> str:
+        metadata_path = variant_path / "conversion_metadata.json"
+        if metadata_path.exists():
+            metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+            if metadata.get("text_encoder"):
+                return metadata["text_encoder"]
+        return "google/flan-t5-xl"
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        num_inference_steps: Union[int, List[int]],
+        output_type: str,
+        negative_prompt: Optional[Union[str, List[str]]],
+    ) -> None:
+        if not isinstance(prompt, str) and not (isinstance(prompt, list) and all(isinstance(p, str) for p in prompt)):
+            raise TypeError("`prompt` must be a string or list of strings.")
+        if negative_prompt is not None and not isinstance(negative_prompt, str):
+            if not (isinstance(negative_prompt, list) and all(isinstance(p, str) for p in negative_prompt)):
+                raise TypeError("`negative_prompt` must be a string or list of strings.")
+        if output_type not in {"pil", "np", "pt", "latent"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt', 'latent'.")
+        stage_steps = self._normalize_stage_steps(num_inference_steps)
+        if any(steps < 1 for steps in stage_steps):
+            raise ValueError("Each stage in num_inference_steps must be >= 1.")
+        if height <= 0 or width <= 0:
+            raise ValueError("height and width must be positive integers.")
+    def _normalize_stage_steps(self, num_inference_steps: Union[int, List[int]]) -> List[int]:
+        if isinstance(num_inference_steps, int):
+            return [num_inference_steps] * self.scheduler.num_stages
+        if len(num_inference_steps) != self.scheduler.num_stages:
+            raise ValueError(
+                f"num_inference_steps must have length {self.scheduler.num_stages} "
+                f"(one value per stage), got {len(num_inference_steps)}."
+            )
+        return list(num_inference_steps)
+    def prepare_latents(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ) -> Tuple[torch.Tensor, int, int]:
+        init_factor = 2 ** (self.scheduler.num_stages - 1)
+        coarse_height = height // init_factor
+        coarse_width = width // init_factor
+        latents = randn_tensor(
+            (batch_size, 3, coarse_height, coarse_width),
+            generator=generator,
+            device=device,
+            dtype=torch.float32,
         )
+        return latents, coarse_height, coarse_width
+    def _sample_block_noise(
+        self,
+        batch_size: int,
+        channels: int,
+        height: int,
+        width: int,
+        eps: float = 1e-6,
+    ) -> torch.Tensor:
         gamma = self.scheduler.gamma
         dist = torch.distributions.multivariate_normal.MultivariateNormal(
             torch.zeros(4),
             torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
         )
+        block_number = batch_size * channels * (height // 2) * (width // 2)
         noise = torch.stack([dist.sample() for _ in range(block_number)])
+        return rearrange(
             noise,
             "(b c h w) (p q) -> b c (h p) (w q)",
+            b=batch_size,
+            c=channels,
             h=height // 2,
             w=width // 2,
             p=2,
             q=2,
         )
+    def _upsample_latents_for_stage(
+        self,
+        latents: torch.Tensor,
+        stage_idx: int,
+        height: int,
+        width: int,
+        device: torch.device,
+    ) -> torch.Tensor:
+        latents = F.interpolate(latents, size=(height, width), mode="nearest")
+        original_start_t = self.scheduler.original_start_t[stage_idx]
+        gamma = self.scheduler.gamma
+        alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
+        beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
+        noise = self._sample_block_noise(*latents.shape)
+        noise = noise.to(device=device, dtype=latents.dtype)
+        return alpha * latents + beta * noise
+    def _prepare_rope_pos_embed(self, latents: torch.Tensor, device: torch.device) -> torch.Tensor:
+        grid_size = latents.shape[-1] // self.transformer.patch_size
+        pos_embed = get_2d_rotary_pos_embed(
+            embed_dim=self.transformer.attention_head_dim,
+            crops_coords=((0, 0), (grid_size, grid_size)),
+            grid_size=(grid_size, grid_size),
+            device=device,
+            output_type="pt",
+        )
+        return torch.stack(pos_embed, -1)
+    def decode_latents(self, latents: torch.Tensor, output_type: str = "pil"):
+        image = (latents / 2 + 0.5).clamp(0, 1)
+        if output_type == "latent":
+            return latents
+        if output_type == "pt":
+            return image
+        if output_type in {"pil", "np"}:
+            return self.image_processor.postprocess(image, output_type=output_type)
+        raise ValueError(f"output_type must be one of: 'pil', 'np', 'pt', 'latent'. Got {output_type}.")
+    @torch.inference_mode()
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
         negative_prompt: Union[str, List[str]] = "",
         max_length: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Encode text prompts into hidden states for the PixelFlow transformer.
+        Args:
+            prompt (`str` or `list[str]`):
+                Prompt(s) to encode.
+            device (`torch.device`):
+                Target device for encoded tensors.
+            num_images_per_prompt (`int`, defaults to `1`):
+                Number of images to generate per prompt.
+            do_classifier_free_guidance (`bool`, defaults to `True`):
+                Whether to concatenate unconditional prompt embeddings for CFG.
+            negative_prompt (`str` or `list[str]`, defaults to `""`):
+                Negative prompt(s) used for classifier-free guidance.
+            max_length (`int`, *optional*):
+                Maximum token length. Defaults to `self.max_token_length`.
+        """
         if self.text_encoder is None or self.tokenizer is None:
             raise ValueError("Text-to-image generation requires `text_encoder` and `tokenizer`.")
             negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len_neg, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1).repeat(
+                num_images_per_prompt, 1
+            )
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
         return prompt_embeds, prompt_attention_mask
+    @torch.inference_mode()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
+        prompt: Union[str, List[str]],
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: Union[int, List[int]] = 10,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: str = "pil",
         return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Generate text-to-image samples with PixelFlow.
+        Examples:
+            <!-- this section is replaced by replace_example_docstring -->
+        Args:
+            prompt (`str` or `list[str]`):
+                Text prompt(s) describing the desired image.
+            height (`int`, *optional*):
+                Output image height in pixels. Defaults to the transformer's native resolution.
+            width (`int`, *optional*):
+                Output image width in pixels. Defaults to the transformer's native resolution.
+            num_inference_steps (`int` or `list[int]`, defaults to `10`):
+                Number of denoising steps per cascade stage.
+            guidance_scale (`float`, defaults to `4.0`):
+                Classifier-free guidance scale.
+            shift (`float`, defaults to `1.0`):
+                Noise shift applied by the scheduler when building stage timesteps.
+            negative_prompt (`str` or `list[str]`, defaults to `""`):
+                Negative prompt(s) for classifier-free guidance.
+            num_images_per_prompt (`int`, defaults to `1`):
+                Number of images to generate for each prompt.
+            generator (`torch.Generator`, *optional*):
+                RNG for reproducibility.
+            output_type (`str`, defaults to `"pil"`):
+                `"pil"`, `"np"`, `"pt"`, or `"latent"`.
+            return_dict (`bool`, defaults to `True`):
+                Return [`ImagePipelineOutput`] if True.
+        """
+        if isinstance(prompt, str):
+            prompt_list = [prompt]
+        else:
+            prompt_list = prompt
+        default_size = int(getattr(self.transformer.config, "sample_size", DEFAULT_NATIVE_RESOLUTION))
+        height = int(height or default_size)
+        width = int(width or default_size)
+        self.check_inputs(prompt_list, height, width, num_inference_steps, output_type, negative_prompt)
+        device = self.transformer.device
+        text_encoder_device = self.text_encoder.device if self.text_encoder is not None else device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        stage_steps = self._normalize_stage_steps(num_inference_steps)
+        batch_size = len(prompt_list)
+        prompt_embeds, prompt_attention_mask = self.encode_prompt(
+            prompt_list,
+            text_encoder_device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+        )
+        prompt_embeds = prompt_embeds.to(device)
+        prompt_attention_mask = prompt_attention_mask.to(device)
+        latents, height, width = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            device,
+            generator,
+        )
+        size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
+        autocast_enabled = device.type == "cuda"
+        autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
+        with self.progress_bar(total=sum(stage_steps)) as progress_bar:
+            for stage_idx in range(self.scheduler.num_stages):
+                self.scheduler.set_timesteps(stage_steps[stage_idx], stage_idx, device=device, shift=shift)
+                timesteps = self.scheduler.Timesteps
+                if stage_idx > 0:
+                    height, width = height * 2, width * 2
+                    latents = self._upsample_latents_for_stage(latents, stage_idx, height, width, device)
+                    size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
+                rope_pos = self._prepare_rope_pos_embed(latents, device)
+                for timestep in timesteps:
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+                    with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
                         noise_pred = self.transformer(
                             latent_model_input,
                             encoder_hidden_states=prompt_embeds,
                             pos_embed=rope_pos,
                         ).sample
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
+                    progress_bar.update()
+        image = self.decode_latents(latents, output_type=output_type)
         self.maybe_free_model_hooks()
         if not return_dict:
             return (image,)
+        return ImagePipelineOutput(images=image)

PixelFlow-T2I/scheduler/scheduler_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_class_name": "PixelFlowScheduler",
   "_diffusers_version": "0.36.0",
-  "gamma": -0.3333333333333333,
   "num_stages": 4,
-  "num_train_timesteps": 1000
 }

 {
   "_class_name": "PixelFlowScheduler",
   "_diffusers_version": "0.36.0",
+  "num_train_timesteps": 1000,
   "num_stages": 4,
+  "gamma": -0.3333333333333333
 }

PixelFlow-T2I/scheduler/scheduling_pixelflow.py CHANGED Viewed

@@ -1,3 +1,17 @@
 import math
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union

PixelFlow-T2I/transformer/transformer_pixelflow.py CHANGED Viewed

@@ -1,14 +1,470 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
-from modeling_pixelflow import PixelFlowModel
 @dataclass

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import LabelEmbedding, TimestepEmbedding, Timesteps
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
+try:
+    from flash_attn import flash_attn_varlen_func
+except ImportError:
+    warnings.warn("`flash-attn` is not installed. Training mode may not work properly.", UserWarning)
+    flash_attn_varlen_func = None
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos, sin = freqs_cis.unbind(-1)
+    cos = cos[None, None]
+    sin = sin[None, None]
+    cos, sin = cos.to(x.device), sin.to(x.device)
+    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+    return out
+class PatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_channels, embed_dim, bias=True):
+        super().__init__()
+        self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
+    def forward_unfold(self, x):
+        out_unfold = x.matmul(self.proj.weight.view(self.proj.weight.size(0), -1).t())
+        if self.proj.bias is not None:
+            out_unfold += self.proj.bias.to(out_unfold.dtype)
+        return out_unfold
+    def forward(self, x):
+        if self.training:
+            return self.forward_unfold(x)
+        out = self.proj(x)
+        out = out.flatten(2).transpose(1, 2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, timestep, seqlen_list=None):
+        input_dtype = x.dtype
+        emb = self.linear(self.silu(timestep))
+        if seqlen_list is not None:
+            emb = torch.cat([one_emb[None].expand(repeat_time, -1) for one_emb, repeat_time in zip(emb, seqlen_list)])
+        else:
+            emb = emb.unsqueeze(1)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.float().chunk(6, dim=-1)
+        x = self.norm(x).float() * (1 + scale_msa) + shift_msa
+        return x.to(input_dtype), gate_msa, shift_mlp, scale_mlp, gate_mlp
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, inner_dim=None, bias=True):
+        super().__init__()
+        inner_dim = int(dim * mult) if inner_dim is None else inner_dim
+        dim_out = dim_out if dim_out is not None else dim
+        self.fc1 = nn.Linear(dim, inner_dim, bias=bias)
+        self.fc2 = nn.Linear(inner_dim, dim_out, bias=bias)
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states, approximate="tanh")
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        output = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (self.weight * output).to(x.dtype)
+class Attention(nn.Module):
+    def __init__(self, q_dim, kv_dim=None, heads=8, head_dim=64, dropout=0.0, bias=False):
+        super().__init__()
+        self.q_dim = q_dim
+        self.kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.inner_dim = head_dim * heads
+        self.dropout = dropout
+        self.head_dim = head_dim
+        self.num_heads = heads
+        self.q_proj = nn.Linear(self.q_dim, self.inner_dim, bias=bias)
+        self.k_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.v_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.o_proj = nn.Linear(self.inner_dim, self.q_dim, bias=bias)
+        self.q_norm = RMSNorm(self.inner_dim)
+        self.k_norm = RMSNorm(self.inner_dim)
+    def prepare_attention_mask(self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3):
+        head_size = self.num_heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def forward(
+        self,
+        inputs_q,
+        inputs_kv,
+        attention_mask=None,
+        cross_attention=False,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        max_seqlen_q=None,
+        max_seqlen_k=None,
+    ):
+        inputs_kv = inputs_q if inputs_kv is None else inputs_kv
+        query_states = self.q_proj(inputs_q)
+        key_states = self.k_proj(inputs_kv)
+        value_states = self.v_proj(inputs_kv)
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+        if max_seqlen_q is None:
+            assert not self.training, "PixelFlow needs sequence packing for training"
+            bsz, q_len, _ = inputs_q.shape
+            _, kv_len, _ = inputs_kv.shape
+            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+            key_states = key_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            query_states = apply_rotary_emb(query_states, rope_pos_embed)
+            if not cross_attention:
+                key_states = apply_rotary_emb(key_states, rope_pos_embed)
+            if attention_mask is not None:
+                attention_mask = self.prepare_attention_mask(attention_mask, kv_len, bsz)
+                attention_mask = attention_mask.view(bsz, self.num_heads, -1, attention_mask.shape[-1])
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=False,
+            )
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_output = attn_output.view(bsz, q_len, self.inner_dim)
+            attn_output = self.o_proj(attn_output)
+            return attn_output
+        query_states = query_states.view(-1, self.num_heads, self.head_dim)
+        key_states = key_states.view(-1, self.num_heads, self.head_dim)
+        value_states = value_states.view(-1, self.num_heads, self.head_dim)
+        query_states = apply_rotary_emb(query_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        if not cross_attention:
+            key_states = apply_rotary_emb(key_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+        )
+        attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=False,
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNorm(dim)
+        self.attn1 = Attention(
+            q_dim=dim,
+            kv_dim=None,
+            heads=num_attention_heads,
+            head_dim=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )
+        if cross_attention_dim is not None:
+            self.norm2 = RMSNorm(dim, eps=1e-6)
+            self.attn2 = Attention(
+                q_dim=dim,
+                kv_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                head_dim=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+            )
+        else:
+            self.attn2 = None
+        self.norm3 = RMSNorm(dim, eps=1e-6)
+        self.mlp = FeedForward(dim)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, timestep, seqlen_list_q)
+        attn_output = self.attn1(
+            inputs_q=norm_hidden_states,
+            inputs_kv=None,
+            attention_mask=None,
+            cross_attention=False,
+            rope_pos_embed=rope_pos_embed,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_q,
+            max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+            max_seqlen_k=max(seqlen_list_q) if seqlen_list_q is not None else None,
+        )
+        attn_output = (gate_msa * attn_output.float()).to(attn_output.dtype)
+        hidden_states = attn_output + hidden_states
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(
+                inputs_q=norm_hidden_states,
+                inputs_kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                cross_attention=True,
+                rope_pos_embed=rope_pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+                max_seqlen_k=max(seqlen_list_k) if seqlen_list_k is not None else None,
+            )
+            hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm3(hidden_states)
+        norm_hidden_states = (norm_hidden_states.float() * (1 + scale_mlp) + shift_mlp).to(norm_hidden_states.dtype)
+        ff_output = self.mlp(norm_hidden_states)
+        ff_output = (gate_mlp * ff_output.float()).to(ff_output.dtype)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class PixelFlowModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_attention_heads,
+        attention_head_dim,
+        depth,
+        patch_size,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=True,
+        num_classes=0,
+        init_weights=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.attention_head_dim = attention_head_dim
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        embed_dim = num_attention_heads * attention_head_dim
+        self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        self.latent_size_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        if self.num_classes > 0:
+            self.class_embedder = LabelEmbedding(num_classes, embed_dim, dropout_prob=0.1)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    embed_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout,
+                    cross_attention_dim,
+                    attention_bias,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.norm_out = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(embed_dim, 2 * embed_dim)
+        self.proj_out_2 = nn.Linear(embed_dim, patch_size * patch_size * out_channels)
+        if init_weights:
+            self.initialize_from_scratch()
+    def initialize_from_scratch(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        nn.init.normal_(self.timestep_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.timestep_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_2.weight, std=0.02)
+        if self.num_classes > 0:
+            nn.init.normal_(self.class_embedder.embedding_table.weight, std=0.02)
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.norm1.linear.weight, 0)
+            nn.init.constant_(block.norm1.linear.bias, 0)
+        nn.init.constant_(self.proj_out_1.weight, 0)
+        nn.init.constant_(self.proj_out_1.bias, 0)
+        nn.init.constant_(self.proj_out_2.weight, 0)
+        nn.init.constant_(self.proj_out_2.bias, 0)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        class_labels=None,
+        timestep=None,
+        latent_size=None,
+        encoder_attention_mask=None,
+        pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        orig_height, orig_width = hidden_states.shape[-2], hidden_states.shape[-1]
+        hidden_states = hidden_states.to(torch.float32)
+        hidden_states = self.patch_embed(hidden_states)
+        timesteps_proj = self.time_proj(timestep)
+        conditioning = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
+        if self.num_classes > 0:
+            class_embed = self.class_embedder(class_labels)
+            conditioning += class_embed
+        latent_size_proj = self.time_proj(latent_size)
+        latent_size_embed = self.latent_size_embedder(latent_size_proj.to(dtype=hidden_states.dtype))
+        conditioning += latent_size_embed
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=conditioning,
+                rope_pos_embed=pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                seqlen_list_q=seqlen_list_q,
+                seqlen_list_k=seqlen_list_k,
+            )
+        shift, scale = self.proj_out_1(F.silu(conditioning)).float().chunk(2, dim=1)
+        if seqlen_list_q is None:
+            shift = shift.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+        else:
+            shift = torch.cat([shift_i[None].expand(ri, -1) for shift_i, ri in zip(shift, seqlen_list_q)])
+            scale = torch.cat([scale_i[None].expand(ri, -1) for scale_i, ri in zip(scale, seqlen_list_q)])
+        hidden_states = (self.norm_out(hidden_states).float() * (1 + scale) + shift).to(hidden_states.dtype)
+        hidden_states = self.proj_out_2(hidden_states)
+        if self.training:
+            hidden_states = hidden_states.reshape(hidden_states.shape[0], self.patch_size, self.patch_size, self.out_channels)
+            hidden_states = hidden_states.permute(0, 3, 1, 2).flatten(1)
+            return hidden_states
+        height, width = orig_height // self.patch_size, orig_width // self.patch_size
+        hidden_states = hidden_states.reshape(shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels))
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size))
+        return output
 @dataclass

README.md CHANGED Viewed

@@ -1,99 +1,147 @@
----
-license: mit
-library_name: diffusers
-pipeline_tag: text-to-image
-tags:
-- diffusers
-- pixelflow
-- image-generation
-- class-conditional
-- flow-matching
-widget:
-- output:
-    url: PixelFlow-256/demo.png
-language:
-- en
----
 # BiliSakura/PixelFlow-diffusers
-Self-contained PixelFlow checkpoints for Hugging Face diffusers. Each subfolder ships its own `pipeline.py`, component modules, and weights.
 ## Available checkpoints
-| Subfolder | Task | Resolution | Params |
-| --- | --- | ---: | ---: |
-| [`PixelFlow-256/`](PixelFlow-256/) | class-to-image | 256×256 | 677M |
-| [`PixelFlow-T2I/`](PixelFlow-T2I/) | text-to-image | 1024×1024 | 882M |
-## ImageNet class labels
-For class-conditional [`PixelFlow-256/`](PixelFlow-256/), ImageNet-1k labels live in shared [`labels/`](labels/) at the repo root:
-| File | Direction | Value format |
-| --- | --- | --- |
-| `labels/id2label_en.json` | id → English | comma-separated synonyms, e.g. `"207": "golden retriever"` |
-| `labels/id2label_cn.json` | id → Chinese | comma-separated synonyms, e.g. `"207": "金毛猎犬"` |
-After `PixelFlowPipeline.from_pretrained(...)`, the pipeline exposes:
-- `pipe.id2label` / `pipe.id2label_cn` — inspect id → label correspondence
-- `pipe.labels` / `pipe.labels_cn` — reverse maps (synonym → id)
-- `pipe.get_label_ids("golden retriever")` or `pipe.get_label_ids("金毛猎犬", lang="cn")`
 - `pipe(class_labels="golden retriever", ...)` — string labels resolved automatically
 ## Demo
-![PixelFlow-256 demo](PixelFlow-256/demo.png)
-## Load from a local clone
-```python
-import sys
-from pathlib import Path
-repo = Path("BiliSakura/PixelFlow-diffusers").resolve()
-variant = "PixelFlow-256"
-sys.path.insert(0, str(repo / variant))
-from pipeline import PixelFlowPipeline
-pipe = PixelFlowPipeline.from_pretrained(".")
 pipe.to("cuda")
-images = pipe(
-    class_labels=207,
     num_inference_steps=[10, 10, 10, 10],
     guidance_scale=4.0,
-).images
-# Human-readable ImageNet labels (English or Chinese)
-print(pipe.id2label[207])          # "golden retriever"
-print(pipe.id2label_cn[207])       # "金毛猎犬"
-pipe.get_label_ids("golden retriever")  # [207]
-pipe.get_label_ids("金毛猎犬", lang="cn")  # [207]
-images = pipe(class_labels="golden retriever", num_inference_steps=[10, 10, 10, 10]).images
 ```
 ### Text-to-image (`PixelFlow-T2I`)
-Uses [`google/flan-t5-xl`](https://huggingface.co/google/flan-t5-xl) as the text encoder (loaded from Hugging Face at runtime, not bundled in the repo).
 ```python
-variant = "PixelFlow-T2I"
-sys.path.insert(0, str(repo / variant))
-from pipeline import PixelFlowPipeline
-pipe = PixelFlowPipeline.from_pretrained(".")
 pipe.to("cuda")
-images = pipe(
     prompt="A golden retriever playing in a sunny garden",
     num_inference_steps=[10, 10, 10, 10],
     guidance_scale=4.0,
-).images
 ```
 ## Conversion
 ```bash
@@ -107,4 +155,17 @@ python scripts/convert_pixelflow_to_diffusers.py \
   --config models/raw/PixelFlow/t2i/config.yaml \
   --output models/BiliSakura/PixelFlow-diffusers/PixelFlow-T2I \
   --skip-text-encoder
-```

 # BiliSakura/PixelFlow-diffusers
+Self-contained PixelFlow checkpoints for Hugging Face diffusers. Each variant folder ships its own `pipeline.py`, component modules, and weights.
 ## Available checkpoints
+| Subfolder | Pipeline | Task | Resolution | Params |
+| --- | --- | --- | ---: | ---: |
+| [`PixelFlow-256/`](PixelFlow-256/) | `PixelFlowPipeline` | class-to-image | 256×256 | 677M |
+| [`PixelFlow-T2I/`](PixelFlow-T2I/) | `PixelFlowT2IPipeline` | text-to-image | 1024×1024 | 882M |
+## Repo layout
+```text
+BiliSakura/PixelFlow-diffusers/
+├── README.md
+├── PixelFlow-256/
+│   ├── pipeline.py
+│   ├── model_index.json
+│   ├── scheduler/scheduler_config.json
+│   └── transformer/
+└── PixelFlow-T2I/
+    ├── pipeline.py
+    ├── model_index.json
+    ├── scheduler/scheduler_config.json
+    ├── text_encoder/
+    ├── tokenizer/
+    └── transformer/
+```
+Each variant is self-contained. The `scheduler/` folder contains `scheduler_config.json` and `scheduling_pixelflow.py` with [`PixelFlowScheduler`](PixelFlow-256/scheduler/scheduling_pixelflow.py).
+No shared helper modules at inference time; only PyPI `diffusers` plus the local variant directory.
+## ImageNet class labels
+For class-conditional [`PixelFlow-256/`](PixelFlow-256/), `id2label` is embedded in `PixelFlow-256/model_index.json` (DiT-style).
+- `pipe.id2label` — inspect id → English label correspondence
+- `pipe.labels` — reverse map (English synonym → id)
+- `pipe.get_label_ids("golden retriever")`
 - `pipe(class_labels="golden retriever", ...)` — string labels resolved automatically
 ## Demo
+Class-to-image:
+```bash
+python demo_inference_c2i.py
+```
+Text-to-image:
+```bash
+python demo_inference_t2i.py
+```
+## Load from a local clone
+### Class-to-image (`PixelFlow-256`)
+```python
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+model_dir = Path("./PixelFlow-256").resolve()
+pipe = DiffusionPipeline.from_pretrained(
+    str(model_dir),
+    local_files_only=True,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+)
 pipe.to("cuda")
+print(pipe.id2label[207])
+print(pipe.get_label_ids("golden retriever"))
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipe(
+    class_labels="golden retriever",
+    height=256,
+    width=256,
     num_inference_steps=[10, 10, 10, 10],
     guidance_scale=4.0,
+    generator=generator,
+).images[0]
+image.save("demo.png")
 ```
 ### Text-to-image (`PixelFlow-T2I`)
+Uses [`google/flan-t5-xl`](https://huggingface.co/google/flan-t5-xl) when `text_encoder/` is not bundled.
 ```python
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+model_dir = Path("./PixelFlow-T2I").resolve()
+pipe = DiffusionPipeline.from_pretrained(
+    str(model_dir),
+    local_files_only=True,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+)
 pipe.to("cuda")
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipe(
     prompt="A golden retriever playing in a sunny garden",
+    height=1024,
+    width=1024,
     num_inference_steps=[10, 10, 10, 10],
     guidance_scale=4.0,
+    generator=generator,
+).images[0]
+image.save("demo.png")
+```
+Load a **variant subfolder** (e.g. `./PixelFlow-256`), not the repo root.
+## Load from the Hub
+```python
+import torch
+from diffusers import DiffusionPipeline
+pipe = DiffusionPipeline.from_pretrained(
+    "BiliSakura/PixelFlow-diffusers",
+    subfolder="PixelFlow-256",
+    custom_pipeline="pipeline.py",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+)
+pipe.to("cuda")
+image = pipe(class_labels="golden retriever", num_inference_steps=[10, 10, 10, 10]).images[0]
 ```
+Swap `subfolder="PixelFlow-T2I"` and call with `prompt=...` for text-to-image.
 ## Conversion
 ```bash
   --config models/raw/PixelFlow/t2i/config.yaml \
   --output models/BiliSakura/PixelFlow-diffusers/PixelFlow-T2I \
   --skip-text-encoder
+```
+## Citation
+```bibtex
+@article{chen2025pixelflow,
+  title={PixelFlow: Pixel-Space Flow Matching for High-Resolution Image Synthesis},
+  author={Chen, Shoufa and others},
+  year={2025},
+  eprint={2504.07963},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+```

demo_inference_c2i.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/usr/bin/env python3
+"""Generate a demo image with PixelFlow-256."""
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+REPO_ROOT = Path(__file__).resolve().parent
+MODEL_DIR = REPO_ROOT / "PixelFlow-256"
+OUTPUT_PATH = REPO_ROOT / "PixelFlow-256" / "demo.png"
+def main() -> None:
+    pipe = DiffusionPipeline.from_pretrained(
+        str(MODEL_DIR),
+        local_files_only=True,
+        custom_pipeline=str(MODEL_DIR / "pipeline.py"),
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    pipe.to("cuda")
+    print(pipe.id2label[207])
+    print(pipe.get_label_ids("golden retriever"))
+    generator = torch.Generator(device="cuda").manual_seed(42)
+    image = pipe(
+        class_labels="golden retriever",
+        height=256,
+        width=256,
+        num_inference_steps=[10, 10, 10, 10],
+        guidance_scale=4.0,
+        generator=generator,
+    ).images[0]
+    image.save(OUTPUT_PATH)
+    print(f"Saved demo image to {OUTPUT_PATH}")
+if __name__ == "__main__":
+    main()

demo_inference_t2i.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python3
+"""Generate a demo image with PixelFlow-T2I."""
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+REPO_ROOT = Path(__file__).resolve().parent
+MODEL_DIR = REPO_ROOT / "PixelFlow-T2I"
+OUTPUT_PATH = REPO_ROOT / "PixelFlow-T2I" / "demo.png"
+def main() -> None:
+    pipe = DiffusionPipeline.from_pretrained(
+        str(MODEL_DIR),
+        local_files_only=True,
+        custom_pipeline=str(MODEL_DIR / "pipeline.py"),
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    pipe.to("cuda")
+    generator = torch.Generator(device="cuda").manual_seed(42)
+    image = pipe(
+        prompt="A golden retriever playing in a sunny garden",
+        height=1024,
+        width=1024,
+        num_inference_steps=[10, 10, 10, 10],
+        guidance_scale=4.0,
+        generator=generator,
+    ).images[0]
+    image.save(OUTPUT_PATH)
+    print(f"Saved demo image to {OUTPUT_PATH}")
+if __name__ == "__main__":
+    main()