Spaces:

benzweijia
/

Adv-GRPO_DINO

Running on Zero

App Files Files Community

benzweijia commited on 16 days ago

Commit

9294bc7

verified ·

1 Parent(s): b1d39cd

Upload 61 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
adv_grpo/__pycache__/conv_gradfix.cpython-310.pyc +0 -0
adv_grpo/__pycache__/discriminator.cpython-310.pyc +0 -0
adv_grpo/__pycache__/ema.cpython-310.pyc +0 -0
adv_grpo/__pycache__/grpo_discriminator.cpython-310.pyc +0 -0
adv_grpo/__pycache__/inflated_layers.cpython-310.pyc +0 -0
adv_grpo/__pycache__/inflated_lib.cpython-310.pyc +0 -0
adv_grpo/__pycache__/ocr.cpython-310.pyc +0 -0
adv_grpo/__pycache__/patchgan_discriminator.cpython-310.pyc +0 -0
adv_grpo/__pycache__/pick_score_training.cpython-310.pyc +0 -0
adv_grpo/__pycache__/pickscore_scorer.cpython-310.pyc +0 -0
adv_grpo/__pycache__/prompts.cpython-310.pyc +0 -0
adv_grpo/__pycache__/rewards.cpython-310.pyc +0 -0
adv_grpo/__pycache__/stat_tracking.cpython-310.pyc +0 -0
adv_grpo/__pycache__/stylegan_discriminator.cpython-310.pyc +0 -0
adv_grpo/aesthetic_scorer.py +53 -0
adv_grpo/assets/activities.txt +3 -0
adv_grpo/assets/activities_v0.txt +3 -0
adv_grpo/assets/flow_grpo_fast.png +3 -0
adv_grpo/assets/imagenet_classes.txt +1000 -0
adv_grpo/assets/sac+logos+ava1-l14-linearMSE.pth +3 -0
adv_grpo/assets/simple_animals.txt +45 -0
adv_grpo/assets/simple_ocr_animals.txt +5 -0
adv_grpo/assets/simple_ocr_animals_digit1.txt +45 -0
adv_grpo/assets/simple_ocr_animals_digit3.txt +45 -0
adv_grpo/assets/simple_ocr_animals_digit5.txt +50 -0
adv_grpo/assets/test.jpg +0 -0
adv_grpo/clip_scorer.py +97 -0
adv_grpo/conv_gradfix.py +345 -0
adv_grpo/diffusers_patch/__pycache__/sd3_pipeline_with_logprob_fast.cpython-310.pyc +0 -0
adv_grpo/diffusers_patch/__pycache__/sd3_sde_with_logprob.cpython-310.pyc +0 -0
adv_grpo/diffusers_patch/__pycache__/train_dreambooth_lora_sd3.cpython-310.pyc +0 -0
adv_grpo/diffusers_patch/flux_kontext_pipeline_with_logprob.py +255 -0
adv_grpo/diffusers_patch/flux_pipeline_with_logprob.py +187 -0
adv_grpo/diffusers_patch/sd3_pipeline_with_logprob.py +198 -0
adv_grpo/diffusers_patch/sd3_pipeline_with_logprob_fast.py +1081 -0
adv_grpo/diffusers_patch/sd3_sde_with_logprob.py +139 -0
adv_grpo/diffusers_patch/train_dreambooth_lora_flux.py +144 -0
adv_grpo/diffusers_patch/train_dreambooth_lora_sd3.py +144 -0
adv_grpo/diffusers_patch/wan_pipeline_with_logprob.py +373 -0
adv_grpo/diffusers_patch/wan_prompt_embedding.py +97 -0
adv_grpo/ema.py +88 -0
adv_grpo/imagereward_scorer.py +40 -0
adv_grpo/inflated_layers.py +305 -0
adv_grpo/inflated_lib.py +346 -0
adv_grpo/ocr.py +138 -0
adv_grpo/pick_score_training.py +385 -0
adv_grpo/pickscore_scorer.py +70 -0
adv_grpo/pickscore_scorer_constractive.py +89 -0
adv_grpo/pickscore_scorer_patch.py +78 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+adv_grpo/assets/flow_grpo_fast.png filter=lfs diff=lfs merge=lfs -text

adv_grpo/__pycache__/conv_gradfix.cpython-310.pyc ADDED Viewed

Binary file (9.28 kB). View file

adv_grpo/__pycache__/discriminator.cpython-310.pyc ADDED Viewed

Binary file (2.95 kB). View file

adv_grpo/__pycache__/ema.cpython-310.pyc ADDED Viewed

Binary file (3.3 kB). View file

adv_grpo/__pycache__/grpo_discriminator.cpython-310.pyc ADDED Viewed

Binary file (625 Bytes). View file

adv_grpo/__pycache__/inflated_layers.cpython-310.pyc ADDED Viewed

Binary file (7.45 kB). View file

adv_grpo/__pycache__/inflated_lib.cpython-310.pyc ADDED Viewed

Binary file (10.1 kB). View file

adv_grpo/__pycache__/ocr.cpython-310.pyc ADDED Viewed

Binary file (4.41 kB). View file

adv_grpo/__pycache__/patchgan_discriminator.cpython-310.pyc ADDED Viewed

Binary file (5.55 kB). View file

adv_grpo/__pycache__/pick_score_training.cpython-310.pyc ADDED Viewed

Binary file (9.12 kB). View file

adv_grpo/__pycache__/pickscore_scorer.cpython-310.pyc ADDED Viewed

Binary file (2.44 kB). View file

adv_grpo/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (2.93 kB). View file

adv_grpo/__pycache__/rewards.cpython-310.pyc ADDED Viewed

Binary file (29.7 kB). View file

adv_grpo/__pycache__/stat_tracking.cpython-310.pyc ADDED Viewed

Binary file (2.62 kB). View file

adv_grpo/__pycache__/stylegan_discriminator.cpython-310.pyc ADDED Viewed

Binary file (14.1 kB). View file

adv_grpo/aesthetic_scorer.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Based on https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/fe88a163f4661b4ddabba0751ff645e2e620746e/simple_inference.py
+from importlib import resources
+import torch
+import torch.nn as nn
+import numpy as np
+from transformers import CLIPModel, CLIPProcessor
+from PIL import Image
+ASSETS_PATH = resources.files("adv_grpo.assets")
+class MLP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(768, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            nn.Linear(16, 1),
+        )
+    @torch.no_grad()
+    def forward(self, embed):
+        return self.layers(embed)
+class AestheticScorer(torch.nn.Module):
+    def __init__(self, dtype):
+        super().__init__()
+        self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.mlp = MLP()
+        state_dict = torch.load(
+            ASSETS_PATH.joinpath("sac+logos+ava1-l14-linearMSE.pth")
+        )
+        self.mlp.load_state_dict(state_dict)
+        self.dtype = dtype
+        self.eval()
+    @torch.no_grad()
+    def __call__(self, images):
+        device = next(self.parameters()).device
+        inputs = self.processor(images=images, return_tensors="pt")
+        inputs = {k: v.to(self.dtype).to(device) for k, v in inputs.items()}
+        embed = self.clip.get_image_features(**inputs)
+        # normalize embedding
+        embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
+        return self.mlp(embed).squeeze(1)

adv_grpo/assets/activities.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+washing the dishes
+riding a bike
+playing chess

adv_grpo/assets/activities_v0.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+washing the dishes
+riding a bike
+playing chess

adv_grpo/assets/flow_grpo_fast.png ADDED Viewed

Git LFS Details

SHA256: 35709d674818e29d39728e036479e51b8e015bcc7caf4dc54eb3e4f41cc05ab1
Pointer size: 131 Bytes
Size of remote file: 222 kB

adv_grpo/assets/imagenet_classes.txt ADDED Viewed

	@@ -0,0 +1,1000 @@

+tench, Tinca tinca
+goldfish, Carassius auratus
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+triceratops
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+tusker
+echidna, spiny anteater, anteater
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wallaby, brush kangaroo
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+wombat
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+chambered nautilus, pearly nautilus, nautilus
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+isopod
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+little blue heron, Egretta caerulea
+American egret, great white heron, Egretta albus
+bittern
+crane
+limpkin, Aramus pictus
+European gallinule, Porphyrio porphyrio
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+dugong, Dugong dugon
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog, Maltese terrier, Maltese
+Pekinese, Pekingese, Peke
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound, Afghan
+basset, basset hound
+beagle
+bloodhound, sleuthhound
+bluetick
+black-and-tan coonhound
+Walker hound, Walker foxhound
+English foxhound
+redbone
+borzoi, Russian wolfhound
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound, Ibizan Podenco
+Norwegian elkhound, elkhound
+otterhound, otter hound
+Saluki, gazelle hound
+Scottish deerhound, deerhound
+Weimaraner
+Staffordshire bullterrier, Staffordshire bull terrier
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier, Sealyham
+Airedale, Airedale terrier
+cairn, cairn terrier
+Australian terrier
+Dandie Dinmont, Dandie Dinmont terrier
+Boston bull, Boston terrier
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier, Scottish terrier, Scottie
+Tibetan terrier, chrysanthemum dog
+silky terrier, Sydney silky
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa, Lhasa apso
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla, Hungarian pointer
+English setter
+Irish setter, red setter
+Gordon setter
+Brittany spaniel
+clumber, clumber spaniel
+English springer, English springer spaniel
+Welsh springer spaniel
+cocker spaniel, English cocker spaniel, cocker
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog, bobtail
+Shetland sheepdog, Shetland sheep dog, Shetland
+collie
+Border collie
+Bouvier des Flandres, Bouviers des Flandres
+Rottweiler
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard, St Bernard
+Eskimo dog, husky
+malamute, malemute, Alaskan malamute
+Siberian husky
+dalmatian, coach dog, carriage dog
+affenpinscher, monkey pinscher, monkey dog
+basenji
+pug, pug-dog
+Leonberg
+Newfoundland, Newfoundland dog
+Great Pyrenees
+Samoyed, Samoyede
+Pomeranian
+chow, chow chow
+keeshond
+Brabancon griffon
+Pembroke, Pembroke Welsh corgi
+Cardigan, Cardigan Welsh corgi
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf, grey wolf, gray wolf, Canis lupus
+white wolf, Arctic wolf, Canis lupus tundrarum
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+dingo, warrigal, warragal, Canis dingo
+dhole, Cuon alpinus
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+red fox, Vulpes vulpes
+kit fox, Vulpes macrotis
+Arctic fox, white fox, Alopex lagopus
+grey fox, gray fox, Urocyon cinereoargenteus
+tabby, tabby cat
+tiger cat
+Persian cat
+Siamese cat, Siamese
+Egyptian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+jaguar, panther, Panthera onca, Felis onca
+lion, king of beasts, Panthera leo
+tiger, Panthera tigris
+cheetah, chetah, Acinonyx jubatus
+brown bear, bruin, Ursus arctos
+American black bear, black bear, Ursus americanus, Euarctos americanus
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+sloth bear, Melursus ursinus, Ursus ursinus
+mongoose
+meerkat, mierkat
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant, emmet, pismire
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+wood rabbit, cottontail, cottontail rabbit
+hare
+Angora, Angora rabbit
+hamster
+porcupine, hedgehog
+fox squirrel, eastern fox squirrel, Sciurus niger
+marmot
+beaver
+guinea pig, Cavia cobaya
+sorrel
+zebra
+hog, pig, grunter, squealer, Sus scrofa
+wild boar, boar, Sus scrofa
+warthog
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+ox
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+bison
+ram, tup
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+ibex, Capra ibex
+hartebeest
+impala, Aepyceros melampus
+gazelle
+Arabian camel, dromedary, Camelus dromedarius
+llama
+weasel
+mink
+polecat, fitch, foulmart, foumart, Mustela putorius
+black-footed ferret, ferret, Mustela nigripes
+otter
+skunk, polecat, wood pussy
+badger
+armadillo
+three-toed sloth, ai, Bradypus tridactylus
+orangutan, orang, orangutang, Pongo pygmaeus
+gorilla, Gorilla gorilla
+chimpanzee, chimp, Pan troglodytes
+gibbon, Hylobates lar
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+guenon, guenon monkey
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+colobus, colobus monkey
+proboscis monkey, Nasalis larvatus
+marmoset
+capuchin, ringtail, Cebus capucinus
+howler monkey, howler
+titi, titi monkey
+spider monkey, Ateles geoffroyi
+squirrel monkey, Saimiri sciureus
+Madagascar cat, ring-tailed lemur, Lemur catta
+indri, indris, Indri indri, Indri brevicaudatus
+Indian elephant, Elephas maximus
+African elephant, Loxodonta africana
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+barracouta, snoek
+eel
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rock beauty, Holocanthus tricolor
+anemone fish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+lionfish
+puffer, pufferfish, blowfish, globefish
+abacus
+abaya
+academic gown, academic robe, judge's robe
+accordion, piano accordion, squeeze box
+acoustic guitar
+aircraft carrier, carrier, flattop, attack aircraft carrier
+airliner
+airship, dirigible
+altar
+ambulance
+amphibian, amphibious vehicle
+analog clock
+apiary, bee house
+apron
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+assault rifle, assault gun
+backpack, back pack, knapsack, packsack, rucksack, haversack
+bakery, bakeshop, bakehouse
+balance beam, beam
+balloon
+ballpoint, ballpoint pen, ballpen, Biro
+Band Aid
+banjo
+bannister, banister, balustrade, balusters, handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel, cask
+barrow, garden cart, lawn cart, wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap, swimming cap
+bath towel
+bathtub, bathing tub, bath, tub
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+beacon, lighthouse, beacon light, pharos
+beaker
+bearskin, busby, shako
+beer bottle
+beer glass
+bell cote, bell cot
+bib
+bicycle-built-for-two, tandem bicycle, tandem
+bikini, two-piece
+binder, ring-binder
+binoculars, field glasses, opera glasses
+birdhouse
+boathouse
+bobsled, bobsleigh, bob
+bolo tie, bolo, bola tie, bola
+bonnet, poke bonnet
+bookcase
+bookshop, bookstore, bookstall
+bottlecap
+bow
+bow tie, bow-tie, bowtie
+brass, memorial tablet, plaque
+brassiere, bra, bandeau
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+breastplate, aegis, egis
+broom
+bucket, pail
+buckle
+bulletproof vest
+bullet train, bullet
+butcher shop, meat market
+cab, hack, taxi, taxicab
+caldron, cauldron
+candle, taper, wax light
+cannon
+canoe
+can opener, tin opener
+cardigan
+car mirror
+carousel, carrousel, merry-go-round, roundabout, whirligig
+carpenter's kit, tool kit
+carton
+car wheel
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello, violoncello
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+chain
+chainlink fence
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+chain saw, chainsaw
+chest
+chiffonier, commode
+chime, bell, gong
+china cabinet, china closet
+Christmas stocking
+church, church building
+cinema, movie theater, movie theatre, movie house, picture palace
+cleaver, meat cleaver, chopper
+cliff dwelling
+cloak
+clog, geta, patten, sabot
+cocktail shaker
+coffee mug
+coffeepot
+coil, spiral, volute, whorl, helix
+combination lock
+computer keyboard, keypad
+confectionery, confectionary, candy store
+container ship, containership, container vessel
+convertible
+corkscrew, bottle screw
+cornet, horn, trumpet, trump
+cowboy boot
+cowboy hat, ten-gallon hat
+cradle
+crane
+crash helmet
+crate
+crib, cot
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam, dike, dyke
+desk
+desktop computer
+dial telephone, dial phone
+diaper, nappy, napkin
+digital clock
+digital watch
+dining table, board
+dishrag, dishcloth
+dishwasher, dish washer, dishwashing machine
+disk brake, disc brake
+dock, dockage, docking facility
+dogsled, dog sled, dog sleigh
+dome
+doormat, welcome mat
+drilling platform, offshore rig
+drum, membranophone, tympan
+drumstick
+dumbbell
+Dutch oven
+electric fan, blower
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa, boa
+file, file cabinet, filing cabinet
+fireboat
+fire engine, fire truck
+fire screen, fireguard
+flagpole, flagstaff
+flute, transverse flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn, horn
+frying pan, frypan, skillet
+fur coat
+garbage truck, dustcart
+gasmask, respirator, gas helmet
+gas pump, gasoline pump, petrol pump, island dispenser
+goblet
+go-kart
+golf ball
+golfcart, golf cart
+gondola
+gong, tam-tam
+gown
+grand piano, grand
+greenhouse, nursery, glasshouse
+grille, radiator grille
+grocery store, grocery, food market, market
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+hand-held computer, hand-held microcomputer
+handkerchief, hankie, hanky, hankey
+hard disc, hard disk, fixed disk
+harmonica, mouth organ, harp, mouth harp
+harp
+harvester, reaper
+hatchet
+holster
+home theater, home theatre
+honeycomb
+hook, claw
+hoopskirt, crinoline
+horizontal bar, high bar
+horse cart, horse-cart
+hourglass
+iPod
+iron, smoothing iron
+jack-o'-lantern
+jean, blue jean, denim
+jeep, landrover
+jersey, T-shirt, tee shirt
+jigsaw puzzle
+jinrikisha, ricksha, rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat, laboratory coat
+ladle
+lampshade, lamp shade
+laptop, laptop computer
+lawn mower, mower
+lens cap, lens cover
+letter opener, paper knife, paperknife
+library
+lifeboat
+lighter, light, igniter, ignitor
+limousine, limo
+liner, ocean liner
+lipstick, lip rouge
+Loafer
+lotion
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+loupe, jeweler's loupe
+lumbermill, sawmill
+magnetic compass
+mailbag, postbag
+mailbox, letter box
+maillot
+maillot, tank suit
+manhole cover
+maraca
+marimba, xylophone
+mask
+matchstick
+maypole
+maze, labyrinth
+measuring cup
+medicine chest, medicine cabinet
+megalith, megalithic structure
+microphone, mike
+microwave, microwave oven
+military uniform
+milk can
+minibus
+miniskirt, mini
+minivan
+missile
+mitten
+mixing bowl
+mobile home, manufactured home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter, scooter
+mountain bike, all-terrain bike, off-roader
+mountain tent
+mouse, computer mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook, notebook computer
+obelisk
+oboe, hautboy, hautbois
+ocarina, sweet potato
+odometer, hodometer, mileometer, milometer
+oil filter
+organ, pipe organ
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+overskirt
+oxcart
+oxygen mask
+packet
+paddle, boat paddle
+paddlewheel, paddle wheel
+padlock
+paintbrush
+pajama, pyjama, pj's, jammies
+palace
+panpipe, pandean pipe, syrinx
+paper towel
+parachute, chute
+parallel bars, bars
+park bench
+parking meter
+passenger car, coach, carriage
+patio, terrace
+pay-phone, pay-station
+pedestal, plinth, footstall
+pencil box, pencil case
+pencil sharpener
+perfume, essence
+Petri dish
+photocopier
+pick, plectrum, plectron
+pickelhaube
+picket fence, paling
+pickup, pickup truck
+pier
+piggy bank, penny bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate, pirate ship
+pitcher, ewer
+plane, carpenter's plane, woodworking plane
+planetarium
+plastic bag
+plate rack
+plow, plough
+plunger, plumber's helper
+Polaroid camera, Polaroid Land camera
+pole
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+poncho
+pool table, billiard table, snooker table
+pop bottle, soda bottle
+pot, flowerpot
+potter's wheel
+power drill
+prayer rug, prayer mat
+printer
+prison, prison house
+projectile, missile
+projector
+puck, hockey puck
+punching bag, punch bag, punching ball, punchball
+purse
+quill, quill pen
+quilt, comforter, comfort, puff
+racer, race car, racing car
+racket, racquet
+radiator
+radio, wireless
+radio telescope, radio reflector
+rain barrel
+recreational vehicle, RV, R.V.
+reel
+reflex camera
+refrigerator, icebox
+remote control, remote
+restaurant, eating house, eating place, eatery
+revolver, six-gun, six-shooter
+rifle
+rocking chair, rocker
+rotisserie
+rubber eraser, rubber, pencil eraser
+rugby ball
+rule, ruler
+running shoe
+safe
+safety pin
+saltshaker, salt shaker
+sandal
+sarong
+sax, saxophone
+scabbard
+scale, weighing machine
+school bus
+schooner
+scoreboard
+screen, CRT screen
+screw
+screwdriver
+seat belt, seatbelt
+sewing machine
+shield, buckler
+shoe shop, shoe-shop, shoe store
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule, slipstick
+sliding door
+slot, one-armed bandit
+snorkel
+snowmobile
+snowplow, snowplough
+soap dispenser
+soccer ball
+sock
+solar dish, solar collector, solar furnace
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web, spider's web
+spindle
+sports car, sport car
+spotlight, spot
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch, stop watch
+stove
+strainer
+streetcar, tram, tramcar, trolley, trolley car
+stretcher
+studio couch, day bed
+stupa, tope
+submarine, pigboat, sub, U-boat
+suit, suit of clothes
+sundial
+sunglass
+sunglasses, dark glasses, shades
+sunscreen, sunblock, sun blocker
+suspension bridge
+swab, swob, mop
+sweatshirt
+swimming trunks, bathing trunks
+swing
+switch, electric switch, electrical switch
+syringe
+table lamp
+tank, army tank, armored combat vehicle, armoured combat vehicle
+tape player
+teapot
+teddy, teddy bear
+television, television system
+tennis ball
+thatch, thatched roof
+theater curtain, theatre curtain
+thimble
+thresher, thrasher, threshing machine
+throne
+tile roof
+toaster
+tobacco shop, tobacconist shop, tobacconist
+toilet seat
+torch
+totem pole
+tow truck, tow car, wrecker
+toyshop
+tractor
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+tray
+trench coat
+tricycle, trike, velocipede
+trimaran
+tripod
+triumphal arch
+trolleybus, trolley coach, trackless trolley
+trombone
+tub, vat
+turnstile
+typewriter keyboard
+umbrella
+unicycle, monocycle
+upright, upright piano
+vacuum, vacuum cleaner
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin, fiddle
+volleyball
+waffle iron
+wall clock
+wallet, billfold, notecase, pocketbook
+wardrobe, closet, press
+warplane, military plane
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+washer, automatic washer, washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool, woolen, woollen
+worm fence, snake fence, snake-rail fence, Virginia fence
+wreck
+yawl
+yurt
+web site, website, internet site, site
+comic book
+crossword puzzle, crossword
+street sign
+traffic light, traffic signal, stoplight
+book jacket, dust cover, dust jacket, dust wrapper
+menu
+plate
+guacamole
+consomme
+hot pot, hotpot
+trifle
+ice cream, icecream
+ice lolly, lolly, lollipop, popsicle
+French loaf
+bagel, beigel
+pretzel
+cheeseburger
+hotdog, hot dog, red hot
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce, chocolate syrup
+dough
+meat loaf, meatloaf
+pizza, pizza pie
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff, drop, drop-off
+coral reef
+geyser
+lakeside, lakeshore
+promontory, headland, head, foreland
+sandbar, sand bar
+seashore, coast, seacoast, sea-coast
+valley, vale
+volcano
+ballplayer, baseball player
+groom, bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+corn
+acorn
+hip, rose hip, rosehip
+buckeye, horse chestnut, conker
+coral fungus
+agaric
+gyromitra
+stinkhorn, carrion fungus
+earthstar
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+bolete
+ear, spike, capitulum
+toilet tissue, toilet paper, bathroom tissue

adv_grpo/assets/sac+logos+ava1-l14-linearMSE.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21dd590f3ccdc646f0d53120778b296013b096a035a2718c9cb0d511bff0f1e0
+size 3714759

adv_grpo/assets/simple_animals.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+cat
+dog
+horse
+monkey
+rabbit
+zebra
+spider
+bird
+sheep
+deer
+cow
+goat
+lion
+tiger
+bear
+raccoon
+fox
+wolf
+lizard
+beetle
+ant
+butterfly
+fish
+shark
+whale
+dolphin
+squirrel
+mouse
+rat
+snake
+turtle
+frog
+chicken
+duck
+goose
+bee
+pig
+turkey
+fly
+llama
+camel
+bat
+gorilla
+hedgehog
+kangaroo

adv_grpo/assets/simple_ocr_animals.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+cat
+dog
+horse
+monkey
+rabbit

adv_grpo/assets/simple_ocr_animals_digit1.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+A cat holding a sign that says '0'
+A dog holding a sign that says '0'
+A horse holding a sign that says '0'
+A monkey holding a sign that says '0'
+A rabbit holding a sign that says '0'
+A cat holding a sign that says '1'
+A dog holding a sign that says '1'
+A horse holding a sign that says '1'
+A monkey holding a sign that says '1'
+A rabbit holding a sign that says '1'
+A cat holding a sign that says '2'
+A dog holding a sign that says '2'
+A horse holding a sign that says '2'
+A monkey holding a sign that says '2'
+A rabbit holding a sign that says '2'
+A cat holding a sign that says '3'
+A dog holding a sign that says '3'
+A horse holding a sign that says '3'
+A monkey holding a sign that says '3'
+A rabbit holding a sign that says '3'
+A cat holding a sign that says '4'
+A dog holding a sign that says '4'
+A horse holding a sign that says '4'
+A monkey holding a sign that says '4'
+A rabbit holding a sign that says '4'
+A cat holding a sign that says '5'
+A dog holding a sign that says '5'
+A horse holding a sign that says '5'
+A monkey holding a sign that says '5'
+A rabbit holding a sign that says '5'
+A cat holding a sign that says '6'
+A dog holding a sign that says '6'
+A horse holding a sign that says '6'
+A monkey holding a sign that says '6'
+A rabbit holding a sign that says '6'
+A cat holding a sign that says '7'
+A dog holding a sign that says '7'
+A horse holding a sign that says '7'
+A monkey holding a sign that says '7'
+A rabbit holding a sign that says '7'
+A cat holding a sign that says '8'
+A dog holding a sign that says '8'
+A horse holding a sign that says '8'
+A monkey holding a sign that says '8'
+A rabbit holding a sign that says '8'

adv_grpo/assets/simple_ocr_animals_digit3.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+A cat holding a sign that says '123'
+A dog holding a sign that says '234'
+A horse holding a sign that says '345'
+A monkey holding a sign that says '456'
+A rabbit holding a sign that says '567'
+A cat holding a sign that says '678'
+A dog holding a sign that says '789'
+A horse holding a sign that says '123'
+A monkey holding a sign that says '234'
+A rabbit holding a sign that says '345'
+A cat holding a sign that says '456'
+A dog holding a sign that says '567'
+A horse holding a sign that says '678'
+A monkey holding a sign that says '789'
+A rabbit holding a sign that says '123'
+A cat holding a sign that says '234'
+A dog holding a sign that says '345'
+A horse holding a sign that says '456'
+A monkey holding a sign that says '567'
+A rabbit holding a sign that says '678'
+A cat holding a sign that says '789'
+A dog holding a sign that says '123'
+A horse holding a sign that says '234'
+A monkey holding a sign that says '345'
+A rabbit holding a sign that says '456'
+A cat holding a sign that says '567'
+A dog holding a sign that says '678'
+A horse holding a sign that says '789'
+A monkey holding a sign that says '123'
+A rabbit holding a sign that says '234'
+A cat holding a sign that says '345'
+A dog holding a sign that says '456'
+A horse holding a sign that says '567'
+A monkey holding a sign that says '678'
+A rabbit holding a sign that says '789'
+A cat holding a sign that says '123'
+A dog holding a sign that says '234'
+A horse holding a sign that says '345'
+A monkey holding a sign that says '456'
+A rabbit holding a sign that says '567'
+A cat holding a sign that says '678'
+A dog holding a sign that says '789'
+A horse holding a sign that says '123'
+A monkey holding a sign that says '234'
+A rabbit holding a sign that says '345'

adv_grpo/assets/simple_ocr_animals_digit5.txt ADDED Viewed

	@@ -0,0 +1,50 @@

+A cat holding a sign that says '12345'
+A dog holding a sign that says '23456'
+A horse holding a sign that says '34567'
+A monkey holding a sign that says '45678'
+A rabbit holding a sign that says '56789'
+A cat holding a sign that says '54321'
+A dog holding a sign that says '65432'
+A horse holding a sign that says '76543'
+A monkey holding a sign that says '87654'
+A rabbit holding a sign that says '98765'
+A cat holding a sign that says '12345'
+A dog holding a sign that says '23456'
+A horse holding a sign that says '34567'
+A monkey holding a sign that says '45678'
+A rabbit holding a sign that says '56789'
+A cat holding a sign that says '54321'
+A dog holding a sign that says '65432'
+A horse holding a sign that says '76543'
+A monkey holding a sign that says '87654'
+A rabbit holding a sign that says '98765'
+A cat holding a sign that says '12345'
+A dog holding a sign that says '23456'
+A horse holding a sign that says '34567'
+A monkey holding a sign that says '45678'
+A rabbit holding a sign that says '56789'
+A cat holding a sign that says '54321'
+A dog holding a sign that says '65432'
+A horse holding a sign that says '76543'
+A monkey holding a sign that says '87654'
+A rabbit holding a sign that says '98765'
+A cat holding a sign that says '12345'
+A dog holding a sign that says '23456'
+A horse holding a sign that says '34567'
+A monkey holding a sign that says '45678'
+A rabbit holding a sign that says '56789'
+A cat holding a sign that says '54321'
+A dog holding a sign that says '65432'
+A horse holding a sign that says '76543'
+A monkey holding a sign that says '87654'
+A rabbit holding a sign that says '98765'
+A cat holding a sign that says '12345'
+A dog holding a sign that says '23456'
+A horse holding a sign that says '34567'
+A monkey holding a sign that says '45678'
+A rabbit holding a sign that says '56789'
+A cat holding a sign that says '54321'
+A dog holding a sign that says '65432'
+A horse holding a sign that says '76543'
+A monkey holding a sign that says '87654'
+A rabbit holding a sign that says '98765'

adv_grpo/assets/test.jpg ADDED Viewed

adv_grpo/clip_scorer.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Based on https://github.com/RE-N-Y/imscore/blob/main/src/imscore/preference/model.py
+from importlib import resources
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as T
+from transformers import AutoImageProcessor,CLIPProcessor, CLIPModel
+import numpy as np
+from PIL import Image
+def get_size(size):
+    if isinstance(size, int):
+        return (size, size)
+    elif "height" in size and "width" in size:
+        return (size["height"], size["width"])
+    elif "shortest_edge" in size:
+        return size["shortest_edge"]
+    else:
+        raise ValueError(f"Invalid size: {size}")
+def get_image_transform(processor:AutoImageProcessor):
+    config = processor.to_dict()
+    resize = T.Resize(get_size(config.get("size"))) if config.get("do_resize") else nn.Identity()
+    crop = T.CenterCrop(get_size(config.get("crop_size"))) if config.get("do_center_crop") else nn.Identity()
+    normalise = T.Normalize(mean=processor.image_mean, std=processor.image_std) if config.get("do_normalize") else nn.Identity()
+    return T.Compose([resize, crop, normalise])
+class ClipScorer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # self.device="cuda"
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.tform = get_image_transform(self.processor.image_processor)
+        self.eval()
+    def _process(self, pixels):
+        dtype = pixels.dtype
+        pixels = self.tform(pixels)
+        pixels = pixels.to(dtype=dtype)
+        return pixels
+    @torch.no_grad()
+    def __call__(self, pixels, prompts, return_img_embedding=False):
+        device = next(self.parameters()).device
+        texts = self.processor(text=prompts, padding='max_length', truncation=True, return_tensors="pt").to(device)
+        pixels = self._process(pixels).to(device)
+        outputs = self.model(pixel_values=pixels, **texts)
+        if return_img_embedding:
+            return outputs.logits_per_image.diagonal()/30, outputs.image_embeds
+        return outputs.logits_per_image.diagonal()/30
+    @torch.no_grad()
+    def image_similarity(self, pixels, ref_pixels):
+        device = next(self.parameters()).device
+        pixels = self._process(pixels).to(device)
+        ref_pixels = self._process(ref_pixels).to(device)
+        pixel_embeds = self.model.get_image_features(pixel_values=pixels)
+        ref_embeds = self.model.get_image_features(pixel_values=ref_pixels)
+        pixel_embeds = pixel_embeds / pixel_embeds.norm(p=2, dim=-1, keepdim=True)
+        ref_embeds = ref_embeds / ref_embeds.norm(p=2, dim=-1, keepdim=True)
+        sim = pixel_embeds @ ref_embeds.T
+        # sim = torch.diagonal(sim, 0)
+        sim = sim.squeeze(-1)
+        return sim
+def main():
+    # scorer = ClipScorer(
+    #     device='cuda'
+    # )
+    scorer = ClipScorer(
+    )
+    images=[
+    "assets/test.jpg",
+    "assets/test.jpg"
+    ]
+    pil_images = [Image.open(img) for img in images]
+    prompts=[
+        'an image of cat',
+        'not an image of cat'
+    ]
+    images = [np.array(img) for img in pil_images]
+    images = np.array(images)
+    images = images.transpose(0, 3, 1, 2)  # NHWC -> NCHW
+    images = torch.tensor(images, dtype=torch.uint8)/255.0
+    print(scorer(images, prompts))
+if __name__ == "__main__":
+    main()

adv_grpo/conv_gradfix.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Custom replacement for `torch.nn.functional.convNd` and `torch.nn.functional.conv_transposeNd`
+that supports arbitrarily high order gradients with zero performance penalty.
+Modified from https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py
+"""
+import contextlib
+import warnings
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import Conv2d, Conv3d
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-differ
+# pylint: disable=protected-access
+# ----------------------------------------------------------------------------
+enabled = False  # Enable the custom op by setting this to true.
+weight_gradients_disabled = (
+    False  # Forcefully disable computation of gradients with respect to the weights.
+)
+@contextlib.contextmanager
+def no_weight_gradients():
+    global weight_gradients_disabled
+    old = weight_gradients_disabled
+    weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+# ----------------------------------------------------------------------------
+class GradFixConv2d(Conv2d):
+    def __init__(self, *args, use_gradfix: bool = False, **kwargs):
+        self.use_gradfix = use_gradfix
+        super().__init__(*args, **kwargs)
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
+        conv_fn = F.conv2d if not self.use_gradfix else convNd
+        if self.padding_mode != "zeros":
+            return conv_fn(
+                F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
+                weight,
+                bias,
+                self.stride,
+                (0, 0),
+                self.dilation,
+                self.groups,
+            )
+        return conv_fn(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+    def forward(
+        self, input: Tensor, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None
+    ) -> Tensor:
+        weight = self.weight if weight is None else weight
+        bias = self.bias if bias is None else bias
+        return self._conv_forward(input, weight, bias)
+class GradFixConv3d(Conv3d):
+    def __init__(self, *args, use_gradfix: bool = False, **kwargs):
+        self.use_gradfix = use_gradfix
+        super().__init__(*args, **kwargs)
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
+        conv_fn = F.conv3d if not self.use_gradfix else convNd
+        if self.padding_mode != "zeros":
+            return conv_fn(
+                F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
+                weight,
+                bias,
+                self.stride,
+                (0, 0, 0),
+                self.dilation,
+                self.groups,
+            )
+        return conv_fn(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+    def forward(
+        self, input: Tensor, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None
+    ) -> Tensor:
+        weight = self.weight if weight is None else weight
+        bias = self.bias if bias is None else bias
+        return self._conv_forward(input, weight, bias)
+# ----------------------------------------------------------------------------
+def convNd(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    N = weight.ndim - 2
+    if _should_use_custom_op(input):
+        return _conv_gradfix(
+            transpose=False,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=0,
+            dilation=dilation,
+            groups=groups,
+        ).apply(input, weight, bias)
+    return getattr(torch.nn.functional, f"conv{N}d")(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+def conv_transposeNd(
+    input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1
+):
+    N = weight.ndim - 2
+    if _should_use_custom_op(input):
+        return _conv_gradfix(
+            transpose=True,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            dilation=dilation,
+        ).apply(input, weight, bias)
+    return getattr(torch.nn.functional, f"conv_transpose{N}d")(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        groups=groups,
+        dilation=dilation,
+    )
+# ----------------------------------------------------------------------------
+def _should_use_custom_op(input):
+    assert isinstance(input, torch.Tensor)
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+    if input.device.type != "cuda":
+        return False
+    if any(torch.__version__.startswith(x) for x in ["1.7.", "1.8.", "1.9"]):
+        return True
+    if torch.__version__.startswith("2"):
+        return True
+    warnings.warn(
+        f"conv2d_gradfix not supported on PyTorch {torch.__version__}. "
+        f"Falling back to torch.nn.functional.conv2d()."
+    )
+    return False
+def _tuple_of_ints(xs, ndim):
+    xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs,) * ndim
+    assert len(xs) == ndim
+    assert all(isinstance(x, int) for x in xs)
+    return xs
+# ----------------------------------------------------------------------------
+_conv_gradfix_cache = dict()
+def _conv_gradfix(transpose, weight_shape, stride, padding, output_padding, dilation, groups):
+    ndim = len(weight_shape) - 2
+    # Parse arguments.
+    weight_shape = tuple(weight_shape)
+    stride = _tuple_of_ints(stride, ndim)
+    padding = _tuple_of_ints(padding, ndim)
+    output_padding = _tuple_of_ints(output_padding, ndim)
+    dilation = _tuple_of_ints(dilation, ndim)
+    # Lookup from cache.
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups)
+    if key in _conv_gradfix_cache:
+        return _conv_gradfix_cache[key]
+    # Validate arguments.
+    assert groups >= 1
+    assert all(stride[i] >= 1 for i in range(ndim))
+    assert all(padding[i] >= 0 for i in range(ndim))
+    assert all(dilation[i] >= 0 for i in range(ndim))
+    if not transpose:
+        assert all(output_padding[i] == 0 for i in range(ndim))
+    else:  # transpose
+        assert all(0 <= output_padding[i] < max(stride[i], dilation[i]) for i in range(ndim))
+    # Helpers.
+    common_kwargs = dict(stride=stride, padding=padding, dilation=dilation, groups=groups)
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [
+                0,
+            ] * ndim
+        return [
+            input_shape[i + 2]
+            - (output_shape[i + 2] - 1) * stride[i]
+            - (1 - 2 * padding[i])
+            - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+    # Forward & backward.
+    class ConvNd(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            """
+            input size: [B, C, ...]
+            weight size:
+                -> Conv:        [C_out, C_in // groups, ...]
+                -> Transpose:   [C_in, C_out // groups, ...]
+            """
+            assert weight.shape == weight_shape
+            ctx.save_for_backward(input, weight)
+            # General case => cuDNN.
+            if transpose:
+                return getattr(torch.nn.functional, f"conv_transpose{ndim}d")(
+                    input=input,
+                    weight=weight.to(input.dtype),
+                    bias=bias,
+                    output_padding=output_padding,
+                    **common_kwargs,
+                )
+            return getattr(torch.nn.functional, f"conv{ndim}d")(
+                input=input, weight=weight.to(input.dtype), bias=bias, **common_kwargs
+            )
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            grad_input = None
+            grad_weight = None
+            grad_bias = None
+            if ctx.needs_input_grad[0]:  # Input
+                p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape)
+                op = _conv_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs,
+                )
+                grad_input = op.apply(grad_output, weight, None)
+                assert grad_input.shape == input.shape
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:  # Weight
+                grad_weight = ConvNdGradWeight.apply(grad_output, input)
+                assert grad_weight.shape == weight_shape
+            if ctx.needs_input_grad[2]:  # Bias
+                grad_bias = grad_output.transpose(0, 1).flatten(1).sum(1)
+            return grad_input, grad_weight, grad_bias
+    # Gradient with respect to the weights.
+    class ConvNdGradWeight(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, grad_output, input):
+            flags = [
+                torch.backends.cudnn.benchmark,
+                torch.backends.cudnn.deterministic,
+                torch.backends.cudnn.allow_tf32,
+            ]
+            if torch.__version__.startswith("1"):
+                op = torch._C._jit_get_operation(
+                    "aten::cudnn_convolution_backward_weight"
+                    if not transpose
+                    else "aten::cudnn_convolution_transpose_backward_weight"
+                )
+                grad_weight = op(
+                    weight_shape,
+                    grad_output,
+                    input.to(grad_output.dtype),
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    *flags,
+                )
+            elif torch.__version__.startswith("2"):
+                # https://github.com/pytorch/pytorch/issues/74437
+                op, _ = torch._C._jit_get_operation("aten::convolution_backward")
+                dummy_weight = torch.tensor(
+                    0.0, dtype=grad_output.dtype, device=input.device
+                ).expand(weight_shape)
+                grad_weight = op(
+                    grad_output,
+                    input.to(grad_output.dtype),
+                    dummy_weight,
+                    None,
+                    stride,
+                    padding,
+                    dilation,
+                    transpose,
+                    (0,) * ndim,
+                    groups,
+                    [False, True, False],
+                )[1]
+            else:
+                raise NotImplementedError
+            assert grad_weight.shape == weight_shape
+            ctx.save_for_backward(grad_output, input)
+            return grad_weight
+        @staticmethod
+        def backward(ctx, grad2_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad2_grad_output = None
+            grad2_input = None
+            if ctx.needs_input_grad[0]:  # Grad of Weight
+                grad2_grad_output = ConvNd.apply(input, grad2_grad_weight, None)
+                assert grad2_grad_output.shape == grad_output.shape
+            if ctx.needs_input_grad[1]:  # Input
+                p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape)
+                op = _conv_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs,
+                )
+                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
+                assert grad2_input.shape == input.shape
+            return grad2_grad_output, grad2_input
+    _conv_gradfix_cache[key] = ConvNd
+    return ConvNd
+# ----------------------------------------------------------------------------

adv_grpo/diffusers_patch/__pycache__/sd3_pipeline_with_logprob_fast.cpython-310.pyc ADDED Viewed

Binary file (15.5 kB). View file

adv_grpo/diffusers_patch/__pycache__/sd3_sde_with_logprob.cpython-310.pyc ADDED Viewed

Binary file (3.54 kB). View file

adv_grpo/diffusers_patch/__pycache__/train_dreambooth_lora_sd3.cpython-310.pyc ADDED Viewed

Binary file (2.48 kB). View file

adv_grpo/diffusers_patch/flux_kontext_pipeline_with_logprob.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/flux/pipeline_flux_kontext.py
+from typing import Any, Dict, List, Optional, Union, Callable
+import torch
+import numpy as np
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.utils import logging
+from .sd3_sde_with_logprob import sde_step_with_logprob
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+@torch.no_grad()
+def pipeline_with_logprob(
+    self,
+    image: Optional[PipelineImageInput] = None,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt: Union[str, List[str]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    sigmas: Optional[List[float]] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    max_area: int = 1024**2,
+    _auto_resize: bool = True,
+    noise_level: float = 0.7,
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    original_height, original_width = height, width
+    aspect_ratio = width / height
+    width = round((max_area * aspect_ratio) ** 0.5)
+    height = round((max_area / aspect_ratio) ** 0.5)
+    multiple_of = self.vae_scale_factor * 2
+    width = width // multiple_of * multiple_of
+    height = height // multiple_of * multiple_of
+    if height != original_height or width != original_width:
+        logger.warning(
+            f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements."
+        )
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._current_timestep = None
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    )
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # 3. Preprocess image
+    if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+        image = self.image_processor.resize(image, height, width)
+        image = self.image_processor.preprocess(image, height, width)
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, image_latents, latent_ids, image_ids = self.prepare_latents(
+        image.float(),
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    if image_ids is not None:
+        latent_ids = torch.cat([latent_ids, image_ids], dim=0)  # dim 0 is sequence dimension
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.get("base_image_seq_len", 256),
+        self.scheduler.config.get("max_image_seq_len", 4096),
+        self.scheduler.config.get("base_shift", 0.5),
+        self.scheduler.config.get("max_shift", 1.15),
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        sigmas=sigmas,
+        mu=mu,
+    )
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+    # handle guidance
+    if self.transformer.config.guidance_embeds:
+        guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+        guidance = guidance.expand(latents.shape[0])
+    else:
+        guidance = None
+    # 6. Prepare image embeddings
+    all_latents = [latents]
+    all_log_probs = []
+    # 7. Denoising loop
+    self.scheduler.set_begin_index(0)
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            self._current_timestep = t
+            latent_model_input = latents
+            if image_latents is not None:
+                latent_model_input = torch.cat([latents, image_latents], dim=1)
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            if noise_pred.isnan().any():
+                breakpoint()
+                print("log_prob is nan")
+            noise_pred = noise_pred[:, : latents.size(1)]
+            latents_dtype = latents.dtype
+            latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+                self.scheduler,
+                noise_pred.float(),
+                t.unsqueeze(0).repeat(latents.shape[0]),
+                latents.float(),
+                noise_level=noise_level,
+            )
+            if latents.dtype != latents_dtype:
+                latents = latents.to(latents_dtype)
+            all_latents.append(latents)
+            all_log_probs.append(log_prob)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+    latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+    latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    latents = latents.to(dtype=self.vae.dtype)
+    image = self.vae.decode(latents, return_dict=False)[0]
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    return image, all_latents, latent_ids, text_ids, all_log_probs, image_latents

adv_grpo/diffusers_patch/flux_pipeline_with_logprob.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/flux/pipeline_flux.py
+from typing import Any, Dict, List, Optional, Union, Callable
+import torch
+import numpy as np
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps
+from .sd3_sde_with_logprob import sde_step_with_logprob
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+@torch.no_grad()
+def pipeline_with_logprob(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt: Union[str, List[str]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    sigmas: Optional[List[float]] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    noise_level: float = 0.7,
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._current_timestep = None
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    )
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+    if hasattr(self.scheduler.config, "use_flow_sigmas") and self.scheduler.config.use_flow_sigmas:
+        sigmas = None
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.get("base_image_seq_len", 256),
+        self.scheduler.config.get("max_image_seq_len", 4096),
+        self.scheduler.config.get("base_shift", 0.5),
+        self.scheduler.config.get("max_shift", 1.15),
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        sigmas=sigmas,
+        mu=mu,
+    )
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+    # handle guidance
+    if self.transformer.config.guidance_embeds:
+        guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+        guidance = guidance.expand(latents.shape[0])
+    else:
+        guidance = None
+    # 6. Prepare image embeddings
+    all_latents = [latents]
+    all_log_probs = []
+    # 7. Denoising loop
+    self.scheduler.set_begin_index(0)
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            self._current_timestep = t
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            noise_pred = self.transformer(
+                hidden_states=latents,
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            latents_dtype = latents.dtype
+            latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+                self.scheduler,
+                noise_pred.float(),
+                t.unsqueeze(0).repeat(latents.shape[0]),
+                latents.float(),
+                noise_level=noise_level,
+            )
+            if latents.dtype != latents_dtype:
+                latents = latents.to(latents_dtype)
+            all_latents.append(latents)
+            all_log_probs.append(log_prob)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+    latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+    latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    latents = latents.to(dtype=self.vae.dtype)
+    image = self.vae.decode(latents, return_dict=False)[0]
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    return image, all_latents, latent_image_ids, text_ids, all_log_probs

adv_grpo/diffusers_patch/sd3_pipeline_with_logprob.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+# with the following modifications:
+# - It uses the patched version of `sde_step_with_logprob` from `sd3_sde_with_logprob.py`.
+# - It returns all the intermediate latents of the denoising process as well as the log probs of each denoising step.
+from typing import Any, Dict, List, Optional, Union
+import torch
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps
+from .sd3_sde_with_logprob import sde_step_with_logprob_new as sde_step_with_logprob
+@torch.no_grad()
+def pipeline_with_logprob(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    prompt_3: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    sigmas: Optional[List[float]] = None,
+    guidance_scale: float = 7.0,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt_3: Optional[Union[str, List[str]]] = None,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    clip_skip: Optional[int] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 256,
+    skip_layer_guidance_scale: float = 2.8,
+    noise_level: float = 0.7,
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._skip_layer_guidance_scale = skip_layer_guidance_scale
+    self._clip_skip = clip_skip
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    )
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_3=prompt_3,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        device=device,
+        clip_skip=self.clip_skip,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    if self.do_classifier_free_guidance:
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels
+    # latents = self.prepare_latents(
+    #     batch_size * num_images_per_prompt,
+    #     num_channels_latents,
+    #     height,
+    #     width,
+    #     prompt_embeds.dtype,
+    #     device,
+    #     generator,
+    #     latents,
+    # ).float()
+    latents = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # 5. Prepare timesteps
+    scheduler_kwargs = {}
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        sigmas=sigmas,
+        **scheduler_kwargs,
+    )
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+    # 6. Prepare image embeddings
+    all_latents = [latents]
+    all_log_probs = []
+    # impor ptbd;
+    # 7. Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latent_model_input.shape[0])
+            # import pdb; pdb.set_trace()
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                pooled_projections=pooled_prompt_embeds,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            # noise_pred = noise_pred.to(prompt_embeds.dtype)
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents_dtype = latents.dtype
+            latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+                self.scheduler,
+                noise_pred.float(),
+                t.unsqueeze(0),
+                latents.float(),
+                noise_level=noise_level,
+            )
+            all_latents.append(latents)
+            all_log_probs.append(log_prob)
+            if latents.dtype != latents_dtype:
+                latents = latents.to(latents_dtype)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+    latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    latents = latents.to(dtype=self.vae.dtype)
+    image = self.vae.decode(latents, return_dict=False)[0]
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    return image, all_latents, all_log_probs

adv_grpo/diffusers_patch/sd3_pipeline_with_logprob_fast.py ADDED Viewed

	@@ -0,0 +1,1081 @@

+# Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+# with the following modifications:
+# - It uses the patched version of `sde_step_with_logprob` from `sd3_sde_with_logprob.py`.
+# - It returns all the intermediate latents of the denoising process as well as the log probs of each denoising step.
+from typing import Any, Dict, List, Optional, Union
+import torch
+import random
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps
+from .sd3_sde_with_logprob import sde_step_with_logprob_new as sde_step_with_logprob
+from PIL import Image
+from torchvision import transforms
+@torch.no_grad()
+def pipeline_with_logprob(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    prompt_3: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    mini_num_image_per_prompt: int = 1,
+    sigmas: Optional[List[float]] = None,
+    guidance_scale: float = 7.0,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt_3: Optional[Union[str, List[str]]] = None,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    clip_skip: Optional[int] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 256,
+    skip_layer_guidance_scale: float = 2.8,
+    noise_level: float = 0.7,
+    train_num_steps: int = 1,
+    process_index: int = 0,
+    sample_num_steps: int = 10,
+    random_timestep: Optional[int] = None,
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._skip_layer_guidance_scale = skip_layer_guidance_scale
+    self._clip_skip = clip_skip
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    )
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_3=prompt_3,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        device=device,
+        clip_skip=self.clip_skip,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # import pdb; pdb.set_trace()
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels
+    latents = self.prepare_latents(
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    ).float()
+    # import pdb; pdb.set_trace()
+    # latents = latents.to(prompt_embeds.dtype)
+    # 5. Prepare timesteps
+    scheduler_kwargs = {}
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        sigmas=sigmas,
+        **scheduler_kwargs,
+    )
+    # timesteps = timesteps.to(prompt_embeds.dtype)
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+    random.seed(process_index)
+    if random_timestep is None:
+        random_timestep = random.randint(0, sample_num_steps//2)
+    # 6. Prepare image embeddings
+    all_latents = []
+    all_log_probs = []
+    all_timesteps = []
+    if self.do_classifier_free_guidance:
+        tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+    # 7. Denoising loop
+    # import pdb; pdb.set_trace()
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        # import pdb; pdb.set_trace()
+        for i, t in enumerate(timesteps):
+            if i < random_timestep:
+                cur_noise_level = 0
+            elif i == random_timestep:
+                cur_noise_level= noise_level
+                # 将latents repeat mini_num_image_per_prompt次
+                latents = latents.repeat(mini_num_image_per_prompt, 1, 1, 1)
+                prompt_embeds = prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+                pooled_prompt_embeds = pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+                negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+                if self.do_classifier_free_guidance:
+                    tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+                    tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+                all_latents.append(latents)
+            elif i > random_timestep and i < random_timestep + train_num_steps:
+                cur_noise_level = noise_level
+            else:
+                cur_noise_level= 0
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latent_model_input.shape[0])
+            # import pdb; pdb.set_trace()
+            # noise_pred = self.transformer( hidden_states=latent_model_input,  timestep=timestep, encoder_hidden_states=tem_prompt_embeds,pooled_projections=tem_pooled_prompt_embeds, joint_attention_kwargs=self.joint_attention_kwargs,return_dict=False, )[0]
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=tem_prompt_embeds,
+                pooled_projections=tem_pooled_prompt_embeds,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            # noise_pred = noise_pred.to(prompt_embeds.dtype)
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents_dtype = latents.dtype
+            latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+                self.scheduler,
+                noise_pred.float(),
+                t.unsqueeze(0),
+                latents.float(),
+                noise_level=cur_noise_level,
+            )
+            # if latents.dtype != latents_dtype:
+            #     latents = latents.to(latents_dtype)
+            if i >= random_timestep and i < random_timestep + train_num_steps:
+                all_latents.append(latents)
+                all_log_probs.append(log_prob)
+                all_timesteps.append(t.repeat(len(latents)))
+                # import pdb; pdb.set_trace()
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+    latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    latents = latents.to(dtype=self.vae.dtype)
+    image = self.vae.decode(latents, return_dict=False)[0]
+    reconstructd_image = self.image_processor.postprocess(image, output_type="pil")
+    # reconstructd_image[0].save("0.png")
+    # import pdb; pdb.set_trace()
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    return image, all_latents, all_log_probs, all_timesteps
+@torch.no_grad()
+def pipeline_with_logprob_new(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    prompt_3: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    mini_num_image_per_prompt: int = 1,
+    sigmas: Optional[List[float]] = None,
+    guidance_scale: float = 7.0,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt_3: Optional[Union[str, List[str]]] = None,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    clip_skip: Optional[int] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 256,
+    skip_layer_guidance_scale: float = 2.8,
+    noise_level: float = 0.7,
+    train_num_steps: int = 1,
+    process_index: int = 0,
+    sample_num_steps: int = 10,
+    random_timestep: Optional[int] = None,
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # import pdb; pdb.set_trace()
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    # import pdb; pdb.set_trace()
+    self._guidance_scale = guidance_scale
+    self._skip_layer_guidance_scale = skip_layer_guidance_scale
+    self._clip_skip = clip_skip
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    )
+    # import pdb; pdb.set_trace()
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_3=prompt_3,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        device=device,
+        clip_skip=self.clip_skip,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # import pdb; pdb.set_trace()
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels
+    latents = self.prepare_latents(
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # import pdb; pdb.set_trace()
+    # latents = latents.to(prompt_embeds.dtype)
+    # 5. Prepare timesteps
+    scheduler_kwargs = {}
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        sigmas=sigmas,
+        **scheduler_kwargs,
+    )
+    # timesteps = timesteps.to(prompt_embeds.dtype)
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+    random.seed(process_index)
+    if random_timestep is None:
+        random_timestep = random.randint(0, sample_num_steps//2)
+    # 6. Prepare image embeddings
+    all_latents = []
+    all_log_probs = []
+    all_timesteps = []
+    # import pdb; pdb.set_trace()
+    if self.do_classifier_free_guidance:
+        tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+    # 7. Denoising loop
+    # import pdb; pdb.set_trace()
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        # import pdb; pdb.set_trace()
+        for i, t in enumerate(timesteps):
+            if i < random_timestep:
+                cur_noise_level = 0
+            elif i == random_timestep:
+                cur_noise_level= noise_level
+                # 将latents repeat mini_num_image_per_prompt次
+                latents = latents.repeat(mini_num_image_per_prompt, 1, 1, 1)
+                prompt_embeds = prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+                pooled_prompt_embeds = pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+                negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+                if self.do_classifier_free_guidance:
+                    tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+                    tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+                all_latents.append(latents)
+            elif i > random_timestep and i < random_timestep + train_num_steps:
+                cur_noise_level = noise_level
+            else:
+                cur_noise_level= 0
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latent_model_input.shape[0])
+            # import pdb; pdb.set_trace()
+            # noise_pred = self.transformer( hidden_states=latent_model_input,  timestep=timestep, encoder_hidden_states=tem_prompt_embeds,pooled_projections=tem_pooled_prompt_embeds, joint_attention_kwargs=self.joint_attention_kwargs,return_dict=False, )[0]
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=tem_prompt_embeds,
+                pooled_projections=tem_pooled_prompt_embeds,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            # noise_pred = noise_pred.to(prompt_embeds.dtype)
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents_dtype = latents.dtype
+            latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+                self.scheduler,
+                noise_pred.float(),
+                t.unsqueeze(0),
+                latents.float(),
+                noise_level=cur_noise_level,
+            )
+            if latents.dtype != latents_dtype:
+                latents = latents.to(latents_dtype)
+            if i >= random_timestep and i < random_timestep + train_num_steps:
+                all_latents.append(latents)
+                all_log_probs.append(log_prob)
+                all_timesteps.append(t.repeat(len(latents)))
+                # import pdb; pdb.set_trace()
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+    latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    latents = latents.to(dtype=self.vae.dtype)
+    image = self.vae.decode(latents, return_dict=False)[0]
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    return image, all_latents, all_log_probs, all_timesteps
+@torch.no_grad()
+def pipeline_with_logprob_random(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    prompt_3: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    mini_num_image_per_prompt: int = 1,
+    sigmas: Optional[List[float]] = None,
+    guidance_scale: float = 7.0,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt_3: Optional[Union[str, List[str]]] = None,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    clip_skip: Optional[int] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 256,
+    skip_layer_guidance_scale: float = 2.8,
+    noise_level: float = 0.7,
+    train_num_steps: int = 1,
+    process_index: int = 0,
+    sample_num_steps: int = 10,
+    random_timestep: Optional[int] = None,
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # import pdb; pdb.set_trace()
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    # import pdb; pdb.set_trace()
+    self._guidance_scale = guidance_scale
+    self._skip_layer_guidance_scale = skip_layer_guidance_scale
+    self._clip_skip = clip_skip
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    )
+    # import pdb; pdb.set_trace()
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_3=prompt_3,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        device=device,
+        clip_skip=self.clip_skip,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    prompt_embeds = prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+    pooled_prompt_embeds = pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+    negative_prompt_embeds = negative_prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+    negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+    # import pdb; pdb.set_trace()
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels
+    latents = self.prepare_latents(
+        prompt_embeds.shape[0],
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # import pdb; pdb.set_trace()
+    # latents = latents.to(prompt_embeds.dtype)
+    # 5. Prepare timesteps
+    scheduler_kwargs = {}
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        sigmas=sigmas,
+        **scheduler_kwargs,
+    )
+    # timesteps = timesteps.to(prompt_embeds.dtype)
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+    random.seed(process_index)
+    if random_timestep is None:
+        random_timestep = random.randint(0, sample_num_steps//2)
+    # 6. Prepare image embeddings
+    all_latents = []
+    all_log_probs = []
+    all_timesteps = []
+    if self.do_classifier_free_guidance:
+        tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+    if self.do_classifier_free_guidance:
+        tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+    # 7. Denoising loop
+    # import pdb; pdb.set_trace()
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        # import pdb; pdb.set_trace()
+        for i, t in enumerate(timesteps):
+            if i < random_timestep:
+                cur_noise_level = 0
+            elif i == random_timestep:
+                cur_noise_level= noise_level
+                # 将latents repeat mini_num_image_per_prompt次
+                # latents = latents.repeat(mini_num_image_per_prompt, 1, 1, 1)
+                # prompt_embeds = prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+                # pooled_prompt_embeds = pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+                # negative_prompt_embeds = negative_prompt_embeds.repeat(mini_num_image_per_prompt, 1, 1)
+                # negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(mini_num_image_per_prompt, 1)
+                # if self.do_classifier_free_guidance:
+                #     tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+                #     tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+                all_latents.append(latents)
+            elif i > random_timestep and i < random_timestep + train_num_steps:
+                cur_noise_level = noise_level
+            else:
+                cur_noise_level= 0
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latent_model_input.shape[0])
+            # import pdb; pdb.set_trace()
+            # noise_pred = self.transformer( hidden_states=latent_model_input,  timestep=timestep, encoder_hidden_states=tem_prompt_embeds,pooled_projections=tem_pooled_prompt_embeds, joint_attention_kwargs=self.joint_attention_kwargs,return_dict=False, )[0]
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=tem_prompt_embeds,
+                pooled_projections=tem_pooled_prompt_embeds,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            # noise_pred = noise_pred.to(prompt_embeds.dtype)
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents_dtype = latents.dtype
+            latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+                self.scheduler,
+                noise_pred.float(),
+                t.unsqueeze(0),
+                latents.float(),
+                noise_level=cur_noise_level,
+            )
+            if latents.dtype != latents_dtype:
+                latents = latents.to(latents_dtype)
+            if i >= random_timestep and i < random_timestep + train_num_steps:
+                all_latents.append(latents)
+                all_log_probs.append(log_prob)
+                all_timesteps.append(t.repeat(len(latents)))
+                # import pdb; pdb.set_trace()
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+    latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    latents = latents.to(dtype=self.vae.dtype)
+    image = self.vae.decode(latents, return_dict=False)[0]
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    return image, all_latents, all_log_probs, all_timesteps
+def move_scheduler_to_device(scheduler, device="cuda"):
+    for attr_name in dir(scheduler):
+        attr = getattr(scheduler, attr_name)
+        if isinstance(attr, torch.Tensor):
+            setattr(scheduler, attr_name, attr.to(device))
+    return scheduler
+def image_to_latent(pipe, images: Union[Image.Image, List[Image.Image]], device="cuda"):
+    # 统一转 list
+    if isinstance(images, Image.Image):
+        images = [images]
+    preprocess = transforms.Compose([
+        transforms.Resize((512, 512)),
+        transforms.ToTensor(),  # 转 [0,1]
+        transforms.Normalize([0.5], [0.5])  # 映射到 [-1,1]
+    ])
+    # 批量处理
+    img_tensors = [preprocess(img) for img in images]  # list of [3,512,512]
+    img_tensor = torch.stack(img_tensors, dim=0).to(device, dtype=torch.float32)  # [B,3,512,512]
+    # import pdb; pdb.set_trace()
+    # 过 VAE 编码
+    latent = pipe.vae.encode(img_tensor).latent_dist.sample()
+    latent = latent * pipe.vae.config.scaling_factor
+    return latent.to(torch.bfloat16)  # [B,4,64,64]  (假设512输入，缩小8倍)
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+@torch.no_grad()
+def flux_to_sd3_denoise(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    prompt_3: Optional[Union[str, List[str]]] = None,
+    flux_images=None,
+    device="cuda",
+    output_type: Optional[str] = "pil",
+    num_inference_steps: int = 20,
+    guidance_scale: float = 7.0,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt_3: Optional[Union[str, List[str]]] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    max_sequence_length: int = 256,
+    noise_level: float = 0.7,
+    random_timestep: Optional[int] = None,
+    noise_timestep_ratio: float = 0.4,
+    clip_skip: Optional[int] = None,
+):
+    """
+    用 Flux 生成的图像 -> 转 latent -> 加噪 -> 用 SD3 多步去噪
+    输出与 pipeline_with_logprob 对齐: image, all_latents, all_log_probs, all_timesteps
+    """
+    # 1. 转 latent
+    flux_latent = image_to_latent(self, flux_images, device)
+    self._guidance_scale = guidance_scale
+    self._clip_skip = clip_skip
+    # 2. 准备 scheduler
+    noise_scheduler = self.scheduler
+    noise_scheduler.set_timesteps(num_inference_steps)
+    timesteps = noise_scheduler.timesteps.to(device)
+    # target_idx = torch.tensor([int(noise_timestep_ratio * (len(timesteps) - 1))], device=device)
+    target_idx =  torch.tensor([noise_timestep_ratio], device=device)
+    t = timesteps[target_idx].to(device)
+    noise = torch.randn_like(flux_latent)
+    sigmas = get_sigmas(noise_scheduler, t, device, n_dim=flux_latent.ndim, dtype=flux_latent.dtype)
+    latents = (1.0 - sigmas) * flux_latent + sigmas * noise
+    num_channels_latents = self.transformer.config.in_channels
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    # latents = self.prepare_latents(
+    #     batch_size,
+    #     num_channels_latents,
+    #     512,
+    #     512,
+    #     prompt_embeds.dtype,
+    #     device,
+    #     None,
+    #     None,
+    # )
+    # import pdb; pdb.set_trace()
+    # noisy_latent_vis = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    # noisy_latent_vis = noisy_latent_vis.to(dtype=self.vae.dtype)
+    # noisy_image = self.vae.decode(noisy_latent_vis, return_dict=False)[0]
+    # noisy_image = self.image_processor.postprocess(noisy_image, output_type="pil")[0]
+    # 保存到本地
+    # noisy_image.save("noisy_image.png")
+    # import pdb; pdb.set_trace()
+    # 4. Encode prompts (对齐 pipeline_with_logprob 的处理)
+    # lora_scale = (
+    #     self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    # )
+    lora_scale = None
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_3=prompt_3,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        device=device,
+        clip_skip=self.clip_skip,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # import pdb; pdb.set_trace()
+    prompt_embeds = prompt_embeds.repeat(latents.shape[0], 1, 1)
+    pooled_prompt_embeds = pooled_prompt_embeds.repeat(latents.shape[0], 1)
+    negative_prompt_embeds = negative_prompt_embeds.repeat(latents.shape[0], 1, 1)
+    negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(latents.shape[0], 1)
+    if self.do_classifier_free_guidance:
+        tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+    else:
+        tem_prompt_embeds = prompt_embeds
+        tem_pooled_prompt_embeds = pooled_prompt_embeds
+    # 5. 从当前 t 开始去噪
+    noise_scheduler.set_timesteps(num_inference_steps)
+    timesteps = noise_scheduler.timesteps.to(device)
+    start_idx = (timesteps >= t[0]).nonzero()[-1].item()
+    timesteps = timesteps[start_idx:]
+    all_latents, all_log_probs, all_timesteps = [], [], []
+    noise_scheduler = move_scheduler_to_device(noise_scheduler, device)
+    for index, t_cur in enumerate(timesteps):
+        # import pdb; pdb.set_trace()
+        if index==0:
+            all_latents.append(latents)
+        if index<2:
+            cur_noise_level = noise_level
+        else:
+            cur_noise_level = 0.0
+        latent_model_input = (
+            torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+        )
+        t_input = t_cur.expand(latent_model_input.shape[0]).to(device)
+        latents_dtype = latents.dtype
+        model_pred = self.transformer(
+            hidden_states=latent_model_input,
+            timestep=t_input,
+            encoder_hidden_states=tem_prompt_embeds,
+            pooled_projections=tem_pooled_prompt_embeds,
+            return_dict=False,
+        )[0]
+        if self.do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = model_pred.chunk(2)
+            model_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # import pdb; pdb.set_trace()
+        latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+            noise_scheduler,
+            model_pred.float(),
+            t_cur.repeat(len(latents)),
+            latents.float(),
+            noise_level=noise_level,
+        )
+        if latents.dtype != latents_dtype:
+            latents = latents.to(latents_dtype)
+        if index>=0 and index<2:
+        # if index<2:
+            # print(model_pred)
+            all_latents.append(latents)
+            all_log_probs.append(log_prob)
+            all_timesteps.append(t_cur.repeat(len(latents)))
+            # import pdb; pdb.set_trace()
+    # 6. 最终解码
+    denoised_latent = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    denoised_latent = denoised_latent.to(dtype=self.vae.dtype)
+    image = self.vae.decode(denoised_latent, return_dict=False)[0]
+    # reconstructd_image = self.image_processor.postprocess(image, output_type="pil")[0]
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    return image, all_latents, all_log_probs, all_timesteps
+@torch.no_grad()
+def flux_to_sd3_denoise_random(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    prompt_3: Optional[Union[str, List[str]]] = None,
+    flux_images=None,
+    device="cuda",
+    output_type: Optional[str] = "pil",
+    num_inference_steps: int = 20,
+    guidance_scale: float = 7.0,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt_3: Optional[Union[str, List[str]]] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    max_sequence_length: int = 256,
+    noise_level: float = 0.7,
+    random_timestep: Optional[int] = None,
+    noise_timestep_ratio: float = 0.4,
+    clip_skip: Optional[int] = None,
+):
+    """
+    用 Flux 生成的图像 -> 转 latent -> 加噪 -> 用 SD3 多步去噪
+    输出与 pipeline_with_logprob 对齐: image, all_latents, all_log_probs, all_timesteps
+    """
+    # 1. 转 latent
+    flux_latent = image_to_latent(self, flux_images, device)
+    self._guidance_scale = guidance_scale
+    self._clip_skip = clip_skip
+    # 2. 准备 scheduler
+    noise_scheduler = self.scheduler
+    noise_scheduler.set_timesteps(num_inference_steps)
+    timesteps = noise_scheduler.timesteps.to(device)
+    # target_idx = torch.tensor([int(noise_timestep_ratio * (len(timesteps) - 1))], device=device)
+    # t = timesteps[target_idx].to(device)
+    # noise = torch.randn_like(flux_latent)
+    # sigmas = get_sigmas(noise_scheduler, t, device, n_dim=flux_latent.ndim, dtype=flux_latent.dtype)
+    # latents = (1.0 - sigmas) * flux_latent + sigmas * noise
+    target_idx = torch.tensor([random.randint(5, 10)], device=device)
+    t = timesteps[target_idx].to(device)
+    # 生成标准高斯噪声
+    noise = torch.randn_like(flux_latent)
+    # 获取对应的 sigma
+    sigmas = get_sigmas(noise_scheduler, t, device, n_dim=flux_latent.ndim, dtype=flux_latent.dtype)
+    # 给 latent 加噪
+    latents = (1.0 - sigmas) * flux_latent + sigmas * noise
+    # import pdb; pdb.set_trace()
+    # noisy_latent_vis = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    # noisy_latent_vis = noisy_latent_vis.to(dtype=self.vae.dtype)
+    # noisy_image = self.vae.decode(noisy_latent_vis, return_dict=False)[0]
+    # noisy_image = self.image_processor.postprocess(noisy_image, output_type="pil")[0]
+    # 保存到本地
+    # noisy_image.save("noisy_image.png")
+    # import pdb; pdb.set_trace()
+    # 4. Encode prompts (对齐 pipeline_with_logprob 的处理)
+    # lora_scale = (
+    #     self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    # )
+    lora_scale = None
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_3=prompt_3,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        device=device,
+        clip_skip=self.clip_skip,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # import pdb; pdb.set_trace()
+    prompt_embeds = prompt_embeds.repeat(latents.shape[0], 1, 1)
+    pooled_prompt_embeds = pooled_prompt_embeds.repeat(latents.shape[0], 1)
+    negative_prompt_embeds = negative_prompt_embeds.repeat(latents.shape[0], 1, 1)
+    negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(latents.shape[0], 1)
+    if self.do_classifier_free_guidance:
+        tem_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        tem_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+    else:
+        tem_prompt_embeds = prompt_embeds
+        tem_pooled_prompt_embeds = pooled_prompt_embeds
+    # 5. 从当前 t 开始去噪
+    noise_scheduler.set_timesteps(num_inference_steps)
+    timesteps = noise_scheduler.timesteps.to(device)
+    start_idx = (timesteps >= t[0]).nonzero()[-1].item()
+    timesteps = timesteps[start_idx:]
+    all_latents, all_log_probs, all_timesteps = [], [], []
+    noise_scheduler = move_scheduler_to_device(noise_scheduler, device)
+    for index, t_cur in enumerate(timesteps):
+        if index==0:
+            all_latents.append(latents)
+        latent_model_input = (
+            torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+        )
+        t_input = t_cur.expand(latent_model_input.shape[0]).to(device)
+        latents_dtype = latents.dtype
+        model_pred = self.transformer(
+            hidden_states=latent_model_input,
+            timestep=t_input,
+            encoder_hidden_states=tem_prompt_embeds,
+            pooled_projections=tem_pooled_prompt_embeds,
+            return_dict=False,
+        )[0]
+        if self.do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = model_pred.chunk(2)
+            model_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # import pdb; pdb.set_trace()
+        latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+            noise_scheduler,
+            model_pred.float(),
+            t_cur.repeat(len(latents)),
+            latents.float(),
+            noise_level=noise_level,
+        )
+        if latents.dtype != latents_dtype:
+            latents = latents.to(latents_dtype)
+        # if index>=2 and index<4:
+        if index<2:
+            # print(model_pred)
+            all_latents.append(latents)
+            all_log_probs.append(log_prob)
+            all_timesteps.append(t_cur.repeat(len(latents)))
+            # import pdb; pdb.set_trace()
+    # 6. 最终解码
+    denoised_latent = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+    denoised_latent = denoised_latent.to(dtype=self.vae.dtype)
+    image = self.vae.decode(denoised_latent, return_dict=False)[0]
+    # reconstructd_image = self.image_processor.postprocess(image, output_type="pil")[0]
+    image = self.image_processor.postprocess(image, output_type=output_type)
+    return image, all_latents, all_log_probs, all_timesteps

adv_grpo/diffusers_patch/sd3_sde_with_logprob.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copied from https://github.com/kvablack/ddpo-pytorch/blob/main/ddpo_pytorch/diffusers_patch/ddim_with_logprob.py
+# We adapt it from flow to flow matching.
+import math
+from typing import Optional, Union
+import torch
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+def sde_step_with_logprob(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    noise_level: float = 0.7,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+):
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    # bf16 can overflow here when compute prev_sample_mean, we must convert all variable to fp32
+    model_output=model_output.float()
+    sample=sample.float()
+    if prev_sample is not None:
+        prev_sample=prev_sample.float()
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, *([1] * (len(sample.shape) - 1)))
+    sigma_prev = self.sigmas[prev_step_index].view(-1, *([1] * (len(sample.shape) - 1)))
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    std_dev_t = torch.sqrt(sigma / (1 - torch.where(sigma == 1, sigma_max, sigma)))*noise_level
+    # import pdb; pdb.set_trace()
+    # our sde
+    prev_sample_mean = sample*(1+std_dev_t**2/(2*sigma)*dt)+model_output*(1+std_dev_t**2*(1-sigma)/(2*sigma))*dt
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * torch.sqrt(-1*dt) * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * ((std_dev_t * torch.sqrt(-1*dt))**2))
+        - torch.log(std_dev_t * torch.sqrt(-1*dt))
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t
+def sde_step_with_logprob_new(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    noise_level: float = 0.7,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+):
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    # bf16 can overflow here when compute prev_sample_mean, we must convert all variable to fp32
+    model_output=model_output.float()
+    sample=sample.float()
+    if prev_sample is not None:
+        prev_sample=prev_sample.float()
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, *([1] * (len(sample.shape) - 1)))
+    sigma_prev = self.sigmas[prev_step_index].view(-1, *([1] * (len(sample.shape) - 1)))
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    # Flow-SDE
+    #std_dev_t = torch.sqrt(sigma / (1 - torch.where(sigma == 1, sigma_max, sigma)))*noise_level * torch.sqrt(-1*dt)
+    # prev_sample_mean = sample*(1+std_dev_t**2/(2*sigma)*dt)+model_output*(1+std_dev_t**2*(1-sigma)/(2*sigma))*dt
+    # Flow-CPS
+    std_dev_t = sigma_prev  * math.sin(noise_level * math.pi / 2) # sigma_t in paper
+    pred_original_sample = sample - sigma * model_output # predicted x_0 in paper
+    noise_estimate = sample + model_output * (1 - sigma) # predicted x_1 in paper
+    prev_sample_mean = pred_original_sample * (1 - sigma_prev) + noise_estimate * torch.sqrt(sigma_prev**2 - std_dev_t**2)
+    # import pdb; pdb.set_trace()
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * variance_noise
+    # remove all constants
+    log_prob = -((prev_sample.detach() - prev_sample_mean) ** 2)
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t

adv_grpo/diffusers_patch/train_dreambooth_lora_flux.py ADDED Viewed

	@@ -0,0 +1,144 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import torch
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length=512,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    if hasattr(text_encoder, "module"):
+        dtype = text_encoder.module.dtype
+    else:
+        dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    text_input_ids=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    if hasattr(text_encoder, "module"):
+        dtype = text_encoder.module.dtype
+    else:
+        dtype = text_encoder.dtype
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    if hasattr(text_encoders[0], "module"):
+        dtype = text_encoders[0].module.dtype
+    else:
+        dtype = text_encoders[0].dtype
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids

adv_grpo/diffusers_patch/train_dreambooth_lora_sd3.py ADDED Viewed

	@@ -0,0 +1,144 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import torch
+def _encode_prompt_with_t5(
+    text_encoder,
+    tokenizer,
+    max_sequence_length,
+    prompt=None,
+    num_images_per_prompt=1,
+    device=None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+    text_encoder,
+    tokenizer,
+    prompt: str,
+    device=None,
+    text_input_ids=None,
+    num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds, pooled_prompt_embeds
+def encode_prompt(
+    text_encoders,
+    tokenizers,
+    prompt: str,
+    max_sequence_length,
+    device=None,
+    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    clip_tokenizers = tokenizers[:2]
+    clip_text_encoders = text_encoders[:2]
+    clip_prompt_embeds_list = []
+    clip_pooled_prompt_embeds_list = []
+    for i, (tokenizer, text_encoder) in enumerate(zip(clip_tokenizers, clip_text_encoders)):
+        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            device=device if device is not None else text_encoder.device,
+            num_images_per_prompt=num_images_per_prompt,
+            text_input_ids=text_input_ids_list[i] if text_input_ids_list else None,
+        )
+        clip_prompt_embeds_list.append(prompt_embeds)
+        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)
+    clip_prompt_embeds = torch.cat(clip_prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = torch.cat(clip_pooled_prompt_embeds_list, dim=-1)
+    t5_prompt_embed = _encode_prompt_with_t5(
+        text_encoders[-1],
+        tokenizers[-1],
+        max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[-1] if text_input_ids_list else None,
+        device=device if device is not None else text_encoders[-1].device,
+    )
+    clip_prompt_embeds = torch.nn.functional.pad(
+        clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+    )
+    prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+    return prompt_embeds, pooled_prompt_embeds

adv_grpo/diffusers_patch/wan_pipeline_with_logprob.py ADDED Viewed

	@@ -0,0 +1,373 @@

+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import torch
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+from diffusers.utils.torch_utils import randn_tensor
+import math
+import numpy as np
+# import logger
+def sde_step_with_logprob(
+    self: UniPCMultistepScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+    determistic: bool = False,
+    return_pixel_log_prob: bool = False,
+    return_dt_and_std_dev_t: bool = False
+):
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    # prev_sample_mean, we must convert all variable to fp32
+    model_output=model_output.float()
+    sample=sample.float()
+    if prev_sample is not None:
+        prev_sample=prev_sample.float()
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    self.sigmas = self.sigmas.to(sample.device)
+    sigma = self.sigmas[step_index].view(-1, 1, 1, 1, 1)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1, 1, 1)
+    sigma_max = self.sigmas[1].item()
+    sigma_min = self.sigmas[-1].item()
+    dt = sigma_prev - sigma
+    std_dev_t = sigma_min + (sigma_max - sigma_min) * sigma
+    prev_sample_mean = sample*(1+std_dev_t**2/(2*sigma)*dt)+model_output*(1+std_dev_t**2*(1-sigma)/(2*sigma))*dt
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * torch.sqrt(-1*dt) * variance_noise
+    # No noise is added during evaluation
+    if determistic:
+        prev_sample = sample + dt * model_output
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * ((std_dev_t * torch.sqrt(-1*dt))**2))
+        - torch.log(std_dev_t * torch.sqrt(-1*dt))
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    if return_dt_and_std_dev_t:
+        return prev_sample, log_prob, prev_sample_mean, std_dev_t, torch.sqrt(-1*dt)
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t * torch.sqrt(-1*dt)
+def wan_pipeline_with_logprob(
+    self,
+    prompt: Union[str, List[str]] = None,
+    negative_prompt: Union[str, List[str]] = None,
+    height: int = 480,
+    width: int = 832,
+    num_frames: int = 81,
+    num_inference_steps: int = 50,
+    guidance_scale: float = 5.0,
+    num_videos_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.Tensor] = None,
+    prompt_embeds: Optional[torch.Tensor] = None,
+    negative_prompt_embeds: Optional[torch.Tensor] = None,
+    output_type: Optional[str] = "np",
+    return_dict: bool = True,
+    attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[
+        Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+    ] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    determistic: bool = False,
+    kl_reward: float = 0.0,
+    return_pixel_log_prob: bool = False,
+):
+    r"""
+    The call function to the pipeline for generation.
+    Args:
+        prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            instead.
+        height (`int`, defaults to `480`):
+            The height in pixels of the generated image.
+        width (`int`, defaults to `832`):
+            The width in pixels of the generated image.
+        num_frames (`int`, defaults to `81`):
+            The number of frames in the generated video.
+        num_inference_steps (`int`, defaults to `50`):
+            The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+            expense of slower inference.
+        guidance_scale (`float`, defaults to `5.0`):
+            Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            `guidance_scale` is defined as `w` of equation 2. of [Imagen
+            Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+            1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+            usually at the expense of lower image quality.
+        num_videos_per_prompt (`int`, *optional*, defaults to 1):
+            The number of images to generate per prompt.
+        generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+            A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+            generation deterministic.
+        latents (`torch.Tensor`, *optional*):
+            Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+            generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+            tensor is generated by sampling using the supplied random `generator`.
+        prompt_embeds (`torch.Tensor`, *optional*):
+            Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+            provided, text embeddings are generated from the `prompt` input argument.
+        output_type (`str`, *optional*, defaults to `"pil"`):
+            The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
+        attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+            A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+            each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+            DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+            list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+        callback_on_step_end_tensor_inputs (`List`, *optional*):
+            The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+            will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+            `._callback_tensor_inputs` attribute of your pipeline class.
+        autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
+            The dtype to use for the torch.amp.autocast.
+    Examples:
+    Returns:
+        [`~WanPipelineOutput`] or `tuple`:
+            If `return_dict` is `True`, [`WanPipelineOutput`] is returned, otherwise a `tuple` is returned where
+            the first element is a list with the generated images and the second element is a list of `bool`s
+            indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+    """
+    if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+        callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        prompt_embeds,
+        negative_prompt_embeds,
+        callback_on_step_end_tensor_inputs,
+    )
+    if num_frames % self.vae_scale_factor_temporal != 1:
+        print(
+            f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+        )
+        num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+    num_frames = max(num_frames, 1)
+    self._guidance_scale = guidance_scale
+    self._attention_kwargs = attention_kwargs
+    self._current_timestep = None
+    self._interrupt = False
+    device = self._execution_device
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    # 3. Encode input prompt
+    prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        num_videos_per_prompt=num_videos_per_prompt,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        max_sequence_length=max_sequence_length,
+        device=device,
+    )
+    transformer_dtype = self.transformer.dtype
+    prompt_embeds = prompt_embeds.to(transformer_dtype)
+    if negative_prompt_embeds is not None:
+        negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+    # 4. Prepare timesteps
+    self.scheduler.set_timesteps(num_inference_steps, device=device)
+    timesteps = self.scheduler.timesteps
+    # 5. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels
+    latents = self.prepare_latents(
+        batch_size * num_videos_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        num_frames,
+        torch.float32,
+        device,
+        generator,
+        latents,
+    )
+    all_latents = [latents]
+    all_log_probs = []
+    all_kl = []
+    # 6. Denoising loop
+    num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+    self._num_timesteps = len(timesteps)
+    # print(timesteps)
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            latents_ori = latents.clone()
+            self._current_timestep = t
+            latent_model_input = latents.to(transformer_dtype)
+            timestep = t.expand(latents.shape[0])
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                attention_kwargs=attention_kwargs,
+                return_dict=False,
+            )[0]
+            noise_pred = noise_pred.to(prompt_embeds.dtype)
+            if self.do_classifier_free_guidance:
+                noise_uncond = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=negative_prompt_embeds,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+            latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(
+                self.scheduler,
+                noise_pred.float(),
+                t.unsqueeze(0),
+                latents.float(),
+                determistic=determistic,
+                return_pixel_log_prob=return_pixel_log_prob
+            )
+            prev_latents = latents.clone()
+            all_latents.append(latents)
+            all_log_probs.append(log_prob)
+            # compute the previous noisy sample x_t -> x_t-1
+            # latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+            # use kl_reward & is sampling process
+            if kl_reward>0 and not determistic:
+                latent_model_input = torch.cat([latents_ori] * 2) if self.do_classifier_free_guidance else latents_ori
+                with self.transformer.disable_adapter():
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                noise_pred = noise_pred.to(prompt_embeds.dtype)
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                _, ref_log_prob, ref_prev_latents_mean, ref_std_dev_t = sde_step_with_logprob(
+                    self.scheduler,
+                    noise_pred.float(),
+                    t.unsqueeze(0),
+                    latents_ori.float(),
+                    prev_sample=prev_latents.float(),
+                    determistic=determistic,
+                )
+                assert std_dev_t == ref_std_dev_t
+                kl = (prev_latents_mean - ref_prev_latents_mean)**2 / (2 * std_dev_t**2)
+                kl = kl.mean(dim=tuple(range(1, kl.ndim)))
+                all_kl.append(kl)
+            else:
+                # no kl reward, we do not need to compute, just put a pre-position value, kl will be 0
+                all_kl.append(torch.zeros(len(latents), device=latents.device))
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+            # if XLA_AVAILABLE:
+            #     xm.mark_step()
+    self._current_timestep = None
+    if not output_type == "latent":
+        latents = latents.to(self.vae.dtype)
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.vae.config.z_dim, 1, 1, 1)
+            .to(latents.device, latents.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            latents.device, latents.dtype
+        )
+        latents = latents / latents_std + latents_mean
+        video = self.vae.decode(latents, return_dict=False)[0]
+        video = self.video_processor.postprocess_video(video, output_type=output_type)
+    else:
+        video = latents
+    self.maybe_free_model_hooks()
+    if not return_dict:
+        return (video, all_latents, all_log_probs, all_kl)
+    return WanPipelineOutput(frames=video), all_latents, all_log_probs, all_kl

adv_grpo/diffusers_patch/wan_prompt_embedding.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+from typing import Any, Callable, Dict, List, Optional, Union
+def _get_t5_prompt_embeds(
+    text_encoder,
+    tokenizer,
+    prompt: Union[str, List[str]] = None,
+    max_sequence_length: int = 226,
+    num_videos_per_prompt: int = 1,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_attention_mask=True,
+        return_tensors="pt",
+    )
+    text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+    seq_lens = mask.gt(0).sum(dim=1).long()
+    prompt_embeds = text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+    )
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+    return prompt_embeds
+def encode_prompt(
+    text_encoder,
+    tokenizer,
+    prompt: Union[str, List[str]],
+    max_sequence_length: int = 226,
+    num_videos_per_prompt: int = 1,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    ):
+    r"""
+    Encodes the prompt into text encoder hidden states.
+    Args:
+        prompt (`str` or `List[str]`, *optional*):
+            prompt to be encoded
+        negative_prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts not to guide the image generation. If not defined, one has to pass
+            `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+            less than `1`).
+        do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+            Whether to use classifier free guidance or not.
+        num_videos_per_prompt (`int`, *optional*, defaults to 1):
+            Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+        prompt_embeds (`torch.Tensor`, *optional*):
+            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+            provided, text embeddings will be generated from `prompt` input argument.
+        negative_prompt_embeds (`torch.Tensor`, *optional*):
+            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+            argument.
+        device: (`torch.device`, *optional*):
+            torch device
+        dtype: (`torch.dtype`, *optional*):
+            torch dtype
+    """
+    device = text_encoder[0].device
+    dtype = text_encoder[0].dtype
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    if prompt is not None:
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    prompt_embeds = _get_t5_prompt_embeds(
+        text_encoder=text_encoder[0],
+        tokenizer=tokenizer[0],
+        prompt=prompt,
+        max_sequence_length=max_sequence_length,
+        num_videos_per_prompt=num_videos_per_prompt,
+        device=device,
+        dtype=dtype,
+    )
+    return prompt_embeds

adv_grpo/ema.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copied from another repo, but I can't remember exactly which one.
+from collections.abc import Iterable
+import torch
+class EMAModuleWrapper:
+    def __init__(
+            self,
+            parameters: Iterable[torch.nn.Parameter],
+            decay: float = 0.9999,
+            update_step_interval: int = 1,
+            device: torch.device | None = None,
+    ):
+        parameters = list(parameters)
+        self.ema_parameters = [p.clone().detach().to(device) for p in parameters]
+        self.temp_stored_parameters = None
+        self.decay = decay
+        self.update_step_interval = update_step_interval
+        self.device = device
+        # TODO: add an automatic decay calculation based on this formula:
+        # The impact of the last n steps can be calculated as:
+        #     impact = 1-(decay^n)
+        # The number of steps needed to reach a specific impact is:
+        #     n = log_decay(1-impact)
+        # The decay needed to reach a specific impact after n steps is:
+        #     decay = (1-impact)^(1/n)
+    def get_current_decay(self, optimization_step) -> float:
+        return min(
+            (1 + optimization_step) / (10 + optimization_step),
+            self.decay
+        )
+    @torch.no_grad()
+    def step(self, parameters: Iterable[torch.nn.Parameter], optimization_step):
+        parameters = list(parameters)
+        one_minus_decay = 1 - self.get_current_decay(optimization_step)
+        if (optimization_step + 1) % self.update_step_interval == 0:
+            for ema_parameter, parameter in zip(self.ema_parameters, parameters, strict=True):
+                if parameter.requires_grad:
+                    if ema_parameter.device == parameter.device:
+                        ema_parameter.add_(one_minus_decay * (parameter - ema_parameter))
+                    else:
+                        # in place calculations to save memory
+                        parameter_copy = parameter.detach().to(ema_parameter.device)
+                        parameter_copy.sub_(ema_parameter)
+                        parameter_copy.mul_(one_minus_decay)
+                        ema_parameter.add_(parameter_copy)
+                        del parameter_copy
+    def to(self, device: torch.device = None, dtype: torch.dtype = None) -> None:
+        self.device = device
+        self.ema_parameters = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.ema_parameters
+        ]
+    def copy_ema_to(self, parameters: Iterable[torch.nn.Parameter], store_temp: bool = True) -> None:
+        if store_temp:
+            self.temp_stored_parameters = [parameter.detach().cpu() for parameter in parameters]
+        parameters = list(parameters)
+        for ema_parameter, parameter in zip(self.ema_parameters, parameters, strict=True):
+            parameter.data.copy_(ema_parameter.to(parameter.device).data)
+    def copy_temp_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        for temp_parameter, parameter in zip(self.temp_stored_parameters, parameters, strict=True):
+            parameter.data.copy_(temp_parameter.data)
+        self.temp_stored_parameters = None
+    def load_state_dict(self, state_dict: dict) -> None:
+        self.decay = self.decay if self.decay else state_dict.get("decay", self.decay)
+        self.ema_parameters = state_dict.get("ema_parameters")
+        self.to(self.device)
+    def state_dict(self) -> dict:
+        return {
+            "decay": self.decay,
+            "ema_parameters": self.ema_parameters,
+        }

adv_grpo/imagereward_scorer.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from transformers import AutoProcessor, AutoModel
+from PIL import Image
+import torch
+import ImageReward as RM
+class ImageRewardScorer(torch.nn.Module):
+    def __init__(self, device="cuda", dtype=torch.float32):
+        super().__init__()
+        self.model_path = "ImageReward-v1.0"
+        self.device = device
+        self.dtype = dtype
+        self.model = RM.load(self.model_path, device=device).eval().to(dtype=dtype)
+        self.model.requires_grad_(False)
+    @torch.no_grad()
+    def __call__(self, prompts, images):
+        rewards = []
+        for prompt,image in zip(prompts, images):
+            _, reward = self.model.inference_rank(prompt, [image])
+            rewards.append(reward)
+        return rewards
+# Usage example
+def main():
+    scorer = ImageRewardScorer(
+        device="cuda",
+        dtype=torch.float32
+    )
+    images=[
+    "astronaut.jpg",
+    ]
+    pil_images = [Image.open(img) for img in images]
+    prompts=[
+        'A astronaut’s glove floating in zero-g with "NASA 2049" on the wrist',
+    ]
+    print(scorer(prompts, pil_images))
+if __name__ == "__main__":
+    main()

adv_grpo/inflated_layers.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from functools import partial
+from typing import Literal
+from einops import rearrange
+from torch import Tensor
+from torch.nn import ConvTranspose2d, ConvTranspose3d
+from flow_grpo.inflated_lib import (
+    MemoryState,
+    extend_head,
+    inflate_bias,
+    inflate_distribution_bias,
+    inflate_distribution_weight,
+    inflate_weight,
+    modify_state_dict,
+)
+from flow_grpo.conv_gradfix import GradFixConv2d, GradFixConv3d
+VERBOSE = False
+_inflation_mode_t = (Literal["none", "flatten", "partial_flatten", "pad", "tile"],)
+_direction_t = Literal["", "out", "in"]
+class InflatedCausalConv3d(GradFixConv3d):
+    """
+    Note:
+        To align the behavior of pretrained 2D models,
+        if you compose a video clip from a single image by:
+            - duplicating:      set shape_norm = True
+            - padding zeros:    set shape_norm = False
+        to avoid gaps in the beginning of training process.
+    """
+    def __init__(
+        self, *args, inflation_mode: _inflation_mode_t, shape_norm: bool = True, **kwargs
+    ):
+        self.shape_norm = shape_norm
+        self.inflation_mode = inflation_mode
+        self.padding_bank = None
+        super().__init__(*args, **kwargs)
+        self.temporal_padding = self.padding[0]
+        self.padding = (0, *self.padding[1:])  # Remove temporal pad to keep causal.
+    def forward(self, input: Tensor, memory_state: MemoryState = MemoryState.DISABLED) -> Tensor:
+        bank_size = self.stride[0] - self.kernel_size[0]
+        padding_bank = (
+            input[:, :, bank_size:].detach()
+            if (bank_size != 0 and memory_state != MemoryState.DISABLED)
+            else None
+        )
+        if (self.padding_bank is not None) and (memory_state == MemoryState.ACTIVE):
+            input = extend_head(input, memory=self.padding_bank)
+        else:
+            input = extend_head(input, times=self.temporal_padding * 2)
+        if memory_state != MemoryState.DISABLED and not self.training:
+            self.padding_bank = padding_bank
+        return super().forward(input)
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        if self.inflation_mode == "none":
+            super()._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+        else:
+            # NOTE: need to switch off strict
+            super()._load_from_state_dict(
+                modify_state_dict(
+                    self,
+                    state_dict,
+                    prefix,
+                    verbose=VERBOSE,
+                    inflate_weight_fn=partial(inflate_weight, position="tail"),
+                    inflate_bias_fn=partial(inflate_bias, position="tail"),
+                ),
+                prefix,
+                local_metadata,
+                False,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+class InflatedDistributionCausalConv3d(GradFixConv3d):
+    """
+    Note:
+        Direction:
+            - out: this layer generates mean/std of some distribution;
+            - in:  this layer takes tensors sampled from output of `out` layer as input.
+    """
+    def __init__(
+        self,
+        *args,
+        direction: _direction_t,
+        inflation_mode: _inflation_mode_t,
+        shape_norm: bool = True,
+        **kwargs,
+    ):
+        self.shape_norm = shape_norm
+        self.inflation_mode = inflation_mode
+        self.direction = direction
+        self.padding_bank = None
+        super().__init__(*args, **kwargs)
+        self.temporal_padding = self.padding[0]
+        self.padding = (0, *self.padding[1:])  # Remove temporal pad to keep causal.
+    def forward(self, input: Tensor, memory_state: MemoryState = MemoryState.DISABLED) -> Tensor:
+        bank_size = self.stride[0] - self.kernel_size[0]
+        padding_bank = (
+            input[:, :, bank_size:].detach()
+            if (bank_size != 0 and memory_state != MemoryState.DISABLED)
+            else None
+        )
+        if (self.padding_bank is not None) and (memory_state == MemoryState.ACTIVE):
+            input = extend_head(input, memory=self.padding_bank)
+        else:
+            input = extend_head(input, times=self.temporal_padding * 2)
+        if memory_state != MemoryState.DISABLED and not self.training:
+            self.padding_bank = padding_bank
+        return super().forward(input)
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        if self.inflation_mode == "none":
+            super()._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+        else:
+            super()._load_from_state_dict(
+                modify_state_dict(
+                    self,
+                    state_dict,
+                    prefix,
+                    verbose=VERBOSE,
+                    inflate_weight_fn=partial(
+                        inflate_distribution_weight, direction=self.direction, position="tail"
+                    ),
+                    inflate_bias_fn=partial(
+                        inflate_distribution_bias, direction=self.direction, position="tail"
+                    ),
+                ),
+                prefix,
+                local_metadata,
+                False,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+class InflatedConvTranspose3d(ConvTranspose3d):
+    # Note: It's not a causal one.
+    def __init__(
+        self, *args, inflation_mode: _inflation_mode_t, shape_norm: bool = True, **kwargs
+    ):
+        self.shape_norm = shape_norm
+        self.inflation_mode = inflation_mode
+        super().__init__(*args, **kwargs)
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        if self.inflation_mode == "none":
+            super()._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+        else:
+            # NOTE: need to switch off strict
+            super()._load_from_state_dict(
+                modify_state_dict(
+                    self,
+                    state_dict,
+                    prefix,
+                    verbose=VERBOSE,
+                    inflate_weight_fn=partial(inflate_weight, position="center"),
+                    inflate_bias_fn=partial(inflate_bias, position="center"),
+                ),
+                prefix,
+                local_metadata,
+                False,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+class FlattenedConvTranspose3d(ConvTranspose2d):
+    def forward(self, input: Tensor, **kwargs) -> Tensor:
+        output = rearrange(input, "b c f h w -> (b f) c h w")
+        output = super().forward(output)
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=input.size(2))
+        return output
+class FlattenedConv3d(GradFixConv2d):
+    def forward(self, input: Tensor, **kwargs) -> Tensor:
+        output = rearrange(input, "b c f h w -> (b f) c h w")
+        output = super().forward(output)
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=input.size(2))
+        return output
+def init_causal_conv3d(
+    *args,
+    inflation_mode: _inflation_mode_t,
+    direction: _direction_t = "",
+    partial_switch: bool = False,
+    **kwargs,
+):
+    """
+    Initialize a Causal-3D convolution layer.
+    Parameters:
+        inflation_mode: Listed as below. It's compatible with all the 3D-VAE checkpoints we have.
+            - none: No inflation will be conducted.
+                    The loading logic of state dict will fall back to default.
+            - flatten:  It will produce a `fake` 3D layer,
+                        which simply squeeze the axis of batch size and depth together,
+                        and then conduct 2D convolution.
+            - partial_flatten:
+                - layers with `partial_switch` on:  using `none` mode.
+                - layers with `partial_switch` off: using `flatten` mode.
+            - pad / tile: Refer to the definition of `InflatedCausalConv3d`.
+        direction:
+            - empty string: Ordinary causal convolution layer.
+            - out / in: Refer to the definition of `InflatedDistributionCausalConv3d`.
+        partial_switch: Only works when `inflation_mode` is `partial_flatten`.
+    """
+    stride = kwargs.get("stride", args[3] if len(args) > 3 else None)
+    padding = kwargs.get("padding", args[4] if len(args) > 4 else None)
+    if "flatten" in inflation_mode:
+        if (
+            (
+                (not stride)
+                or isinstance(stride, int)
+                or (isinstance(stride, list or tuple) and len(stride) < 3)
+            )  # if the config of stride can be used for 2D conv
+            and (
+                (not padding)
+                or isinstance(padding, int)
+                or (isinstance(padding, list or tuple) and len(padding) < 3)
+            )  # if the config of padding can be used for 2D conv
+            and (("partial" not in inflation_mode) or (not partial_switch))
+            # if it's fully-flatten mode, or with `partial_switch` off
+        ):
+            return FlattenedConv3d(*args, **kwargs)
+        else:
+            return InflatedCausalConv3d(*args, inflation_mode="none", **kwargs)
+            # Force-override
+    else:
+        if direction:
+            return InflatedDistributionCausalConv3d(
+                *args, direction=direction, inflation_mode=inflation_mode, **kwargs
+            )
+        else:
+            return InflatedCausalConv3d(*args, inflation_mode=inflation_mode, **kwargs)
+def init_transposed_conv3d(
+    *args, inflation_mode: _inflation_mode_t, partial_switch: bool = False, **kwargs
+):
+    stride = kwargs.get("stride", args[3] if len(args) > 3 else None)
+    padding = kwargs.get("padding", args[4] if len(args) > 4 else None)
+    if "flatten" in inflation_mode:
+        if (
+            (
+                (not stride)
+                or isinstance(stride, int)
+                or (isinstance(stride, list or tuple) and len(stride) < 3)
+            )
+            and (
+                (not padding)
+                or isinstance(padding, int)
+                or (isinstance(padding, list or tuple) and len(padding) < 3)
+            )
+            or (("partial" in inflation_mode) and not partial_switch)
+        ):
+            return FlattenedConvTranspose3d(*args, **kwargs)
+        else:
+            return InflatedConvTranspose3d(
+                *args, inflation_mode="none", **kwargs
+            )  # Force-override
+    else:
+        return InflatedConvTranspose3d(*args, inflation_mode=inflation_mode, **kwargs)

adv_grpo/inflated_lib.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import math
+from enum import Enum
+from typing import Optional
+import numpy as np
+import torch
+from diffusers.models.attention_processor import SpatialNorm
+from diffusers.models.normalization import RMSNorm
+from einops import rearrange
+from torch import Tensor, nn
+# from common.logger import get_logger
+# logger = get_logger(__name__)
+class MemoryState(Enum):
+    """
+    State[Disabled]:        No memory bank will be enabled.
+    State[Initializing]:    The model is handling the first clip,
+                            need to reset / initialize the memory bank.
+    State[Active]:          There has been some data in the memory bank.
+    """
+    DISABLED = 0
+    INITIALIZING = 1
+    ACTIVE = 2
+def norm_wrapper(
+    norm_layer: nn.Module,
+    x: torch.Tensor,
+    y: Optional[torch.Tensor] = None,
+    keep_causal: bool = False,
+) -> torch.Tensor:
+    if isinstance(norm_layer, (nn.LayerNorm, RMSNorm)):
+        if x.ndim == 4:
+            x = rearrange(x, "b c h w -> b h w c")
+            x = norm_layer(x)
+            x = rearrange(x, "b h w c -> b c h w")
+            return x
+        if x.ndim == 5:
+            x = rearrange(x, "b c t h w -> b t h w c")
+            x = norm_layer(x)
+            x = rearrange(x, "b t h w c -> b c t h w")
+            return x
+    if isinstance(norm_layer, (nn.GroupNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
+        if x.ndim <= 4 or (not keep_causal and not isinstance(norm_layer, nn.BatchNorm2d)):
+            return norm_layer(x)
+        if x.ndim == 5:
+            t = x.size(2)
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+            x = norm_layer(x)
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+            return x
+    if isinstance(norm_layer, SpatialNorm):
+        t = -1
+        if x.ndim == 5:
+            t = x.size(2)
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        if y.ndim == 5:
+            y = rearrange(y, "b c t h w -> (b t) c h w")
+        if x.ndim != 4 or y.ndim != 4:
+            raise NotImplementedError
+        x = norm_layer(x, y)
+        if t != -1:
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        return x
+    raise NotImplementedError
+def remove_head(tensor: Tensor, times: int = 1) -> Tensor:
+    """
+    Remove duplicated first frame features in the up-sampling process.
+    """
+    if times == 0:
+        return tensor
+    return torch.cat(tensors=(tensor[:, :, :1], tensor[:, :, times + 1 :]), dim=2)
+def extend_head(
+    tensor: Tensor, times: Optional[int] = 2, memory: Optional[Tensor] = None
+) -> Tensor:
+    """
+    When memory is None:
+        - Duplicate first frame features in the down-sampling process.
+    When memory is not None:
+        - Concatenate memory features with the input features to keep temporal consistency.
+    """
+    if times == 0:
+        return tensor
+    if memory is not None:
+        return torch.cat((memory.to(tensor), tensor), dim=2)
+    else:
+        tile_repeat = np.ones(tensor.ndim).astype(int)
+        tile_repeat[2] = times
+        return torch.cat(tensors=(torch.tile(tensor[:, :, :1], list(tile_repeat)), tensor), dim=2)
+def fill_weight_in_depth(weight: torch.Tensor, source: torch.Tensor, position: str):
+    """
+    Inflate a 2D convolution weight matrix to a 3D one by padding zeros in the channel of depth.
+    Parameters:
+         weight: The weight parameters of 3D conv kernel to be initialized.
+         source: The weight parameters of 2D conv kernel to be inflated.
+         position: Where to insert the 2D weights, can be chosen from
+            - tail: Pad zeros in the front of the 2D kernel. Used for casual inflation.
+            - center: Pad zeros around the 2D kernel. Used for normal inflation.
+    """
+    assert position in ["tail", "center"], "Unsupported fill-in position for weight inflation."
+    depth = weight.size(2)
+    weight.fill_(0.0)
+    if position == "center":
+        if depth % 2 == 1:
+            weight[:, :, depth // 2].copy_(source.squeeze(2))
+        else:
+            weight[:, :, depth // 2].copy_(source.squeeze(2) / 2.0)
+            weight[:, :, depth // 2 - 1].copy_(source.squeeze(2) / 2.0)
+    else:
+        if depth % 2 == 1:
+            weight[:, :, -1].copy_(source.squeeze(2))
+        else:
+            weight[:, :, -1].copy_(source.squeeze(2) / 2.0)
+            weight[:, :, -2].copy_(source.squeeze(2) / 2.0)
+    return weight
+def inflate_weight(
+    weight_2d: torch.Tensor,
+    weight_3d: torch.Tensor,
+    shape_norm: bool,
+    name: str,
+    inflation_mode: str,
+    position: str,
+    verbose: bool = True,
+):
+    """
+    Inflate a 2D convolution weight matrix to a 3D one.
+    Parameters:
+        weight_2d:      The weight matrix of 2D conv to be inflated.
+        weight_3d:      The weight matrix of 3D conv to be initialized.
+        inflation_mode: the mode of inflation
+            - pad: pad zeros around 2D kernel.
+            - tile: tile 2D kernel along the depth axis.
+        shape_norm:     Whether to scale the parameters of 2D kernel so that the untrained
+                        inflated model behaves exactly the same as the original 2D model
+                        in the reconstruction of image and video. recommend to switch it on.
+        name:           The name of inflated module. Only be used in logging.
+        position:       Refer to the doc of `fill_weight_in_depth`.
+                        Only works when `inflation_mode` is `pad`.
+        verbose:        Whether to log information about inflation.
+    """
+    assert inflation_mode in ["pad", "tile"]
+    depth = weight_3d.size(2)
+    tgt_out, tgt_in = weight_3d.size()[:2]
+    src_out, src_in = weight_2d.size()[:2]
+    assert (tgt_out % src_out == 0) and (tgt_in % src_in == 0)
+    out_fan, in_fan = tgt_out // src_out, tgt_in // src_in
+    depth_factor = 1 if inflation_mode == "pad" else depth
+    factor = (depth_factor * math.sqrt(out_fan) * math.sqrt(in_fan)) if shape_norm else 1
+    with torch.no_grad():
+        channel_inflation = weight_2d.unsqueeze(2).repeat(out_fan, in_fan, 1, 1, 1) / factor
+        if inflation_mode == "tile":
+            weight_3d.copy_(channel_inflation.repeat(1, 1, depth, 1, 1))
+        else:
+            weight_3d = fill_weight_in_depth(weight_3d, channel_inflation, position)
+        if verbose:
+            print(
+                f"*** {name}weight {weight_2d.size()} is inflated to {weight_3d.size()} ***"
+            )
+        return weight_3d
+def inflate_bias(
+    bias_2d: torch.Tensor,
+    bias_3d: torch.Tensor,
+    shape_norm: bool,
+    name: str,
+    inflation_mode: str,
+    position: str,
+    verbose: bool = True,
+):
+    """
+    Inflate a 2D convolution bias tensor to a 3D one
+    Parameters:
+        bias_2d:        The bias tensor of 2D conv to be inflated.
+        bias_3d:        The bias tensor of 3D conv to be initialized.
+        shape_norm:     Refer to `inflate_weight` function.
+        name:           The name of inflated module. Only be used in logging.
+        inflation_mode: Placeholder to align `inflate_weight`.
+        position:       Placeholder to align `inflate_weight`.
+        verbose:        Whether to log information about inflation.
+    """
+    tgt_ch, src_ch = bias_3d.size(0), bias_2d.size(0)
+    assert tgt_ch % src_ch == 0
+    fan = tgt_ch // src_ch
+    factor = math.sqrt(fan) if shape_norm else 1
+    with torch.no_grad():
+        bias_3d.copy_(bias_2d.repeat(fan) / factor)
+        if (tgt_ch != src_ch) and verbose:
+            print(f"*** {name}bias {bias_2d.size()} is inflated to {bias_3d.size()} ***")
+        return bias_3d
+def inflate_distribution_weight(
+    weight_2d: torch.Tensor,
+    weight_3d: torch.Tensor,
+    shape_norm: bool,
+    name: str,
+    direction: str,
+    inflation_mode: str,
+    position: str,
+    verbose: bool = True,
+):
+    """
+    Inflate a 2D convolution weight matrix to a 3D one.
+    Note:   Different from `inflate_weight`,
+            it's designed for `quant_conv` or `post_quant_conv` layers.
+            i.e.,   a convolution layer used to produce `mean` and `std` of some distribution,
+                    or its subsequent layer.
+    Parameters: Refer to `inflate_weight`.
+        direction:
+            - out:  this layer generates `mean` and `std` of some distribution.
+            - in:   this layer takes tensors sampled from output of `out` layer as input.
+    """
+    assert inflation_mode in ["pad", "tile"]
+    depth = weight_3d.size(2)
+    tgt_out, tgt_in = weight_3d.size()[:2]
+    src_out, src_in = weight_2d.size()[:2]
+    assert (tgt_out % src_out == 0) and (tgt_in % src_in == 0)
+    out_fan, in_fan = tgt_out // src_out, tgt_in // src_in
+    depth_factor = 1 if inflation_mode == "pad" else depth
+    if direction == "out":
+        factor = (depth_factor * math.sqrt(in_fan)) if shape_norm else 1
+        with torch.no_grad():
+            in_inflation = weight_2d.unsqueeze(2).repeat(1, in_fan, 1, 1, 1) / factor
+            # [src_out, src_in, k_h, k_w] -> [src_out, tgt_in, 1, k_h, k_w]
+            out_mean_weight, out_std_weight = torch.chunk(in_inflation, 2, dim=0)
+            mean_slice = slice(src_out // 2)
+            std_slice = slice(tgt_out // 2, tgt_out // 2 + src_out // 2)
+            if inflation_mode == "tile":
+                weight_3d[mean_slice] = out_mean_weight
+                weight_3d[std_slice] = out_std_weight
+                # Other part will be randomly initialized.
+            else:
+                weight_3d[mean_slice] = fill_weight_in_depth(
+                    weight_3d[mean_slice], out_mean_weight, position
+                )
+                weight_3d[std_slice] = fill_weight_in_depth(
+                    weight_3d[std_slice], out_std_weight, position
+                )
+                # Other part will be randomly initialized.
+    elif direction == "in":
+        factor = (depth_factor * math.sqrt(out_fan)) if shape_norm else 1
+        with torch.no_grad():
+            out_inflation = weight_2d.unsqueeze(2).repeat(out_fan, 1, 1, 1, 1) / factor
+            # [src_out, src_in, k_h, k_w] -> [tgt_out, src_in, 1, k_h, k_w]
+            if inflation_mode == "tile":
+                weight_3d[:, :src_in] = out_inflation
+            else:
+                weight_3d[:, :src_in] = fill_weight_in_depth(
+                    weight_3d[:, :src_in], out_inflation, position
+                )
+            weight_3d[:, src_in:].fill_(0.0)
+    else:
+        raise NotImplementedError
+    if verbose:
+        print(
+            f"*** [Distribution] {name}weight {weight_2d.size()} "
+            f"is inflated to {weight_3d.size()} ***"
+        )
+    return weight_3d
+def inflate_distribution_bias(
+    bias_2d: torch.Tensor,
+    bias_3d: torch.Tensor,
+    shape_norm: bool,
+    name: str,
+    direction: str,
+    inflation_mode: str,
+    position: str,
+    verbose: bool = True,
+):
+    """
+    The combination of `inflate_distribution_weight` and `inflate_bias`.
+    """
+    tgt_ch, src_ch = bias_3d.size(0), bias_2d.size(0)
+    assert tgt_ch % src_ch == 0
+    if direction == "out":
+        with torch.no_grad():
+            out_mean_bias, out_std_bias = torch.chunk(bias_2d, 2, dim=0)
+            bias_3d[: src_ch // 2] = out_mean_bias
+            bias_3d[tgt_ch // 2 : tgt_ch // 2 + src_ch // 2] = out_std_bias
+    elif direction == "in":
+        with torch.no_grad():
+            bias_3d[:src_ch] = bias_2d
+            bias_3d[src_ch:].fill_(0.0)
+    else:
+        raise NotImplementedError
+    if verbose:
+        print(
+            f"*** [Distribution] {name}bias {bias_2d.size()} is inflated to {bias_3d.size()} ***"
+        )
+    return bias_3d
+def modify_state_dict(
+    layer, state_dict, prefix, inflate_weight_fn, inflate_bias_fn, verbose=False
+):
+    """
+    the main function to inflated 2D parameters to 3D.
+    """
+    weight_name = prefix + "weight"
+    bias_name = prefix + "bias"
+    if weight_name in state_dict:
+        weight_2d = state_dict[weight_name]
+        if (
+            weight_2d.dim() == 4
+        ):  # Assuming the 2D weights are 4D tensors (out_channels, in_channels, h, w)
+            weight_3d = inflate_weight_fn(
+                weight_2d=weight_2d,
+                weight_3d=layer.weight,
+                shape_norm=layer.shape_norm,
+                name=prefix,
+                verbose=verbose,
+                inflation_mode=layer.inflation_mode,
+            )
+            state_dict[weight_name] = weight_3d
+        else:
+            return state_dict
+            # It's a 3d state dict, should not do inflation on both bias and weight.
+    if bias_name in state_dict:
+        bias_2d = state_dict[bias_name]
+        if bias_2d.dim() == 1:  # Assuming the 2D biases are 1D tensors (out_channels,)
+            bias_3d = inflate_bias_fn(
+                bias_2d=bias_2d,
+                bias_3d=layer.bias,
+                shape_norm=layer.shape_norm,
+                name=prefix,
+                verbose=verbose,
+                inflation_mode=layer.inflation_mode,
+            )
+            state_dict[bias_name] = bias_3d
+    return state_dict

adv_grpo/ocr.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from paddleocr import PaddleOCR
+import torch
+import numpy as np
+from Levenshtein import distance
+from typing import List, Union, Tuple
+from PIL import Image
+class OcrScorer:
+    def __init__(self, use_gpu: bool = False):
+        """
+        OCR reward calculator
+        :param use_gpu: Whether to use GPU acceleration for PaddleOCR
+        """
+        self.ocr = PaddleOCR(
+            use_angle_cls=False,
+            lang="en",
+            use_gpu=use_gpu,
+            show_log=False  # Disable unnecessary log output
+        )
+    @torch.no_grad()
+    def __call__(self,
+                images: Union[List[Image.Image], List[np.ndarray]],
+                prompts: List[str]) -> torch.Tensor:
+        """
+        Calculate OCR reward
+        :param images: List of input images (PIL or numpy format)
+        :param prompts: Corresponding target text list
+        :return: Reward tensor (CPU)
+        """
+        # import pdb; pdb.set_trace()
+        prompts = [prompt.split('"')[1] for prompt in prompts]
+        rewards = []
+        # Ensure input lengths are consistent
+        assert len(images) == len(prompts), "Images and prompts must have the same length"
+        for img, prompt in zip(images, prompts):
+            # Convert image format
+            if isinstance(img, Image.Image):
+                img = np.array(img)
+            try:
+                # OCR recognition
+                result = self.ocr.ocr(img, cls=False)
+                # Extract recognized text (handle possible multi-line results)
+                recognized_text = ''.join([res[1][0] if res[1][1] > 0 else '' for res in result[0]]) if result[0] else ''
+                recognized_text = recognized_text.replace(' ', '').lower()
+                prompt = prompt.replace(' ', '').lower()
+                if prompt in recognized_text:
+                    dist = 0
+                else:
+                    dist = distance(recognized_text, prompt)
+                # import pdb; pdb.set_trace()
+                # Recognized many unrelated characters, only add one character penalty
+                if dist > len(prompt):
+                    dist = len(prompt)
+            except Exception as e:
+                # Error handling (e.g., OCR parsing failure)
+                print(f"OCR processing failed: {str(e)}")
+                dist = len(prompt)  # Maximum penalty
+            reward = 1-dist/(len(prompt))
+            rewards.append(reward)
+        return rewards
+class OcrScorer_video_or_image:
+    def __init__(self, use_gpu: bool = False):
+        """
+        OCR reward calculator
+        :param use_gpu: Whether to use GPU acceleration for PaddleOCR
+        """
+        self.ocr = PaddleOCR(
+            use_angle_cls=False,
+            lang="en",
+            use_gpu=use_gpu,
+            show_log=False  # Disable unnecessary log output
+        )
+        self.frame_interval = 4
+    @torch.no_grad()
+    def __call__(self, images: Union[List[Image.Image], List[np.ndarray]], prompts: List[str]) -> Tuple[List[float], torch.Tensor]:
+        """
+        :param images: List of images or videos (each video as np.ndarray of shape [F, H, W, C])
+        :param prompts: List of prompts containing target text
+        :return: (List of OCR rewards, Tensor of attention regions)
+        """
+        prompts = [prompt.split('"')[1] for prompt in prompts]
+        assert len(images) == len(prompts), "Mismatch between images and prompts."
+        rewards = []
+        for img, prompt in zip(images, prompts):
+            prompt = prompt.replace(' ', '').lower()
+            frame_rewards = []
+            # Handle video: shape (F, H, W, C)
+            if isinstance(img, np.ndarray) and img.ndim == 4:
+                sampled_frames = img[::self.frame_interval]
+            else:
+                sampled_frames = [img]
+            for frame in sampled_frames:
+                region = None
+                if isinstance(frame, Image.Image):
+                    frame = np.array(frame)
+                try:
+                    result = self.ocr.ocr(frame, cls=False)
+                    text = ''.join([res[1][0] if res[1][1] > 0 else '' for res in result[0]]) if result[0] else ''
+                    text = text.replace(' ', '').lower()
+                    dist = distance(text, prompt)
+                    dist = min(dist, len(prompt))
+                except Exception as e:
+                    print(f"OCR failed on frame: {e}")
+                    dist = len(prompt)
+                reward = 1 - dist / len(prompt)
+                if reward > 0:
+                    frame_rewards.append(reward)
+            if frame_rewards:
+                rewards.append(sum(frame_rewards) / len(frame_rewards))
+            else:
+                rewards.append(0.0)
+        return rewards
+if __name__ == "__main__":
+    example_image_path = "media_images_eval_images_499_ef42de47b8ec98892954.jpg"
+    example_image = Image.open(example_image_path)
+    example_prompt = 'New York Skyline with "Hello World" written with fireworks on the sky'
+    # Instantiate scorer
+    scorer = OcrScorer(use_gpu=False)
+    # Call scorer and print result
+    reward = scorer([example_image], [example_prompt])
+    print(f"OCR Reward: {reward}")

adv_grpo/pick_score_training.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import torch
+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+from torch.utils.data import DataLoader
+# ====== 使用你找到的 CLIPCriterion ======
+from dataclasses import dataclass
+from torch.nn.modules.loss import _Loss
+from torch.utils.data import Dataset, DataLoader
+import os
+import json
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+def evaluate_pickscore(model, processor, json_file, qwen_dir, sd3_dir, device, max_eval=100):
+    """
+    简单评估：取前 max_eval 对 Qwen vs SD3 pair，算平均分
+    """
+    model.eval()
+    if hasattr(model, "module"):  # DDP 情况
+        model = model.module
+    with open(json_file, "r") as f:
+        prompt2img = json.load(f)
+    prompts = list(prompt2img.keys())[:max_eval]
+    qwen_scores, sd3_scores = [], []
+    for prompt in prompts:
+        filename = prompt2img[prompt]
+        qwen_img_path = os.path.join(qwen_dir, filename)
+        sd3_img_path = os.path.join(sd3_dir, filename)
+        if not (os.path.exists(qwen_img_path) and os.path.exists(sd3_img_path)):
+            continue
+        qwen_img = Image.open(qwen_img_path).convert("RGB")
+        sd3_img = Image.open(sd3_img_path).convert("RGB")
+        # 文本 & 图像输入
+        text_inputs = processor.tokenizer(
+            prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=77
+        ).to(device)
+        qwen_inputs = processor(images=qwen_img, return_tensors="pt").to(device)
+        sd3_inputs = processor(images=sd3_img, return_tensors="pt").to(device)
+        with torch.no_grad():
+            text_features = model.get_text_features(**text_inputs)
+            qwen_features = model.get_image_features(**qwen_inputs)
+            sd3_features = model.get_image_features(**sd3_inputs)
+            # 归一化
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            qwen_features = qwen_features / qwen_features.norm(dim=-1, keepdim=True)
+            sd3_features = sd3_features / sd3_features.norm(dim=-1, keepdim=True)
+            # 相似度分数
+            logit_scale = model.logit_scale.exp()
+            qwen_score = (logit_scale * (text_features @ qwen_features.T)).item()
+            sd3_score = (logit_scale * (text_features @ sd3_features.T)).item()
+            qwen_scores.append(qwen_score)
+            sd3_scores.append(sd3_score)
+    model.train()
+    if len(qwen_scores) > 0:
+        print(f"[Eval] Qwen avg={sum(qwen_scores)/len(qwen_scores):.4f} "
+              f"| SD3 avg={sum(sd3_scores)/len(sd3_scores):.4f}")
+@dataclass
+class CLIPCriterionConfig:
+    _target_: str = "trainer.criterions.clip_criterion.CLIPCriterion"
+    is_distributed: bool = False  # 本地先关掉
+    label_0_column_name: str = "label_0"
+    label_1_column_name: str = "label_1"
+    input_ids_column_name: str = "input_ids"
+    pixels_0_column_name: str = "pixels_0"
+    pixels_1_column_name: str = "pixels_1"
+    num_examples_per_prompt_column_name: str = "num_examples_per_prompt"
+    in_batch_negatives: bool = False
+class CLIPCriterion(_Loss):
+    def __init__(self, cfg: CLIPCriterionConfig):
+        super().__init__()
+        self.cfg = cfg
+    @staticmethod
+    def get_features(model, input_ids, pixels_0_values, pixels_1_values):
+        # import pdb; pdb.set_trace()
+        # if hasattr(model, "module"):
+        #     model = model.module
+        all_pixel_values = torch.cat([pixels_0_values, pixels_1_values], dim=0)
+        # text_features, all_image_features = model(text_inputs=input_ids, image_inputs=all_pixel_values)
+        text_features = model.get_text_features(input_ids=input_ids)
+        all_image_features = model.get_image_features(pixel_values=all_pixel_values)
+        all_image_features = all_image_features / all_image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        image_0_features, image_1_features = all_image_features.chunk(2, dim=0)
+        return image_0_features, image_1_features, text_features
+    @staticmethod
+    def gather_features(features):
+        all_features = torch.cat(torch.distributed.nn.all_gather(features), dim=0)
+        return all_features
+    # def safe_sync(self, msg):
+    #     torch.cuda.synchronize()
+    #     print(f"[Rank {dist.get_rank()}] OK at {msg}")
+    def calc_loss(
+            self,
+            text_features,
+            image_0_features,
+            image_1_features,
+            logit_scale,
+            label_0,
+            label_1,
+            num_examples_per_prompt,
+            *args,
+            **kwargs
+    ):
+        # self.safe_sync("start")
+        device = image_0_features.device
+        # gather features
+        if self.cfg.is_distributed:
+            image_0_features = self.gather_features(image_0_features)
+            image_1_features = self.gather_features(image_1_features)
+            text_features = self.gather_features(text_features)
+            label_0 = self.gather_features(label_0)
+            label_1 = self.gather_features(label_1)
+            num_examples_per_prompt = self.gather_features(num_examples_per_prompt)
+        # calc logits # TODO use local loss as open-clip does
+        all_image_features = torch.cat([image_0_features, image_1_features], dim=0)  # (2 * batch_size, dim)
+        logits_per_image = logit_scale * all_image_features @ text_features.T
+        image_0_logits, image_1_logits = logits_per_image.chunk(2, dim=0)
+        text_logits = logit_scale * text_features @ all_image_features.T
+        if self.cfg.in_batch_negatives:
+            # get labels
+            num_images = all_image_features.shape[0]
+            image_labels = torch.arange(num_images, device=device, dtype=torch.long)
+            image_0_labels, image_1_labels = image_labels.chunk(2, dim=0)
+            num_texts = text_features.shape[0]
+            text_labels = torch.arange(num_texts, device=device, dtype=torch.long)
+            # image loss - we want to increase the logits of the preferred image to the text
+            image_0_loss = torch.nn.functional.cross_entropy(image_0_logits, text_labels, reduction="none")
+            image_1_loss = torch.nn.functional.cross_entropy(image_1_logits, text_labels, reduction="none")
+            # if we have a tie, we will increase both images equally, and average so the image loss of each example is
+            # proportional
+            image_loss = label_0 * image_0_loss + label_1 * image_1_loss
+            # text loss - we want to increase the logits of the text to the preferred image
+            text_0_loss = torch.nn.functional.cross_entropy(text_logits, image_0_labels, reduction="none")
+            text_1_loss = torch.nn.functional.cross_entropy(text_logits, image_1_labels, reduction="none")
+        else:
+            text_0_logits, text_1_logits = text_logits.chunk(2, dim=-1)
+            index = torch.arange(text_0_logits.shape[0], device=device, dtype=torch.long)
+            text_0_logits = text_0_logits[index, index]
+            text_1_logits = text_1_logits[index, index]
+            text_logits = torch.stack([text_0_logits, text_1_logits], dim=-1)
+            text_0_labels = torch.zeros(text_logits.shape[0], device=device, dtype=torch.long)
+            text_1_labels = text_0_labels + 1
+            text_0_loss = torch.nn.functional.cross_entropy(text_logits, text_0_labels, reduction="none")
+            text_1_loss = torch.nn.functional.cross_entropy(text_logits, text_1_labels, reduction="none")
+        # if we have a tie we want the logits of for each image to be equal
+        text_loss = label_0 * text_0_loss + label_1 * text_1_loss
+        # we want the ideal loss to be 0, currently, if there is a tie, it is 0.5 * log(0.5) + 0.5 * log(0.5)
+        # so we add log(0.5) to the loss
+        is_tie = (label_0 == label_1).float()
+        is_tie *= torch.log(torch.tensor(0.5, device=device))
+        text_loss += is_tie
+        # we average the image and text loss
+        if self.cfg.in_batch_negatives:
+            loss = (image_loss + text_loss) / 2
+        else:
+            loss = text_loss
+        # import pdb; pdb.set_trace()
+        # some prompts have lots of interactions, we want weight them accordingly
+        # absolute_example_weight = 1 / num_examples_per_prompt
+        # denominator = absolute_example_weight.sum()
+        # weight_per_example = absolute_example_weight / denominator
+        # loss *= weight_per_example
+        loss = loss.mean()
+        # import pdb; pdb.set_trace()
+        # loss = loss.sum()
+        return loss
+    def forward(self, model, batch):
+        # import pdb; pdb.set_trace()
+        image_0_features, image_1_features, text_features = self.get_features(
+            model,
+            batch[self.cfg.input_ids_column_name],
+            batch[self.cfg.pixels_0_column_name],
+            batch[self.cfg.pixels_1_column_name]
+        )
+        # print("text_features:", text_features.shape)
+        loss = self.calc_loss(
+            text_features,
+            image_0_features,
+            image_1_features,
+            model.logit_scale.exp(),
+            batch[self.cfg.label_0_column_name],
+            batch[self.cfg.label_1_column_name],
+            batch[self.cfg.num_examples_per_prompt_column_name],
+        )
+        return loss
+# ====== 数据准备 ======
+class QwenSD3JsonDataset(Dataset):
+    def __init__(self, processor, json_file, qwen_dir, sd3_dir):
+        """
+        json_file: prompt2img.json {prompt: filename}
+        qwen_dir: 存放Qwen图像的文件夹
+        sd3_dir: 存放SD3图像的文件夹
+        """
+        self.processor = processor
+        with open(json_file, "r") as f:
+            self.prompt2img = json.load(f)
+        self.prompts = list(self.prompt2img.keys())
+        self.qwen_dir = qwen_dir
+        self.sd3_dir = sd3_dir
+    def __len__(self):
+        return len(self.prompts)
+    def __getitem__(self, idx):
+        prompt = self.prompts[idx]
+        filename = self.prompt2img[prompt]
+        qwen_img_path = os.path.join(self.qwen_dir, filename)
+        sd3_img_path = os.path.join(self.sd3_dir, filename)
+        if os.path.exists(qwen_img_path) and os.path.exists(sd3_img_path):
+            qwen_img = Image.open(qwen_img_path).convert("RGB")
+            sd3_img = Image.open(sd3_img_path).convert("RGB")
+        else:
+            qwen_img = Image.open(sd3_img_path).convert("RGB")
+            sd3_img = Image.open(sd3_img_path).convert("RGB")
+        # 文本token
+        text_inputs = self.processor.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=77,
+            return_tensors="pt"
+        )
+        input_ids = text_inputs["input_ids"].squeeze(0)
+        # 图像预处理
+        pixels_0 = self.processor(images=qwen_img, return_tensors="pt")["pixel_values"].squeeze(0)
+        pixels_1 = self.processor(images=sd3_img, return_tensors="pt")["pixel_values"].squeeze(0)
+        return {
+            "input_ids": input_ids,
+            "pixels_0": pixels_0,  # 正样本 (Qwen)
+            "pixels_1": pixels_1,  # 负样本 (SD3)
+            "label_0": torch.tensor(1.0),
+            "label_1": torch.tensor(0.0),
+            "num_examples_per_prompt": torch.tensor(1.0)
+        }
+# ====== 训练 loop ======
+# def finetune_pickscore(json_file, qwen_dir, sd3_dir, epochs=2, batch_size=4, lr=1e-6, device="cuda"):
+#     processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+#     model = CLIPModel.from_pretrained("yuvalkirstain/PickScore_v1").to(device)
+#     dataset = QwenSD3JsonDataset(processor,json_file, qwen_dir, sd3_dir)
+#     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+#     criterion = CLIPCriterion(CLIPCriterionConfig())
+#     optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+#     # import pdb; pdb.set_trace()
+#     model.train()
+#     for epoch in range(epochs):
+#         total_loss = 0.0
+#         for batch in dataloader:
+#             batch = {k: v.to(device) for k, v in batch.items()}
+#             loss = criterion(model, batch)
+#             optimizer.zero_grad()
+#             loss.backward()
+#             optimizer.step()
+#             total_loss += loss.item()
+#         print(f"Epoch {epoch} | Loss {total_loss/len(dataloader):.4f}")
+#     model.save_pretrained("pickscore_qwen_finetuned")
+#     return model
+def finetune_pickscore_distributed(json_file, qwen_dir, sd3_dir, epochs=2, batch_size=4, lr=1e-6):
+    # 1. 初始化分布式
+    dist.init_process_group(backend="nccl")
+    local_rank = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    # 2. 准备数据
+    processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+    dataset = QwenSD3JsonDataset(processor, json_file, qwen_dir, sd3_dir)
+    sampler = DistributedSampler(dataset)
+    dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
+    # 3. 模型 + DDP
+    model = CLIPModel.from_pretrained("yuvalkirstain/PickScore_v1").to(device)
+    model = DDP(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
+    criterion = CLIPCriterion(CLIPCriterionConfig())
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    # 4. 训练
+    model.train()
+    if dist.get_rank() == 0:
+        evaluate_pickscore(model, processor, json_file, qwen_dir, sd3_dir, device)
+    for epoch in range(epochs):
+        sampler.set_epoch(epoch)  # 保证每个 epoch shuffle 一样
+        total_loss = 0.0
+        for step, batch in enumerate(dataloader):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            loss = criterion(model.module, batch)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            # 累积loss（先local）
+            total_loss += loss.item()
+            # 每隔一定步打印一次（rank=0）
+            if step % 50 == 0:  # 你可以改成10、100
+                # all_reduce 把所有 GPU 的 loss 平均
+                avg_loss = torch.tensor(loss.item(), device=device)
+                dist.all_reduce(avg_loss, op=dist.ReduceOp.AVG)
+                if dist.get_rank() == 0:
+                    print(f"[Epoch {epoch} | Step {step}/{len(dataloader)}] "
+                        f"local_loss={loss.item():.4f} | avg_loss={avg_loss.item():.4f}")
+        # 每个 epoch 打印 epoch 平均 loss
+        epoch_loss = torch.tensor(total_loss / len(dataloader), device=device)
+        dist.all_reduce(epoch_loss, op=dist.ReduceOp.AVG)
+        if dist.get_rank() == 0:
+            print(f"===> Epoch {epoch} done | avg_epoch_loss={epoch_loss.item():.4f}")
+            evaluate_pickscore(model, processor, json_file, qwen_dir, sd3_dir, device)
+    # 5. 保存模型（只在 rank=0）
+    if dist.get_rank() == 0:
+        model.module.save_pretrained("pickscore_qwen_finetuned")
+    dist.destroy_process_group()
+# ====== 用法示例 ======
+if __name__ == "__main__":
+    finetune_pickscore_distributed(
+        json_file="/mnt/bn/vgfm2/test_dit/weijia/outputs/sd3_images/prompt2img.json",
+        qwen_dir="/mnt/bn/vgfm2/test_dit/weijia/outputs/qwen_images",
+        sd3_dir="/mnt/bn/vgfm2/test_dit/weijia/outputs/sd3_images",
+        epochs=2,
+        batch_size=4,
+        lr=1e-6,
+    )

adv_grpo/pickscore_scorer.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+import torch
+class PickScoreScorer(torch.nn.Module):
+    def __init__(self, device="cuda", dtype=torch.float32):
+        super().__init__()
+        processor_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+        model_path = "yuvalkirstain/PickScore_v1"
+        self.device = device
+        self.dtype = dtype
+        self.processor = CLIPProcessor.from_pretrained(processor_path)
+        self.model = CLIPModel.from_pretrained(model_path).eval().to(device)
+        self.model = self.model.to(dtype=dtype)
+    @torch.no_grad()
+    def __call__(self, prompt, images):
+        # Preprocess images
+        if hasattr(self.model, "module"):
+            self.model = self.model.module
+        image_inputs = self.processor(
+            images=images,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        image_inputs = {k: v.to(device=self.device) for k, v in image_inputs.items()}
+        # Preprocess text
+        text_inputs = self.processor(
+            text=prompt,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        text_inputs = {k: v.to(device=self.device) for k, v in text_inputs.items()}
+        # Get embeddings
+        image_embs = self.model.get_image_features(**image_inputs)
+        image_embs = image_embs / image_embs.norm(p=2, dim=-1, keepdim=True)
+        text_embs = self.model.get_text_features(**text_inputs)
+        text_embs = text_embs / text_embs.norm(p=2, dim=-1, keepdim=True)
+        # Calculate scores
+        logit_scale = self.model.logit_scale.exp()
+        scores = logit_scale * (text_embs @ image_embs.T)
+        scores = scores.diag()
+        # norm to 0-1
+        scores = scores/26
+        return scores
+# Usage example
+def main():
+    scorer = PickScoreScorer(
+        device="cuda",
+        dtype=torch.float32
+    )
+    images=[
+    "nasa.jpg",
+    ]
+    pil_images = [Image.open(img) for img in images]
+    prompts=[
+        'A astronaut’s glove floating in zero-g with "NASA 2049" on the wrist',
+    ]
+    print(scorer(prompts, pil_images))
+if __name__ == "__main__":
+    main()

adv_grpo/pickscore_scorer_constractive.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+import torch
+class PickScoreScorerConstractive(torch.nn.Module):
+    def __init__(self, device="cuda", dtype=torch.float32):
+        super().__init__()
+        processor_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+        model_path = "yuvalkirstain/PickScore_v1"
+        self.device = device
+        self.dtype = dtype
+        self.processor = CLIPProcessor.from_pretrained(processor_path)
+        self.model = CLIPModel.from_pretrained(model_path).eval().to(device)
+        self.model = self.model.to(dtype=dtype)
+    @torch.no_grad()
+    def __call__(self, prompt, images, ref_images):
+        # Preprocess images
+        image_inputs = self.processor(
+            images=images,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        image_inputs = {k: v.to(device=self.device) for k, v in image_inputs.items()}
+        ref_image_inputs = self.processor(
+            images=ref_images,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        ref_image_inputs = {k: v.to(device=self.device) for k, v in ref_image_inputs.items()}
+        # Preprocess text
+        text_inputs = self.processor(
+            text=prompt,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        text_inputs = {k: v.to(device=self.device) for k, v in text_inputs.items()}
+        # Get embeddings
+        image_embs = self.model.get_image_features(**image_inputs)
+        image_embs = image_embs / image_embs.norm(p=2, dim=-1, keepdim=True)
+        ref_image_embs = self.model.get_image_features(**ref_image_inputs)
+        ref_image_embs = ref_image_embs / ref_image_embs.norm(p=2, dim=-1, keepdim=True)
+        text_embs = self.model.get_text_features(**text_inputs)
+        text_embs = text_embs / text_embs.norm(p=2, dim=-1, keepdim=True)
+        # Calculate scores
+        logit_scale = self.model.logit_scale.exp()
+        scores = logit_scale * (text_embs @ image_embs.T)
+        scores = scores.diag()
+        # norm to 0-1
+        scores = scores/26
+        ref_scores = logit_scale * (text_embs @ ref_image_embs.T)
+        ref_scores = ref_scores.diag()
+        ref_scores = ref_scores/26
+        return scores, ref_scores, image_embs, ref_image_embs
+# Usage example
+def main():
+    scorer = PickScoreScorer(
+        device="cuda",
+        dtype=torch.float32
+    )
+    images=[
+    "nasa.jpg",
+    ]
+    pil_images = [Image.open(img) for img in images]
+    prompts=[
+        'A astronaut’s glove floating in zero-g with "NASA 2049" on the wrist',
+    ]
+    print(scorer(prompts, pil_images))
+if __name__ == "__main__":
+    main()

adv_grpo/pickscore_scorer_patch.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+import torch
+class PickScoreScorer(torch.nn.Module):
+    def __init__(self, device="cuda", dtype=torch.float32):
+        super().__init__()
+        processor_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+        model_path = "yuvalkirstain/PickScore_v1"
+        self.device = device
+        self.dtype = dtype
+        self.processor = CLIPProcessor.from_pretrained(processor_path)
+        self.model = CLIPModel.from_pretrained(model_path).eval().to(device)
+        self.model = self.model.to(dtype=dtype)
+    @torch.no_grad()
+    def __call__(self, prompt, images):
+        # Preprocess images
+        if hasattr(self.model, "module"):
+            self.model = self.model.module
+        image_inputs = self.processor(
+            images=images,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        image_inputs = {k: v.to(device=self.device) for k, v in image_inputs.items()}
+        # Preprocess text
+        text_inputs = self.processor(
+            text=prompt,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        text_inputs = {k: v.to(device=self.device) for k, v in text_inputs.items()}
+        # Get embeddings
+        # image_embs = self.model.get_image_features(**image_inputs)
+        import pdb; pdb.set_trace()
+        image_embs = self.model.vision_model(image_inputs["pixel_values"],output_hidden_states=True)
+        image_embs = image_embs.last_hidden_state
+        image_embs = self.model.visual_projection(image_embs)  # [B, N, 1024]
+        image_embs = image_embs / image_embs.norm(p=2, dim=-1, keepdim=True)
+        text_embs = self.model.get_text_features(**text_inputs)
+        text_embs = text_embs / text_embs.norm(p=2, dim=-1, keepdim=True)
+        # Calculate scores
+        logit_scale = self.model.logit_scale.exp()
+        # scores = logit_scale * (text_embs @ image_embs.T)
+        patch_scores = torch.einsum("bd,bnd->bn", text_embs, image_embs)  # [B, N]
+        scores = logit_scale * patch_scores.mean(dim=1)  # 取所有 patch 的平均
+        # scores = scores.diag()
+        # norm to 0-1
+        scores = scores/26
+        # import pdb; pdb.set_trace()
+        return scores
+# Usage example
+def main():
+    scorer = PickScoreScorer(
+        device="cuda",
+        dtype=torch.float32
+    )
+    images=[
+    "nasa.jpg",
+    ]
+    pil_images = [Image.open(img) for img in images]
+    prompts=[
+        'A astronaut’s glove floating in zero-g with "NASA 2049" on the wrist',
+    ]
+    print(scorer(prompts, pil_images))
+if __name__ == "__main__":
+    main()