AlirezaSalehi99 commited on Feb 21

Commit

ebe754f

verified ·

1 Parent(s): 0c3fcb9

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
Tipsomaly/model/big_vision/configs/proj/givt/givt_overview.png +3 -0
Tipsomaly/model/big_vision/configs/proj/jetformer/jetformer_overview.png +3 -0
Tipsomaly/model/big_vision/configs/proj/paligemma/paligemma.png +3 -0
Tipsomaly/model/big_vision/datasets/countbenchqa/data/countbench_paired_questions.json +1 -0
Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-311.pyc +3 -0
Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-312.pyc +3 -0
Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-39.pyc +3 -0
Tipsomaly/model/big_vision/datasets/nocaps/nocaps.py +160 -0
Tipsomaly/model/big_vision/datasets/refcoco/refcoco.py +448 -0
Tipsomaly/model/big_vision/datasets/rsvqa_hr/rsvqa_hr.py +193 -0
Tipsomaly/model/big_vision/datasets/rsvqa_lr/rsvqa_lr.py +198 -0
Tipsomaly/model/big_vision/datasets/scicap/scicap.py +205 -0
Tipsomaly/model/big_vision/datasets/science_qa/science_qa.py +156 -0
Tipsomaly/model/big_vision/datasets/screen2words/screen2words.py +120 -0
Tipsomaly/model/big_vision/datasets/stvqa/stvqa.py +134 -0
Tipsomaly/model/big_vision/datasets/tallyqa/tallyqa.py +146 -0
Tipsomaly/model/big_vision/datasets/textcaps/textcaps.py +152 -0
Tipsomaly/model/big_vision/datasets/textvqa/textvqa.py +186 -0
Tipsomaly/model/big_vision/datasets/vizwizvqa/vizwizvqa.py +128 -0
Tipsomaly/model/big_vision/datasets/vqa/vqa.py +147 -0
Tipsomaly/model/big_vision/datasets/widgetcap/widgetcap.py +151 -0
Tipsomaly/model/big_vision/datasets/xgqa/xgqa.py +145 -0
Tipsomaly/model/big_vision/datasets/xm3600/xm3600.py +136 -0
Tipsomaly/model/big_vision/evaluators/proj/cappa/perplexity.py +50 -0
Tipsomaly/model/big_vision/evaluators/proj/cappa/scoring_classifier.py +63 -0
Tipsomaly/model/big_vision/evaluators/proj/distill/distance.py +151 -0
Tipsomaly/model/big_vision/evaluators/proj/givt/coco_panoptic.py +401 -0
Tipsomaly/model/big_vision/evaluators/proj/givt/nyu_depth.py +191 -0
Tipsomaly/model/big_vision/evaluators/proj/givt/save_predictions.py +118 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/contrastive.py +99 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/discriminative_classifier.py +440 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/discriminative_classifier_test.py +237 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/image_text_retrieval.py +85 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/image_text_retrieval_test.py +86 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/prompt_engineering.py +112 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/prompt_engineering_constants.py +110 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/prompt_engineering_test.py +48 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/retrieval.py +306 -0
Tipsomaly/model/big_vision/evaluators/proj/image_text/retrieval_test.py +178 -0
Tipsomaly/model/big_vision/evaluators/proj/paligemma/perplexity.py +63 -0
Tipsomaly/model/big_vision/evaluators/proj/paligemma/transfers/chartqa.py +139 -0
Tipsomaly/model/big_vision/evaluators/proj/paligemma/transfers/pope.py +135 -0
Tipsomaly/model/big_vision/evaluators/proj/uvim/coco_panoptic.py +324 -0
Tipsomaly/model/big_vision/evaluators/proj/uvim/coltran_fid.py +242 -0
Tipsomaly/model/big_vision/evaluators/proj/uvim/coltran_fid_data/eval_file_names.txt +0 -0
Tipsomaly/model/big_vision/evaluators/proj/uvim/coltran_fid_data/reference_file_names.txt +0 -0
Tipsomaly/model/big_vision/evaluators/proj/uvim/common.py +64 -0
Tipsomaly/model/big_vision/evaluators/proj/uvim/compute_mean.py +79 -0
Tipsomaly/model/big_vision/evaluators/proj/uvim/nyu_depth.py +154 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 Tipsomaly/imgs/Models_Architecture_page-0001.jpg filter=lfs diff=lfs merge=lfs -text
 Tipsomaly/imgs/results-table.png filter=lfs diff=lfs merge=lfs -text
 Tipsomaly/imgs/Qualitative_results_page-0001.jpg filter=lfs diff=lfs merge=lfs -text

 Tipsomaly/imgs/Models_Architecture_page-0001.jpg filter=lfs diff=lfs merge=lfs -text
 Tipsomaly/imgs/results-table.png filter=lfs diff=lfs merge=lfs -text
 Tipsomaly/imgs/Qualitative_results_page-0001.jpg filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-39.pyc filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/model/big_vision/configs/proj/givt/givt_overview.png filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/model/big_vision/configs/proj/jetformer/jetformer_overview.png filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/model/big_vision/configs/proj/paligemma/paligemma.png filter=lfs diff=lfs merge=lfs -text

Tipsomaly/model/big_vision/configs/proj/givt/givt_overview.png ADDED Viewed

Git LFS Details

SHA256: 5c9ab0e72ea3dab6d2997cfa20074bae3dad71d1c7a9d0fff15fb4d453fa8a85
Pointer size: 131 Bytes
Size of remote file: 803 kB

Tipsomaly/model/big_vision/configs/proj/jetformer/jetformer_overview.png ADDED Viewed

Git LFS Details

SHA256: 5f24740d8fb552cda73e0e06874aa888fc60a4b47b76e3870a85eccc2c716699
Pointer size: 131 Bytes
Size of remote file: 553 kB

Tipsomaly/model/big_vision/configs/proj/paligemma/paligemma.png ADDED Viewed

Git LFS Details

SHA256: e3258bcf799cf0bbf7f5181e53c85339630b60476f8af9a95ef6432d0ad56f54
Pointer size: 131 Bytes
Size of remote file: 367 kB

Tipsomaly/model/big_vision/datasets/countbenchqa/data/countbench_paired_questions.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"question": "How many headsets are there in the image?"}, {"question": "How many light bulbs are there in the image?"}, {"question": "How many prints are there in the image?"}, {"question": "How many arrows are there in the image?"}, {"question": "How many spoons are there in the image?"}, {"question": "How many girls are there in the image?"}, {"question": "How many parrots are there in the image?"}, {"question": "How many coloring pages are there in the image?"}, {"question": "How many food containers are there in the image?"}, {"question": "How many birdhouse patterns are there in the image?"}, {"question": "How many sofas are there in the image?"}, {"question": "How many waterlilies are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many golfers are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many outfits are there in the image?"}, {"question": "How many pigs are there in the image?"}, {"question": "How many cars are there in the image?"}, {"question": "How many aum symbols are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many buttons are there in the image?"}, {"question": "How many rackets are there in the image?"}, {"question": "How many pots are there in the image?"}, {"question": "How many stars are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many silhouettes are there in the image?"}, {"question": "How many kids are there in the image?"}, {"question": "How many moth silhouettes are there in the image?"}, {"question": "How many pumpkin candles are there in the image?"}, {"question": "How many essential oils are there in the image?"}, {"question": "How many stencils are there in the image?"}, {"question": "How many text boxes are there in the image?"}, {"question": "How many basketball players are there in the image?"}, {"question": "How many photos are there in the image?"}, {"question": "How many forks are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many bags are there in the image?"}, {"question": "How many couples are there in the image?"}, {"question": "How many weights are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many fish are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many clocks are there in the image?"}, {"question": "How many banners are there in the image?"}, {"question": "How many chests are there in the image?"}, {"question": "How many stars are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many globe icons are there in the image?"}, {"question": "How many posters are there in the image?"}, {"question": "How many socks are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many snails are there in the image?"}, {"question": "How many crochet potholders are there in the image?"}, {"question": "How many christmas cards are there in the image?"}, {"question": "How many double beds are there in the image?"}, {"question": "How many baseball players are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many baseball players are there in the image?"}, {"question": "How many cars are there in the image?"}, {"question": "How many trees are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many tree trunk cuts are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many banners are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many individual earrings are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many tomatoes are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many prints are there in the image?"}, {"question": "How many ice creams are there in the image?"}, {"question": "How many plates are there in the image?"}, {"question": "How many sumo wrestlers are there in the image?"}, {"question": "How many compositions are there in the image?"}, {"question": "How many PVC vinyls are there in the image?"}, {"question": "How many photos of fruit are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many mirrors are there in the image?"}, {"question": "How many groomsmen are there in the image?"}, {"question": "How many posters are there in the image?"}, {"question": "How many pumpkins are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many kittens are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many boots are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many figurines are there in the image?"}, {"question": "How many moais are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many cats are there in the image?"}, {"question": "How many wallpaper variants are there in the image?"}, {"question": "How many nail polishes are there in the image?"}, {"question": "How many bumble bees are there in the image?"}, {"question": "How many tickets are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many wine bottles are there in the image?"}, {"question": "How many silhouettes of couples are there in the image?"}, {"question": "How many owls are there in the image?"}, {"question": "How many candles are there in the image?"}, {"question": "How many flower pots are there in the image?"}, {"question": "How many coins are there in the image?"}, {"question": "How many placemats are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many cars are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many candles are there in the image?"}, {"question": "How many pairs of socks are there in the image?"}, {"question": "How many blinds are there in the image?"}, {"question": "How many floral patterns are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many bicycles are there in the image?"}, {"question": "How many dwarfs are there in the image?"}, {"question": "How many stickers are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many stamps are there in the image?"}, {"question": "How many pumpkins are there in the image?"}, {"question": "How many game cartridges are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many glasses are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many coins are there in the image?"}, {"question": "How many gears are there in the image?"}, {"question": "How many flowers are there in the image?"}, {"question": "How many crip packages are there in the image?"}, {"question": "How many bridesmaids are there in the image?"}, {"question": "How many apples are there in the image?"}, {"question": "How many bowls are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many photographs are there in the image?"}, {"question": "How many warning signs are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many cyclists are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many giraffes are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many male nurses are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many couples are there in the image?"}, {"question": "How many cars are there in the image?"}, {"question": "How many apples are there in the image?"}, {"question": "How many smartphones are there in the image?"}, {"question": "How many roses are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many greeting cards are there in the image?"}, {"question": "How many guitars are there in the image?"}, {"question": "How many ironman suits are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many CDs are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many pot holders are there in the image?"}, {"question": "How many stamps are there in the image?"}, {"question": "How many bookmarks are there in the image?"}, {"question": "How many portraits are there in the image?"}, {"question": "How many girls are there in the image?"}, {"question": "How many labels are there in the image?"}, {"question": "How many mandalas are there in the image?"}, {"question": "How many silhouettes are there in the image?"}, {"question": "How many peacocks are there in the image?"}, {"question": "How many roses are there in the image?"}, {"question": "How many cars are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many canvases are there in the image?"}, {"question": "How many cards are there in the image?"}, {"question": "How many bell pepper halves are there in the image?"}, {"question": "How many pigs are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many newspapers are there in the image?"}, {"question": "How many leggings are there in the image?"}, {"question": "How many medals are there in the image?"}, {"question": "How many patterns are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many doors are there in the image?"}, {"question": "How many pairs of socks are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many backgrounds are there in the image?"}, {"question": "How many images of dogs are there in the image?"}, {"question": "How many broadheads are there in the image?"}, {"question": "How many figurines are there in the image?"}, {"question": "How many candles are there in the image?"}, {"question": "How many petals does each flower have in this image?"}, {"question": "How many crayons are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many caps are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many frames are there in the image?"}, {"question": "How many animals are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many cats are there in the image?"}, {"question": "How many sconces are there in the image?"}, {"question": "How many spoons are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many planes are there in the image?"}, {"question": "How many cats are there in the image?"}, {"question": "How many sketches are there in the image?"}, {"question": "How many trees are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many figurines are there in the image?"}, {"question": "How many napkins are there in the image?"}, {"question": "How many dogs are there in the image?"}, {"question": "How many kids are there in the image?"}, {"question": "How many hearts are there in the image?"}, {"question": "How many apples are there in the image?"}, {"question": "How many post-its are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many books are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many sofas are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many girls are there in the image?"}, {"question": "How many sketches are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many moons are there in the image?"}, {"question": "How many labels are there in the image?"}, {"question": "How many cylinders are there in the image?"}, {"question": "How many silhouettes of couples are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many dogs are there in the image?"}, {"question": "How many people on stage are there in the image?"}, {"question": "How many lambs are there in the image?"}, {"question": "How many violins are there in the image?"}, {"question": "How many armchairs are there in the image?"}, {"question": "How many silhouettes are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many photos are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many figurines are there in the image?"}, {"question": "How many moais are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many glasses are there in the image?"}, {"question": "How many sinks are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many buildings are there in the image?"}, {"question": "How many flyers are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many pizza slices are there in the image?"}, {"question": "How many stamps are there in the image?"}, {"question": "How many stickers are there in the image?"}, {"question": "How many gifts are there in the image?"}, {"question": "How many bowls are there in the image?"}, {"question": "How many onesies are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many posters are there in the image?"}, {"question": "How many cards are there in the image?"}, {"question": "How many portraits are there in the image?"}, {"question": "How many poinsettias are there in the image?"}, {"question": "How many chicken thighs are there in the image?"}, {"question": "How many glass windows are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many baskets are there in the image?"}, {"question": "How many tulips are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many stuffed animals are there in the image?"}, {"question": "How many keychains are there in the image?"}, {"question": "How many photos are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many adults are there in the image?"}, {"question": "How many tulips are there in the image?"}, {"question": "How many frames are there in the image?"}, {"question": "How many samosas are there in the image?"}, {"question": "How many strawberries are there in the image?"}, {"question": "How many cocktails are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many wineglasses are there in the image?"}, {"question": "How many goblets are there in the image?"}, {"question": "How many prints are there in the image?"}, {"question": "How many flowers are there in the image?"}, {"question": "How many zebras are there in the image?"}, {"question": "How many paint brushes are there in the image?"}, {"question": "How many prints are there in the image?"}, {"question": "How many figurines are there in the image?"}, {"question": "How many glasses are there in the image?"}, {"question": "How many sunglasses are there in the image?"}, {"question": "How many banners are there in the image?"}, {"question": "How many bottle caps are there in the image?"}, {"question": "How many prints are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many books are there in the image?"}, {"question": "How many animals are there in the image?"}, {"question": "How many eyeshadows are there in the image?"}, {"question": "How many keychains are there in the image?"}, {"question": "How many pairs of earrings are there in the image?"}, {"question": "How many canisters are there in the image?"}, {"question": "How many bags are there in the image?"}, {"question": "How many baking trays are there in the image?"}, {"question": "How many diamonds are there in the image?"}, {"question": "How many portraits are there in the image?"}, {"question": "How many framed images are there in the image?"}, {"question": "How many flags are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many framed pictures are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many croissants are there in the image?"}, {"question": "How many Manikins are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many labels are there in the image?"}, {"question": "How many people in the foreground are there in the image?"}, {"question": "How many armchairs are there in the image?"}, {"question": "How many cups are there in the image?"}, {"question": "How many helicopters are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many buttons are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many Cookies are there in the image?"}, {"question": "How many sculptures are there in the image?"}, {"question": "How many school uniforms are there in the image?"}, {"question": "How many sculptures are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many packages are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many stories does this cottage have?"}, {"question": "How many gift cards are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many beers are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many planes are there in the image?"}, {"question": "How many cactus pots are there in the image?"}, {"question": "How many smartphones are there in the image?"}, {"question": "How many picture frames are there in the image?"}, {"question": "How many elephants are there in the image?"}, {"question": "How many guitars are there in the image?"}, {"question": "How many samurai are there in the image?"}, {"question": "How many ghosts are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many candles are there in the image?"}, {"question": "How many vases are there in the image?"}, {"question": "How many sets of headphones are there in the image?"}, {"question": "How many pandas are there in the image?"}, {"question": "How many books are there in the image?"}, {"question": "How many stickers are there in the image?"}, {"question": "How many rings are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many dragon balls are there in the image?"}, {"question": "How many tanks are there in the image?"}, {"question": "How many students are there in the image?"}, {"question": "How many cups are there in the image?"}, {"question": "How many cubs are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many cushions are there in the image?"}, {"question": "How many shoes are there in the image?"}, {"question": "How many beers are there in the image?"}, {"question": "How many wine glasses are there in the image?"}, {"question": "How many cards are there in the image?"}, {"question": "How many boots are there in the image?"}, {"question": "How many stickers are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many butterflies are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many butterflies are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many hexagons are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many football players are there in the image?"}, {"question": "How many gifts are there in the image?"}, {"question": "How many light bulbs are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many candles are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many silhouettes are there in the image?"}, {"question": "How many pots are there in the image?"}, {"question": "How many pumpkins are there in the image?"}, {"question": "How many owls are there in the image?"}, {"question": "How many doctors are there in the image?"}, {"question": "How many pigs are there in the image?"}, {"question": "How many pillars are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many pumpkins are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many roses are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many illustrations are there in the image?"}, {"question": "How many photos are there in the image?"}, {"question": "How many butterflies are there in the image?"}, {"question": "How many spoons are there in the image?"}, {"question": "How many potato spreads are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many wall arts are there in the image?"}, {"question": "How many covers are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many soldiers are there in the image?"}, {"question": "How many posters are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many patterns are there in the image?"}, {"question": "How many dogs are there in the image?"}, {"question": "How many pennies are there in the image?"}, {"question": "How many windows are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many geese are there in the image?"}, {"question": "How many tickets are there in the image?"}, {"question": "How many figurines are there in the image?"}, {"question": "How many cases are there in the image?"}, {"question": "How many people are in the foreground of this image?"}, {"question": "How many glasses are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many cups are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many kittens are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many scones are there in the image?"}, {"question": "How many schoolgirls are there in the image?"}, {"question": "How many padlocks are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many windows are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many banners are there in the image?"}, {"question": "How many dogs are there in the image?"}, {"question": "How many groomsmen are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many glasses are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many coins are there in the image?"}, {"question": "How many trees are there in the image?"}, {"question": "How many turtles are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many frames are there in the image?"}, {"question": "How many firescreens are there in the image?"}, {"question": "How many bowls are there in the image?"}, {"question": "How many stickers are there in the image?"}, {"question": "How many archaic mirrors are there in the image?"}, {"question": "How many dogs are there in the image?"}, {"question": "How many candles are there in the image?"}, {"question": "How many paint brushes are there in the image?"}, {"question": "How many forks are there in the image?"}, {"question": "How many figurines are there in the image?"}, {"question": "How many owls are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many silhouettes are there in the image?"}, {"question": "How many fried eggs are there in the image?"}, {"question": "How many guitars are there in the image?"}, {"question": "How many signs are there in the image?"}, {"question": "How many watches are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many kittens are there in the image?"}, {"question": "How many kids are there in the image?"}, {"question": "How many kittens are there in the image?"}, {"question": "How many toys are there in the image?"}, {"question": "How many pairs of socks are there in the image?"}, {"question": "How many pairs of socks are there in the image?"}, {"question": "How many men are there in the image?"}, {"question": "How many stars are there in the image?"}, {"question": "How many quarters are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many diamonds are there in the image?"}, {"question": "How many moais are there in the image?"}, {"question": "How many banners are there in the image?"}, {"question": "How many silhouettes are there in the image?"}, {"question": "How many coins are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many banners are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many flags are there in the image?"}, {"question": "How many frames are there in the image?"}, {"question": "How many contestants are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many globes are there in the image?"}, {"question": "How many animals are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many buttons are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many piles of candy are there in the image?"}, {"question": "How many pictures are there in the image?"}, {"question": "How many plates are there in the image?"}, {"question": "How many calendars are there in the image?"}, {"question": "How many oranges are there in the image?"}, {"question": "How many puppies are there in the image?"}, {"question": "How many buffalos are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many christmas balls are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many armchairs are there in the image?"}, {"question": "How many frames are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many soldiers are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many cards are there in the image?"}, {"question": "How many colored tiles are there in the image?"}, {"question": "How many trapezoids are there in the image?"}, {"question": "How many pastries are there in the image?"}, {"question": "How many plants are there in the image?"}, {"question": "How many place mats are there in the image?"}, {"question": "How many symbols are there in the image?"}, {"question": "How many mazes are there in the image?"}, {"question": "How many tea bags are there in the image?"}, {"question": "How many photos are there in the image?"}, {"question": "How many children are there in the image?"}, {"question": "How many starfish are there in the image?"}, {"question": "How many mugs are there in the image?"}, {"question": "How many bottles are there in the image?"}, {"question": "How many eggs are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many balls are there in the image?"}, {"question": "How many paper bags are there in the image?"}, {"question": "How many garage doors are there in the image?"}, {"question": "How many silhouettes are there in the image?"}, {"question": "How many couples are there in the image?"}, {"question": "How many dice are there in the image?"}, {"question": "How many women are there in the image?"}, {"question": "How many icons are there in the image?"}, {"question": "How many wooden spoons are there in the image?"}, {"question": "How many people are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many chairs are there in the image?"}, {"question": "How many beds are there in the image?"}, {"question": "How many chairs are there in the image?"}]

Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60133d938ba72366456dec978d305ff978b143577b047fe46aee4b50b104eef6
+size 272122

Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-312.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a565009d6d842a49ccfc7f31d9945d1916fa9669913290d3483a4b9d360acbd7
+size 272065

Tipsomaly/model/big_vision/datasets/imagenet/__pycache__/class_names.cpython-39.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fba8a44d65d0f986bf5562ae0290a3cec1db9af239f9c0f91876099c173b760b
+size 272046

Tipsomaly/model/big_vision/datasets/nocaps/nocaps.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements nocaps val/test set in TFDS structure.
+It's small data, so simple to run locally. First, copy the data to local disk:
+  mkdir -p /tmp/data/nocaps_data
+  cd /tmp/data/nocaps_data
+  wget https://s3.amazonaws.com/open-images-dataset/tar/test.tar.gz
+  wget https://s3.amazonaws.com/open-images-dataset/tar/validation.tar.gz
+  curl -O https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
+  curl -O https://s3.amazonaws.com/nocaps/nocaps_test_image_info.json
+  mkdir -p /tmp/data/nocaps_data/Images
+  tar -xf validation.tar.gz -C Images
+  rm validation.tar.gz
+  tar -xf test.tar.gz -C Images
+  rm test.tar.gz
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=nocaps
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('nocaps', split='val', data_dir='/tmp/tfds')
+"""
+import collections
+import json
+import os
+from absl import logging
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+_DESCRIPTION = """Nocaps dataset."""
+_CITATION = (
+    '@inproceedings{agrawal2019nocaps,'
+    'title={nocaps: novel object captioning at scale},'
+    'author={Agrawal, Harsh and Desai, Karan and Wang, Yufei and Chen, Xinlei'
+    'and Jain, Rishabh and Johnson, Mark and Batra, Dhruv and Parikh, Devi'
+    'and Lee, Stefan and Anderson, Peter},'
+    'booktitle={ICCV},'
+    'pages={8948--8957},'
+    'year={2019}}')
+# When running locally (recommended), copy files as above an use these:
+_FILEPATH = '/tmp/data/nocaps_data/Images/'
+_VAL_FILES = '/tmp/data/nocaps_data/nocaps_val_4500_captions.json'
+_TEST_FILES = '/tmp/data/nocaps_data/nocaps_test_image_info.json'
+class NoCaps(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for nocaps dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata.
+    (tfds.core.DatasetInfo object)
+      These are the features of your dataset like images, labels, etc.
+    """
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'image/id': tf.int64,
+            'image_filepath': tfds.features.Text(),
+            'url': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'texts': tfds.features.Sequence(tfds.features.Text()),
+        }),
+        # If there's a common (input, target) tuple from the
+        # features, specify them here. They'll be used if
+        # `as_supervised=True` in `builder.as_dataset`.
+        supervised_keys=None,  # Set to `None` to disable
+        homepage='https://nocaps.org/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    def group_by_id(data, image_dir):
+      id2caps = collections.defaultdict(list)
+      for ex in data.get('annotations', []):
+        id2caps[ex['image_id']].append(ex['caption'])
+      id_to_example = {}
+      for ex in data['images']:
+        id_to_example[ex['id']] = {
+            'image/id': ex['id'],
+            'image_filepath': os.path.join(
+                _FILEPATH, image_dir, ex['file_name']),
+            'url': ex['coco_url'],
+            'image': os.path.join(_FILEPATH, image_dir, ex['file_name']),
+            'texts': id2caps[ex['id']] if ex['id'] in id2caps else ['N/A'],
+        }
+      return id_to_example
+    # Returns the Dict[split names, Iterator[Key, Example]]
+    with open(_VAL_FILES) as f:
+      val_data = group_by_id(json.load(f), 'validation')
+    with open(_TEST_FILES) as f:
+      test_data = group_by_id(json.load(f), 'test')
+    return {
+        'val': self._generate_examples(val_data),
+        'test': self._generate_examples(test_data),
+    }
+  def _generate_examples(self, data):
+    """Generate a tf.Example object.
+    This contains the image, objects, attributes, regions and relationships.
+    Args:
+      data: a dictionary with the image/id.
+    Yields:
+      (key, example) tuples from dataset. The example has format specified in
+        the above DatasetInfo.
+    """
+    for k, v in data.items():
+      try:
+        # Jpeg decode test to check early errors. The decoded images are not
+        # used, instead we rely on the default tfds.features.Image function.
+        unused_image = tf.io.read_file(v['image_filepath'])
+        unused_image = np.array(tf.image.decode_jpeg(unused_image))
+      except tf.errors.InvalidArgumentError:
+        # Unable to read image, skip this image and output download link.
+        logging.error('Unable to decode: curl -O %s', v['url'])
+        continue
+      except tf.errors.NotFoundError:
+        # Unable to read image, skip this image and output download link.
+        logging.error('File not found: curl -O %s', v['url'])
+        continue
+      yield k, v

Tipsomaly/model/big_vision/datasets/refcoco/refcoco.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Unbatch RefCOCO, RefCOCO+, RefCOCOg datasets in TFDS structure."""
+# Based on tensorflow_datasets/datasets/ref_coco
+import io
+import os
+import pickle
+import numpy as np
+import PIL.Image
+import pycocotools.coco
+import tensorflow_datasets as tfds
+_ROOT_PATH = '/tmp/data/'
+class RefCocoConfig(tfds.core.BuilderConfig):
+  """Config to specify each RefCoco variant."""
+  def __init__(self, dataset, dataset_partition, **kwargs):
+    name = f'{dataset}_{dataset_partition}'
+    super(RefCocoConfig, self).__init__(name=name, **kwargs)
+    self.dataset = dataset
+    self.dataset_partition = dataset_partition
+_DESCRIPTION = """RefCOCO, RefCOCO+, RefCOCOg datasets.
+Images, boxes and segmentations are from the original COCO dataset
+(Lin et al, ECCV 2014). The referential segmentations are from two different
+sources:
+1) RefCOCOg (Mao et al, CVPR 2016):
+  - https://github.com/mjhucla/Google_Refexp_toolbox
+  - This is the split used in the "refcocog_google" dataset. Note that this
+    split has overlapping images in train/validation. The same split is also
+    provided in 2).
+2) Source of RefCOCO and RefCOCO+ (Yu et al, ECCV 2016):
+  - https://github.com/lichengunc/refer
+  - Apache License 2.0
+  - Provides all the splits used for generation of these datasets, including the
+    "refcocog_google" split that is identical with the split from 1).
+For convenience, we provide an additional dataset "refcocox_combined" that
+combines the datasets "refcoco_unc", "refcocoplus_unc", and "refcocog_umd",
+unifying "testA" and "testB" into a single "test" split, and removing any images
+from "train" that appear either in "validation" or "test".
+Also for convenience, every split is unrolled twice (at the "objects" level and
+at the "object/refs" level) and saved as "{split}_flat".
+"""
+# pylint: disable=line-too-long
+_CITATION = r"""
+@inproceedings{DBLP:conf/cvpr/MaoHTCY016,
+  author       = {Junhua Mao and
+                  Jonathan Huang and
+                  Alexander Toshev and
+                  Oana Camburu and
+                  Alan L. Yuille and
+                  Kevin Murphy},
+  title        = {Generation and Comprehension of Unambiguous Object Descriptions},
+  booktitle    = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
+                  {CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
+  pages        = {11--20},
+  publisher    = {{IEEE} Computer Society},
+  year         = {2016},
+  url          = {https://doi.org/10.1109/CVPR.2016.9},
+  doi          = {10.1109/CVPR.2016.9},
+  timestamp    = {Fri, 24 Mar 2023 00:02:52 +0100},
+  biburl       = {https://dblp.org/rec/conf/cvpr/MaoHTCY016.bib},
+  bibsource    = {dblp computer science bibliography, https://dblp.org}
+}
+@inproceedings{DBLP:conf/eccv/YuPYBB16,
+  author       = {Licheng Yu and
+                  Patrick Poirson and
+                  Shan Yang and
+                  Alexander C. Berg and
+                  Tamara L. Berg},
+  editor       = {Bastian Leibe and
+                  Jiri Matas and
+                  Nicu Sebe and
+                  Max Welling},
+  title        = {Modeling Context in Referring Expressions},
+  booktitle    = {Computer Vision - {ECCV} 2016 - 14th European Conference, Amsterdam,
+                  The Netherlands, October 11-14, 2016, Proceedings, Part {II}},
+  series       = {Lecture Notes in Computer Science},
+  volume       = {9906},
+  pages        = {69--85},
+  publisher    = {Springer},
+  year         = {2016},
+  url          = {https://doi.org/10.1007/978-3-319-46475-6\_5},
+  doi          = {10.1007/978-3-319-46475-6\_5},
+  timestamp    = {Wed, 07 Dec 2022 23:10:23 +0100},
+  biburl       = {https://dblp.org/rec/conf/eccv/YuPYBB16.bib},
+  bibsource    = {dblp computer science bibliography, https://dblp.org}
+}
+@article{DBLP:journals/corr/LinMBHPRDZ14,
+  author    = {Tsung{-}Yi Lin and
+               Michael Maire and
+               Serge J. Belongie and
+               Lubomir D. Bourdev and
+               Ross B. Girshick and
+               James Hays and
+               Pietro Perona and
+               Deva Ramanan and
+               Piotr Doll{\'{a}}r and
+               C. Lawrence Zitnick},
+  title     = {Microsoft {COCO:} Common Objects in Context},
+  journal   = {CoRR},
+  volume    = {abs/1405.0312},
+  year      = {2014},
+  url       = {http://arxiv.org/abs/1405.0312},
+  archivePrefix = {arXiv},
+  eprint    = {1405.0312},
+  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+"""
+# coco_data = json.load(open('annotations/instances_train2017.json'))
+# [l['name'] for l in coco_data['licenses']]
+LICENSES = [
+    'Attribution-NonCommercial-ShareAlike License',
+    'Attribution-NonCommercial License',
+    'Attribution-NonCommercial-NoDerivs License',
+    'Attribution License',
+    'Attribution-ShareAlike License',
+    'Attribution-NoDerivs License',
+    'No known copyright restrictions',
+    'United States Government Work',
+]
+# _licenses_map = {l['id']: i for i, l in enumerate(coco_data['licenses'])}
+_licenses_map = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
+# pyformat: disable
+# [c['name'] for c in coco_data['categories']]
+CATEGORIES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush',
+]
+# sorted(set(c['supercategory'] for c in coco_data['categories']))
+SUPERCATEGORIES = [
+    'accessory', 'animal', 'appliance', 'electronic', 'food', 'furniture',
+    'indoor', 'kitchen', 'outdoor', 'person', 'sports', 'vehicle',
+]
+# pyformat: enable
+# Will be exported into directory `$TFDS_DATA_DIR/ref_coco_bv`
+# If the class name was `RefCOCO` then it would be exported into
+# `$TFDS_DATA_DIR/ref_coco`, which would collide with the default TFDS dataset
+# also named `ref_coco` (which has precedence over `data_dir` builder arg).
+class RefCocoBv(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for RefCoco datasets."""
+  VERSION = tfds.core.Version('1.4.0')
+  RELEASE_NOTES = {
+      '1.4.0': 'Added flat versions of all dataset splits.',
+      '1.3.0': 'Added "refcocox_combined" dataset.',
+      '1.2.0': 'Added "train_flat" splits.',
+      '1.1.0': 'Added more features (mask etc), nested "refs" in "objects".',
+      '1.0.0': 'Initial release.',
+  }
+  MANUAL_DOWNLOAD_INSTRUCTIONS = """
+  1. Install https://pypi.org/project/pycocotools/.
+  2. Download data (requires ~20G for COCO images):
+        (mkdir -p /tmp/tfds/downloads/manual &&
+        cd /tmp/tfds/downloads/manual &&
+        wget http://images.cocodataset.org/zips/train2017.zip &&
+        wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip &&
+        wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip &&
+        wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip &&
+        wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip &&
+        for zip in *.zip; do unzip $zip; done
+        )
+  3. Run the generation script with `TFDS_DATA_DIR=/tmp/tfds`
+  """
+  BUILDER_CONFIGS = [
+      RefCocoConfig(dataset='refcoco', dataset_partition='unc'),
+      RefCocoConfig(dataset='refcoco', dataset_partition='google'),
+      RefCocoConfig(dataset='refcocoplus', dataset_partition='unc'),
+      RefCocoConfig(dataset='refcocog', dataset_partition='google'),
+      RefCocoConfig(dataset='refcocog', dataset_partition='umd'),
+      RefCocoConfig(dataset='refcocox', dataset_partition='combined'),
+  ]
+  def _info(self) -> tfds.core.DatasetInfo:
+    return tfds.core.DatasetInfo(
+        builder=self,
+        features=tfds.features.FeaturesDict({
+            'id': tfds.features.Scalar(np.int32),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'height': tfds.features.Scalar(np.int32),
+            'width': tfds.features.Scalar(np.int32),
+            'license': tfds.features.ClassLabel(names=LICENSES),
+            'file_name': tfds.features.Text(),
+            'flickr_url': tfds.features.Text(),
+            'coco_url': tfds.features.Text(),
+            'objects': tfds.features.Sequence({
+                'id': tfds.features.Scalar(np.int64),
+                'area': tfds.features.Scalar(np.float32),
+                'bbox': tfds.features.BBoxFeature(),
+                'mask': tfds.features.Image(encoding_format='png'),
+                'category': tfds.features.ClassLabel(names=CATEGORIES),
+                'supercategory': tfds.features.ClassLabel(
+                    names=SUPERCATEGORIES
+                ),
+                'iscrowd': tfds.features.Scalar(np.bool_),
+                # refcoco, refcoco+, refcocog features:
+                'refs': tfds.features.Sequence({
+                    'id': tfds.features.Scalar(np.int32),
+                    'sentence': tfds.features.Text(),
+                }),
+            }),
+        }),
+        supervised_keys=None,  # Set to `None` to disable
+        citation=_CITATION,
+        description=_DESCRIPTION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    allowed_splits = {
+        ('refcoco', 'google'): [
+            tfds.Split.TRAIN,
+            tfds.Split.VALIDATION,
+            tfds.Split.TEST,
+        ],
+        ('refcoco', 'unc'): [
+            tfds.Split.TRAIN,
+            tfds.Split.VALIDATION,
+            'testA',
+            'testB',
+        ],
+        ('refcocoplus', 'unc'): [
+            tfds.Split.TRAIN,
+            tfds.Split.VALIDATION,
+            'testA',
+            'testB',
+        ],
+        # Verified manually that image and annotation IDs match the ones in
+        # https://storage.googleapis.com/refexp/google_refexp_dataset_release.zip
+        ('refcocog', 'google'): [
+            tfds.Split.TRAIN,
+            tfds.Split.VALIDATION,
+        ],
+        ('refcocog', 'umd'): [
+            tfds.Split.TRAIN,
+            tfds.Split.VALIDATION,
+            tfds.Split.TEST,
+        ],
+        ('refcocox', 'combined'): [
+            tfds.Split.TRAIN,
+            tfds.Split.VALIDATION,
+            tfds.Split.TEST,
+        ],
+    }
+    bc = self.builder_config
+    splits = allowed_splits[(bc.dataset, bc.dataset_partition)]
+    data_dir = dl_manager.manual_dir
+    for url, components in (
+        # pylint: disable=line-too-long
+        # pyformat: disable
+        ('http://images.cocodataset.org/zips/train2017.zip', ('train2017', '000000147328.jpg')),
+        ('http://images.cocodataset.org/annotations/annotations_trainval2017.zip', ('annotations', 'instances_train2017.json')),
+        ('https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip', ('refcoco', 'refs(unc).p')),
+        ('https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip', ('refcoco+', 'refs(unc).p')),
+        ('https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip', ('refcocog', 'refs(umd).p')),
+        # pyformat: enable
+        # pylint: enable=line-too-long
+    ):
+      path = os.path.exists(os.path.join(data_dir, *components))
+      if not path:
+        raise FileNotFoundError(
+            f'Could not find {path}: please download {url} and unzip into'
+            f' {data_dir}'
+        )
+    coco = pycocotools.coco.COCO(
+        os.path.join(data_dir, 'annotations', 'instances_train2017.json')
+    )
+    return {
+        split + suffix: self._generate_examples(
+            coco, data_dir, bc.dataset, bc.dataset_partition, split + suffix,
+        )
+        for split in splits
+        for suffix in ('', '_flat')
+    }
+  # Builder must overwrite all abstract methods.
+  def _generate_examples(
+      self, coco, data_dir, dataset, dataset_partition, split):
+    return _generate_examples(coco, data_dir, dataset, dataset_partition, split)
+def _get_ids(data_dir, dataset, dataset_partition, split):
+  """Returns `img_ids, ann_to_refs` for specified dataset/partition/split."""
+  def load(dataset, dataset_partition):
+    fname = f'refs({dataset_partition}).p'
+    path = os.path.join(data_dir, dataset, fname)
+    refcoco = pickle.load(open(path, 'rb'))
+    return refcoco
+  if split == tfds.Split.VALIDATION:
+    split = 'val'
+  if (dataset, dataset_partition) == ('refcocox', 'combined'):
+    refcoco = (
+        load('refcocog', 'umd')
+        + load('refcoco', 'unc')
+        + load('refcoco+', 'unc')
+    )
+    if split == 'test':
+      splits = ('test', 'testA', 'testB')
+    else:
+      splits = (split,)
+    exclude_img_ids = set()
+    if split == 'train':
+      # Exclude all images with val/test annotations from train set.
+      exclude_img_ids = {
+          r['image_id'] for r in refcoco if r['split'] != 'train'
+      }
+    refcoco = [
+        r
+        for r in refcoco
+        if r['split'] in splits and r['image_id'] not in exclude_img_ids
+    ]
+  else:
+    if dataset == 'refcocoplus':
+      dataset = 'refcoco+'
+    refcoco = load(dataset, dataset_partition)
+    refcoco = [r for r in refcoco if r['split'] == split]
+  img_ids = {r['image_id'] for r in refcoco}
+  ann_to_refs = {}
+  for r in refcoco:
+    for sent in r['sentences']:
+      ann_to_refs.setdefault(r['ann_id'], []).append(dict(
+          id=sent['sent_id'],
+          sentence=sent['sent']
+      ))
+  return img_ids, ann_to_refs
+def _generate_examples(coco, data_dir, dataset, dataset_partition, split):
+  """Generates examples for a given split."""
+  flat = '_flat' in split
+  split = split.replace('_flat', '')
+  img_ids, ann_to_refs = _get_ids(data_dir, dataset, dataset_partition, split)
+  for img_id in coco.getImgIds():
+    if img_id not in img_ids:
+      continue
+    img, = coco.loadImgs([img_id])
+    example = {
+        'id': img_id,
+        'image': os.path.join(data_dir, 'train2017', img['file_name']),
+        'height': img['height'],
+        'width': img['width'],
+        'license': LICENSES[_licenses_map[img['license']]],
+        'file_name': img['file_name'],
+        'flickr_url': img['flickr_url'],
+        'coco_url': img['coco_url'],
+        'objects': [],
+    }
+    for ann in coco.loadAnns(coco.getAnnIds(img_id)):
+      refs = ann_to_refs.get(ann['id'])
+      if not refs:
+        continue
+      cat, = coco.loadCats([ann['category_id']])
+      mask = coco.annToMask(ann).astype(np.bool_)
+      mask_buf = io.BytesIO()
+      PIL.Image.fromarray(mask).save(mask_buf, 'png')
+      mask_buf.seek(0)
+      object_ = {
+          'id': ann['id'],
+          'mask': mask_buf,
+          'category': cat['name'],
+          'supercategory': cat['supercategory'],
+          'iscrowd': ann['iscrowd'],
+          'area': ann['area'],
+          'bbox': _convert_bbox(img, *ann['bbox']),
+          'refs': refs,
+      }
+      if flat:
+        example['objects'] = [object_]
+        for ref_i, ref in enumerate(refs):
+          object_['refs'] = [ref]
+          mask_buf.seek(0)
+          yield f'{img_id}_{ann["id"]}_{ref_i}', example
+      else:
+        example['objects'].append(object_)
+    if not flat:
+      yield img_id, example
+def _convert_bbox(img, x, y, w, h):
+  return tfds.features.BBox(
+      ymin=y / img['height'],
+      xmin=x / img['width'],
+      ymax=(y + h) / img['height'],
+      xmax=(x + w) / img['width'],
+  )

Tipsomaly/model/big_vision/datasets/rsvqa_hr/rsvqa_hr.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements RSVQA-HR dataset in TFDS.
+Remote sensing visual question answering task, using high-resolution airborne
+image data at 15cm resolution per pixel.
+It's small dataset at source (14G), so simple to run locally.
+First, download and unzip the dataset from https://zenodo.org/records/6344367
+and place it in /tmp/data/rsvqa_hr.
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd third_party/py/big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=rsvqa_hr
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('rsvqa_hr', split='train', data_dir='/tmp/tfds')
+Dataset splits (all):
+  train: 625,340 examples/questions
+  val: 102,843 examples/questions
+  test: 222,684 examples/questions
+  test_2: 105,647 examples/questions  (other area, unknown instrument)
+Non-numeric data splits (nonum):
+  train: 371,834 examples/questions
+  val: 60,405 examples/questions
+  test: 131,468 examples/questions
+  test_2: 62,554 examples/questions
+Note: due to image duplication with each question, the dataset size is
+significatnly increased by the number of questions per image.
+Recommended training splits:
+  train: train
+  minitrain: train[:5%]
+  eval: val
+  full_train: train+val
+  test: test
+Image sizes: 512x512
+Number of answers per question: 1
+Question types distribution in train split:
+  - Area (area): 14.6% (integers, binned into {0m2, 1-10m2, 11-100m2, 101-1000m2, >1000m2})
+  - Comparison(comp): 33.5%
+  - Count (count): 26.0%  (integers, not binned, maximum number of objects is 89)
+  - Presence (presence): 26.0%
+"""
+import json
+import os
+import numpy as np
+import tensorflow_datasets as tfds
+_DESCRIPTION = """RSVQA-HR dataset."""
+# pylint: disable=line-too-long
+_CITATION = """
+@article{Lobry_2020,
+   title={RSVQA: Visual Question Answering for Remote Sensing Data},
+   volume={58},
+   ISSN={1558-0644},
+   url={http://dx.doi.org/10.1109/TGRS.2020.2988782},
+   DOI={10.1109/tgrs.2020.2988782},
+   number={12},
+   journal={IEEE Transactions on Geoscience and Remote Sensing},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Lobry, Sylvain and Marcos, Diego and Murray, Jesse and Tuia, Devis},
+   year={2020},
+   month=dec, pages={8555-8566} }
+"""
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above an use these:
+PATH = '/tmp/data/rsvqa_hr/'
+class RsvqaHrConfig(tfds.core.BuilderConfig):
+  """Config to specify each variant."""
+  def __init__(self, nonum, **kwargs):
+    name = 'nonum' if nonum else 'all'
+    super(RsvqaHrConfig, self).__init__(name=name, **kwargs)
+    self.nonum = nonum
+class RsvqaHr(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for RSVQA-HR dataset."""
+  VERSION = tfds.core.Version('1.0.2')
+  RELEASE_NOTES = {
+      '1.0.0': 'First release.',
+      '1.0.1': 'Rename binned values.',
+      '1.0.2': 'Removed explicit png image encoding.',
+      }
+  BUILDER_CONFIGS = [
+      RsvqaHrConfig(nonum=False),
+      RsvqaHrConfig(nonum=True),
+  ]
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'question_id': tfds.features.Scalar(np.int32),
+            'filename': tfds.features.Text(),
+            'image': tfds.features.Image(),
+            'question': tfds.features.Text(),
+            'question_type': tfds.features.Text(),
+            'answers': tfds.features.Sequence(tfds.features.Text()),
+            'raw_answers': tfds.features.Sequence(tfds.features.Text()),
+        }),
+        supervised_keys=None,
+        homepage='https://rsvqa.sylvainlobry.com/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {
+        split: self._generate_examples(split)
+        for split in ('train', 'val', 'test', 'test_2')
+    }
+  def _generate_examples(self, split):
+    """Yields (key, example) tuples."""
+    if split == 'test_2':
+      split = 'test_phili'
+    questions_path = os.path.join(PATH + f'USGS_split_{split}_questions.json')
+    answers_path = os.path.join(PATH + f'USGS_split_{split}_answers.json')
+    images_path = os.path.join(PATH + 'Data')
+    with open(questions_path, 'r') as f:
+      questions = json.loads(f.read())['questions']
+    with open(answers_path, 'r') as f:
+      answers = json.loads(f.read())['answers']
+    for q, a in zip(questions, answers):
+      assert q['active'] == a['active']
+      if not q['active']:
+        continue
+      if self.builder_config.nonum and q['type'] in ('area', 'count'):
+        continue
+      assert q['answers_ids'][0] == a['id']
+      assert q['id'] == a['question_id']
+      filename = f'{q["img_id"]}.png'
+      yield q['id'], {
+          'question_id': q['id'],
+          'filename': filename,
+          'image': os.path.join(images_path, filename),
+          'question': q['question'],
+          'question_type': q['type'],
+          'answers': [bin_answer(a['answer'], q['type'])],
+          'raw_answers': [a['answer']],
+      }
+def bin_answer(answer, question_type):
+  """Bins answers into expected ranges."""
+  if question_type == 'area':
+    area = int(answer[:-2])
+    if area == 0:
+      return '0 m2'
+    elif area <= 10:
+      return 'between 1 m2 and 10 m2'
+    elif area <= 100:
+      return 'between 11 m2 and 100 m2'
+    elif area <= 1000:
+      return 'between 101 m2 and 1000 m2'
+    else:
+      return 'more than 1000 m2'
+  return answer

Tipsomaly/model/big_vision/datasets/rsvqa_lr/rsvqa_lr.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements RSVQA-LR dataset in TFDS.
+Remote sensing visual question answering task, using low-resolution satellite
+(Sentinel-2) RGB channels data at 10m resolution per pixel.
+It's small dataset at source (200M), so simple to run locally.
+First, download and unzip the dataset from https://zenodo.org/records/6344334
+and place it in /tmp/data/rsvqa_lr.
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd third_party/py/big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=rsvqa_lr
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('rsvqa_lr', split='train', data_dir='/tmp/tfds')
+Dataset splits:
+  train: 57223 examples/questions
+  val: 10005 examples/questions
+  test: 10004 examples/questions
+And the same splits are available excluding numeric questions:
+  train_nonum: 39441 examples/questions
+  val_nonum: 6782 examples/questions
+  test_nonum: 6782 examples/questions
+Note: due to image duplication with each question, the dataset size is
+significatnly increased by the number of questions per image.
+Recommended training splits:
+  train: train
+  minitrain: train[:5%]
+  eval: val
+  full_train: train+val
+  test: test
+Image sizes: 256x256
+Number of answers per question: 1
+Question types distribution in train split:
+  - Comparison(comp): 39.4%
+  - Count (count): 29.9%  (integers, binned at evaluation into
+    {0, 1-10, 11-100, 101-1000, >10000})
+  - Presence (presence): 29.7%
+  - Rural/Urban (rural_urban): 1%
+"""
+import io
+import json
+import os
+import numpy as np
+import tensorflow_datasets as tfds
+_DESCRIPTION = """RSVQA-LR dataset."""
+# pylint: disable=line-too-long
+_CITATION = """
+@article{Lobry_2020,
+   title={RSVQA: Visual Question Answering for Remote Sensing Data},
+   volume={58},
+   ISSN={1558-0644},
+   url={http://dx.doi.org/10.1109/TGRS.2020.2988782},
+   DOI={10.1109/tgrs.2020.2988782},
+   number={12},
+   journal={IEEE Transactions on Geoscience and Remote Sensing},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Lobry, Sylvain and Marcos, Diego and Murray, Jesse and Tuia, Devis},
+   year={2020},
+   month=dec, pages={8555–8566} }
+"""
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above an use these:
+PATH = '/tmp/data/rsvqa_lr/'
+class RsvqaLrConfig(tfds.core.BuilderConfig):
+  """Config to specify each variant."""
+  def __init__(self, nonum, **kwargs):
+    name = 'nonum' if nonum else 'all'
+    super(RsvqaLrConfig, self).__init__(name=name, **kwargs)
+    self.nonum = nonum
+class RsvqaLr(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for RSVQA-LR dataset."""
+  VERSION = tfds.core.Version('1.0.2')
+  RELEASE_NOTES = {
+      '1.0.0': 'First release.',
+      '1.0.1': 'Rename binned values.',
+      '1.0.2': 'Removed explicit png image encoding.',
+      }
+  BUILDER_CONFIGS = [
+      RsvqaLrConfig(nonum=False),
+      RsvqaLrConfig(nonum=True),
+  ]
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'question_id': tfds.features.Scalar(np.int32),
+            'filename': tfds.features.Text(),
+            'image': tfds.features.Image(),
+            'question': tfds.features.Text(),
+            'question_type': tfds.features.Text(),
+            'answers': tfds.features.Sequence(tfds.features.Text()),
+            'raw_answers': tfds.features.Sequence(tfds.features.Text()),
+        }),
+        supervised_keys=None,
+        homepage='https://rsvqa.sylvainlobry.com/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {
+        split: self._generate_examples(split)
+        for split in ('train', 'val', 'test')
+    }
+  def _generate_examples(self, split):
+    """Yields (key, example) tuples."""
+    questions_path = os.path.join(PATH + f'LR_split_{split}_questions.json')
+    answers_path = os.path.join(PATH + f'LR_split_{split}_answers.json')
+    images_path = os.path.join(PATH + 'Images_LR')
+    with open(questions_path, 'r') as f:
+      questions = json.loads(f.read())['questions']
+    with open(answers_path, 'r') as f:
+      answers = json.loads(f.read())['answers']
+    for q, a in zip(questions, answers):
+      assert q['active'] == a['active']
+      if not q['active']:
+        continue
+      if self.builder_config.nonum and q['type'] == 'count':
+        continue
+      assert q['answers_ids'] == [a['id']]
+      assert q['id'] == a['question_id']
+      filename = f'{q["img_id"]}.tif'
+      img = read_tif(os.path.join(images_path, filename))
+      yield q['id'], {
+          'question_id': q['id'],
+          'filename': filename,
+          'image': img,
+          'question': q['question'],
+          'question_type': q['type'],
+          'answers': [bin_answer(a['answer'], q['type'])],
+          'raw_answers': [a['answer']],
+      }
+def bin_answer(answer, question_type):
+  """Bins answers into expected ranges."""
+  if question_type == 'count':
+    count = int(answer)
+    if count == 0:
+      return '0'
+    elif count <= 10:
+      return 'between 1 and 10'
+    elif count <= 100:
+      return 'between 11 and 100'
+    elif count <= 1000:
+      return 'between 101 and 1000'
+    else:
+      return 'more than 1000'
+  return answer
+def read_tif(path):
+  with open(path, 'rb') as f:
+    img = tfds.core.lazy_imports.tifffile.imread(io.BytesIO(f.read()))
+  return img.astype(np.uint8)

Tipsomaly/model/big_vision/datasets/scicap/scicap.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Creates TFDS dataset for SciCap.
+Preparing the data:
+  1) mkdir /tmp/data/scicap && cd /tmp/data/scicap
+  2) wget 'https://www.dropbox.com/s/t1sjqesl0pynaxo/scicap_data.zip?dl=0'
+  3) unzip -UU 'scicap_data.zip?dl=0' && rm 'scicap_data.zip?dl=0'
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=scicap
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('scicap', split='train', data_dir='/tmp/tfds')
+"""
+# pylint: enable=line-too-long
+import enum
+import functools
+import json
+import os
+import tensorflow_datasets as tfds
+_DESCRIPTION = """SciCap dataset."""
+_CITATION = """
+@article{hsu2021scicap,
+    title={SciCap: Generating captions for scientific figures},
+    author={Hsu, Ting-Yao and Giles, C Lee and Huang, Ting-Hao'Kenneth'},
+    journal={arXiv preprint arXiv:2110.11624},
+    year={2021}
+}
+"""
+# When running locally (recommended), copy files as above an use these:
+_SCICAP_DIR = "/tmp/data/scicap/scicap_data"
+class ScicapSubset(enum.Enum):
+  """Versions of the SciCap dataset."""
+  SINGLE_SENTENCE = "single_sentence"
+  FIRST_SENTENCE = "first_sentence"
+  LEQ_100_TOKENS = "leq_100_tokens"
+_SPLITS_TO_GENERATE = ["train", "test", "val"]
+_CONFIG_TO_IDS_PATH = {
+    (ScicapSubset.SINGLE_SENTENCE, True): "Single-Sentence-Caption/Yes-Subfig",
+    (ScicapSubset.SINGLE_SENTENCE, False): "Single-Sentence-Caption/No-Subfig",
+    (ScicapSubset.FIRST_SENTENCE, True): "First-Sentence/Yes-Subfig",
+    (ScicapSubset.FIRST_SENTENCE, False): "First-Sentence/No-Subfig",
+    (ScicapSubset.LEQ_100_TOKENS, True):
+        "Caption-No-More-Than-100-Tokens/Yes-Subfig",
+    (ScicapSubset.LEQ_100_TOKENS, False):
+        "Caption-No-More-Than-100-Tokens/No-Subfig",
+}
+_SUBFIG_TO_PATH = {
+    True: "SciCap-Yes-Subfig-Img", False: "SciCap-No-Subfig-Img"
+}
+class ScicapConfig(tfds.core.BuilderConfig):
+  """"Configuration for SciCap caption length and subfigure inclusion."""
+  def __init__(self, *, subset: ScicapSubset, subfig: bool, **kwargs):
+    """Parameters specifying how the dataset will be processed.
+    Args:
+      subset: Subset of the Scicap data (see enum above).
+      subfig: Whether or not figure with subfigures are included.
+      **kwargs: Passed on to the constructor of `BuilderConfig`.
+    """
+    super(ScicapConfig, self).__init__(**kwargs)
+    self.subset = subset
+    self.subfig = subfig
+@functools.cache
+def _read_annotations(split: str, image_id: str):
+  """Reads annotations for a single file."""
+  path = os.path.join(_SCICAP_DIR, "SciCap-Caption-All", split)
+  fname = os.path.join(path, image_id + ".json")
+  with open(fname, "r") as fin:
+    return json.load(fin)
+class Scicap(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for the SciCap dataset."""
+  VERSION = tfds.core.Version("1.0.0")
+  RELEASE_NOTES = {"1.0.0": "First release."}
+  BUILDER_CONFIGS = [
+      ScicapConfig(
+          name="single_sentence_subfig_yes",
+          description="Single sentence caption with subfigures allowed.",
+          subset=ScicapSubset.SINGLE_SENTENCE,
+          subfig=True
+      ),
+      ScicapConfig(
+          name="single_sentence_subfig_no",
+          description="Single sentence caption with subfigures not allowed.",
+          subset=ScicapSubset.SINGLE_SENTENCE,
+          subfig=False
+      ),
+      ScicapConfig(
+          name="first_sentence_subfig_yes",
+          description="First sentence of captions with subfigures allowed.",
+          subset=ScicapSubset.FIRST_SENTENCE,
+          subfig=True
+      ),
+      ScicapConfig(
+          name="first_sentence_subfig_no",
+          description="First sentence of captions with subfigures not allowed.",
+          subset=ScicapSubset.FIRST_SENTENCE,
+          subfig=False
+      ),
+      ScicapConfig(
+          name="leq_100_tokens_subfig_yes",
+          description="Captions with <= 100 tokens with subfigures allowed.",
+          subset=ScicapSubset.LEQ_100_TOKENS,
+          subfig=True
+      ),
+      ScicapConfig(
+          name="leq_100_tokens_subfig_no",
+          description=("Captions with <= 100 tokens with subfigures"
+                       " not allowed."),
+          subset=ScicapSubset.LEQ_100_TOKENS,
+          subfig=False
+      ),
+  ]
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            "image/id": tfds.features.Text(),
+            "image/filename": tfds.features.Text(),
+            "image": tfds.features.Image(encoding_format="png"),
+            "caption/originally_extracted": tfds.features.Text(),
+            "caption/lowercase_and_token_and_remove_figure_index":
+                tfds.features.Text(),
+            "caption/normalized/basic_num": tfds.features.Text(),
+            "caption/normalized/advanced_equation_bracket":
+                tfds.features.Text(),
+        }),
+        supervised_keys=None,
+        homepage="https://github.com/tingyaohsu/SciCap",
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {split: self._generate_examples(split)
+            for split in _SPLITS_TO_GENERATE}
+  def _generate_examples(self, split: str):
+    """Yields (key, example) tuples from test set."""
+    config_path = _CONFIG_TO_IDS_PATH[
+        (self.builder_config.subset, self.builder_config.subfig)]
+    image_path = os.path.join(
+        _SCICAP_DIR, _SUBFIG_TO_PATH[self.builder_config.subfig], split)
+    id_list_fname = os.path.join(
+        _SCICAP_DIR, "List-of-Files-for-Each-Experiments",
+        config_path, split, "file_idx.json")
+    with open(id_list_fname, "r") as fin:
+      split_images = json.load(fin)
+    for fname in split_images:
+      assert fname.endswith(".png")
+      image_id = fname[:-len(".png")]
+      annotations = _read_annotations(split, image_id)
+      yield fname, {
+          "image/id": image_id,
+          "image/filename": fname,
+          "image": os.path.join(image_path, fname),
+          "caption/originally_extracted": annotations["0-originally-extracted"],
+          "caption/lowercase_and_token_and_remove_figure_index":
+              annotations["1-lowercase-and-token-and-remove-figure-index"][
+                  "caption"],
+          "caption/normalized/basic_num": annotations["2-normalized"][
+              "2-1-basic-num"]["caption"],
+          "caption/normalized/advanced_equation_bracket":
+              annotations["2-normalized"][
+                  "2-2-advanced-euqation-bracket"]["caption"]
+      }

Tipsomaly/model/big_vision/datasets/science_qa/science_qa.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements ScienceQA train/val/test-set in TFDS structure.
+First, download the science QA dataset from their website https://scienceqa.github.io/#download
+  - mkdir -p /tmp/data/ScienceQA_DATA
+  - From Google Drive: https://drive.google.com/corp/drive/folders/1w8imCXWYn2LxajmGeGH_g5DaL2rabHev
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  - cd big_vision/datasets
+  - env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=science_qa
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load(
+      'science_qa', split='train',
+      data_dir='/tmp/tfds')
+"""
+import json
+import os
+import numpy as np
+import tensorflow_datasets as tfds
+_DESCRIPTION = """Sci QA test-set."""
+# pylint: disable=line-too-long
+_CITATION = """
+@inproceedings{lu2022learn,
+    title={Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering},
+    author={Lu, Pan and Mishra, Swaroop and Xia, Tony and Qiu, Liang and Chang, Kai-Wei and Zhu, Song-Chun and Tafjord, Oyvind and Clark, Peter and Ashwin Kalyan},
+    booktitle={The 36th Conference on Neural Information Processing Systems (NeurIPS)},
+    year={2022}
+}
+"""
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above an use these:
+_SCIQA_PATH = '/tmp/data/ScienceQA_DATA/'
+# _IMAGE_COCO_PATH = '/tmp/data/val2014'
+_ALPHABETS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+class ScienceQA(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for ScienceQA dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {'1.0.0': 'First release.'}
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'question': tfds.features.Text(),
+            'choices': tfds.features.Sequence(tfds.features.Text()),
+            'answer': tfds.features.Scalar(np.int32),
+            'hint': tfds.features.Text(),
+            'task': tfds.features.Text(),
+            'grade': tfds.features.Text(),
+            'subject': tfds.features.Text(),
+            'topic': tfds.features.Text(),
+            'category': tfds.features.Text(),
+            'skill': tfds.features.Text(),
+            'lecture': tfds.features.Text(),
+            'solution': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='png'),
+            'indexed_choices': tfds.features.Text(),
+            'indexed_answer': tfds.features.Text(),
+        }),
+        supervised_keys=None,
+        homepage='https://github.com/lupantech/ScienceQA/tree/main',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {
+        split: self._generate_examples(split)
+        for split in ('train', 'test', 'val')
+    }
+  def _generate_examples(self, split):
+    """Yields (key, example) tuples from test set."""
+    annot_fname = os.path.join(_SCIQA_PATH, 'problems.json')
+    with open(annot_fname, 'r') as f:
+      data = json.loads(f.read())
+    for k, v in data.items():
+      if v['split'] == split:  #  "split":"train"
+        image = v['image']
+        # Science QA contains the example without image as well. As this
+        # conversion is for VQA tasks, we dropped the examples without Image.
+        # TODO: Include the examples without image, and udpate the
+        # downstream pipeline to skip the examples without image, instead of
+        # doing it at pre-processing.
+        if image:
+          image = os.path.join(f'{_SCIQA_PATH}/{split}/{k}/', f'{image}')
+        else:
+          # image = None
+          continue
+        question = v['question']
+        choices = v['choices']
+        answer = v['answer']
+        hint = v['hint']
+        if not hint:
+          hint = 'N/A'  # align with orignal github implementation
+        task = v['task']
+        grade = v['grade']
+        subject = v['subject']
+        topic = v['topic']
+        category = v['category']
+        skill = v['skill']
+        lecture = v['lecture']
+        solution = v['solution']
+        split = v['split']
+        indexed_choices = ', '.join(
+            f'({_ALPHABETS[i]}) {c}' for i, c in enumerate(choices)
+        )
+        indexed_answer = _ALPHABETS[int(answer)]
+        yield int(k), {
+            'question': question,
+            'choices': choices,
+            'answer': answer,
+            'hint': hint,
+            'task': task,
+            'grade': grade,
+            'subject': subject,
+            'topic': topic,
+            'category': category,
+            'skill': skill,
+            'lecture': lecture,
+            'solution': solution,
+            'image': image,
+            'indexed_choices': indexed_choices,
+            'indexed_answer': indexed_answer,
+        }

Tipsomaly/model/big_vision/datasets/screen2words/screen2words.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Creates TFDS dataset for Screen2words.
+Preparing the data:
+  1) mkdir /tmp/data/rico && cd /tmp/data/rico
+  2) wget https://storage.googleapis.com/crowdstf-rico-uiuc-4540/rico_dataset_v0.1/unique_uis.tar.gz
+  3) tar xvfz unique_uis.tar.gz && rm unique_uis.tar.gz
+  4) git clone https://github.com/google-research-datasets/screen2words.git
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=screen2words
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('screen2_words', split='train', data_dir='/tmp/tfds')
+"""
+# pylint: enable=line-too-long
+import collections
+import csv
+import os
+import numpy as np
+import tensorflow_datasets as tfds
+_DESCRIPTION = """Screen2words dataset."""
+_CITATION = """
+@inproceedings{wang2021screen2words,
+    title={Screen2words: Automatic mobile UI summarization with multimodal
+           learning},
+    author={Wang, Bryan and
+            Li, Gang and
+            Zhou, Xin and
+            Chen, Zhourong and
+            Grossman, Tovi and
+            Li, Yang},
+    booktitle={The 34th Annual ACM Symposium on User Interface Software
+               and Technology},
+    pages={498--510},
+    year={2021}
+}
+"""
+# When running locally (recommended), copy files as above an use these:
+_SCREEN2WORDS_DIR = "/tmp/data/rico/screen2words"
+_RICO_DIR = "/tmp/data/rico/combined"
+# (name, path) tuples for splits to be generated.
+_SPLITS_TO_GENERATE = ["train", "dev", "test"]
+class Screen2Words(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for the Screen2words dataset."""
+  VERSION = tfds.core.Version("1.0.0")
+  RELEASE_NOTES = {"1.0.0": "First release."}
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            "image/id": tfds.features.Scalar(np.int32),
+            "image/filename": tfds.features.Text(),
+            "image": tfds.features.Image(encoding_format="jpeg"),
+            "summary": tfds.features.Sequence(tfds.features.Text()),
+        }),
+        supervised_keys=None,
+        homepage="https://github.com/google-research-datasets/screen2words",
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {split: self._generate_examples(split)
+            for split in _SPLITS_TO_GENERATE}
+  def _generate_examples(self, split: str):
+    """Yields (key, example) tuples from test set."""
+    id_list_fname = os.path.join(
+        _SCREEN2WORDS_DIR, "split", f"{split}_screens.txt")
+    with open(id_list_fname, "r") as fin:
+      split_ids = fin.readlines()
+    summaries_fname = os.path.join(_SCREEN2WORDS_DIR, "screen_summaries.csv")
+    summaries = collections.defaultdict(list)
+    with open(summaries_fname, "r") as fin:
+      for entry in csv.DictReader(fin):
+        summaries[int(entry["screenId"])].append(entry["summary"])
+    for line in split_ids:
+      line = line.strip()
+      image_id = int(line)
+      yield image_id, {
+          "image/id": image_id,
+          "image/filename": f"{image_id}.jpg",
+          "image": os.path.join(_RICO_DIR, f"{image_id}.jpg"),
+          "summary": summaries[image_id],
+      }

Tipsomaly/model/big_vision/datasets/stvqa/stvqa.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements ST-VQA dataset in TFDS.
+It's small data, so simple to run locally.
+First, download and unzip the dataset from https://rrc.cvc.uab.es/?ch=11
+and place it in /tmp/data/stvqa.
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd third_party/py/big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=stvqa
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('stvqa', split='train', data_dir='/tmp/tfds')
+Dataset splits:
+  train: 23446 examples/questions (subset of original train)
+  val: 2628 examples/questions (subset of original train)
+  test: 4070 examples/questions (no answers)
+Note: original source data has no val/holdout split, and we therefore split the
+original train split (26074 examples/questions) by ourselves into train & val
+splits.
+Recommended training splits:
+  train: train
+  minitrain: train[:5%]
+  eval: val
+  fulltrain: train+val
+"""
+import json
+import os
+from big_vision.datasets.stvqa import val_ids
+import numpy as np
+import tensorflow_datasets as tfds
+_VAL_IDS = val_ids.PSEUDO_VAL_IMAGE_PATHS
+_DESCRIPTION = """ST-VQA dataset."""
+# pylint: disable=line-too-long
+_CITATION = """
+@inproceedings{Biten_2019,
+   title={Scene Text Visual Question Answering},
+   url={http://dx.doi.org/10.1109/ICCV.2019.00439},
+   DOI={10.1109/iccv.2019.00439},
+   booktitle={2019 IEEE/CVF International Conference on Computer Vision (ICCV)},
+   publisher={IEEE},
+   author={Biten, Ali Furkan and Tito, Ruben and Mafla, Andres and Gomez, Lluis and Rusinol, Marcal and Jawahar, C.V. and Valveny, Ernest and Karatzas, Dimosthenis},
+   year={2019},
+   month=oct }
+"""
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above an use these:
+_STVQA_PATH = '/tmp/data/stvqa/'
+class Stvqa(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for ST-VQA dataset."""
+  VERSION = tfds.core.Version('1.2.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'First release.',
+      '1.1.0': 'Switch to COCO high-res images and lower-case answers.',
+      '1.2.0': 'Rename pseudo splits and remove lower-case answers.',
+      }
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'question_id': tfds.features.Scalar(np.int32),
+            'filename': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'question': tfds.features.Text(),
+            'answers': tfds.features.Sequence(tfds.features.Text()),
+        }),
+        supervised_keys=None,
+        homepage='https://rrc.cvc.uab.es/?ch=11',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {split: self._generate_examples(split)
+            for split in ('train', 'val', 'test')}
+  def _generate_examples(self, split):
+    """Yields (key, example) tuples."""
+    src_split = 'test' if split == 'test' else 'train'
+    annot_fname = os.path.join(_STVQA_PATH, f'{src_split}_task_3.json')
+    images_path = f'{src_split}{"_task3" if src_split == "test" else ""}_images'
+    with open(annot_fname, 'r') as f:
+      data = json.loads(f.read())
+    for x in data['data']:
+      if split == 'val' and x['file_path'] not in _VAL_IDS:
+        continue
+      elif split == 'train' and x['file_path'] in _VAL_IDS:
+        continue
+      image_path = os.path.join(_STVQA_PATH, images_path, x['file_path'])
+      # Always use high-res COCO images from train2014 directory.
+      if x['file_path'].startswith('coco-text'):
+        image_path = image_path.replace(os.path.join(images_path, 'coco-text'),
+                                        'train2014')
+      yield x['question_id'], {
+          'question_id': x['question_id'],
+          'filename': x['file_path'],
+          'image': image_path,
+          'question': x['question'],
+          'answers': x.get('answers', []),
+      }

Tipsomaly/model/big_vision/datasets/tallyqa/tallyqa.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Import TallyQA into TFDS format. Uses Visual Genome and COCO images.
+It's small data, so simple to run locally. First, download all the data:
+  mkdir /tmp/data/ ; cd /tmp/data
+  wget http://images.cocodataset.org/zips/{train2014,val2014}.zip
+  wget https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip
+  wget https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip
+  wget https://github.com/manoja328/tallyqa/blob/master/tallyqa.zip?raw=true
+  unzip *.zip
+Then, update the PATHs below and run conversion locally like so (make sure to
+install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=tallyqa
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('tallyqa', split='train', data_dir='/tmp/tfds')
+The test split distinguishes between simple and complex questions. The train
+split does not contain this information. We therefore set issimple to `-1` in
+the train split to indicate it is not known.
+"""
+import json
+import numpy as np
+import tensorflow_datasets as tfds
+_TALLYQA_PATH = '/tmp/data/tallyQA/'
+_VISUAL_GENOME_PATH = '/tmp/data/visual_genome/'
+_COCO_PATH = '/tmp/data/coco/'
+_DESCRIPTION = """
+TallyQA: Answering Complex Counting Questions
+Most counting questions in visual question answering (VQA) datasets are simple
+and require no more than object detection. Here, we study algorithms for complex
+counting questions that involve relationships between objects, attribute
+identification, reasoning, and more. To do this, we created TallyQA, the world's
+largest dataset for open-ended counting.
+"""
+_CITATION = """
+@inproceedings{acharya2019tallyqa,
+  title={TallyQA: Answering Complex Counting Questions},
+  author={Acharya, Manoj and Kafle, Kushal and Kanan, Christopher},
+  booktitle={AAAI},
+  year={2019}
+}
+"""
+_HOMEPAGE = 'https://github.com/manoja328/TallyQA_dataset'
+class TallyQA(tfds.core.GeneratorBasedBuilder):
+  """Import TallyQA dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {'1.0.0': 'Initial release.'}
+  MANUAL_DOWNLOAD_INSTRUCTIONS = """
+  There are three parts which should be downloaded:
+  * TallyQA (train / test json files)
+  * Visual Genome images (needed for train and test split)
+  * COCO (2014) train / val images (only needed for train split)
+  """
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    features = tfds.features.FeaturesDict({
+        'image': tfds.features.Image(shape=(None, None, 3)),
+        'image_id': tfds.features.Scalar(dtype=np.int32),
+        'image_source': tfds.features.Text(),
+        'question': tfds.features.Text(),
+        'question_id': tfds.features.Scalar(dtype=np.int32),
+        'answer': tfds.features.Scalar(dtype=np.int32),
+        'issimple': tfds.features.Scalar(dtype=np.int32),
+    })
+    return tfds.core.DatasetInfo(
+        builder=self,
+        features=features,
+        description=_DESCRIPTION,
+        supervised_keys=None,
+        homepage=_HOMEPAGE,
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager) -> ...:
+    """Call the function which defines the splits."""
+    del dl_manager
+    return {
+        'train': self._generate_examples(split='train'),
+        'test': self._generate_examples(split='test'),
+    }
+  def _generate_examples(self, split: str) -> ...:
+    tally_json_file = f'{_TALLYQA_PATH}/{split}.json'
+    with open(tally_json_file, 'r') as f:
+      tally_json = json.load(f)
+    for tally_qa in tally_json:
+      # The TallyQA images come from two sources: Visual Genome and COCO.
+      # Determine the correct dataset by inspecting the prefix.
+      filepath = tally_qa['image']
+      if filepath.startswith('VG_100K'):
+        filepath = _VISUAL_GENOME_PATH + filepath
+      elif filepath.startswith('train2014') or filepath.startswith('val2014'):
+        filepath = _COCO_PATH + filepath
+      else:
+        raise ValueError(f'Unknown image path: {filepath}')
+      tally_qa_dict = {
+          'image': filepath,
+          'image_id': tally_qa['image_id'],
+          'image_source': tally_qa['data_source'],
+          'question': tally_qa['question'],
+          'question_id': tally_qa['question_id'],
+          'answer': int(tally_qa['answer']),
+      }
+      if split == 'test':
+        # Field only present in test split.
+        tally_qa_dict.update({'issimple': tally_qa['issimple']})
+      else:
+        # In the train split, we set issimple to -1 to indicate it is not known.
+        tally_qa_dict.update({'issimple': -1})
+      tally_qa_id = f'{tally_qa_dict["image_id"]} / {tally_qa_dict["question_id"]}'  # pylint: disable=line-too-long
+      yield tally_qa_id, tally_qa_dict

Tipsomaly/model/big_vision/datasets/textcaps/textcaps.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements textcaps val-set in TFDS structure.
+It's small data, so simple to run locally. First, copy the data to local disk:
+  mkdir -p /tmp/data/textcaps
+  cd /tmp/data/textcaps
+  curl -O https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json
+  curl -O https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_val.json
+  curl -O https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_test.json
+  curl -O https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
+  curl -O https://dl.fbaipublicfiles.com/textvqa/images/test_images.zip
+  unzip train_val_images.zip
+  rm train_val_images.zip
+  unzip test_images.zip
+  rm test_images.zip
+Then, run conversion locally (make sure to install tensorflow-datasets for the
+`tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=textcaps
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('text_caps', split='val', data_dir='/tmp/tfds')
+"""
+import collections
+import json
+import os
+from absl import logging
+import numpy as np
+import tensorflow_datasets as tfds
+_DESCRIPTION = """TextCaps dataset."""
+# pylint: disable=line-too-long
+_CITATION = (
+    '@inproceedings{sidorov2019textcaps,'
+    'title={TextCaps: a Dataset for Image Captioningwith Reading Comprehension},'
+    'author={Sidorov, Oleksii and Hu, Ronghang and Rohrbach, Marcus and Singh, Amanpreet},'
+    'journal={European Conference on Computer Vision},'
+    'year={2020}}')
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above an use these:
+_FILEPATH = '/tmp/data/textcaps/'
+_TRAIN_FILES = '/tmp/data/textcaps/TextCaps_0.1_train.json'
+_VAL_FILES = '/tmp/data/textcaps/TextCaps_0.1_val.json'
+_TEST_FILES = '/tmp/data/textcaps/TextCaps_0.1_test.json'
+class TextCaps(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for TextCaps dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata.
+    (tfds.core.DatasetInfo object)
+      These are the features of your dataset like images, labels, etc.
+    """
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'image/id': tfds.features.Text(),
+            'image_filepath': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'texts': tfds.features.Sequence(tfds.features.Text()),
+        }),
+        supervised_keys=None,  # Set to `None` to disable
+        homepage='https://textvqa.org/textcaps/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    def group_by_id(data, image_dir):
+      id_to_example = collections.defaultdict(list)
+      for ex in data:
+        id_to_example[ex['image_id']].append(ex)
+      for k, exs in id_to_example.items():
+        image_ids, image_names, texts = [], [], []
+        for ex in exs:
+          image_ids.append(ex['image_id'])
+          image_names.append(ex['image_name'])
+          if ex.get('caption_str'):
+            texts.append(ex.get('caption_str'))
+        assert len(set(image_ids)) == 1
+        assert len(set(image_names)) == 1
+        image_filepath = os.path.join(
+            _FILEPATH, image_dir, str(image_names[0])+'.jpg')
+        id_to_example[k] = {
+            'image/id': image_ids[0],
+            'image_filepath': image_filepath,
+            'image': image_filepath,
+            'texts': texts,
+        }
+      return id_to_example
+    # Returns the Dict[split names, Iterator[Key, Example]]
+    with open(_TRAIN_FILES) as f:
+      train_data = group_by_id(json.load(f)['data'], 'train_images')
+    with open(_VAL_FILES) as f:
+      val_data = group_by_id(json.load(f)['data'], 'train_images')
+    with open(_TEST_FILES) as f:
+      test_data = group_by_id(json.load(f)['data'], 'test_images')
+    return {
+        'train': self._generate_examples(train_data),
+        'val': self._generate_examples(val_data),
+        'test': self._generate_examples(test_data),
+    }
+  def _generate_examples(self, data):
+    """Generate a tf.Example object.
+    This contains the image, objects, attributes, regions and relationships.
+    Args:
+      data: a dictionary with the image/id.
+    Yields:
+      (key, example) tuples from dataset. The example has format specified in
+        the above DatasetInfo.
+    """
+    for k, v in data.items():
+      yield k, v

Tipsomaly/model/big_vision/datasets/textvqa/textvqa.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements textvqa in TFDS structure.
+It's small data, so simple to run locally. First, copy the data to local disk:
+  mkdir -p /tmp/data/textvqa
+  cd /tmp/data/textvqa
+  curl -O https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
+  curl -O https://dl.fbaipublicfiles.com/textvqa/images/test_images.zip
+  curl -O https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_train.json
+  curl -O https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
+  curl -O https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_test.json
+  # The Rosetta_OCR files are probably not needed.
+  # curl -O https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_Rosetta_OCR_v0.2_train.json
+  # curl -O https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_Rosetta_OCR_v0.2_val.json
+  # curl -O https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_Rosetta_OCR_v0.2_test.json
+  unzip train_val_images.zip
+  rm train_val_images.zip
+  unzip test_images.zip
+  rm test_images.zip
+  # Background: at https://textvqa.org/dataset/ it says:
+  # "Note: Some of the images in OpenImages are rotated,
+  # please make sure to check the Rotation field in the Image IDs files
+  # for train and test."
+  curl -O https://storage.googleapis.com/openimages/2018_04/train/train-images-boxable-with-rotation.csv
+  curl -O https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv
+  mv train-images-boxable-with-rotation.csv train_images/rotation.csv
+  mv test-images-with-rotation.csv test_images/rotation.csv
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=textvqa
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('textvqa', split='train', data_dir='/tmp/tfds')
+"""
+import json
+import os
+from absl import logging
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import tensorflow_datasets as tfds
+_DESCRIPTION = """TextVqa dataset."""
+# pylint: disable=line-too-long
+_CITATION = (
+    '@inproceedings{singh2019towards,'
+    'title={Towards VQA Models That Can Read},'
+    'author={Singh, Amanpreet and Natarjan, Vivek and Shah, Meet and Jiang, Yu and Chen, Xinlei and Parikh, Devi and Rohrbach, Marcus},'
+    'booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},'
+    'pages={8317-8326},'
+    'year={2019}}'
+    )
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above and use these:
+_FILEPATH = '/tmp/data/textvqa/'
+_TRAIN_FILES = '/tmp/data/textvqa/TextVQA_0.5.1_train.json'
+_VAL_FILES = '/tmp/data/textvqa/TextVQA_0.5.1_val.json'
+_TEST_FILES = '/tmp/data/textvqa/TextVQA_0.5.1_test.json'
+_ROTATION_CSV = 'rotation.csv'
+class TextVqa(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for textvqa dataset."""
+  VERSION = tfds.core.Version('1.0.1')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+      '1.0.1': 'Undo rotation for known rotated images.',
+  }
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata.
+    (tfds.core.DatasetInfo object)
+      These are the features of your dataset like images, labels, etc.
+    """
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'image/id': tfds.features.Scalar(np.int32),
+            'image_filepath': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'question_id': tfds.features.Scalar(np.int32),
+            'question': tfds.features.Text(),
+            'answers': tfds.features.Sequence(tfds.features.Text()),
+        }),
+        supervised_keys=None,  # Set to `None` to disable
+        homepage='https://textvqa.org/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    def json_to_examples(data, image_dir):
+      # Load rotation csv.
+      logging.info('Processing %d items in %s', len(data), image_dir)
+      rot = pd.read_csv(os.path.join(_FILEPATH, image_dir, _ROTATION_CSV))
+      rotation_by_id = {}
+      for row in rot.itertuples():
+        rotation = int(row.Rotation) if not np.isnan(row.Rotation) else 0
+        rotation_by_id[row.ImageID] = rotation
+      examples = {}
+      for v in data:
+        image_id = str(v['image_id'])
+        image_filepath = os.path.join(_FILEPATH, image_dir, image_id + '.jpg')
+        question_id = v['question_id']
+        examples[question_id] = {
+            'image/id': question_id,
+            'image_filepath': image_filepath,
+            'image': image_filepath,
+            'rotation': rotation_by_id[image_id],
+            'question_id': question_id,
+            'question': v['question'],
+            'answers': v.get('answers', []),  # No answers in test set.
+        }
+      return examples
+    # Returns the Dict[split names, Iterator[Key, Example]]
+    with open(_TRAIN_FILES) as f:
+      train_data = json_to_examples(json.load(f)['data'], 'train_images')
+    with open(_VAL_FILES) as f:
+      # Validation images are stored in the train_images folder.
+      val_data = json_to_examples(json.load(f)['data'], 'train_images')
+    with open(_TEST_FILES) as f:
+      test_data = json_to_examples(json.load(f)['data'], 'test_images')
+    return {
+        'train': self._generate_examples(train_data),
+        'val': self._generate_examples(val_data),
+        'test': self._generate_examples(test_data),
+    }
+  def _generate_examples(self, data):
+    """Generate a tf.Example object.
+    Args:
+      data: a dictionary with the image/id.
+    Yields:
+      (key, example) tuples from dataset. The example has format specified in
+        the above DatasetInfo.
+    """
+    for k, v in data.items():
+      # If the image is rotated, we undo the rotation here and re-encode.
+      image_bytes = open(v['image_filepath'], 'rb').read()
+      if v['rotation'] != 0:
+        rotation = v['rotation']
+        assert rotation % 90 == 0
+        turns = int(rotation / 90)
+        image = tf.image.decode_jpeg(image_bytes)
+        image_bytes = tf.io.encode_jpeg(
+            tf.image.rot90(image, turns), quality=100
+        ).numpy()
+      # If no rotation was needed, we just pass along the unchanged bytes.
+      v['image'] = image_bytes
+      # Now all rotation should have been accounted for. And we don't want to
+      # pass on the (now obsolete) rotation info as features.
+      del v['rotation']
+      yield k, v

Tipsomaly/model/big_vision/datasets/vizwizvqa/vizwizvqa.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Implements VizWizVQA dataset in TFDS structure.
+It's small data, so simple to run locally. First, copy the data to local disk:
+  mkdir -p /tmp/data/vizwizvqa
+  wget -O  https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip /tmp/data/vizwizvqa
+  wget -O  https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip /tmp/data/vizwizvqa
+  wget -O https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip /tmp/data/vizwizvqa
+Then, run conversion locally
+(make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=vizwizvqa
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('vizwizvqa', split='train', data_dir='/tmp/tfds')
+"""
+import json
+import os
+import numpy as np
+import tensorflow_datasets as tfds
+_DESCRIPTION = """VizWiz VQA Dataset."""
+# pylint: disable=line-too-long
+_CITATION = """
+@inproceedings{gurari2018vizwiz,
+  title={Vizwiz grand challenge: Answering visual questions from blind people},
+  author={Gurari, Danna and Li, Qing and Stangl, Abigale J and Guo, Anhong and Lin, Chi and Grauman, Kristen and Luo, Jiebo and Bigham, Jeffrey P},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={3608--3617},
+  year={2018}
+}
+}
+"""
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above an use these:
+_VIZWIZVQA_PATH = '/tmp/data/vizwizvqa/'
+class VizWizVQA(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for VizWizVQA dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {'1.0.0': 'First release.'}
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'question': tfds.features.Text(),
+            'image/filename': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'answers': tfds.features.Sequence(tfds.features.Text()),
+            # can be "yes" "no" and "maybe" strings
+            'answer_confidences': tfds.features.Sequence(tfds.features.Text()),
+            'answerable': tfds.features.Scalar(np.int32),
+            'question_id': tfds.features.Scalar(np.int32),
+        }),
+        supervised_keys=None,
+        homepage='https://vizwiz.org/tasks-and-datasets/vqa/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {split: self._generate_examples(split)
+            for split in ('val', 'train', 'test',)}
+  def _generate_examples(self, split: str):
+    """Yields (key, example) tuples from test set."""
+    annot_fname = os.path.join(_VIZWIZVQA_PATH, 'annotations', f'{split}.json')
+    with open(annot_fname, 'r') as f:
+      data = json.loads(f.read())
+    for v in data:
+      answers = []
+      answer_confidences = []
+      image_file = v['image']
+      answerable = -1
+      if split != 'test':
+        for answer in v['answers']:
+          # A couple of answers in the train set are empty strings.
+          if not answer['answer']:
+            continue
+          answers.append(answer['answer'])
+          answer_confidences.append(answer['answer_confidence'])
+        answerable = v['answerable']
+      question_id = image_file[:-4]
+      question_id = int(question_id.split('_')[-1])
+      yield v['image'], {
+          'question': v['question'],
+          'image/filename': image_file,
+          'question_id': question_id,
+          'image': os.path.join(_VIZWIZVQA_PATH, split, image_file),
+          'answers': answers,
+          'answer_confidences': answer_confidences,
+          'answerable': answerable,
+      }

Tipsomaly/model/big_vision/datasets/vqa/vqa.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Import VQAv2 into TFDS format. Uses coco-2014 images.
+It's small data, so simple to run locally. First, download all the data:
+  mkdir /tmp/data/ ; cd /tmp/data
+  wget http://images.cocodataset.org/zips/{train2014,val2014,test2015}.zip
+  wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_{Train,Val,Test}_mscoco.zip
+  wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_{Train,Val}_mscoco.zip
+  unzip '*.zip'
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=vqa
+It runs at around 750 examples/sec, so takes around 25min for the 1.2M questions.
+Each question is an example; images are repeated, a bit wasteful, but disk is cheap.
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load('vqa', split='train', data_dir='/tmp/tfds')
+"""
+import json
+import os
+import numpy as np
+import tensorflow_datasets as tfds
+_VQAV2_PATH = '/tmp/data'
+_IMAGE_PATH = '/tmp/data'
+_CITATION = (
+    '@InProceedings{balanced_vqa_v2,'
+    'author = {Yash Goyal and Tejas Khot and '
+    'Douglas Summers{-}Stay and Dhruv Batra and Devi Parikh},'
+    'title = {Making the {V} in {VQA} Matter: Elevating the Role of Image'
+    'Understanding in {V}isual {Q}uestion {A}nswering},'
+    'booktitle = {Computer Vision and Pattern Recognition (CVPR)},'
+    'year = {2017},}')
+class Vqa(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for VQAv2 dataset."""
+  VERSION = tfds.core.Version('3.0.0')
+  RELEASE_NOTES = {'3.0.0': 'Format as needed for PaliGemma'}
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description='The VQAv2 dataset.',
+        features=tfds.features.FeaturesDict({
+            'image/id': np.int32,
+            'image/filename': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'question_id': np.int32,
+            'question_type': tfds.features.Text(),
+            'question_text': tfds.features.Text(),
+            'answer_type': tfds.features.Text(),
+            'answers': tfds.features.Sequence(tfds.features.Text()),
+            'answer_confidences': tfds.features.Sequence(
+                tfds.features.ClassLabel(names=['no', 'maybe', 'yes'])),
+            'top_answer': tfds.features.Text(),
+        }),
+        homepage='https://visualqa.org/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {
+        'train': self._generate_examples('train2014'),
+        'validation': self._generate_examples('val2014'),
+        'test': self._generate_examples('test2015'),
+        'test-dev': self._generate_examples('test-dev2015', 'test2015'),
+    }
+  def _generate_examples(self, split, image_folder=None):
+    """Yields (key, example) tuples from test set."""
+    image_folder = image_folder or split
+    # The questions file has fields image_id, question, question_id.
+    with open(os.path.join(
+        _VQAV2_PATH, f'v2_OpenEnded_mscoco_{split}_questions.json')) as f:
+      examples = json.load(f)['questions']
+    # The questions file has fields: image_id, question_id, answers,
+    # answer_type, question_type, multiple_choice_answer.
+    if 'test' not in split:
+      with open(os.path.join(
+          _VQAV2_PATH, f'v2_mscoco_{split}_annotations.json')) as f:
+        annots = {a['question_id']: a for a in json.load(f)['annotations']}
+    for ex in examples:
+      qid = ex['question_id']
+      ex = {
+          'image/id': ex['image_id'],
+          'question_id': qid,
+          'question_text': ex['question'],
+      }
+      if 'test' not in split:
+        fname = f'COCO_{image_folder}_{ex["image/id"]:012d}.jpg'
+        ex['image/filename'] = fname
+        ex['image'] = os.path.join(_IMAGE_PATH, image_folder, fname)
+        ann = annots[qid]
+        ex['question_type'] = ann['question_type']
+        ex['answer_type'] = ann['answer_type']
+        ex['answers'] = [a['answer'] for a in ann['answers']]
+        ex['answer_confidences'] = [a['answer_confidence']
+                                    for a in ann['answers']]
+        ex['top_answer'] = ann['multiple_choice_answer']
+      else:
+        # For test images, a few are from the wrong year...
+        fname = f'COCO_{image_folder}_{ex["image/id"]:012d}.jpg'
+        ex['image/filename'] = fname
+        if os.path.isfile(path := os.path.join(_IMAGE_PATH, image_folder, fname)):
+          ex['image'] = path
+        else:
+          print(ex['image/id'])
+          continue
+        ex['question_type'] = ''
+        ex['answer_type'] = ''
+        ex['answers'] = []
+        ex['answer_confidences'] = []
+        ex['top_answer'] = ''
+      yield qid, ex

Tipsomaly/model/big_vision/datasets/widgetcap/widgetcap.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Import widgetcap into TFDS format.
+  Widget Captioning all requires images from the RICO dataset:
+  mkdir -p /tmp/data/rico_images ; cd /tmp/data/rico_images
+  wget
+  https://storage.googleapis.com/crowdstf-rico-uiuc-4540/rico_dataset_v0.1/unique_uis.tar.gz
+  tar xvfz unique_uis.tar.gz
+  rm unique_uis.tar.gz
+  Widget Captioning:
+  mkdir - /tmp/data/widget_captioning ; cd /tmp/data/widget_captioning
+  git clone https://github.com/google-research-datasets/widget-caption.git
+  cp widget-caption/widget_captions.csv ./
+  cp widget-caption/split/*.txt ./
+  rm -rf widget-caption
+Then, run conversion locally (make sure to install tensorflow-datasets for the
+`tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=widgetcap
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset_augmented = tfds.load('widgetcap', split='train',
+  data_dir='/tmp/tfds')
+"""
+import csv
+import json
+import os
+import numpy as np
+from PIL import Image
+import tensorflow_datasets as tfds
+_DATASET_DIR = '/tmp/data/widget_captioning'
+# Dataset property indicating the y-dim of the canvas
+_RICO_CANVAS_Y = 2560
+_IMAGE_DIR = '/tmp/data/rico_images/combined'
+_CITATION = (
+    '@inproceedings{Li2020WidgetCG,title={Widget Captioning: Generating Natural'
+    ' Language Description for MobileUser Interface Elements},author={Y. Li and'
+    ' Gang Li and Luheng He and Jingjie Zheng and Hong Li and Zhiwei'
+    ' Guan},booktitle={Conference on Empirical Methods in Natural Language'
+    ' Processing},year={2020},}'
+)
+class Widgetcap(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for widgetcap dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {'1.0.0': 'Format as needed for PaliGemma'}
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description='The widgetcap dataset.',
+        features=tfds.features.FeaturesDict({
+            'image/id': tfds.features.Text(),
+            'image/filename': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'texts': tfds.features.Sequence(tfds.features.Text()),
+            'bbox': tfds.features.BBoxFeature(),
+            'screen_id': tfds.features.Text(),
+            'node_id': tfds.features.Text(),
+            'height': np.int32,
+            'width': np.int32,
+        }),
+        homepage='https://github.com/google-research-datasets/widget-caption',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {
+        'train': self._generate_examples('train'),
+        'dev': self._generate_examples('dev'),
+        'test': self._generate_examples('test'),
+    }
+  def _generate_examples(self, split):
+    """Yields (key, example) tuples from the dataset."""
+    split_screen_ids = set()
+    with open(os.path.join(_DATASET_DIR, split + '.txt')) as f:
+      for line in f:
+        split_screen_ids.add(line.strip())
+    with open(os.path.join(_DATASET_DIR, 'widget_captions.csv')) as f:
+      for row in csv.DictReader(f):
+        if row['screenId'] in split_screen_ids:
+          id_, example = self._get_example(
+              row['screenId'], row['nodeId'], row['captions']
+          )
+          yield id_, example
+  def _get_node_box(self, screen_id, node_id, height):
+    index_list = [int(i) for i in node_id.split('.')[1:]]
+    with open(os.path.join(_IMAGE_DIR, screen_id + '.json')) as f:
+      view = json.load(f)
+    curr_node = view['activity']['root']
+    for index in index_list:
+      curr_node = curr_node['children'][index]
+    normalized_bounds = map(
+        lambda x: x * height / _RICO_CANVAS_Y, curr_node['bounds']
+    )
+    return normalized_bounds
+  def _get_example(self, screen_id, node_id, captions):
+    image = Image.open(os.path.join(_IMAGE_DIR, screen_id + '.jpg'))
+    width, height = image.size
+    # get bounding box coordinates
+    xmin, ymin, xmax, ymax = self._get_node_box(screen_id, node_id, height)
+    image_id = f'{screen_id}_{node_id}'
+    example = {
+        'image/id': image_id,
+        'image/filename': screen_id + '.jpg',
+        'image': os.path.join(_IMAGE_DIR, screen_id + '.jpg'),
+        'texts': captions.split('|'),
+        'bbox': tfds.features.BBox(
+            ymin=ymin / height,
+            xmin=xmin / width,
+            ymax=ymax / height,
+            xmax=xmax / width,
+        ),
+        'screen_id': screen_id,
+        'node_id': node_id,
+        'height': height,
+        'width': width,
+    }
+    return image_id, example

Tipsomaly/model/big_vision/datasets/xgqa/xgqa.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+"""Generates xGQA in a TFDS-ready structure.
+First, download the data:
+  mkdir -p /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_bn.json -P /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_de.json -P /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_en.json -P /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_id.json -P /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_ko.json -P /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_pt.json -P /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_ru.json -P /tmp/data/xgqa/annotations
+  wget https://raw.githubusercontent.com/e-bug/iglue/main/datasets/xGQA/annotations/zero_shot/testdev_balanced_questions_zh.json -P /tmp/data/xgqa/annotations
+  wget https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip -P /tmp/data/xgqa/
+  unzip /tmp/data/xgqa/images.zip -d /tmp/data/xgqa/
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=xgqa
+Example to load:
+import tensorflow_datasets as tfds
+dataset = tfds.load(
+    'xgqa', split='test_zs_en',
+    data_dir='/tmp/tfds')
+"""
+import json
+import os
+import tensorflow_datasets as tfds
+_DESCRIPTION = """xGQA (uses GQA images)."""
+# pylint: disable=line-too-long
+_CITATION = (
+    '@inproceedings{pfeiffer-etal-2022-xgqa,'
+    'title = "x{GQA}: Cross-Lingual Visual Question Answering",'
+    'author = "Pfeiffer, Jonas  and'
+    '  Geigle, Gregor  and'
+    '  Kamath, Aishwarya  and'
+    '  Steitz, Jan-Martin  and'
+    '  Roth, Stefan  and'
+    '  Vuli{\'c}, Ivan  and'
+    '  Gurevych, Iryna",'
+    'booktitle = "Findings of the Association for Computational Linguistics: '
+    'ACL 2022",'
+    'month = may,'
+    'year = "2022",'
+    'address = "Dublin, Ireland",'
+    'publisher = "Association for Computational Linguistics",'
+    'url = "https://aclanthology.org/2022.findings-acl.196",'
+    'doi = "10.18653/v1/2022.findings-acl.196",'
+    'pages = "2497--2511",'
+    '}'
+)
+# pylint: enable=line-too-long
+# When running locally (recommended), copy files as above an use these:
+_DATA_PATH = '/tmp/data/xgqa/'
+_IMAGE_PATH = '/tmp/data/xgqa/images/'
+LANGUAGES = frozenset(['bn', 'de', 'en', 'id', 'ko', 'pt', 'ru', 'zh'])
+class XGQA(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for XGQA dataset."""
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {'1.0.0': 'First release.'}
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'example_id': tfds.features.Text(),
+            'image/id': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'question': tfds.features.Text(),
+            'answer': tfds.features.Text(),
+        }),
+        supervised_keys=None,
+        homepage='https://github.com/adapter-hub/xGQA',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    d = dict()
+    for l in LANGUAGES:
+      d.update({
+          f'test_zs_{l}': self._generate_examples('test', 'zero_shot', l),
+          f'test_fs_{l}': self._generate_examples('test', 'few_shot', l),
+          f'dev_fs_{l}': self._generate_examples('test', 'few_shot', l),
+          f'train_fs1_{l}': self._generate_examples('train_1', 'few_shot', l),
+          f'train_fs5_{l}': self._generate_examples('train_5', 'few_shot', l),
+          f'train_fs10_{l}': self._generate_examples('train_10', 'few_shot', l),
+          f'train_fs20_{l}': self._generate_examples('train_20', 'few_shot', l),
+          f'train_fs25_{l}': self._generate_examples('train_25', 'few_shot', l),
+          f'train_fs48_{l}': self._generate_examples('train_48', 'few_shot', l),
+      })
+    return d
+  def _generate_examples(self, split, num_shots, lang):
+    """Yields (key, example) tuples."""
+    # Loads the questions for each image.
+    if num_shots == 'few_shot':
+      file_path = os.path.join(_DATA_PATH, 'annotations', 'few_shot', lang,
+                               f'{split}.json')
+    elif num_shots == 'zero_shot':
+      file_path = os.path.join(_DATA_PATH, 'annotations', 'zero_shot',
+                               f'testdev_balanced_questions_{lang}.json')
+    else:
+      raise ValueError(f'Unknown num_shots: {num_shots}')
+    with open(file_path, 'r') as f:
+      entries = json.load(f)
+    # Make one entry per question-answer pair.
+    for question_id, question_data in entries.items():
+      example_id = f'{question_id}_{lang}'
+      yield example_id, {
+          'example_id': example_id,
+          'image/id': question_data['imageId'],
+          'image': os.path.join(_IMAGE_PATH, f'{question_data["imageId"]}.jpg'),
+          'question': question_data['question'],
+          'answer': question_data['answer'],
+      }

Tipsomaly/model/big_vision/datasets/xm3600/xm3600.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Generates XM3600 in a TFDS-ready structure.
+First, download the captions from https://google.github.io/crossmodal-3600/ and the images from https://cocodataset.org/#download.
+The coco Karpathy split is available at http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip:
+  mkdir -p /tmp/data/xm3600
+  wget https://google.github.io/crossmodal-3600/web-data/captions.zip -P /tmp/data/xm3600
+  unzip /tmp/data/xm3600/captions.zip -d /tmp/data/xm3600/
+  wget https://open-images-dataset.s3.amazonaws.com/crossmodal-3600/images.tgz ta-P /tmp/data/xm3600
+  mkdir /tmp/data/xm3600/images
+  tar -xzf /tmp/data/xm3600/images.tgz -C /tmp/data/xm3600/images
+Then, run conversion locally (make sure to install tensorflow-datasets for the `tfds` util):
+  cd big_vision/datasets
+  env TFDS_DATA_DIR=/tmp/tfds tfds build --datasets=xm3600
+Example to load:
+  import tensorflow_datasets as tfds
+  dataset = tfds.load(
+      'xm3600', split='en',
+      data_dir='/tmp/tfds')
+"""
+import json
+import os.path
+import tensorflow_datasets as tfds
+_DESCRIPTION = """
+COCO image + captions, translated from English to 35 languages (English incl.).
+"""
+# pylint: disable=line-too-long
+_CITATION = """
+@inproceedings{thapliyal-etal-2022-crossmodal,
+    title = "Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset",
+    author = "Thapliyal, Ashish V.  and
+      Pont Tuset, Jordi  and
+      Chen, Xi  and
+      Soricut, Radu",
+    editor = "Goldberg, Yoav  and
+      Kozareva, Zornitsa  and
+      Zhang, Yue",
+    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
+    month = dec,
+    year = "2022",
+    address = "Abu Dhabi, United Arab Emirates",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.emnlp-main.45",
+    doi = "10.18653/v1/2022.emnlp-main.45",
+    pages = "715--729",
+}
+"""
+# pylint: enable=line-too-long
+_CAPTIONS_PATH = '/tmp/data/xm3600'
+_IMAGES_PATH = '/tmp/data/xm3600/images'
+XM3600_LANGUAGES = [
+    'ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr',
+    'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl',
+    'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh'
+]
+class Xm3600(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for XM3600 dataset."""
+  VERSION = tfds.core.Version('1.0.1')
+  RELEASE_NOTES = {
+      '1.0.0': 'First release.',
+      '1.0.1': 'Add captions/tokenized feature to compute metrics (eg CIDEr).',
+  }
+  def _info(self):
+    """Returns the metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'image/id': tfds.features.Text(),
+            'image': tfds.features.Image(encoding_format='jpeg'),
+            'captions': tfds.features.Sequence(tfds.features.Text()),
+            'captions/tokenized': tfds.features.Sequence(tfds.features.Text()),
+            'language': tfds.features.Text(),
+        }),
+        supervised_keys=None,
+        homepage='https://google.github.io/crossmodal-3600/',
+        citation=_CITATION,
+    )
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    return {lang: self._generate_examples(lang) for lang in XM3600_LANGUAGES}
+  def _generate_examples(self, split: str):
+    """Yields (key, example) tuples from dataset."""
+    language = split
+    annot_fname = os.path.join(_CAPTIONS_PATH, 'captions.jsonl')
+    data = {}
+    tok_data = {}
+    with open(annot_fname, 'r') as f:
+      for line in f:
+        j = json.loads(line)
+        image_id = f'{j["image/key"]}_{language}'
+        captions = j[language]['caption']
+        data[image_id] = captions
+        tok_data[image_id] = j[language]['caption/tokenized']
+    for image_id, captions in data.items():
+      yield image_id, {
+          'image/id': image_id,
+          'image': os.path.join(_IMAGES_PATH, f'{image_id.split("_")[0]}.jpg'),
+          'captions': captions,
+          'captions/tokenized': tok_data[image_id],
+          'language': language,
+      }

Tipsomaly/model/big_vision/evaluators/proj/cappa/perplexity.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright 2023 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for perplexity of a model."""
+from big_vision.evaluators import mean
+import big_vision.utils as u
+import jax.numpy as jnp
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = 'jit'
+def perplexity(predict_fn, normalize_by_seqlen):
+  """Returns a function that computes perplexity."""
+  def _perplexity_fn(train_state, batch, pad_token=0, **kw):
+    logits, _ = predict_fn(train_state, batch, **kw)
+    # Ignore perplexity on the padding label.
+    weights = jnp.where(batch['labels'] != pad_token, 1, 0).astype(jnp.float32)
+    if batch.get('label_masks') is not None:
+      weights = weights * batch['label_masks']
+    losses = u.weighted_softmax_xent(
+        logits=logits, labels=batch['labels'],
+        weights=weights, label_smoothing=0.0,
+        reduction=False, normalize=normalize_by_seqlen)
+    return {'perplexity': losses}
+  return _perplexity_fn
+class Evaluator(mean.Evaluator):
+  """Perplexity evaluator."""
+  def __init__(self, predict_fn, *a, normalize_by_seqlen=False, **kw):
+    super().__init__(perplexity(predict_fn, normalize_by_seqlen), *a, **kw)

Tipsomaly/model/big_vision/evaluators/proj/cappa/scoring_classifier.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright 2023 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Scoring classifier.
+This one is based on a generative perspective for image classification.
+Here we input the image as well as all the tokenized labels to compute their
+perplexity and select the one with minimum loss as the prediction.
+"""
+import functools
+from big_vision.datasets.imagenet import class_names as imagenet_class_names
+from big_vision.evaluators import mean
+from big_vision.pp import builder as pp_builder
+import jax.numpy as jnp
+import numpy as np
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = "jit"
+CLASS_NAMES = {
+    "imagenet2012": imagenet_class_names.CLIP_IMAGENET_CLASS_NAMES,
+}
+# As a separate function to cache result across instances.
+@functools.lru_cache(maxsize=None)
+def get_classes(dataset_name, pp_txt):
+  """Load the class label strings and tokenize them using pp_txt."""
+  pp_fn = pp_builder.get_preprocess_fn(pp_txt, log_data=False)
+  return np.array([pp_fn({"label": name})["labels"]
+                   for name in CLASS_NAMES[dataset_name]])
+def scoring(predict_fn, tokenized_labels):
+  def _scoring_fn(train_state, batch, *a, **kw):
+    batch = {"_label_tokens": tokenized_labels, **batch}
+    scores = predict_fn(train_state, batch, *a, **kw)
+    predictions = jnp.argmax(scores, axis=-1)
+    return {"prec@1": predictions == batch["label"]}
+  return _scoring_fn
+class Evaluator(mean.Evaluator):
+  """Evaluator for classification accuracy based on scoring all classes."""
+  def __init__(self, predict_fn, data, pp_fn, pp_txt, *a, **kw):
+    cls_tokens = get_classes(data["name"], pp_txt)
+    super().__init__(scoring(predict_fn, cls_tokens), data, pp_fn, *a, **kw)

Tipsomaly/model/big_vision/evaluators/proj/distill/distance.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for the classfication task."""
+from functools import partial, lru_cache
+from big_vision import input_pipeline
+import big_vision.datasets.core as ds_core
+import big_vision.pp.builder as pp_builder
+import big_vision.utils as u
+import einops
+import jax
+import jax.numpy as jnp
+from jax.sharding import NamedSharding
+from jax.sharding import PartitionSpec as P
+import numpy as np
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = 'jit'
+def dist(student, teacher, kind, feat_axis=-1,
+         epsilon=1e-12, t=1, ls=0.0, k=1):
+  """Distance function used for distillation."""
+  diff = student - teacher
+  if kind == 'euclidean':
+    return jnp.sqrt(jnp.sum(diff * diff, axis=feat_axis) + epsilon)
+  elif kind == 'l2':
+    return jnp.sum(diff * diff, axis=feat_axis)
+  elif kind == 'hard':
+    pseudolabels = jnp.argmax(teacher, feat_axis)
+    pl = u.onehot(pseudolabels, teacher.shape[feat_axis])
+    if ls:
+      pl = (1.0 - ls) * pl + (ls / (pl.shape[-1] - 1)) * (1.0 - pl)
+    return u.softmax_xent(logits=student, labels=pl,
+                          reduction=False, kl=True, axis=feat_axis)
+  elif kind == 'kl':
+    return t**2 * u.softmax_xent(
+        logits=student / t,
+        labels=jax.nn.softmax(teacher / t),
+        reduction=False, kl=True, axis=feat_axis)
+  elif kind == 'logsoftmax_euclidean':
+    logsoftmax_diff = (
+        jax.nn.log_softmax(student, axis=feat_axis) -
+        jax.nn.log_softmax(teacher, axis=feat_axis))
+    return jnp.sqrt(
+        jnp.sum(logsoftmax_diff * logsoftmax_diff, axis=feat_axis) + epsilon)
+  elif kind == 'agree':
+    def get_top_k(arr, k, ax):
+      return jax.lax.top_k(arr.swapaxes(ax, -1), k)[1].swapaxes(ax, -1)
+    return (get_top_k(student, k, feat_axis) ==
+            get_top_k(teacher, 1, feat_axis)).sum(feat_axis)
+  else:
+    assert False, f'Unknown kind of distance {kind}.'
+@lru_cache(None)
+def get_dist_fn(**kw):
+  return partial(dist, **kw)
+# To avoid re-compiling the function for every new instance of the same
+# evaluator on a different dataset!
+@lru_cache(None)
+def get_eval_fn(student_teacher_fwd, what, mesh, distances):
+  """Produces eval function, also applies pmap."""
+  @partial(jax.jit, out_shardings=NamedSharding(mesh, P()))
+  def _eval_fn(train_state, batch, mask):
+    (_, out_s), (_, out_t) = student_teacher_fwd(train_state, batch)
+    repr_s = u.tree_get(out_s, what[0])
+    repr_t = u.tree_get(out_t, what[1])
+    # Let's flatten any non-vectors (eg feature-maps).
+    repr_s = einops.rearrange(repr_s, 'b ... -> b (...)')
+    repr_t = einops.rearrange(repr_t, 'b ... -> b (...)')
+    all_ds = []
+    # NOTE: we're gathering and returning all ; if this becomes too slow, we
+    #       can change to compute and return summary stats later on.
+    for dist_fn in distances:
+      ds = dist_fn(repr_s, repr_t)
+      all_ds.append(ds)
+    all_masks = mask
+    return all_ds, all_masks
+  return _eval_fn
+class Evaluator:
+  """Distillation distance evaluator."""
+  def __init__(
+      self,
+      student_teacher_fwd,
+      data,
+      pp_fn,
+      distances,
+      what=('logits', 'logits'),
+      *,
+      devices,
+      **data_kw,
+  ):
+    data = ds_core.get(**data)
+    pp_fn = pp_builder.get_preprocess_fn(pp_fn)
+    prefetch = data_kw.pop('prefetch', 1)
+    self.ds, self.steps = input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True),
+        pp_fn,
+        num_ex_per_process=data.num_examples_per_process(),
+        **data_kw,
+    )
+    self.data_iter = input_pipeline.start_global(self.ds, devices, prefetch)
+    dist_fns = tuple(get_dist_fn(**dist) for dist in distances)
+    self.dist_names = [
+        '_'.join(f'{k}={v}' for k, v in dist.items()) for dist in distances
+    ]
+    mesh = jax.sharding.Mesh(devices, ('data',))
+    self.eval_fn = get_eval_fn(student_teacher_fwd, what, mesh, dist_fns)
+  def run(self, train_state):
+    """Computes all metrics."""
+    all_ds = [[] for _ in self.dist_names]
+    for _, batch in zip(range(self.steps), self.data_iter):
+      mask = batch.pop('_mask')
+      batch_ds, batch_ms = self.eval_fn(train_state, batch, mask)
+      # All results are a replicated array shaped as follows:
+      # (local_devices, per_device_batch_size, elem_shape...)
+      # with each local device's entry being identical.
+      # So let's just take the first one to the host as numpy.
+      batch_ms = np.array(batch_ms)
+      for i, val in enumerate(batch_ds):
+        all_ds[i].append(np.array(val)[batch_ms == 1])
+    for name, ds in zip(self.dist_names, all_ds):
+      ds = np.concatenate(ds)
+      yield f'{name}/all', ds
+      yield f'{name}/avg', np.mean(ds)
+      yield f'{name}/min', np.min(ds)
+      yield f'{name}/max', np.max(ds)

Tipsomaly/model/big_vision/evaluators/proj/givt/coco_panoptic.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""COCO17 panoptic evaluation.
+jax.jit-compatible fork of the evaluator from evaluators/proj/uvim.
+"""
+import functools
+import itertools
+import json
+import os
+import tempfile
+import time
+from typing import Any
+import zipfile
+from absl import flags
+from absl import logging
+from big_vision import input_pipeline
+from big_vision import utils
+from big_vision.datasets import core as ds_core
+import big_vision.pp.builder as pp_builder
+import jax
+import jax.numpy as jnp
+import numpy as np
+from pycocotools.panopticapi import evaluation
+import panopticapi_converters.twochannels2panoptic_coco_format as converter
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.io import gfile
+# Temporary global flag to facilitate backwards compatability.
+API = 'jit'
+ROOT = os.environ.get('COCO_DATA_DIR', '.')
+PANOPTIC_COCO_CATS_FILE = f'{ROOT}/panoptic_coco_categories.json'
+PANOPTIC_2017 = {
+    'train': f'{ROOT}/panoptic_train2017.json',
+    'validation': f'{ROOT}/panoptic_val2017.json',
+}
+PANOPTIC_GT_ZIP = {
+    'train': f'{ROOT}/panoptic_train2017.zip',
+    'validation': f'{ROOT}/panoptic_val2017.zip',
+}
+# Note: global to avoid jax re-compiling across different evaluator instances.
+@functools.cache
+def _get_predict_fn(predict_fn, mesh=None):
+  """Wrapper for jit-compiled predict function."""
+  # `out_shardings` annotation is needed because of the `all_gather` ops in the
+  # pmap implementation.
+  @functools.partial(jax.jit,
+                     out_shardings=jax.sharding.NamedSharding(
+                         mesh, jax.sharding.PartitionSpec()))
+  def _run_predict_fn(train_state, batch):
+    """Run predict_fn and gather all outputs on all devices."""
+    y = predict_fn(train_state, batch)
+    res = {
+        'image/id': batch['image/id'],
+        'mask': batch['_mask'],
+        'y': jnp.stack([y['semantics'], y['instances']], axis=-1),
+    }
+    return res
+  return _run_predict_fn
+class Evaluator:
+  """Panoptic segmentation evaluator: calls official COCO API."""
+  def __init__(
+      self,
+      predict_fn,
+      pp_fn,
+      batch_size,
+      data=None,
+      cache_final=True,
+      cache_raw=False,
+      prefetch=1,
+      save_dir=None,
+      *,
+      devices,
+  ):
+    """Panoptic segmentation evaluator: calls official COCO API.
+    Args:
+      predict_fn: jit-compilable function, which accepts arbitrary dictionaries
+        of parameters and data, where the data dictionary is produced by the
+        `pp_fn`. It is expected to output a 2-channel mask, where the first
+        channel encodes semantics, and the second channel encodes instance ids.
+      pp_fn: Preprocessing function, sepcified as string.
+      batch_size: Batch size.
+      data: Dict specifying name and split of the data set. Defaults to the
+        standard COCO (2017).
+      cache_final: Whether to cache the data after preprocessing - see
+        input_pipeline for details.
+      cache_raw: Whether to cache the raw data - see input_pipline for details.
+      prefetch: Number of batches to prefetch
+      save_dir: Directory to save the results in.
+      devices: List of jax devices.
+    """
+    self.predict_fn = _get_predict_fn(
+        predict_fn, jax.sharding.Mesh(devices, ('devices',)))
+    data_specs = dict(name='coco/2017_panoptic',
+                      data_dir=None, split='validation')
+    data_specs.update(data or {})
+    data = ds_core.get(**data_specs)
+    self.dataset, self.steps = input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True), batch_size=batch_size,
+        num_ex_per_process=data.num_examples_per_process(),
+        preprocess_fn=pp_builder.get_preprocess_fn(pp_fn),
+        cache_final=cache_final, cache_raw=cache_raw)
+    self.data_iter = input_pipeline.start_global(
+        self.dataset, devices, prefetch)
+    # Only process 0 runs conversion to png and calls into coco api.
+    if jax.process_index() == 0:
+      self.result_dir = tempfile.TemporaryDirectory()
+      (self.gt_folder, self.gt_json, self.categories_json,
+       self.remap, self.size_map) = _prepare_ground_truth(
+           data_specs['name'], data_specs['split'],
+           data_specs.get('data_dir'))
+      if save_dir:
+        self.save_dir = save_dir.format(workdir=flags.FLAGS.workdir)
+        gfile.makedirs(self.save_dir)
+      else:
+        self.save_dir = None
+  def _compute_png_predictions(
+      self, train_state: Any) -> Any:
+    """Computes predictions and converts then to png to optimize memory use."""
+    count = 0
+    logging.info('Panoptic eval: running inference.')
+    for batch in itertools.islice(self.data_iter, self.steps):
+      out = self.predict_fn(train_state, batch)
+      if jax.process_index():
+        continue
+      out = jax.device_get(out)
+      mask = out['mask']
+      pan_recs = out['y'][mask]
+      ids = out['image/id'][mask]
+      for pan_rec, image_id in zip(pan_recs, ids):
+        sem = pan_rec[..., 0]
+        ins = pan_rec[..., 1]
+        sem_remapped = np.array(sem)
+        for v in np.unique(sem):
+          sem_remapped[sem == v] = self.remap[v]
+        sem = sem_remapped
+        pan_mask = np.stack([sem, ins, np.zeros_like(sem)], axis=-1)
+        pan_mask = utils.put_cpu(pan_mask)
+        pan_mask = _resize_nearest(pan_mask, self.size_map[image_id])
+        pan_mask_png = tf.io.encode_png(pan_mask.astype('uint8')).numpy()
+        fname = f'{self.result_dir.name}/{image_id:012d}.png'
+        with open(fname, 'wb') as f:
+          f.write(pan_mask_png)
+        count += 1
+      logging.log_every_n_seconds(
+          logging.INFO, 'Panoptic eval: processed %i examples so far.', 30,
+          count)
+    if jax.process_index():
+      return None
+    logging.info('Panoptic eval: inference done. Processed %d examples.', count)
+    return self.result_dir
+  def run(self, train_state):
+    """Run panoptic segmentation evaluation.
+    Args:
+      train_state: pytree containing the model parameters.
+    Yields:
+      Tuples consisting of metric name and value.
+    """
+    # Note result_dir is constant, but files inside are mutated.
+    result_dir = self._compute_png_predictions(train_state)
+    if jax.process_index():
+      return
+    if self.save_dir:
+      gfile.RecursivelyCopyDir(result_dir.name, self.save_dir, overwrite=True)
+    with tempfile.TemporaryDirectory() as pred_folder, \
+         tempfile.NamedTemporaryFile(mode='w') as pred_json:
+      logging.info('Panoptic eval: running conversion.')
+      converter.converter(
+          source_folder=result_dir.name,
+          images_json_file=self.gt_json,
+          categories_json_file=self.categories_json,
+          segmentations_folder=pred_folder,
+          predictions_json_file=pred_json.name)
+      logging.info('Panoptic eval: conversion done.')
+      logging.info('Panoptic eval: running metrics computation.')
+      res = evaluation.pq_compute(gt_json_file=self.gt_json,
+                                  gt_folder=self.gt_folder,
+                                  pred_json_file=pred_json.name,
+                                  pred_folder=pred_folder)
+      logging.info('Panoptic eval: metrics computation done.')
+    for k in ['All', 'Stuff', 'Things']:
+      for m in ['pq', 'rq', 'sq']:
+        yield f'{k}_{m}', res[k][m]
+def _prepare_ground_truth(dataset, split, data_dir):
+  if dataset == 'coco/2017_panoptic' and data_dir is None:
+    return _prepare_ground_truth_from_zipfiles(split)
+  else:
+    return _prepare_ground_truth_from_dataset(dataset, split, data_dir)
+@functools.lru_cache(maxsize=None)
+def _prepare_ground_truth_from_dataset(dataset, split, data_dir):
+  """Prepare ground truth from a tf.data.Dataset.
+  Args:
+    dataset: TFDS-compatible dataset specification.
+    split: Data set split to use.
+    data_dir: Folder containing the data
+  Returns:
+    A tuple containing the folder containing the ground-truth data, the
+    ground truth annotations loaded from json, the categories loaded form json,
+    a map for remapping, and a map mapping image id to image size.
+  """
+  tfds_dataset = tfds.builder(
+      dataset, data_dir=data_dir).as_dataset(split=split)
+  categories_json = _make_local_copy(PANOPTIC_COCO_CATS_FILE)
+  with gfile.GFile(categories_json, 'rb') as f:
+    categories = json.loads(f.read())
+  # Build map from tfds class ids to COCO class ids.
+  remap = {0: 0}
+  with gfile.GFile(categories_json, 'r') as f:
+    remap = {**remap, **{(i + 1): x['id'] for i, x in enumerate(categories)}}
+  gt_folder = tempfile.mkdtemp()
+  gfile.makedirs(gt_folder)
+  size_map = {}
+  annotations = []
+  images = []
+  for example in tfds_dataset:
+    image_id = int(example['image/id'])
+    panoptic_image = example['panoptic_image']
+    ann_ids = example['panoptic_objects']['id']
+    ann_labels = example['panoptic_objects']['label']
+    ann_iscrowd = example['panoptic_objects']['is_crowd']
+    ann_area = example['panoptic_objects']['area']
+    fname = f'{image_id:012d}.png'
+    with gfile.GFile(os.path.join(gt_folder, fname), 'wb') as f:
+      f.write(tf.io.encode_png(panoptic_image).numpy())
+    size_map[image_id] = (panoptic_image.shape[0], panoptic_image.shape[1])
+    segments_info = []
+    for i in range(len(ann_ids)):
+      segments_info.append({
+          'id': int(ann_ids[i]),
+          'category_id': remap[int(ann_labels[i] + 1)],
+          'iscrowd': int(ann_iscrowd[i]),
+          'area': int(ann_area[i]),
+      })
+    annotations.append({
+        'file_name': str(fname),
+        'image_id': int(image_id),
+        'segments_info': segments_info
+    })
+    images.append({
+        'id': image_id,
+        'file_name': f'{image_id:012d}.jpg',
+    })
+  # Write annotations.json needed for pq_compute.
+  gt_json = os.path.join(gt_folder, 'annotations.json')
+  with gfile.GFile(gt_json, 'wb') as f:
+    f.write(json.dumps({
+        'images': images,
+        'annotations': annotations,
+        'categories': categories,
+    }))
+  return gt_folder, gt_json, categories_json, remap, size_map
+def _prepare_ground_truth_from_zipfiles(split):
+  """Prepare ground truth from coco zip files.
+  Args:
+    split: dataset split to prepare ground truth for.
+  Returns:
+    A tuple containing the folder containing the ground-truth data, the ground
+    truth annotations loaded from json, the categories loaded form json, a map
+    for remapping, and a map mapping image id to image size.
+  """
+  split_prefix = split.split('[')[0]
+  if split_prefix not in ('train', 'validation'):
+    raise ValueError(f'Split {split} not supported')
+  # The following 4 calls are cached. This allows to save significant time
+  # in use cases like sweeping predict_fn hparams on the same run.
+  gt_json = _make_local_copy(PANOPTIC_2017[split_prefix])
+  gt_folder = _make_local_unzip_copy(PANOPTIC_GT_ZIP[split_prefix])
+  categories_json = _make_local_copy(PANOPTIC_COCO_CATS_FILE)
+  image_ids = _list_image_ids('coco/2017_panoptic', split)
+  gt_folder = os.path.join(
+      gt_folder, 'panoptic_val2017'
+      if split_prefix == 'validation' else 'panoptic_train2017')
+  # Build map from tfds class ids to COCO class ids.
+  remap = {0: 0}
+  with gfile.GFile(categories_json, 'r') as f:
+    remap = {**remap, **{(i + 1): x['id'] for i, x in enumerate(json.load(f))}}
+  # Filters gt_json to contain only annotations for images in dataset.
+  with gfile.GFile(gt_json) as f:
+    data = json.load(f)
+  logging.info(
+      'Panoptic eval: pre-filter %d annotations.',
+      len(data['annotations'])
+  )
+  data['images'] = [x for x in data['images'] if x['id'] in image_ids]
+  data['annotations'] = [
+      x for x in data['annotations'] if x['image_id'] in image_ids
+  ]
+  logging.info(
+      'Panoptic eval: post-filter %d annotations.',
+      len(data['annotations'])
+  )
+  filtered_gt_json = tempfile.NamedTemporaryFile(delete=False).name
+  with open(filtered_gt_json, 'w') as f:
+    json.dump(data, f)
+  # Precompute images sizes.
+  size_map = {x['id']: (x['height'], x['width']) for x in data['images']}
+  return gt_folder, filtered_gt_json, categories_json, remap, size_map
+@functools.lru_cache(maxsize=None)
+def _list_image_ids(dataset, split):
+  d = tfds.load(dataset, split=split).map(lambda x: x['image/id'])
+  return frozenset(d.as_numpy_iterator())
+@functools.lru_cache(maxsize=None)
+def _make_local_copy(fname) -> str:
+  start = time.monotonic()
+  local_file = tempfile.NamedTemporaryFile(delete=False)
+  gfile.copy(fname, local_file.name, overwrite=True)
+  logging.info('Copy %s in %d seconds.', fname, time.monotonic() - start)
+  return local_file.name
+@functools.lru_cache(maxsize=None)
+def _make_local_unzip_copy(fname) -> str:
+  start = time.monotonic()
+  folder = tempfile.mkdtemp()
+  with tempfile.NamedTemporaryFile() as tmp_zip_file:
+    gfile.copy(fname, tmp_zip_file.name, overwrite=True)
+    with zipfile.ZipFile(tmp_zip_file.name, 'r') as f:
+      f.extractall(folder)
+  logging.info('Copy %s in %d seconds.', fname, time.monotonic() - start)
+  return folder
+@utils.jit_cpu(static_argnums=(1,))
+def _resize_nearest(image, shape):
+  return jax.image.resize(image, shape + image.shape[-1:], 'nearest')

Tipsomaly/model/big_vision/evaluators/proj/givt/nyu_depth.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for NYU depth.
+jax.jit-compatible fork of the evaluator from evaluators/proj/uvim.
+At evaluation time the ground truth is cropped and clipped. Values outside of
+the test crop or clipping range are not included in eval calculations.
+In this evaluator, it is assume that the groud truth is already cropped, so the
+entire image is evaluated. However, the evaluator does perform the clipping.
+Reference implementations:
+  https://github.com/zhyever/Monocular-Depth-Estimation-Toolbox/blo(internal link)a0f341244260ff61541191a613dd74bc/depth/datasets/nyu.py
+  https://github.com/vinvino02/GLPDepth/blob/7f3c78df4ecd6e7c79fd0c4b73c95d61f4aa2121/code/utils/metrics.py
+  https://github.com/shariqfarooq123/AdaBins/blob/2fb686a66a304f0a719bc53d77412460af97fd61/evaluate.py
+"""
+import functools
+import itertools
+from big_vision import input_pipeline
+from big_vision import utils
+from big_vision.datasets import core as ds_core
+import big_vision.pp.builder as pp_builder
+import jax
+import jax.numpy as jnp
+import numpy as np
+# Temporary global flag to facilitate backwards compatability.
+API = "jit"
+# Note: global to avoid jax re-compiling across different evaluator instances.
+@functools.cache
+def _get_predict_fn(predict_fn, mesh=None):
+  """Wrapper for jit-compiled predict function."""
+  # `out_shardings` annotation is needed because of the `all_gather` ops in the
+  # pmap implementation.
+  @functools.partial(jax.jit,
+                     out_shardings=jax.sharding.NamedSharding(
+                         mesh, jax.sharding.PartitionSpec()))
+  def _run_predict_fn(train_state, batch):
+    """Run predict_fn and gather all outputs on all devices."""
+    pred = predict_fn(train_state, batch)
+    return {"mask": batch["_mask"],
+            "gt": jnp.squeeze(batch["ground_truth"], axis=-1),
+            "y": pred["depth"]}
+  return _run_predict_fn
+class Evaluator:
+  """Evaluator for NYU depth."""
+  def __init__(self,
+               predict_fn,
+               pp_fn,
+               batch_size,
+               data,
+               cache_final=True,
+               cache_raw=False,
+               prefetch=1,
+               min_depth=1e-3,
+               max_depth=10,
+               *,
+               devices):
+    """Evaluator for NYU depth.
+    Args:
+      predict_fn: jit-compilable function, accepts arbitrary dictionaries of
+        parameters and data, where the data dictionary is produced by the
+        `pp_fn` op. It is expected to output a dict with `depth` containing an
+        2D array with the predicted depth. The prediction is resized to the
+        ground_truth size with nearest neighbour.
+      pp_fn: Preprocessing function, sepcified as string. `pp_fn` must also
+        output a 'ground_truth' as a 2D array of ground truth. Fruther, it has
+        to apply a crop, if one wants to compute metrics with the eval crop
+        typically used for NYU Depth metrics.
+      batch_size: Batch size.
+      data: Dict specifying name and split of the data set. Defaults to the
+        standard COCO (2017).
+      cache_final: Whether to cache the data after preprocessing - see
+        input_pipeline for details.
+      cache_raw: Whether to cache the raw data - see input_pipline for details.
+      prefetch: Number of batches to prefetch
+      min_depth: Minimum depth value.
+      max_depth: Maximum depth value.
+      devices: List of jax devices.
+    """
+    self.min_depth = min_depth
+    self.max_depth = max_depth
+    self.predict_fn = _get_predict_fn(
+        predict_fn, jax.sharding.Mesh(devices, ("devices",)))
+    data = ds_core.get(**data)
+    self.dataset, self.steps = input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True), batch_size=batch_size,
+        num_ex_per_process=data.num_examples_per_process(),
+        preprocess_fn=pp_builder.get_preprocess_fn(pp_fn),
+        cache_final=cache_final, cache_raw=cache_raw)
+    self.data_iter = input_pipeline.start_global(
+        self.dataset, devices, prefetch)
+  def run(self, train_state):
+    """Run NYU depth eval.
+    Args:
+      train_state: pytree containing the model parameters.
+    Yields:
+      Tuples consisting of metric name and value.
+    """
+    rmses = []
+    abs_res = []
+    abs_logs = []
+    d1s = []
+    d2s = []
+    d3s = []
+    for batch in itertools.islice(self.data_iter, self.steps):
+      # Outputs is a dict with values shaped (gather/same, devices, batch, ...)
+      out = self.predict_fn(train_state, batch)
+      if jax.process_index():  # Host0 gets all preds and does eval.
+        continue
+      out = jax.device_get(out)
+      # Then the bool-indexing with mask resulting in flat (global_batch, ...)
+      out = jax.tree_map(lambda x: x[out["mask"]], out)  # pylint:disable=cell-var-from-loop
+      for gt, pred in zip(out["gt"], out["y"]):
+        # put_cpu and conversion to numpy arrays below to avoid unwanted
+        # host-to-device transfers
+        pred, gt = utils.put_cpu((pred, gt))
+        pred = _resize_nearest(pred, (gt.shape[0], gt.shape[1]))
+        pred, gt = np.array(pred), np.array(gt)
+        valid_mask = np.logical_and(gt > self.min_depth, gt < self.max_depth)
+        rmses.append(_compute_rmse(gt[valid_mask], pred[valid_mask]))
+        abs_res.append(_compute_abs_re(gt[valid_mask], pred[valid_mask]))
+        abs_logs.append(_compute_abs_log(gt[valid_mask], pred[valid_mask]))
+        d1s.append(_compute_delta(gt[valid_mask], pred[valid_mask], order=1))
+        d2s.append(_compute_delta(gt[valid_mask], pred[valid_mask], order=2))
+        d3s.append(_compute_delta(gt[valid_mask], pred[valid_mask], order=3))
+    if jax.process_index():  # Host0 gets all preds and does eval.
+      return
+    yield "RMSE", np.mean(rmses)
+    yield "abs_RE", np.mean(abs_res)
+    yield "log10", np.mean(abs_logs)
+    yield "delta1", np.mean(d1s)
+    yield "delta2", np.mean(d2s)
+    yield "delta3", np.mean(d3s)
+@utils.jit_cpu(static_argnums=(1,))
+def _resize_nearest(image, shape):
+  return jax.image.resize(image, shape, "nearest")
+def _compute_rmse(gt, pred):
+  diff = gt - pred
+  return np.sqrt(np.mean(np.power(diff, 2)))
+def _compute_abs_re(gt, pred):
+  diff = np.abs(gt - pred)
+  return np.mean(diff / gt)
+def _compute_abs_log(gt, pred):
+  diff = np.abs(np.log10(gt) - np.log10(pred))
+  return np.mean(diff)
+def _compute_delta(gt, pred, order):
+  rel_diff = np.maximum(gt / pred, pred / gt)
+  return np.sum(rel_diff < 1.25**order) / rel_diff.size

Tipsomaly/model/big_vision/evaluators/proj/givt/save_predictions.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator to save predictions."""
+# pylint: disable=consider-using-from-import
+import functools
+import io  # pylint: disable=unused-import
+import itertools
+import os
+from absl import flags
+from absl import logging
+from big_vision import input_pipeline
+from big_vision.datasets import core as ds_core
+import big_vision.pp.builder as pp_builder
+import big_vision.utils as u
+import jax
+import numpy as np
+from tensorflow.io import gfile  # pylint: disable=unused-import
+# Temporary global flag to facilitate backwards compatability.
+API = 'jit'
+# Note: global to avoid jax re-compiling across different evaluator instances.
+@functools.cache
+def _get_predict_fn(predict_fn, mesh=None):
+  """Wrapper for jit-compiled predict function."""
+  # `out_shardings` annotation is needed because of the `all_gather` ops in the
+  # pmap implementation.
+  @functools.partial(jax.jit,
+                     out_shardings=jax.sharding.NamedSharding(
+                         mesh, jax.sharding.PartitionSpec()))
+  def _run_predict_fn(train_state, batch):
+    """Run predict_fn and gather all outputs on all devices."""
+    y = predict_fn(train_state, batch)
+    return {'inputs': batch, 'outputs': y, 'mask': batch['_mask']}
+  return _run_predict_fn
+class Evaluator:
+  """Save predictions in "{FLAGS.workdir}/{outfile}".
+  Results can then be easily inspected in a notebook such as:
+  ```
+    results = utils.load_checkpoint("<full_path_to_outfile>")
+    inputs, outputs = (results["inputs"], results["outputs"])
+  ```
+  """
+  def __init__(self, predict_fn, pp_fn, batch_size, data, outfile,
+               cache_final=True, cache_raw=False, prefetch=1, *, devices):
+    self.predict_fn = _get_predict_fn(
+        predict_fn, jax.sharding.Mesh(devices, ('devices',)))
+    # Prepare data for each process and pad with zeros so all processes have the
+    # same number of batches.
+    data = ds_core.get(**data)
+    self.dataset, self.steps = input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True), batch_size=batch_size,
+        num_ex_per_process=data.num_examples_per_process(),
+        preprocess_fn=pp_builder.get_preprocess_fn(pp_fn),
+        cache_final=cache_final, cache_raw=cache_raw)
+    self.data_iter = input_pipeline.start_global(
+        self.dataset, devices, prefetch)
+    self.path = os.path.join(flags.FLAGS.workdir, outfile)
+  def run(self, train_state):
+    """Compute all predictions, gather in main host and save in outfile."""
+    count = 0
+    outputs = []
+    for batch in itertools.islice(self.data_iter, self.steps):
+      out = self.predict_fn(train_state, batch)
+      if jax.process_index():
+        continue
+      out = jax.device_get(out)
+      # Note that we need to access `out['mask']` here `x` does not have that
+      # field during the tree map.
+      out = jax.tree_map(lambda x: x[out['mask']], out)  # pylint: disable=cell-var-from-loop
+      count += out['mask'].shape[0]
+      out.pop('mask')
+      outputs.append(out)
+      logging.log_every_n_seconds(
+          logging.INFO, 'Save predictions: processed %i examples so far.', 30,
+          count)
+    if jax.process_index():
+      return
+    logging.info('Save predictions: processed %d examples.', count)
+    # Actually save in filesystem.
+    outputs = jax.tree_map(lambda *x: np.concatenate(x, axis=0), *outputs)
+    names_and_vals, _ = u.tree_flatten_with_names(outputs)
+    io_buffer = io.BytesIO()
+    np.savez_compressed(io_buffer, **{k: v for k, v in names_and_vals})
+    with gfile.GFile(self.path, 'wb') as f:
+      f.write(io_buffer.getvalue())
+    return
+    yield None  # pylint: disable=unreachable

Tipsomaly/model/big_vision/evaluators/proj/image_text/contrastive.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for the contrastive task.
+DON'T COMPARE ACROSS RUNS, use for training health monitoring only.
+Note that this evaluator's `ncorrect_minibatch` is only a rough proxy for
+training progress and does not report the actual `ncorrect`: when the same
+labels found multiple times in a batch, then the reported value is biased
+towards lower values.
+Also note that the `ncorrect_minibatch` is a function of batch size (it's a lot
+easier to find correct values in small batches).
+"""
+import functools
+from big_vision import input_pipeline
+import big_vision.datasets.core as ds_core
+import big_vision.pp.builder as pp_builder
+import big_vision.utils as u
+import jax
+import jax.numpy as jnp
+import numpy as np
+def _all_gather(z):
+  """All gather and flatten first two dims."""
+  gather_flat = lambda x: jnp.concatenate(jax.lax.all_gather(x, "batch"), 0)
+  return jax.tree_map(gather_flat, z)
+# To avoid re-compiling the function for every new instance of the same
+# evaluator on a different dataset!
+@functools.lru_cache(None)
+def get_eval_fn(predict_fn, use_global_batch):
+  """Produces eval function, also applies pmap."""
+  @functools.partial(jax.pmap, axis_name="batch")
+  def _eval_fn(params, images, labels, mask):
+    zimg, ztxt, extras = predict_fn(params, images, labels)
+    if use_global_batch:
+      zimg, ztxt, mask = _all_gather((zimg, ztxt, mask))
+    # Temperature won't affect ranking for accuracy, but impacts loss magnitude.
+    losses, measurements = u.bidirectional_contrastive_loss(
+        zimg, ztxt, extras["t"], mask, reduction=False)
+    l = jax.lax.psum(losses * mask, axis_name="batch")
+    c = jax.lax.psum(measurements["ncorrect"] * mask, axis_name="batch")
+    n = jax.lax.psum(mask, axis_name="batch")
+    return c, l, n
+  return _eval_fn
+class Evaluator:
+  """Contrastive evaluator."""
+  def __init__(self, predict_fn, data, pp_fn, batch_size,
+               use_global_batch, cache_final=True,
+               cache_raw=False, prefetch=1, label_key="labels"):
+    data = ds_core.get(**data)
+    pp_fn = pp_builder.get_preprocess_fn(pp_fn)
+    self.ds, self.steps = input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True), pp_fn, batch_size,
+        num_ex_per_process=data.num_examples_per_process(),
+        cache_final=cache_final, cache_raw=cache_raw)
+    self.data_iter = input_pipeline.start_input_pipeline(self.ds, prefetch)
+    self.eval_fn = get_eval_fn(predict_fn, use_global_batch)
+    self.label_key = label_key
+  def run(self, params):
+    """Computes all metrics."""
+    l, c, nseen = 0, 0, 0
+    for _, batch in zip(range(self.steps), self.data_iter):
+      labels, mask = batch.pop(self.label_key), batch.pop("_mask")
+      batch_ncorrect, batch_losses, batch_n = self.eval_fn(
+          params, batch["image"], labels, mask)
+      # All results are a replicated array shaped as follows:
+      # (local_devices, per_device_batch_size, elem_shape...)
+      # with each local device's entry being identical as they got psum'd.
+      # So let's just take the first one to the host as numpy.
+      c += np.sum(np.array(batch_ncorrect[0]))
+      l += np.sum(np.array(batch_losses[0]))
+      nseen += np.sum(np.array(batch_n[0]))
+    yield ("ncorrect_minibatch", c / nseen)
+    yield ("loss", l / nseen)

Tipsomaly/model/big_vision/evaluators/proj/image_text/discriminative_classifier.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Discriminative zero-shot classification evaluator.
+"""
+import functools
+import time
+from absl import logging
+from big_vision import input_pipeline
+from big_vision import utils
+from big_vision.evaluators.proj.image_text import prompt_engineering
+from big_vision.pp import ops_general  # pylint: disable=unused-import
+from big_vision.pp import ops_image  # pylint: disable=unused-import
+import big_vision.pp.builder as pp_builder
+import jax
+import jax.numpy as jnp
+from jax.sharding import NamedSharding
+from jax.sharding import PartitionSpec as P
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = "jit"
+DATASET_NAMES = ("imagenet2012", "cifar100", "oxford_iiit_pet")
+DEFAULT_OVERRIDES = (
+    ("imagenet2012", (
+        ("class_names", "clip"),
+        ("split", "validation"),
+        )),
+    )
+def _with_infinite_padding(dataset):
+  """Adds "infinite padding" to the dataset."""
+  filler_element = tf.nest.map_structure(
+      lambda spec: tf.zeros(spec.shape, spec.dtype)[None], dataset.element_spec)
+  filler_element["mask"] = [False]
+  filler_dataset = tf.data.Dataset.from_tensor_slices(filler_element)
+  dataset = dataset.map(
+      lambda features: dict(mask=True, **features),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset.concatenate(filler_dataset.repeat(None))
+# This is needed so retrieval_test can replace dataset info.
+def _get_dataset_info(builder):
+  return builder.info
+def prepare_datasets(img_dataset,
+                     class_names,
+                     *,
+                     prompt_templates,
+                     pp_img,
+                     pp_txt,
+                     cache_final=False,
+                     pre_filter_fn=None,
+                     class_name_offset=0):
+  """Returns unbatched `ds_images, ds_texts` datasets."""
+  assert prompt_templates, "Must specify prompt templates (e.g. simply ['{}'])"
+  def expand_aliases(idx, class_name):
+    class_names = tf.strings.split(class_name, ",")
+    return tf.data.Dataset.from_tensor_slices((
+        tf.repeat([idx + class_name_offset], len(class_names), axis=0),
+        class_names,
+    ))
+  def add_prompts(idx, class_name):
+    return tf.data.Dataset.from_tensor_slices({
+        "label": tf.repeat([idx], len(prompt_templates), axis=0),
+        "class_name": tf.repeat([class_name], len(prompt_templates), axis=0),
+        "prompt_template": prompt_templates,
+    })
+  def substitute_prompt(features):
+    parts = tf.strings.split(features["prompt_template"], "{}")
+    tf.debugging.assert_equal(len(parts), 2, features["prompt_template"])
+    return {
+        "label": features["label"],
+        "texts": tf.strings.join([parts[0], features["class_name"], parts[1]])
+    }
+  if pre_filter_fn:
+    img_dataset = img_dataset.filter(pre_filter_fn)
+  ds_images = img_dataset.map(
+      pp_builder.get_preprocess_fn(f"{pp_img}|keep('label', 'image')"))
+  ds_texts = tf.data.Dataset.from_tensor_slices(list(class_names)).enumerate(
+  ).flat_map(expand_aliases).flat_map(add_prompts).map(substitute_prompt).map(
+      pp_builder.get_preprocess_fn(f"{pp_txt}|keep('label', 'labels')"))
+  if cache_final:
+    ds_images, ds_texts = ds_images.cache(), ds_texts.cache()
+  return ds_images, ds_texts
+def _split_and_batch(dataset_name, data_dir, class_names, batch_size, split,
+                     get_ds):
+  """Splits dataset, calls `get_ds` and returns padded + batched datasets."""
+  assert not batch_size % jax.device_count(), (
+      f"batch_size={batch_size} % jax.device_count()={jax.device_count()}")
+  builder = tfds.builder(dataset_name, data_dir=data_dir)
+  # Split class names (last process gets remainder).
+  if len(class_names) < jax.process_count():
+    # See (internal link) for more details.
+    class_names += [""] * (jax.process_count() - len(class_names))
+  per_process = len(class_names) // jax.process_count()
+  class_name_offset = per_process * jax.process_index()
+  if jax.process_index() == jax.process_count() - 1:
+    class_names = class_names[class_name_offset:]
+  else:
+    class_names = class_names[class_name_offset:class_name_offset + per_process]
+  ds_images, ds_texts = get_ds(
+      builder.as_dataset(split=tfds.split_for_jax_process(split)),
+      class_names,
+      class_name_offset=class_name_offset)
+  return (
+      _with_infinite_padding(ds_images).batch(batch_size),
+      _with_infinite_padding(ds_texts).batch(batch_size),
+  )
+def _average_embeddings(embeddings, *, labels, num_classes, normalize):
+  """Computes per-class averages of `embeddings`."""
+  assert embeddings.ndim == 2, f"Expected {embeddings.ndim}==2"
+  assert labels.ndim == 1, f"Expected {labels.ndim}==1"
+  assert len(labels) == len(embeddings), (
+      f"Expected {len(labels)}=={len(embeddings)}")
+  byidx = [[] for _ in range(num_classes)]
+  for label, embedding in zip(labels, embeddings):
+    byidx[label].append(embedding)
+  missing = set(range(num_classes)) - set(
+      idx for idx, embs in enumerate(byidx) if len(embs))
+  assert not missing, f"Classes without embeddings: {missing}"
+  embeddings = [np.array(embedding).mean(axis=0) for embedding in byidx]
+  embeddings = np.stack(embeddings)
+  assert len(embeddings) == num_classes
+  if normalize:
+    embeddings /= 1e-8 + np.linalg.norm(embeddings, axis=1, keepdims=True)
+  return embeddings
+class Evaluator:
+  """Zero-shot classification evaluator."""
+  def __init__(self,
+               predict_fn,
+               *,
+               batch_size,
+               devices,
+               dataset_names=DATASET_NAMES,
+               data_dir=None,
+               class_names="dataset_info:label",
+               split="test",
+               prompt_templates="clip_paper",
+               canonicalize=True,
+               pp_img="resize(224)|value_range(-1,1)",
+               pp_txt="tokenize(max_len=16, eos='sticky', "
+                      "pad_value=1, inkey='texts', outkey='labels')",
+               cache_final=False,
+               pre_filter_fn=None,
+               first_class_name_only=True,
+               dataset_overrides=DEFAULT_OVERRIDES,
+               async_delay=1):
+    """Initializes a new zero-shot classification evaluator.
+    See `prepare_datasets()` for details on how the dataset is pre-processed.
+    Args:
+      predict_fn: Prediction function with signature
+        `zimg, ztxt, out = predict_fn(params, images, texts)`
+      batch_size: Global batch size.
+      devices: list of devices.
+      dataset_names: Names of TFDS datasets to evaluate on.
+      data_dir: Optional argument to `tfds.builder()`.
+      class_names: Usually specified as a string that is interpreted by
+        `prompt_engineering.get_class_names()` to look up class names.
+        Alternatively, this attribute can be a list of class names (using ","
+        to separate multiple aliases).
+      split: Which dataset split to use for evaluation.
+      prompt_templates: Specifies which prompt templates to use. See module
+        big_vision.evaluators.proj.image_text.prompte_engineering
+        for valid values.
+      canonicalize: Whether class names and prompt templates should be
+        canonicalized. See `prompt_engineering.py` for details.
+      pp_img: Preprocessing string for images. Preprocessed features should
+        contain key "image" with value that can be batched and is suitable for
+        the `images` argument of `predict_fn` input``.
+      pp_txt: Preprocessing string for texts. Can expect "texts" key as an input
+        (shape=[], dtype=string), and is expected to produce "labels" key that
+        is suitable for the `text` argument of `predict_fn` input.
+      cache_final: Wether preprocesse dataset should be cached.
+      pre_filter_fn: Predicate applied to the dataset for filtering records.
+      first_class_name_only: Whether only the first class name should be
+        considered (i.e. not using any aliases).
+      dataset_overrides: Mapping `dataset_name` to an optional dictionary that
+        can override parameters `dataset_name`, `data_dir`, `pp_img`, `pp_txt`,
+        `class_names`, `split`, `pre_filter_fn`, and the extra
+        `class_names_dataset_name`.
+        Works with tuple/dict of tuples/dicts.
+      async_delay: How many steps to wait before checking if all hosts have
+        finished their batch. A value > 1 allows for more parallelized
+        processing, but will results in more unnecessary steps with padded data.
+    """
+    t0 = time.monotonic()
+    self.datasets = {}
+    self.prompt_templates = prompt_engineering.get_prompt_templates(
+        prompt_templates, canonicalize=canonicalize)
+    self._axis_name = "batch"
+    dataset_overrides = {k: dict(v) for k, v in dict(dataset_overrides).items()}
+    for dataset_name in dataset_names:
+      overrides = dataset_overrides.pop(dataset_name, {})
+      dataset_name_ = overrides.pop("dataset_name", dataset_name)
+      data_dir_ = overrides.pop("data_dir", data_dir)
+      class_names_dataset_name = overrides.pop("class_names_dataset_name",
+                                               dataset_name_)
+      class_names_ = overrides.pop("class_names", class_names)
+      class_names_ = prompt_engineering.get_class_names(
+          dataset_name=class_names_dataset_name,
+          source=class_names_,
+          canonicalize=canonicalize)
+      pp_img_ = overrides.pop("pp_img", pp_img)
+      pp_txt_ = overrides.pop("pp_txt", pp_txt)
+      cache_final_ = overrides.pop("cache_final", cache_final)
+      split_ = overrides.pop("split", split)
+      pre_filter_fn_ = overrides.pop("pre_filter_fn", pre_filter_fn)
+      prompt_templates_ = overrides.pop("prompt_templates", prompt_templates)
+      canonicalize_ = overrides.pop("canonicalize", canonicalize)
+      prompt_templates_ = prompt_engineering.get_prompt_templates(
+          prompt_templates_, canonicalize=canonicalize_)
+      assert not overrides, f"Unknown overrides {dataset_name}: {overrides}"
+      if first_class_name_only:
+        class_names_ = [name.split(",")[0] for name in class_names_]
+      ds_images, ds_texts = _split_and_batch(
+          dataset_name=dataset_name_,
+          data_dir=data_dir_,
+          class_names=class_names_,
+          batch_size=batch_size,
+          split=split_,
+          get_ds=functools.partial(
+              prepare_datasets,
+              pp_img=pp_img_,
+              pp_txt=pp_txt_,
+              cache_final=cache_final_,
+              pre_filter_fn=pre_filter_fn_,
+              prompt_templates=prompt_templates_))
+      self.datasets[dataset_name] = dict(
+          images=ds_images, texts=ds_texts, class_names=class_names_,
+          dataset_name=dataset_name_, split=split_)
+    assert not dataset_overrides, f"Extra overrides: {dataset_overrides}"
+    def embed_texts(train_state, texts):
+      """Returns text embeddings."""
+      _, ztxt, _ = predict_fn(train_state, {"labels": texts})
+      return ztxt
+    def count_correct(train_state, return_embeddings, *, mask, labels, image,
+                      ztxt):
+      """Returns count of correct predictions (and optionally embeddings)."""
+      zimg, _, _ = predict_fn(train_state, {"image": image})
+      best_txt = (zimg @ ztxt.T).argmax(axis=1)
+      # labels has format [[1, -1, -1], [5, -1, -1], [7, 2, -1], ...]
+      # so here we count "any" correct, such that the counting matches the
+      # multilabel scenario described in "are we done with imagenet"
+      # (http://arxiv.org/abs/2006.07159) section 3.1
+      if labels.ndim == 1:
+        labels = labels[..., None]
+      assert labels.ndim == 2, labels.shape
+      matching = (best_txt[:, None] == labels).sum(axis=1)
+      correct = jnp.where(mask, (matching > 0).astype(jnp.int32), 0).sum()
+      correct = jnp.sum(correct)
+      if return_embeddings:
+        return correct, zimg
+      else:
+        return correct, None
+    self.devices = devices
+    self.mesh = jax.sharding.Mesh(devices, ("devices",))
+    self._embed_texts_p = jax.jit(
+        embed_texts, out_shardings=NamedSharding(self.mesh, P()))
+    self._count_correct_p = jax.jit(count_correct, static_argnums=(1,),
+                                    out_shardings=NamedSharding(self.mesh, P()))
+    self._count_p = jax.jit(jnp.sum,
+                            out_shardings=NamedSharding(self.mesh, P()))
+    self._all_gather_p = jax.jit(
+        lambda x: x, out_shardings=NamedSharding(self.mesh, P()))
+    self._compiled = set()
+    assert async_delay > 0, f"async_delay must be >0, not {async_delay}"
+    self._async_delay = async_delay
+    logging.info("Initialized evaluator in %.1f seconds", time.monotonic() - t0)
+  def _embed_texts(self, train_state, dataset_name):
+    """Returns per-class averaged text embeddings."""
+    t0 = time.monotonic()
+    logging.info("Starting text embedding...")
+    ns = []
+    embeddings = []
+    data = {"label": [], "mask": []}
+    ds_b = input_pipeline.start_global(
+        self.datasets[dataset_name]["texts"], self.devices)
+    for batch in ds_b:
+      ns.append(jax.device_get(self._count_p(batch["mask"])))
+      if len(ns) >= self._async_delay and ns[-self._async_delay] == 0:
+        break
+      embeddings.append(jax.device_get(self._embed_texts_p(
+          train_state, batch["labels"])))
+      for name in data:
+        data[name].append(jax.device_get(self._all_gather_p(batch[name])))
+      if self._embed_texts_p not in self._compiled:
+        logging.info("Compiled text embeddings in %.1fs", time.monotonic() - t0)
+        t0 = time.monotonic()
+        self._compiled.add(self._embed_texts_p)
+    ns = np.array(ns)
+    n = ns.sum()
+    data["embedding"] = embeddings
+    data = {k: np.concatenate(v, axis=0) for k, v in data.items()}
+    mask = data.pop("mask").astype(bool)
+    data = {k: v[mask] for k, v in data.items()}
+    data["average_embedding"] = _average_embeddings(
+        data["embedding"],
+        labels=data["label"],
+        num_classes=len(self.datasets[dataset_name]["class_names"]),
+        normalize=True)
+    logging.info("Embedded %s text in %d steps - ...%s", dataset_name, len(ns),
+                 ns[-10:])
+    logging.info("Totalling %d text in %.1fs", n, time.monotonic() - t0)
+    logging.info("Total texts embeddings size %.1fM",
+                 data["embedding"].nbytes / 1e6)
+    return data
+  def evaluate(self,
+               train_state,
+               dataset_name,
+               *,
+               return_embeddings=False):
+    """Returns evaluation results."""
+    texts = self._embed_texts(train_state, dataset_name)
+    ztxt_p = texts["average_embedding"]
+    ztxt_p = utils.reshard(ztxt_p, NamedSharding(self.mesh, P()))
+    t0 = time.monotonic()
+    logging.info("Starting image embedding...")
+    ns = []
+    embeddings = []
+    corrects = []
+    data = {"mask": [], "label": []} if return_embeddings else {}
+    ds_b = input_pipeline.start_global(
+        self.datasets[dataset_name]["images"], self.devices)
+    for batch in ds_b:
+      ns.append(jax.device_get(self._count_p(batch["mask"])))
+      if len(ns) >= self._async_delay and ns[-self._async_delay] == 0:
+        break
+      labels = batch["label"]
+      correct_p, embs_p = self._count_correct_p(
+          train_state,
+          return_embeddings,
+          mask=batch["mask"],
+          labels=labels,
+          image=batch["image"],
+          ztxt=ztxt_p,
+      )
+      corrects.append(jax.device_get(correct_p))
+      if self._count_correct_p not in self._compiled:
+        logging.info("Compiled image embeddings in %.1fs",
+                     time.monotonic() - t0)
+        t0 = time.monotonic()
+        self._compiled.add(self._count_correct_p)
+      if return_embeddings:
+        embeddings.append(jax.device_get(self._all_gather_p(embs_p)))
+      for name in data:
+        data[name].append(jax.device_get(self._all_gather_p(batch[name])))
+    ns = np.array(ns)
+    n = ns.sum()
+    correct = np.array(corrects).sum()
+    logging.info("Embedded %s image in %d steps - ...%s", dataset_name, len(ns),
+                 ns[-10:])
+    logging.info("Totalling %d image in %.1fs", n, time.monotonic() - t0)
+    ret = {
+        "accuracy": correct / n,
+        "correct": correct,
+        "count": n,
+    }
+    logging.info("Dataset %s, results %s", dataset_name, ret)
+    if return_embeddings:
+      data["embedding"] = embeddings
+      data = {k: np.concatenate(v, axis=0) for k, v in data.items()}
+      logging.info("Total images embeddings size %.1fM",
+                   data["embedding"].nbytes / 1e6)
+      mask = data.pop("mask").astype(bool)
+      ret["images"] = {k: v[mask] for k, v in data.items()}
+      ret["texts"] = texts
+    return ret
+  def run(self, train_state):
+    """Returns metrics."""
+    return [(f"{dataset_name}_accuracy",
+             self.evaluate(train_state, dataset_name)["accuracy"])
+            for dataset_name in self.datasets]

Tipsomaly/model/big_vision/evaluators/proj/image_text/discriminative_classifier_test.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for discriminative_classifier."""
+from unittest import mock
+from big_vision.evaluators.proj.image_text import discriminative_classifier
+from big_vision.pp import ops_general  # pylint: disable=unused-import
+from big_vision.pp import ops_image  # pylint: disable=unused-import
+from big_vision.pp.registry import Registry
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+@Registry.register("preprocess_ops.test_texts2labels")
+def _get_test_texts2labels():
+  def pp(features):
+    features["labels"] = tf.strings.to_number(features["texts"])
+    return features
+  return pp
+@Registry.register("preprocess_ops.copy_from")
+def _get_copy_from(**key_map):
+  def copy_from(d):
+    d = dict(d)
+    for k1, k2 in key_map.items():
+      d[k1] = d[k2]
+    return d
+  return copy_from
+class _Model(nn.Module):
+  @nn.compact
+  def __call__(self, image, texts):
+    self.param("x", lambda _: 0.)
+    def z(x):
+      if x is not None:
+        # Note that the returned vector is most similar with other vectors
+        # generated from the same underlying `x[:]`.
+        return jnp.stack([jnp.cos(x / 10.), jnp.sin(x / 10.)]).T
+    if texts is not None:
+      texts %= 5  # For testing `pre_filter_fn` below.
+    return z(image), z(texts), None
+  class DiscriminativeClassifierTest(tf.test.TestCase):
+  def test_prepare_datasets(self):
+    def generator():
+      yield {
+          "image": tf.ones([5, 5, 3], tf.float32),
+          "label": 1,
+      }
+      yield {
+          "image": tf.ones([4, 4, 3], tf.float32),
+          "label": 2,
+      }
+    ds = tf.data.Dataset.from_generator(
+        generator,
+        output_signature={
+            "image": tf.TensorSpec(shape=[None, None, 3], dtype=tf.float32),
+            "label": tf.TensorSpec(shape=[], dtype=tf.int64),
+        })
+    class_names = [
+        "class1,class1a",
+        "class2",
+    ]
+    prompt_templates = [
+        "test {}",
+        "test {} test",
+    ]
+    ds_img, ds_txt = discriminative_classifier.prepare_datasets(
+        ds,
+        class_names,
+        prompt_templates=prompt_templates,
+        pp_img="resize(2)",
+        pp_txt="copy_from(labels='texts')",
+    )
+    it_img = iter(ds_img)
+    batch = next(it_img)
+    self.assertAllEqual(1, batch["label"])
+    self.assertAllEqual(tf.ones([2, 2, 3]), batch["image"])
+    batch = next(it_img)
+    self.assertAllEqual(2, batch["label"])
+    self.assertAllEqual(tf.ones([2, 2, 3]), batch["image"])
+    it_txt = iter(ds_txt)
+    batch = next(it_txt)
+    self.assertAllEqual(0, batch["label"])
+    self.assertAllEqual("test class1", batch["labels"])
+    batch = next(it_txt)
+    self.assertAllEqual(0, batch["label"])
+    self.assertAllEqual("test class1 test", batch["labels"])
+    batch = next(it_txt)
+    self.assertAllEqual(0, batch["label"])
+    self.assertAllEqual("test class1a", batch["labels"])
+    batch = next(it_txt)
+    self.assertAllEqual(0, batch["label"])
+    self.assertAllEqual("test class1a test", batch["labels"])
+    batch = next(it_txt)
+    self.assertAllEqual(1, batch["label"])
+    self.assertAllEqual("test class2", batch["labels"])
+    batch = next(it_txt)
+    self.assertAllEqual(1, batch["label"])
+    self.assertAllEqual("test class2 test", batch["labels"])
+  def test_average_embeddings(self):
+    self.assertAllEqual(jnp.array([
+        [2.], [4.], [8.],
+    ]), discriminative_classifier._average_embeddings(
+        embeddings=jnp.array([
+            1., 3., 3., 1.,  # label1
+            8., 0.,  # label2
+            32., 0., 0., 0.,  # label3
+        ])[..., None],
+        labels=jnp.array([
+            0, 0,  # label1
+            0, 0,  # label1 (alias)
+            1, 1,  # label2
+            2, 2,  # label3
+            2, 2,  # label3 (alias)
+        ], jnp.int32),
+        num_classes=3, normalize=False))
+    self.assertAllEqual(
+        jnp.array([
+            [2**-.5, 2**-.5],
+        ]),
+        discriminative_classifier._average_embeddings(
+            embeddings=jnp.array([[2., 2.]]),
+            labels=jnp.array([0], jnp.int32),
+            num_classes=1,
+            normalize=True))
+  @mock.patch("big_vision.evaluators.proj."
+              "image_text.prompt_engineering.get_class_names")
+  @mock.patch("big_vision.evaluators.proj."
+              "image_text.prompt_engineering.get_prompt_templates")
+  @mock.patch("big_vision.evaluators.proj."
+              "image_text.discriminative_classifier._get_dataset_info")
+  def test_evaluate(self, get_dataset_info_mock, get_prompt_templates_mock,
+                    get_class_names_mock):
+    per_device_batch_size = 10  # Make sure we have some unfiltered examples.
+    global_batch_size = per_device_batch_size * jax.device_count()
+    per_host_num_examples = int(
+        np.ceil(global_batch_size / jax.process_count()))
+    splits = {
+        "test":
+            tfds.core.SplitInfo(
+                name="test", shard_lengths=[per_host_num_examples], num_bytes=0)
+    }
+    model = _Model()
+    params = model.init(jax.random.PRNGKey(0), None, None)["params"]
+    prompt_templates = [
+        "test prompt 1 {}",
+        "test prompt 2 {}",
+    ]
+    class_names = [
+        f"test_class_{i}" for i in range(10)
+    ]
+    get_prompt_templates_mock.return_value = prompt_templates
+    get_class_names_mock.return_value = class_names
+    get_dataset_info_mock.return_value.splits = splits
+    def pre_filter_fn(features):
+      return features["label"] < 5  # matches `texts %= 5` above
+    dataset_name = "cifar10_test"
+    with tfds.testing.mock_data(num_examples=per_host_num_examples):
+      evaluator = discriminative_classifier.Evaluator(
+          lambda p, b: model.apply({"params": p},
+                                   b.get("image", None),
+                                   b.get("labels", None)),
+          dataset_names=[dataset_name],
+          prompt_templates="test_prompts",
+          batch_size=global_batch_size,
+          devices=jax.devices(),
+          pp_img="copy_from(image='label')",
+          pp_txt="copy_from(labels='label')",
+          dataset_overrides={
+              dataset_name: {
+                  "dataset_name": "cifar10",
+                  "class_names": "test_classes",
+                  "pre_filter_fn": pre_filter_fn,
+              }
+          },
+          first_class_name_only=True,
+      )
+      results = evaluator.evaluate(
+          params,
+          dataset_name,
+          return_embeddings=True)
+      metrics = dict(evaluator.run(params))
+    # Assert all examples were processed.
+    self.assertLen(results["texts"]["embedding"],
+                   len(class_names) * len(prompt_templates))
+    self.assertLen(results["texts"]["average_embedding"], len(class_names))
+    self.assertAllEqual(
+        sorted(results["texts"]["label"]),
+        [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9])
+    # Note that above model makes perfect predictions by design.
+    self.assertEqual(1.0, results["accuracy"])
+    self.assertEqual(1.0, metrics[f"{dataset_name}_accuracy"])
+if __name__ == "__main__":
+  tf.test.main()

Tipsomaly/model/big_vision/evaluators/proj/image_text/image_text_retrieval.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluates image-text retrieval results."""
+from typing import List, Mapping
+import numpy as np
+RECALL_THRESHOLDS = (1, 5, 10)
+def text_to_image_retrieval_eval(
+    dist_matrix: np.ndarray,
+    text_image_correspondence: List[int]) -> Mapping[str, float]:
+  """Runs the text-to-image retrieval eval from the distance matrix.
+  Args:
+    dist_matrix: Distance matrix between text and image embeddings (shape
+      N_IMAGES x N_TEXTS).
+    text_image_correspondence: Mapping between rows and columns of
+      `dist_matrix`, that is, a list of N_TEXTS integers n_i that represent that
+      the text embedding in column i corresponds to the image embedding in row
+      n_i. Please note that many texts can be assigned to the same image. For
+      instance, if we have 2 images and 4 texts (i.e. dist_matrix is 2x4), then
+      `text_image_correspondence = [0, 0, 1, 1]` means that the two first texts
+      correspond to the first image and the two last texts to the second image.
+  Returns:
+    A dictionary with the Recall@k scores for k in RECALL_THRESHOLDS.
+  """
+  per_text_ranks = dist_matrix.argsort(axis=0)
+  text_image_correspondence = np.array(text_image_correspondence)
+  def recall_at(k):
+    wins = per_text_ranks[:k, :] == text_image_correspondence[None]
+    return wins.any(axis=0).mean()
+  return {
+      f'Recall@{k}': recall_at(k)
+      for k in RECALL_THRESHOLDS
+  }
+def image_to_text_retrieval_eval(
+    dist_matrix: np.ndarray,
+    text_image_correspondence: List[int]) -> Mapping[str, float]:
+  """Runs the image-to-text retrieval eval from the distance matrix.
+  Args:
+    dist_matrix: Distance matrix between text and image embeddings (shape
+      N_IMAGES x N_TEXTS).
+    text_image_correspondence: Mapping between rows and columns of
+      `dist_matrix`, that is, a list of N_TEXTS integers n_i that represent that
+      the text embedding in column i corresponds to the image embedding in row
+      n_i. Please note that many texts can be assigned to the same image. For
+      instance, if we have 2 images and 4 texts (i.e. dist_matrix is 2x4), then
+      `text_image_correspondence = [0, 0, 1, 1]` means that the two first texts
+      correspond to the first image and the two last texts to the second image.
+  Returns:
+    A dictionary with the Recall@k scores for k in RECALL_THRESHOLDS.
+  """
+  per_image_ranks = dist_matrix.argsort(axis=1)
+  text_image_correspondence = np.array(text_image_correspondence)
+  def recall_at(k):
+    top_k_images = text_image_correspondence[per_image_ranks[:, :k]]
+    wins = top_k_images == np.arange(len(per_image_ranks))[:, None]
+    return wins.any(axis=1).mean()
+  return {
+      f'Recall@{k}': recall_at(k)
+      for k in RECALL_THRESHOLDS
+  }

Tipsomaly/model/big_vision/evaluators/proj/image_text/image_text_retrieval_test.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for image_text_retrieval."""
+from typing import Mapping
+from absl.testing import absltest
+from absl.testing import parameterized
+from big_vision.evaluators.proj.image_text import image_text_retrieval
+import numpy as np
+class ImTextRetrievalTest(parameterized.TestCase):
+  @parameterized.parameters(
+      (np.array([[0.0, 0.0, 0.1, 0.5, 0.1, 0.2, 0.5, 0.1],
+                 [0.5, 0.4, 0.0, 0.0, 0.4, 0.2, 0.6, 0.4],
+                 [0.5, 0.4, 0.1, 0.5, 0.0, 0.0, 0.8, 0.3],
+                 [0.5, 0.4, 0.1, 0.5, 0.3, 0.2, 0.0, 0.0]]), {
+                     'Recall@1': 1.0,
+                     'Recall@5': 1.0,
+                     'Recall@10': 1.0
+                 }),  #
+      (np.array([[0.8, 0.8, 0.1, 0.5, 0.1, 0.2, 0.5, 0.1],
+                 [0.5, 0.4, 0.0, 0.0, 0.4, 0.2, 0.6, 0.4],
+                 [0.5, 0.4, 0.1, 0.5, 0.0, 0.8, 0.8, 0.3],
+                 [0.5, 0.4, 0.1, 0.5, 0.4, 0.2, 0.3, 0.3]]), {
+                     'Recall@1': 0.5,
+                     'Recall@5': 0.75,
+                     'Recall@10': 1.0
+                 }))
+  def test_image_to_text_retrieval_eval(self, dist_matrix: np.ndarray,
+                                        expected: Mapping[str, float]):
+    """Checks `image_to_text_retrieval_eval`.
+    Args:
+      dist_matrix: Distance matrix between image (rows) and text (columns).
+      expected: Expected eval results.
+    """
+    self.assertEqual(
+        image_text_retrieval.image_to_text_retrieval_eval(
+            dist_matrix, [0, 0, 1, 1, 2, 2, 3, 3]), expected)
+  @parameterized.parameters(
+      (np.array([[0.0, 0.0, 0.1, 0.5, 0.1, 0.2, 0.5, 0.1],
+                 [0.5, 0.4, 0.0, 0.0, 0.4, 0.2, 0.6, 0.4],
+                 [0.5, 0.4, 0.1, 0.5, 0.0, 0.0, 0.8, 0.3],
+                 [0.5, 0.4, 0.1, 0.5, 0.3, 0.2, 0.0, 0.0]]), {
+                     'Recall@1': 1.0,
+                     'Recall@5': 1.0,
+                     'Recall@10': 1.0
+                 }),  #
+      (np.array([[0.8, 0.8, 0.1, 0.5, 0.1, 0.2, 0.1, 0.1],
+                 [0.5, 0.4, 0.0, 0.0, 0.4, 0.2, 0.6, 0.4],
+                 [0.5, 0.4, 0.1, 0.5, 0.0, 0.8, 0.8, 0.3],
+                 [0.5, 0.4, 0.1, 0.5, 0.4, 0.2, 0.3, 0.3]]), {
+                     'Recall@1': 0.375,
+                     'Recall@5': 1.0,
+                     'Recall@10': 1.0
+                 }))
+  def test_image_text_retrieval(self, dist_matrix: np.ndarray,
+                                expected: Mapping[str, float]):
+    """Checks `text_to_image_retrieval_eval`.
+    Args:
+      dist_matrix: Distance matrix between image (rows) and text (columns).
+      expected: Expected eval results.
+    """
+    self.assertEqual(
+        image_text_retrieval.text_to_image_retrieval_eval(
+            dist_matrix, [0, 0, 1, 1, 2, 2, 3, 3]), expected)
+if __name__ == '__main__':
+  absltest.main()

Tipsomaly/model/big_vision/evaluators/proj/image_text/prompt_engineering.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for generating zero-shot prompts."""
+import re
+import string
+from typing import Sequence
+from absl import logging
+from big_vision.datasets.imagenet import class_names as imagenet_class_names
+from big_vision.evaluators.proj.image_text import prompt_engineering_constants
+import tensorflow_datasets as tfds
+_CLASS_NAMES = {  # For each dataset, maps from a source to its class names.
+    "imagenet2012": {
+        "clip": imagenet_class_names.CLIP_IMAGENET_CLASS_NAMES,
+    },
+    "grand-vision:imagenet2012": {
+        "clip": imagenet_class_names.CLIP_IMAGENET_CLASS_NAMES,
+    },
+    "imagenet_a": {
+        "clip": [
+            imagenet_class_names.CLIP_IMAGENET_CLASS_NAMES[i]
+            for i in imagenet_class_names.IMAGENET_A_LABELSET
+        ]
+    },
+    "imagenet_r": {
+        "clip": [
+            imagenet_class_names.CLIP_IMAGENET_CLASS_NAMES[i]
+            for i in imagenet_class_names.IMAGENET_R_LABELSET
+        ]
+    },
+    "imagenet_v2": {
+        "clip": imagenet_class_names.CLIP_IMAGENET_CLASS_NAMES,
+    },
+}
+_PROMPT_TEMPLATES = {
+    "class_name_only": ["{}"],
+    "clip_paper": prompt_engineering_constants.CLIP_PAPER_PROMPT_TEMPLATES,
+    "clip_best": prompt_engineering_constants.CLIP_BEST_PROMPT_TEMPLATES,
+}
+def get_class_names(*, dataset_name, source="dataset_info", canonicalize=True):
+  """Returns class name for `dataset_name` from `source`."""
+  if isinstance(source, str):
+    if source.startswith("dataset_info:"):
+      name = source[len("dataset_info:"):]
+      class_names = tfds.builder(dataset_name).info.features[name].names
+    else:
+      class_names = _CLASS_NAMES[dataset_name][source]
+  else:
+    assert isinstance(source, Sequence) and all(
+        map(lambda s: isinstance(s, str), source)), source
+    class_names = source
+  if canonicalize:
+    class_names = [
+        canonicalize_text(name, keep_punctuation_exact_string=",")
+        for name in class_names
+    ]
+  logging.info("Using %d class_names: %s", len(class_names), class_names)
+  return class_names
+def get_prompt_templates(prompt_templates_name,
+                         *,
+                         canonicalize=True):
+  """Returns prompt templates."""
+  prompts_templates = _PROMPT_TEMPLATES[prompt_templates_name]
+  if canonicalize:
+    prompts_templates = [
+        canonicalize_text(name, keep_punctuation_exact_string="{}")
+        for name in prompts_templates
+    ]
+  logging.info("Using %d prompts_templates: %s", len(prompts_templates),
+               prompts_templates)
+  return prompts_templates
+def canonicalize_text(text, *, keep_punctuation_exact_string=None):
+  """Returns canonicalized `text` (lowercase and puncuation removed).
+  Args:
+    text: string to be canonicalized.
+    keep_punctuation_exact_string: If provided, then this exact string kept.
+      For example providing '{}' will keep any occurrences of '{}' (but will
+      still remove '{' and '}' that appear separately).
+  """
+  text = text.replace("_", " ")
+  if keep_punctuation_exact_string:
+    text = keep_punctuation_exact_string.join(
+        part.translate(str.maketrans("", "", string.punctuation))
+        for part in text.split(keep_punctuation_exact_string))
+  else:
+    text = text.translate(str.maketrans("", "", string.punctuation))
+  text = text.lower()
+  text = re.sub(r"\s+", " ", text)
+  return text.strip()

Tipsomaly/model/big_vision/evaluators/proj/image_text/prompt_engineering_constants.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Constants used by the module `prompt_engineering` in the same directory."""
+CLIP_PAPER_PROMPT_TEMPLATES = [
+    'a bad photo of a {}.',
+    'a photo of many {}.',
+    'a sculpture of a {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of the {}.',
+    'a rendering of a {}.',
+    'graffiti of a {}.',
+    'a bad photo of the {}.',
+    'a cropped photo of the {}.',
+    'a tattoo of a {}.',
+    'the embroidered {}.',
+    'a photo of a hard to see {}.',
+    'a bright photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a drawing of a {}.',
+    'a photo of my {}.',
+    'the plastic {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a black and white photo of the {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+    'a pixelated photo of the {}.',
+    'a sculpture of the {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a plastic {}.',
+    'a photo of the dirty {}.',
+    'a jpeg corrupted photo of a {}.',
+    'a blurry photo of the {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a rendering of the {}.',
+    'a {} in a video game.',
+    'a photo of one {}.',
+    'a doodle of a {}.',
+    'a close-up photo of the {}.',
+    'a photo of a {}.',
+    'the origami {}.',
+    'the {} in a video game.',
+    'a sketch of a {}.',
+    'a doodle of the {}.',
+    'a origami {}.',
+    'a low resolution photo of a {}.',
+    'the toy {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a photo of a large {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a photo of a weird {}.',
+    'a blurry photo of a {}.',
+    'a cartoon {}.',
+    'art of a {}.',
+    'a sketch of the {}.',
+    'a embroidered {}.',
+    'a pixelated photo of a {}.',
+    'itap of the {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a good photo of a {}.',
+    'a plushie {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'the cartoon {}.',
+    'art of the {}.',
+    'a drawing of the {}.',
+    'a photo of the large {}.',
+    'a black and white photo of a {}.',
+    'the plushie {}.',
+    'a dark photo of a {}.',
+    'itap of a {}.',
+    'graffiti of the {}.',
+    'a toy {}.',
+    'itap of my {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+    'a tattoo of the {}.',
+    '{}',
+]
+CLIP_BEST_PROMPT_TEMPLATES = [
+    'itap of a {}.',
+    'a bad photo of the {}.',
+    'a origami {}.',
+    'a photo of the large {}.',
+    'a {} in a video game.',
+    'art of the {}.',
+    'a photo of the small {}.',
+    '{}',
+]

Tipsomaly/model/big_vision/evaluators/proj/image_text/prompt_engineering_test.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for prompt_engineering."""
+from absl.testing import absltest
+from big_vision.evaluators.proj.image_text import prompt_engineering
+class PromptEngineeringTest(absltest.TestCase):
+  def test_canonicalize_text(self):
+    self.assertEqual(prompt_engineering.canonicalize_text("test_test"), "test test")
+    self.assertEqual(
+        prompt_engineering.canonicalize_text("test___test"), "test test")
+    self.assertEqual(prompt_engineering.canonicalize_text("test"), "test")
+    self.assertEqual(prompt_engineering.canonicalize_text("test."), "test")
+    self.assertEqual(prompt_engineering.canonicalize_text(" test "), "test")
+    self.assertEqual(
+        prompt_engineering.canonicalize_text("test\ntest"), "test test")
+    self.assertEqual(
+        prompt_engineering.canonicalize_text("test  test"), "test test")
+    self.assertEqual(prompt_engineering.canonicalize_text("test {}"), "test")
+    self.assertEqual(
+        prompt_engineering.canonicalize_text(
+            "test {}", keep_punctuation_exact_string="{}"), "test {}")
+    self.assertEqual(
+        prompt_engineering.canonicalize_text(
+            " test  {}...", keep_punctuation_exact_string="{}"), "test {}")
+    self.assertEqual(
+        prompt_engineering.canonicalize_text(
+            "test {}  {}  {}", keep_punctuation_exact_string="{}"),
+        "test {} {} {}")
+if __name__ == "__main__":
+  absltest.main()

Tipsomaly/model/big_vision/evaluators/proj/image_text/retrieval.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-host image->text and text->image retrieval evaluation.
+Example how to add to config:
+  config.evals {}
+  config.evals.retieval = dict(log_steps=1200, type='proj.image_text.retrieval')
+  config.evals.retrieval.dataset = 'coco_captions'
+  config.evals.retrieval.txt_name = ('captions', 'text')
+  # Note that initial "decode|" is not needed.
+  config.evals.retrieval.pp_img = 'resize(224)|value_range(-1,1)'
+  # Raw text strings use key "texts" in feature dict. The evaluator expects
+  # tokenized text with key "labels".
+  config.evals.retrieval.pp_txt = (
+      'tokenize(max_len=16, eos="sticky", pad_value=1, inkey="texts", '
+      '         outkey="labels")')
+Example to support precomputed data:
+See `big_vision/configs/proj/image_text/lit.py`.
+"""
+import functools
+import operator
+import time
+from absl import logging
+from big_vision import input_pipeline
+from big_vision.evaluators.proj.image_text import image_text_retrieval
+import big_vision.pp.builder as pp_builder
+import jax
+import jax.numpy as jnp
+from jax.sharding import NamedSharding
+from jax.sharding import PartitionSpec as P
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = "jit"
+def _with_infinite_padding(dataset):
+  """Adds "infinite padding" to the dataset."""
+  filler_element = tf.nest.map_structure(
+      lambda spec: tf.zeros(spec.shape, spec.dtype)[None], dataset.element_spec)
+  filler_element["mask"] = [False]
+  filler_dataset = tf.data.Dataset.from_tensor_slices(filler_element)
+  dataset = dataset.map(
+      lambda features: dict(mask=True, **features),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset.concatenate(filler_dataset.repeat(None))
+# This is needed so retrieval_test can replace dataset info.
+def _get_dataset_info(builder):
+  return builder.info
+def prepare_datasets(
+    dataset, *, pp_img, pp_txt, txt_name, offset=0, cache_final=False
+):
+  """Returns unbatched `ds_images, ds_texts` datasets.
+  Args:
+    dataset: An image-text `tf.data.Dataset` that is expected to contain the
+      following features: "image" (dtype=uint8, shape=[None, None, 3]),
+      `txt_name` (dtype=string, shape=[None]).
+    pp_img: String defining pre-processing for images. The pre-processing can
+      expect the following features to be prepared: "image", "id". The
+      pre-processing should convert the "image" (dtype=uint8,
+      shape=[None, None, 3]) to "image" (dtype=float32, shape=[sz, sz, 3]).
+    pp_txt: String defining pre-processing for text. The pre-processing can
+      expect the following features to be prepared: "texts", "id", "caption_id".
+      The pre-processing should convert the "texts" (dtype=string, shape=[])
+      into a tokenized "labels" (dtype=int32, shape=[max_len]).
+    txt_name: Name of the text feature to unroll in the original `dataset`. Can
+      be a simple string feature name, or an iterable of strings to specify a
+      nested feature (e.g. for "coco_captions", this would be
+      `('captions', 'text')`).
+    offset: Offset that should be added to enumerated examples to generate IDs.
+      In a multi-host setup, this is typically set to a value large enough to
+      make all IDs distinct.
+    cache_final: Whether the dataset should be cached.
+  Returns:
+    Image and text datasets.
+  """
+  def get_feature_value(data, feature_name):
+    if isinstance(feature_name, str):
+      feature_name = [feature_name]
+    return functools.reduce(operator.getitem, feature_name, data)
+  def get_captions(idx, features):
+    """Returns a dataset with unrolled "caption" for every example."""
+    texts = get_feature_value(features, txt_name)
+    texts = tf.experimental.numpy.atleast_1d(texts)  # For single-text GT.
+    texts_n = tf.shape(texts)[0]
+    return tf.data.Dataset.from_tensor_slices({
+        "id": tf.tile([idx + offset], [texts_n]),
+        "caption_i": tf.stack(tf.range(texts_n)),
+        "texts": tf.stack(texts),
+    })
+  def add_id(idx, features):
+    return {**features, "id": idx + offset}
+  ds_images = dataset.enumerate().map(add_id).map(
+      pp_builder.get_preprocess_fn(f"{pp_img}|keep('id', 'image')"))
+  ds_texts = dataset.enumerate().flat_map(get_captions).map(
+      pp_builder.get_preprocess_fn(
+          f"{pp_txt}|keep('id', 'caption_i', 'labels')"))
+  if cache_final:
+    ds_images, ds_texts = ds_images.cache(), ds_texts.cache()
+  return ds_images, ds_texts
+def _split_and_batch(dataset_name, batch_size, split, get_ds, data_dir=None):
+  """Splits dataset, calls `get_ds` and returns padded + batched datasets."""
+  assert not batch_size % jax.device_count(), (
+      f"batch_size={batch_size} % jax.device_count()={jax.device_count()}")
+  builder = tfds.builder(dataset_name, data_dir=data_dir)
+  info = _get_dataset_info(builder)
+  num_examples = info.splits[split].num_examples
+  ds_images, ds_texts = get_ds(
+      builder.as_dataset(split=tfds.split_for_jax_process(split)),
+      offset=jax.process_index() * num_examples,
+  )
+  return (
+      _with_infinite_padding(ds_images).batch(batch_size),
+      _with_infinite_padding(ds_texts).batch(batch_size),
+  )
+class Evaluator:
+  """Image/text retrieval evaluator."""
+  def __init__(self,
+               predict_fn,
+               *,
+               dataset,
+               pp_img,
+               pp_txt,
+               txt_name,
+               batch_size,
+               devices,
+               data_dir=None,
+               split="test",
+               cache_final=True):
+    """Initializes a new zero-shot image/text retrieval evaluator.
+    See `prepare_datasets()` for details on how the dataset is pre-processed.
+    Args:
+      predict_fn: Prediction function with signature
+        `zimg, ztxt, out = predict_fn(params, images, texts)`
+      dataset: The TFDS dataset name of the eval data.
+      pp_img: Preprocessing string for images. Preprocessed features should
+        contain key "image" with value that can be batched and is suitable for
+        `predict_fn(images)` input``.
+      pp_txt: Preprocessing string for texts. Can expect "texts" key as an input
+        (shape=[], dtype=string), and is expected to produce "labels" key that
+        is suitable for `predict_fn(texts)` input.
+      txt_name: The name of the feature of captions (can be a tuple to look up a
+        value in a nested feature dictionary). Expected shape=[None],
+        dtype=string. specified then items are used as lookup path.
+      batch_size: Global batch size.
+      devices: list of devices.
+      data_dir: Optional dir to load the TFDS dataset from.
+      split: The split of the eval data.
+      cache_final: Wether preprocessed dataset should be cached.
+    """
+    self.ds_images, self.ds_texts = _split_and_batch(
+        dataset,
+        batch_size,
+        split,
+        functools.partial(
+            prepare_datasets,
+            pp_img=pp_img,
+            pp_txt=pp_txt,
+            txt_name=txt_name,
+            cache_final=cache_final,
+        ),
+        data_dir=data_dir,
+    )
+    self._axis_name = "batch"
+    self.devices = devices
+    mesh = jax.sharding.Mesh(devices, ("devices",))
+    def embed_images(train_state, images):
+      zimg, _, _ = predict_fn(train_state, {"image": images})
+      return zimg
+    def embed_texts(train_state, texts):
+      _, ztxt, _ = predict_fn(train_state, {"labels": texts})
+      return ztxt
+    self._embed_images_p = jax.jit(embed_images,
+                                   out_shardings=NamedSharding(mesh, P()))
+    self._embed_texts_p = jax.jit(embed_texts,
+                                  out_shardings=NamedSharding(mesh, P()))
+    self._all_gather_p = jax.jit(
+        lambda x: x, out_shardings=NamedSharding(mesh, P()))
+    self._count_p = jax.jit(jnp.sum, out_shardings=NamedSharding(mesh, P()))
+    self._compiled = set()
+  def _embed(self, name, train_state, ds, embed_fn, id_names):
+    """Embeds features name `name` using `embed_fn`.
+    Args:
+      name: Feature name to be embedded.
+      train_state: train_state for the predict_fn.
+      ds: The dataset.
+      embed_fn: A pmapped function that returns the embeddings.
+      id_names: An iterable of feature names that should be collected.
+    Returns:
+      A dictionary with "embeddings" and `id_names` as keys.
+    """
+    ns = []
+    embeddings = []
+    ids = {id_name: [] for id_name in list(id_names) + ["mask"]}
+    t0 = time.time()
+    ds_b = input_pipeline.start_global(ds, self.devices)
+    for batch in ds_b:
+      ns.append(jax.device_get(self._count_p(batch["mask"])))
+      # Due to infinite padding, this loop will never end. We will stop once
+      # all processes only process padded data. We don't check the latest
+      # DeviceArray `ns[-1]` Because we want to keep our computation async for
+      # efficiency reasons.
+      if len(ns) >= 2 and ns[-2] == 0:
+        break
+      embs = embed_fn(train_state, batch[name])
+      if embed_fn not in self._compiled:
+        logging.info("Compiled %s embeddings in %.3fs", name, time.time() - t0)
+        t0 = time.time()
+        self._compiled.add(embed_fn)
+      embeddings.append(jax.device_get(embs))
+      for id_name in ids:
+        ids[id_name].append(jax.device_get(self._all_gather_p(batch[id_name])))
+    # Only access DeviceArray at end of loop for better efficiency.
+    ns = np.array(ns)
+    embeddings = np.concatenate(embeddings)
+    ids = {k: np.concatenate(v) for k, v in ids.items()}
+    masks = ids.pop("mask").astype(bool)
+    logging.info("Processed %s in %d steps - ...%s", name, len(ns), ns[-10:])
+    n = ns.sum()
+    logging.info("Totalling %d %s in %.3fs", n, name, time.time() - t0)
+    return {
+        "embeddings": embeddings[masks],
+        **{k: v[masks] for k, v in ids.items()},
+    }
+  def evaluate(self, train_state):
+    """Returns evaluation results."""
+    images = self._embed("image", train_state, self.ds_images,
+                         self._embed_images_p, ("id",))
+    texts = self._embed("labels", train_state, self.ds_texts,
+                        self._embed_texts_p, ("id", "caption_i"))
+    # Shapes: (nimg, emb) * (emb, ntxt) -> (nimg, ntxt)
+    similarities = np.dot(images["embeddings"], texts["embeddings"].T)
+    t0 = time.time()
+    id2img = {id_: i for i, id_ in enumerate(images["id"])}
+    text_image_correspondence = [id2img[id_] for id_ in texts["id"]]
+    img2txt = image_text_retrieval.image_to_text_retrieval_eval(
+        -similarities, text_image_correspondence)
+    txt2img = image_text_retrieval.text_to_image_retrieval_eval(
+        -similarities, text_image_correspondence)
+    logging.info("Computed retrieval metrics in %.3fs", time.time() - t0)
+    return dict(
+        images=images,
+        texts=texts,
+        img2txt=img2txt,
+        txt2img=txt2img,
+    )
+  def run(self, train_state):
+    """Returns metrics."""
+    results = self.evaluate(train_state)
+    return [(f"{direction}_{k.lower()}", v)
+            for direction in ("img2txt", "txt2img")
+            for k, v in results[direction].items()]

Tipsomaly/model/big_vision/evaluators/proj/image_text/retrieval_test.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for retrieval."""
+from unittest import mock
+from big_vision.evaluators.proj.image_text import retrieval
+from big_vision.pp import ops_general  # pylint: disable=unused-import
+from big_vision.pp import ops_image  # pylint: disable=unused-import
+from big_vision.pp import registry
+import chex
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import tensorflow as tf
+import tensorflow_datasets as tfds
+def _get_test_texts2labels():
+  def pp(features):
+    features["labels"] = tf.strings.to_number(features["texts"])
+    return features
+  return pp
+def _get_copy_from(**key_map):
+  def copy_from(d):
+    d = dict(d)
+    for k1, k2 in key_map.items():
+      d[k1] = d[k2]
+    return d
+  return copy_from
+class _Model(nn.Module):
+  @nn.compact
+  def __call__(self, image, texts):
+    self.param("x", lambda _: 0.)
+    def z(x):
+      if x is not None:
+        batch_size = len(x)
+        # Note that the returned vector is most similar with other vectors
+        # generated from the same underlying `x[:]`.
+        x = jnp.concatenate([100 * jnp.ones([batch_size, 1]), x[:, None]],
+                            axis=1)
+        return x / jnp.linalg.norm(x, axis=1)[:, None]
+    return z(image), z(texts), None
+def setUpModule():
+  chex.set_n_cpu_devices(8)
+class RetrievalTest(tf.test.TestCase):
+  def test_prepare_datasets(self):
+    def generator():
+      yield {
+          "image": tf.ones([5, 5, 3], tf.float32),
+          "captions": {
+              "text": tf.constant(["11", "12"])
+          }
+      }
+      yield {
+          "image": tf.ones([4, 4, 3], tf.float32),
+          "captions": {
+              "text": tf.constant(["21", "22", "23"])
+          }
+      }
+    ds = tf.data.Dataset.from_generator(
+        generator,
+        output_signature={
+            "image": tf.TensorSpec(shape=[None, None, 3], dtype=tf.float32),
+            "captions": {
+                "text": tf.TensorSpec(shape=[None], dtype=tf.string),
+            },
+        })
+    with registry.temporary_ops(test_texts2labels=_get_test_texts2labels):
+      ds_img, ds_txt = retrieval.prepare_datasets(
+          ds,
+          pp_img="resize(2)",
+          pp_txt="test_texts2labels()",
+          txt_name=("captions", "text"),
+      )
+    it_img = iter(ds_img)
+    it_txt = iter(ds_txt)
+    batch = next(it_img)
+    self.assertAllEqual(batch["id"], 0)
+    self.assertAllEqual(batch["image"], tf.ones([2, 2, 3]))
+    batch = next(it_img)
+    self.assertAllEqual(batch["id"], 1)
+    self.assertAllEqual(batch["image"], tf.ones([2, 2, 3]))
+    batch = next(it_txt)
+    self.assertAllEqual(batch["id"], 0)
+    self.assertAllEqual(batch["caption_i"], 0)
+    self.assertAllEqual(batch["labels"], 11.0)
+    batch = next(it_txt)
+    self.assertAllEqual(batch["id"], 0)
+    self.assertAllEqual(batch["caption_i"], 1)
+    self.assertAllEqual(batch["labels"], 12.0)
+    batch = next(it_txt)
+    self.assertAllEqual(batch["id"], 1)
+    self.assertAllEqual(batch["caption_i"], 0)
+    self.assertAllEqual(batch["labels"], 21.0)
+    batch = next(it_txt)
+    self.assertAllEqual(batch["id"], 1)
+    self.assertAllEqual(batch["caption_i"], 1)
+    self.assertAllEqual(batch["labels"], 22.0)
+    batch = next(it_txt)
+    self.assertAllEqual(batch["id"], 1)
+    self.assertAllEqual(batch["caption_i"], 2)
+    self.assertAllEqual(batch["labels"], 23.0)
+  def test_evaluate(self):
+    per_device_batch_size = 2
+    batch_size = per_device_batch_size * jax.device_count()
+    num_examples = 1 * batch_size + 1
+    splits = {
+        "test":
+            tfds.core.SplitInfo(
+                name="test", shard_lengths=[num_examples], num_bytes=0)
+    }
+    model = _Model()
+    params = model.init(jax.random.PRNGKey(0), None, None)["params"]
+    with tfds.testing.mock_data(num_examples=num_examples):
+      info_mock = mock.Mock()
+      info_mock.splits = splits
+      with mock.patch.object(retrieval, "_get_dataset_info",
+                             lambda _: info_mock):
+        with registry.temporary_ops(copy_from=_get_copy_from):
+          evaluator = retrieval.Evaluator(
+              lambda p, b: model.apply({"params": p},
+                                       b.get("image", None),
+                                       b.get("labels", None)),
+              dataset="coco_captions",
+              batch_size=batch_size,
+              devices=jax.devices(),
+              txt_name=("captions", "text"),
+              pp_img="copy_from(image='id')",
+              pp_txt="copy_from(labels='id')",
+          )
+      results = evaluator.evaluate(params)
+    # Assert all examples were processed.
+    self.assertLen(results["images"]["embeddings"], num_examples)
+    self.assertLen(results["images"]["id"], num_examples)
+    # Assert no padding was processed (expects exactly one (=first) image.id=0
+    self.assertEqual((results["images"]["id"] == 0).sum(), 1)
+    # Expect perfect ITR with above _Model()...
+    self.assertEqual(results["img2txt"]["Recall@1"], 1.0)
+    self.assertEqual(results["txt2img"]["Recall@5"], 1.0)
+if __name__ == "__main__":
+  tf.test.main()

Tipsomaly/model/big_vision/evaluators/proj/paligemma/perplexity.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for perplexity of a model."""
+import functools
+from big_vision.evaluators import mean
+import big_vision.utils as u
+import jax.numpy as jnp
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = 'jit'
+# Cache the function such that it won't always recompile (in mean evaluator).
+@functools.cache
+def perplexity(
+    predict_fn, key='labels', shift_labels=True, pad_token=None):
+  """Returns a function that computes perplexity."""
+  def _perplexity_fn(train_state, batch, **kw):
+    logits, _ = predict_fn(train_state, batch, **kw)
+    labels = batch[key]
+    weights = batch.get('mask_loss', jnp.ones_like(labels))
+    if pad_token is not None:
+      weights = weights * (labels != pad_token).astype(jnp.float32)
+    if shift_labels:
+      labels = labels[:, 1:]
+      weights = weights[:, 1:]
+    losses = u.weighted_softmax_xent(
+        logits=logits, labels=labels, weights=weights,
+        reduction=False, normalize=False)
+    normalizer = jnp.clip(weights.sum(axis=1), 2e-38)
+    return {'sum': losses, 'avg': losses / normalizer}
+  return _perplexity_fn
+class Evaluator(mean.Evaluator):
+  """Perplexity evaluator."""
+  def __init__(self, predict_fn, *a,
+               key='labels', shift_labels=False, pad_token=None, **kw):
+    kw.setdefault('prefetch', 0)  # More memory-saving default.
+    super().__init__(
+        perplexity(predict_fn, key, shift_labels, pad_token), *a, **kw)

Tipsomaly/model/big_vision/evaluators/proj/paligemma/transfers/chartqa.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for ChartQA variants."""
+import functools
+import big_vision.evaluators.common as c
+import big_vision.pp.tokenizer
+import big_vision.utils as u
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = "jit"
+class Evaluator:
+  """Evaluator for simple VQA tasks."""
+  def __init__(
+      self, predict_fn, tokenizer, to_lower=False,
+      outfile="{workdir}/{split}.json",
+      out_question_key="question_id", out_answer_key="answer",
+      *, data, devices, **kw):
+    self.get_data_iter, self.steps = c.eval_input_pipeline(
+        keep_on_cpu={"answer", "question_id"}, data=data, devices=devices, **kw)
+    self.outfile = c.resolve_outfile(outfile, split=data.get("split"))
+    self.out_question_key = out_question_key
+    self.out_answer_key = out_answer_key
+    # We'll need the tokenizer to detokenize the model outputs later.
+    self.tok = big_vision.pp.tokenizer.get_tokenizer(tokenizer)
+    self.postproc = (lambda s: s.lower()) if to_lower else lambda s: s
+    self.decode = functools.partial(
+        predict_fn, devices=devices, eos_token=self.tok.eos_token)
+  def run(self, train_state):
+    """Does one evaluation run, yields metrics."""
+    accuracies = []
+    relaxed_accuracies = []
+    json_out = []
+    for _, batch in zip(range(self.steps), self.get_data_iter()):
+      # (batch, seqlen) array of decoded generated tokens.
+      tokens = self.decode(train_state, batch)
+      # (local_batch,) that indicates padding examples (0) vs real examples (1).
+      tokens = u.get_local_slice_from_fsarray(tokens)
+      ex_masks = u.get_local_slice_from_fsarray(batch["_mask"])
+      # Turn predictions into texts and then scores, one by one.
+      for i in range(len(tokens)):
+        if ex_masks[i] == 0:  # Skip last-batch padding examples
+          continue
+        answer = self.postproc(self.tok.to_str(tokens[i], stop_at_eos=True))
+        gt = self.postproc(batch["answer"][i])
+        accuracies.append(float(answer == gt))
+        relaxed_accuracies.append(_relaxed_match(gt, answer))
+        json_out.append({
+            self.out_question_key: batch["question_id"][i].item(),
+            self.out_answer_key: answer,
+            "gt": gt,
+            "relaxed_match": relaxed_accuracies[-1],
+        })
+    # At this point `accuracies` is a list of per-example scores. However,
+    # remember that each host holds a different subset of the examples! So if
+    # we were to just return the mean accuracy here, we would effectively only
+    # have evaluated on the main host's (who writes metrics) subset!
+    # So now, we need to compute global means.
+    # There is one more caveat: `process_sum` needs the summands on each host
+    # to have the same size. So we either need to include dummy values for
+    # the padding examples (last batch, annoying), or we only sum scalars as in
+    # sufficient statistics, which we do here.
+    sum_accs, sum_relaxed_accs, num = c.process_sum(
+        [sum(accuracies), sum(relaxed_accuracies), len(accuracies)])
+    # Yielding metric_name, value means logging the metric.
+    yield "acc", sum_accs / num
+    yield "relaxed_acc", sum_relaxed_accs / num
+    yield "num", num  # Just for sanity checks.
+    c.multiprocess_write_json(self.outfile, json_out)
+def _to_float(text: str) -> float | None:
+  try:
+    if text.endswith("%"):
+      # Convert percentages to floats.
+      return float(text.rstrip("%")) / 100.0
+    else:
+      return float(text)
+  except ValueError:
+    return None
+def _relaxed_match(target: str,
+                   prediction: str,
+                   max_relative_error: float = 0.05) -> bool:
+  """Calculates relaxed correctness.
+  The correctness tolerates certain error ratio defined by max_relative_error.
+  See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+  “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+  numeric answers to allow a minor inaccuracy that may result from the automatic
+  data extraction process. We consider an answer to be correct if it is within
+  5% of the gold answer. For non-numeric answers, we still need an exact match
+  to consider an answer to be correct.”
+  Args:
+    target: Target string.
+    prediction: Predicted string.
+    max_relative_error: Maximum relative error.
+  Returns:
+    Whether the prediction was correct given the specified tolerance.
+  """
+  prediction_float = _to_float(prediction)
+  target_float = _to_float(target)
+  # When the target is 0 is always required an exact match.
+  if prediction_float is not None and target_float:
+    relative_error = abs(prediction_float - target_float) / abs(target_float)
+    return relative_error <= max_relative_error
+  else:
+    return prediction == target

Tipsomaly/model/big_vision/evaluators/proj/paligemma/transfers/pope.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for the POPE dataset (https://github.com/RUCAIBox/POPE).
+POPE is a binary classification dataset with ground-truth answers being either
+'yes' or 'no'.
+"""
+import functools
+import big_vision.datasets.core
+import big_vision.evaluators.common as c
+import big_vision.input_pipeline
+import big_vision.pp.builder
+import big_vision.pp.tokenizer
+import big_vision.utils as u
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = "jit"
+class Evaluator:
+  """Evaluator for the POPE task.
+  This evaluator expects the batch to contain a field `question_id` and a field
+  `answer` for single ground truth or `answers` for multiple ground truths.
+  The field names used when writting the json result can be controlled with
+  `out_question_key` and `out_answer_key`.
+  """
+  def __init__(
+      self,
+      predict_fn,
+      data,
+      pp_fn,
+      tokenizer,
+      batch_size,
+      *,
+      devices,
+      outfile="{workdir}/{split}.json",
+      out_question_key="question_id",
+      out_answer_key="answer"
+  ):
+    self.outfile = c.resolve_outfile(outfile, split=data.get("split"))
+    self.out_question_key = out_question_key
+    self.out_answer_key = out_answer_key
+    # This will mostly look the same across all evaluators, preparing data:
+    data = big_vision.datasets.core.get(**data)
+    pp_fn = big_vision.pp.builder.get_preprocess_fn(pp_fn)
+    self.ds, self.steps = big_vision.input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True),
+        pp_fn,
+        batch_size,
+        num_ex_per_process=data.num_examples_per_process(),
+    )
+    # The `keep_on_cpu=` argument lists the data keys that, if they exist, we
+    # do NOT want to ship to the TPUs and instead just keep in host memory.
+    # Typically ground-truth and metadata, that is often of string type.
+    self.data_iter = big_vision.input_pipeline.start_global(
+        self.ds, devices, keep_on_cpu={"answer", "question_id"}
+    )
+    # We'll need the tokenizer to detokenize the model outputs later.
+    self.tok = big_vision.pp.tokenizer.get_tokenizer(tokenizer)
+    self.decode = functools.partial(
+        predict_fn, devices=devices, eos_token=self.tok.eos_token
+    )
+  def run(self, train_state):
+    """Does one evaluation run, yields metrics."""
+    accuracies = []
+    valid = []
+    json_out = []
+    for _, batch in zip(range(self.steps), self.data_iter):
+      # (batch, seqlen) array of decoded generated tokens.
+      tokens = self.decode(train_state, batch)
+      # (local_batch,) that indicates padding examples (0) vs real examples (1).
+      tokens = u.get_local_slice_from_fsarray(tokens)
+      ex_masks = u.get_local_slice_from_fsarray(batch["_mask"])
+      # Turn predictions into texts and then scores, one by one.
+      for i in range(len(tokens)):
+        if ex_masks[i] == 0:  # Skip last-batch padding examples
+          continue
+        answer = self.tok.to_str(tokens[i], stop_at_eos=True).lower()
+        gt = batch["answer"][i]
+        accuracies.append(float(answer == gt))
+        valid.append(float(answer in ("yes", "no")))
+        json_out.append(
+            {
+                self.out_question_key: batch["question_id"][i].item(),
+                self.out_answer_key: answer,
+            }
+        )
+    # At this point `accuracies` is a list of per-example scores. However,
+    # remember that each host holds a different subset of the examples! So if
+    # we were to just return the mean accuracy here, we would effectively only
+    # have evaluated on the main host's (who writes metrics) subset!
+    # So now, we need to compute global means.
+    # There is one more caveat: `process_sum` needs the summands on each host
+    # to have the same size. So we either need to include dummy values for
+    # the padding examples (last batch, annoying), or we only sum scalars as in
+    # sufficient statistics, which we do here.
+    sum_accs, sum_valid, num = c.process_sum([
+        sum(accuracies),
+        sum(valid),
+        len(accuracies),
+    ])
+    if num:
+      yield "acc", sum_accs / num
+      yield "valid_percent", sum_valid / num
+    yield "num", num
+    c.multiprocess_write_json(self.outfile, json_out)

Tipsomaly/model/big_vision/evaluators/proj/uvim/coco_panoptic.py ADDED Viewed

	@@ -0,0 +1,324 @@

+# Copyright 2022 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""COCO17 panoptic evaluation."""
+import functools
+from functools import partial
+import json
+import os
+import tempfile
+import time
+import zipfile
+from absl import logging
+from big_vision.evaluators.proj.uvim import common
+import big_vision.pp.builder as pp_builder
+import jax
+import numpy as np
+import panopticapi_converters.twochannels2panoptic_coco_format as converter
+from panopticapi.evaluation import pq_compute
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.io import gfile
+ROOT = os.environ.get('COCO_DATA_DIR', '.')
+PANOPTIC_COCO_CATS_FILE = f'{ROOT}/panoptic_coco_categories.json'
+PANOPTIC_2017 = {
+    'train': f'{ROOT}/panoptic_train2017.json',
+    'validation': f'{ROOT}/panoptic_val2017.json',
+}
+PANOPTIC_GT_ZIP = {
+    'train': f'{ROOT}/panoptic_train2017.zip',
+    'validation': f'{ROOT}/panoptic_val2017.zip',
+}
+class Evaluator:
+  """Panoptic segmentation evaluator: calls official COCO API.
+  `predict_fn` accepts arbitrary dictionaries of parameters and data, where
+  the data dictionary is produced by the `pp` op. It is expected to output a
+  2-channel mask, where the first channel encodes semantics, and the second
+  channel encodes instance ids.
+  """
+  def __init__(self,
+               predict_fn,
+               pp_fn,
+               batch_size,
+               dataset='coco/2017_panoptic',
+               dataset_dir=None,
+               split='validation',
+               predict_kwargs=None):
+    # Prepare to run predict on all processes and gather predictions on all
+    # devices. Note: if needed consider only gather across processes.
+    def predict(params, batch):
+      res = {
+          'image/id': batch['image/id'],
+          'mask': batch['mask'],
+          'y': predict_fn(params, batch['input'], **(predict_kwargs or {})),
+      }
+      return jax.lax.all_gather(res, axis_name='data', axis=0)
+    self.predict_fn = jax.pmap(predict, axis_name='data')
+    # Prepare data for each process and pad with zeros so all processes have the
+    # same number of batches.
+    def preprocess(example):
+      return {
+          'image/id': example['image/id'],
+          'mask': tf.constant(1),
+          'input': pp_builder.get_preprocess_fn(pp_fn)(example),
+      }
+    self.data = common.get_jax_process_dataset(
+        dataset, split, dataset_dir=dataset_dir,
+        global_batch_size=batch_size,
+        pp_fn=preprocess)
+    # Only process 0 runs conversion to png and calls into coco api.
+    if jax.process_index() == 0:
+      self.result_dir = tempfile.TemporaryDirectory()
+      (self.gt_folder, self.gt_json, self.categories_json,
+       self.remap, self.size_map) = _prepare_ground_truth(
+           dataset, split, dataset_dir)
+  def _compute_png_predictions(self, params):
+    """Computes predictions and converts then to png to optimize memory use."""
+    count = 0
+    logging.info('Panoptic eval: running inference.')
+    for batch in self.data.as_numpy_iterator():
+      out = self.predict_fn(params, batch)
+      if jax.process_index():
+        continue
+      out = jax.device_get(jax.tree_map(lambda x: x[0], out))
+      mask = out['mask']
+      pan_recs = out['y'][mask != 0]
+      ids = out['image/id'][mask != 0]
+      for pan_rec, image_id in zip(pan_recs, ids):
+        sem = pan_rec[..., 0]
+        ins = pan_rec[..., 1]
+        sem_remapped = np.array(sem)
+        for v in np.unique(sem):
+          sem_remapped[sem == v] = self.remap[v]
+        sem = sem_remapped
+        pan_mask = np.stack([sem, ins, np.zeros_like(sem)], axis=-1)
+        pan_mask = _resize_nearest(pan_mask, self.size_map[image_id])
+        pan_mask_png = tf.io.encode_png(pan_mask.astype('uint8')).numpy()
+        fname = f'{self.result_dir.name}/{image_id:012d}.png'
+        with open(fname, 'wb') as f:
+          f.write(pan_mask_png)
+        count += 1
+      logging.log_every_n_seconds(
+          logging.INFO, 'Panoptic eval: processed %i examples so far.', 30,
+          count)
+    if jax.process_index():
+      return None
+    logging.info('Panoptic eval: inference done. Processed %d examples.', count)
+    return self.result_dir
+  def run(self, params):
+    """Run eval."""
+    # Note result_dir is constant, but files inside are mutated.
+    result_dir = self._compute_png_predictions(params)
+    if not result_dir:
+      return
+    with tempfile.TemporaryDirectory() as pred_folder, \
+         tempfile.NamedTemporaryFile(mode='w') as pred_json:
+      logging.info('Panoptic eval: running conversion.')
+      converter.converter(
+          source_folder=result_dir.name,
+          images_json_file=self.gt_json,
+          categories_json_file=self.categories_json,
+          segmentations_folder=pred_folder,
+          predictions_json_file=pred_json.name)
+      logging.info('Panoptic eval: conversion done.')
+      logging.info('Panoptic eval: running metrics computation.')
+      res = pq_compute(gt_json_file=self.gt_json,
+                       gt_folder=self.gt_folder,
+                       pred_json_file=pred_json.name,
+                       pred_folder=pred_folder)
+      logging.info('Panoptic eval: metrics computation done.')
+    for k in ['All', 'Stuff', 'Things']:
+      for m in ['pq', 'rq', 'sq']:
+        yield f'{k}_{m}', res[k][m]
+def _prepare_ground_truth(dataset, split, data_dir):
+  """Prepare ground truth from tf.data.Dataset."""
+  if dataset == 'coco/2017_panoptic' and data_dir is None:
+    return _prepare_ground_truth_from_zipfiles(split)
+  else:
+    return _prepare_ground_truth_from_dataset(dataset, split, data_dir)
+@functools.lru_cache(maxsize=None)
+def _prepare_ground_truth_from_dataset(dataset, split, data_dir):
+  """Prepare ground truth from a tf.data.Dataset."""
+  dataset = tfds.builder(dataset, data_dir=data_dir).as_dataset(split=split)
+  categories_json = _make_local_copy(PANOPTIC_COCO_CATS_FILE)
+  with gfile.GFile(categories_json, 'rb') as f:
+    categories = json.loads(f.read())
+  # Build map from tfds class ids to COCO class ids.
+  remap = {0: 0}
+  with gfile.GFile(categories_json, 'r') as f:
+    remap = {**remap, **{(i + 1): x['id'] for i, x in enumerate(categories)}}
+  gt_folder = tempfile.mkdtemp()
+  gfile.makedirs(gt_folder)
+  size_map = {}
+  annotations = []
+  images = []
+  for example in dataset:
+    image_id = int(example['image/id'])
+    panoptic_image = example['panoptic_image']
+    ann_ids = example['panoptic_objects']['id']
+    ann_labels = example['panoptic_objects']['label']
+    ann_iscrowd = example['panoptic_objects']['is_crowd']
+    ann_area = example['panoptic_objects']['area']
+    fname = f'{image_id:012d}.png'
+    with gfile.GFile(os.path.join(gt_folder, fname), 'wb') as f:
+      f.write(tf.io.encode_png(panoptic_image).numpy())
+    size_map[image_id] = (panoptic_image.shape[0], panoptic_image.shape[1])
+    segments_info = []
+    for i in range(len(ann_ids)):
+      segments_info.append({
+          'id': int(ann_ids[i]),
+          'category_id': remap[int(ann_labels[i] + 1)],
+          'iscrowd': int(ann_iscrowd[i]),
+          'area': int(ann_area[i]),
+      })
+    annotations.append({
+        'file_name': str(fname),
+        'image_id': int(image_id),
+        'segments_info': segments_info
+    })
+    images.append({
+        'id': image_id,
+        'file_name': f'{image_id:012d}.jpg',
+    })
+  # Write annotations.json needed for pq_compute.
+  gt_json = os.path.join(gt_folder, 'annotations.json')
+  with gfile.GFile(gt_json, 'wb') as f:
+    f.write(json.dumps({
+        'images': images,
+        'annotations': annotations,
+        'categories': categories,
+    }))
+  return gt_folder, gt_json, categories_json, remap, size_map
+def _prepare_ground_truth_from_zipfiles(split):
+  """Prepare ground truth from coco zip files."""
+  split_prefix = split.split('[')[0]
+  if split_prefix not in ('train', 'validation'):
+    raise ValueError(f'Split {split} not supported')
+  # The following 4 calls are cached. This allows to save significant time
+  # in use cases like sweeping predict_fn hparams on the same run.
+  gt_json = _make_local_copy(PANOPTIC_2017[split_prefix])
+  gt_folder = _make_local_unzip_copy(PANOPTIC_GT_ZIP[split_prefix])
+  categories_json = _make_local_copy(PANOPTIC_COCO_CATS_FILE)
+  image_ids = _list_image_ids('coco/2017_panoptic', split)
+  gt_folder = os.path.join(
+      gt_folder, 'panoptic_val2017'
+      if split_prefix == 'validation' else 'panoptic_train2017')
+  # Build map from tfds class ids to COCO class ids.
+  remap = {0: 0}
+  with gfile.GFile(categories_json, 'r') as f:
+    remap = {**remap, **{(i + 1): x['id'] for i, x in enumerate(json.load(f))}}
+  # Filters gt_json to contain only annotations for images in dataset.
+  with gfile.GFile(gt_json) as f:
+    data = json.load(f)
+  logging.info(
+      'Panoptic eval: pre-filter %d annotations.',
+      len(data['annotations'])
+  )
+  data['images'] = [x for x in data['images'] if x['id'] in image_ids]
+  data['annotations'] = [
+      x for x in data['annotations'] if x['image_id'] in image_ids
+  ]
+  logging.info(
+      'Panoptic eval: post-filter %d annotations.',
+      len(data['annotations'])
+  )
+  filtered_gt_json = tempfile.NamedTemporaryFile(delete=False).name
+  with open(filtered_gt_json, 'w') as f:
+    json.dump(data, f)
+  # Precompute images sizes.
+  size_map = {x['id']: (x['height'], x['width']) for x in data['images']}
+  return gt_folder, filtered_gt_json, categories_json, remap, size_map
+@functools.lru_cache(maxsize=None)
+def _list_image_ids(dataset, split):
+  d = tfds.load(dataset, split=split).map(lambda x: x['image/id'])
+  return frozenset(d.as_numpy_iterator())
+@functools.lru_cache(maxsize=None)
+def _make_local_copy(fname) -> str:
+  start = time.monotonic()
+  local_file = tempfile.NamedTemporaryFile(delete=False)
+  gfile.copy(fname, local_file.name, overwrite=True)
+  logging.info('Copy %s in %d seconds.', fname, time.monotonic() - start)
+  return local_file.name
+@functools.lru_cache(maxsize=None)
+def _make_local_unzip_copy(fname) -> str:
+  start = time.monotonic()
+  folder = tempfile.mkdtemp()
+  with tempfile.NamedTemporaryFile() as tmp_zip_file:
+    gfile.copy(fname, tmp_zip_file.name, overwrite=True)
+    with zipfile.ZipFile(tmp_zip_file.name, 'r') as f:
+      f.extractall(folder)
+  logging.info('Copy %s in %d seconds.', fname, time.monotonic() - start)
+  return folder
+@partial(jax.jit, static_argnums=(1,), backend='cpu')
+def _resize_nearest(image, shape):
+  return jax.image.resize(image, shape + image.shape[-1:], 'nearest')

Tipsomaly/model/big_vision/evaluators/proj/uvim/coltran_fid.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# Copyright 2022 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation producing ColTran FID-5K metric."""
+import functools
+import os
+from absl import logging
+import einops
+import jax
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import tensorflow_gan as tfgan
+import tensorflow_hub as tfhub
+from tensorflow.io import gfile
+ROOT = os.environ.get("FID_DATA_DIR", ".")
+def _preprocess(image, resolution=512):
+  """ColTran dataset preprocessing.
+  See,
+  github.com/google-research/google-research/blob/master/coltran/datasets.py#L44
+  Args:
+    image: ImageNet example from TFDS.
+    resolution: Integer representing output size.
+  Returns:
+    An int32 image of size (resolution, resolution, 3).
+  """
+  image_shape = tf.shape(image)
+  height, width = image_shape[0], image_shape[1]
+  side_size = tf.minimum(height, width)
+  image = tf.image.resize_with_crop_or_pad(
+      image, target_height=side_size, target_width=side_size)
+  image = tf.image.resize(image, method="area", antialias=True,
+                          size=(resolution, resolution))
+  image = tf.cast(tf.round(image), dtype=tf.int32)
+  return image
+def _normalize(x):
+  """Coltran normalization to expected range for Inception module.
+  Args:
+    x: Image with values in [0,255].
+  Returns:
+    Image with values in [-1,1].
+  """
+  x = tf.cast(x, tf.float32)
+  x = (x / 128.0) - 1.0  # note: 128.0 is the value used in ColTran.
+  return x
+class Evaluator:
+  """ColTran FID-5K Evaluator.
+  This Evaluator aims to mirror the evaluation pipeline used by Kumar et.al.
+  in Colorization Transformer (https://arxiv.org/abs/2102.04432).
+  To be clear: much of this code is direct snippets from ColTran code.
+  See,
+  github.com/google-research/google-research/blob/master/coltran/datasets.py#L44
+  The ColTran pipeline has numerous stages, where serialied data is passed
+  between binaries via file, etc...  While we don't physically write the same
+  files, we simulate the effects of the serialization (e.g., quantization).
+  """
+  def __init__(self,
+               predict_fn,
+               batch_size,  # ignored
+               device_batch_size=5,
+               coltran_seed=1,
+               predict_kwargs=None):
+    """Create Evaluator.
+    Args:
+      predict_fn: Colorization prediction function.  Expects grayscale images
+        of size (512, 512, 3) in keys `image` and `image_ctx` with values in
+        the range [-1,1].  Outputs `color` image in range [-1,1].
+      batch_size: ignored.
+      device_batch_size: number of images per batch, per device.
+      coltran_seed: used to specify the block of 5_000 images used to generate
+        the reference pool.  Value of `1` matches default ColTran code.
+      predict_kwargs: arguments passed to `predict_fn`.
+    """
+    del batch_size
+    self.num_devices = jax.local_device_count()
+    self.device_batch_size = device_batch_size
+    logging.log(logging.INFO, "Colorizing with batch size %i on %i devices.",
+                self.device_batch_size, self.num_devices)
+    assert 5_000 % (self.device_batch_size * self.num_devices) == 0
+    predict = functools.partial(predict_fn, **(predict_kwargs or {}))
+    self.predict_fn = jax.pmap(predict)
+    module = tfhub.load(tfgan.eval.INCEPTION_TFHUB)
+    def _pools(x):
+      return np.squeeze(module(x)[tfgan.eval.INCEPTION_FINAL_POOL].numpy())
+    self.inception_pool = _pools
+    # Setup the colorization dataset.
+    # TRICKY: ColTran FID-5k uses the first 5_000 images returned as read by
+    # default from tensorflow_datasets (that is: with shard interleaving).
+    # In particular note that it is different than the set of images returned
+    # by "validation[:5000]".
+    def _eval_data_preprocess(example):
+      # Colorization happens at 512x512 resolution.
+      image = _preprocess(example["image"], resolution=512)
+      image = _normalize(image)
+      grayscale = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
+      return {
+          "image": image,
+          "grayscale": grayscale,
+          "file_name": example["file_name"]
+      }
+    ds = tfds.load("imagenet2012", split="validation")
+    ds = ds.map(_eval_data_preprocess)
+    ds = ds.take(5_000)
+    ds = ds.batch(self.device_batch_size)
+    ds = ds.batch(self.num_devices)
+    self.eval_data = ds.cache().prefetch(tf.data.AUTOTUNE)
+    # Setup the reference dataset.
+    def _reference_data_preprocess(example):
+      # ColTran eval operates on 256x256.
+      image = _preprocess(example["image"], resolution=256)
+      image = _normalize(image)
+      return {"image": image, "file_name": example["file_name"]}
+    ds = tfds.load("imagenet2012", split="validation")
+    ds = ds.map(_reference_data_preprocess)
+    # Skip the images used in colorization.
+    ds = ds.skip(5_000)
+    # ColTran eval w/ seed=1 effectively uses 10_000:15_000 to
+    # calculate reference.
+    ds = ds.skip(coltran_seed * 5_000)
+    ds = ds.take(5_000)
+    ds = ds.batch(device_batch_size)
+    self.reference_data = ds.cache().prefetch(tf.data.AUTOTUNE)
+    def _get_file(name):
+      return os.path.join(ROOT, name)
+    with gfile.GFile(_get_file("eval_file_names.txt")) as f:
+      self.eval_file_names = frozenset(f.read().splitlines())
+    with gfile.GFile(_get_file("reference_file_names.txt")) as f:
+      self.reference_file_names = frozenset(f.read().splitlines())
+  def run(self, params):
+    """Run eval."""
+    if jax.process_index():  # Host0 does all work.
+      return
+    color_pools = []
+    color_file_names = set()
+    for i, batch in enumerate(self.eval_data.as_numpy_iterator()):
+      predict_batch = {
+          "labels": batch["image"],
+          "image": batch["grayscale"],
+          "image_ctx": batch["grayscale"],
+      }
+      y = self.predict_fn(params, predict_batch)
+      y = y["color"]
+      y = einops.rearrange(y, "d b h w c -> (d b) h w c")
+      # Return to the ColTran eval size of 256x256.
+      y = tf.image.resize(y, (256, 256), "area")
+      # Mimic effect of serializing image as integers and map back to [-1, 1].
+      y = np.clip(np.round((y + 1.) * 128.), 0, 255)
+      y = _normalize(y)
+      color_pools.append(self.inception_pool(y))
+      file_names = einops.rearrange(batch["file_name"], "d b -> (d b)")
+      color_file_names.update([f.decode() for f in file_names])
+      logging.log_every_n_seconds(
+          logging.INFO,
+          "ColTran FID eval: processed %i colorized examples so far.", 30,
+          (i + 1) * self.device_batch_size * self.num_devices)
+    reference_pools = []
+    reference_file_names = set()
+    for i, batch in enumerate(self.reference_data.as_numpy_iterator()):
+      image = batch["image"]
+      assert np.array_equal(image.shape, (self.device_batch_size, 256, 256, 3))
+      reference_pools.append(self.inception_pool(image))
+      reference_file_names.update([f.decode() for f in batch["file_name"]])
+      logging.log_every_n_seconds(
+          logging.INFO,
+          "ColTran FID eval: processed %i reference examples so far.", 30,
+          (i + 1) * self.device_batch_size)
+    if color_file_names != self.eval_file_names:
+      raise ValueError("unknown: {}\nmissing: {}".format(
+          color_file_names - self.eval_file_names,
+          self.eval_file_names - color_file_names))
+    if reference_file_names != self.reference_file_names:
+      raise ValueError("unknown: {}\nmissing: {}".format(
+          reference_file_names - self.reference_file_names,
+          self.reference_file_names - reference_file_names))
+    color = np.concatenate(color_pools, axis=0)
+    reference = np.concatenate(reference_pools, axis=0)
+    if color.shape[0] != 5_000:
+      raise ValueError(color.shape)
+    if reference.shape[0] != 5_000:
+      raise ValueError(reference.shape)
+    yield "FID_5k", tfgan.eval.frechet_classifier_distance_from_activations(
+        color, reference)

Tipsomaly/model/big_vision/evaluators/proj/uvim/coltran_fid_data/eval_file_names.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Tipsomaly/model/big_vision/evaluators/proj/uvim/coltran_fid_data/reference_file_names.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Tipsomaly/model/big_vision/evaluators/proj/uvim/common.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright 2022 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common utilities used in evaluators."""
+import math
+import jax
+import tensorflow as tf
+import tensorflow_datasets as tfds
+def get_jax_process_dataset(dataset, split, global_batch_size, pp_fn,
+                            dataset_dir=None, cache=True, add_tfds_id=False):
+  """Returns dataset to be processed by current jax host.
+  The dataset is sharded and padded with zeros such that all processes
+  have equal number of batches. The first 2 dimensions of the dataset
+  elements are: [local_device_count, device_batch_size].
+  Args:
+    dataset: dataset name.
+    split: dataset split.
+    global_batch_size: batch size to be process per iteration on the dataset.
+    pp_fn: preprocessing function to apply per example.
+    dataset_dir: path for tfds to find the prepared data.
+    cache: whether to cache the dataset after batching.
+    add_tfds_id: whether to add the unique `tfds_id` string to each example.
+  """
+  assert global_batch_size % jax.device_count() == 0
+  total_examples = tfds.load(
+      dataset, split=split, data_dir=dataset_dir).cardinality()
+  num_batches = math.ceil(total_examples / global_batch_size)
+  process_split = tfds.even_splits(
+      split, n=jax.process_count(), drop_remainder=False)[jax.process_index()]
+  data = tfds.load(
+      dataset,
+      split=process_split,
+      data_dir=dataset_dir,
+      read_config=tfds.ReadConfig(add_tfds_id=add_tfds_id)).map(pp_fn)
+  pad_data = tf.data.Dataset.from_tensors(
+      jax.tree_map(lambda x: tf.zeros(x.shape, x.dtype), data.element_spec)
+  ).repeat()
+  data = data.concatenate(pad_data)
+  data = data.batch(global_batch_size // jax.device_count())
+  data = data.batch(jax.local_device_count())
+  data = data.take(num_batches)
+  if cache:
+    # Eval datasets are often used many times and caching the dataset after
+    # batching allows one to have the buffers ready to be used and not have
+    # to wait for preprocessing to be done over and over.
+    data = data.cache()
+  return data

Tipsomaly/model/big_vision/evaluators/proj/uvim/compute_mean.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright 2022 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for computing mean of per-example metrics."""
+import functools
+from typing import Mapping
+from big_vision import input_pipeline
+from big_vision.datasets import core as ds_core
+from big_vision.pp import builder as pp_builder
+import jax
+import jax.numpy as jnp
+import numpy as np
+# Note: global to avoid jax re-compiling across different evaluator instances.
+@functools.partial(jax.pmap, static_broadcasted_argnums=0, axis_name='batch')
+def _run_predict_fn(predict_fn, params, batch):
+  """Sum per-example metrics weighted by `_mask`."""
+  mask = batch['_mask']
+  metrics = predict_fn(params, batch)
+  # Sanity check output format of predict_fn.
+  assert isinstance(metrics, Mapping), 'predict_fn must return a dict'
+  for y in jax.tree_leaves(metrics):
+    if y.shape != mask.shape:
+      raise ValueError(
+          f'Expected per-example metrics of shape {mask.shape} found '
+          f'{jax.tree_map(lambda x: x.shape, metrics)}.')
+  metrics = {**metrics, '_mask': mask}
+  metrics = jax.tree_map(lambda x: jnp.inner(x, mask), metrics)
+  return jax.lax.psum(metrics, axis_name='batch')
+class Evaluator:
+  """Report the mean of per-example metrics computed by predict_fn.
+  `predict_fn(params, batch)` must return a dict from metric name to
+  per-example metrics of shape [batch_size].
+  """
+  def __init__(self, predict_fn, data, pp_fn, batch_size,
+               cache_final=True, cache_raw=False, prefetch=1):
+    data = ds_core.get(**data)
+    self.dataset, self.steps = input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True), batch_size=batch_size,
+        num_ex_per_process=data.num_examples_per_process(),
+        preprocess_fn=pp_builder.get_preprocess_fn(pp_fn),
+        cache_final=cache_final, cache_raw=cache_raw)
+    self.data_iter = input_pipeline.start_input_pipeline(self.dataset, prefetch)
+    self.predict_fn = predict_fn
+  def run(self, params):
+    """Computes all metrics."""
+    metrics = []
+    # Compute batch metrics without blocking.
+    for _, batch in zip(range(self.steps), self.data_iter):
+      batch_metrics = _run_predict_fn(self.predict_fn, params, batch)
+      metrics.append(batch_metrics)
+    # Transfer metrics from device 0 to host (blocking).
+    metrics = jax.device_get(jax.tree_map(lambda x: x[0], metrics))
+    metrics_sum = jax.tree_map(lambda *x: np.sum(x), *metrics)
+    mask_sum = metrics_sum.pop('_mask')
+    for key, value_sum in metrics_sum.items():
+      yield (key, value_sum / mask_sum)

Tipsomaly/model/big_vision/evaluators/proj/uvim/nyu_depth.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright 2022 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for NYU depth.
+At evaluation time the ground truth is cropped and clipped. Values outside of
+the test crop or clipping range are not included in eval calculations.
+In this evaluator, it is assume that the groud truth is already cropped, so the
+entire image is evaluated. However, the evaluator does perform the clipping.
+Reference implementations:
+  https://github.com/zhyever/Monocular-Depth-Estimation-Toolbox/blo(internal link)a0f341244260ff61541191a613dd74bc/depth/datasets/nyu.py
+  https://github.com/vinvino02/GLPDepth/blob/7f3c78df4ecd6e7c79fd0c4b73c95d61f4aa2121/code/utils/metrics.py
+  https://github.com/shariqfarooq123/AdaBins/blob/2fb686a66a304f0a719bc53d77412460af97fd61/evaluate.py
+"""
+import functools
+import big_vision.evaluators.proj.uvim.common as common
+import big_vision.pp.builder as pp_builder
+import jax
+import jax.numpy as jnp
+import numpy as np
+import tensorflow as tf
+EVAL_CROP_H = 426
+EVAL_CROP_W = 560
+class Evaluator:
+  """Evaluator for NYU depth."""
+  def __init__(self,
+               predict_fn,
+               pp_fn,
+               batch_size,
+               dataset,
+               split,
+               min_depth=1e-3,
+               max_depth=10,
+               dataset_dir=None,
+               predict_kwargs=None):
+    self.min_depth = min_depth
+    self.max_depth = max_depth
+    def predict(params, batch):
+      pred = predict_fn(params, batch, **(predict_kwargs or {}))
+      return jax.lax.all_gather({
+          "mask": batch["mask"],
+          "gt": jnp.squeeze(batch["ground_truth"], axis=-1),
+          "y": pred["depth"],
+      }, axis_name="data", axis=0)
+    self.predict_fn = jax.pmap(predict, axis_name="data")
+    # Prepare data for each process and pad with zeros so all processes have the
+    # same number of batches.
+    def preprocess(example):
+      return {
+          "mask": tf.constant(1),
+          **pp_builder.get_preprocess_fn(pp_fn)(example),
+      }
+    self.process_batch_size = batch_size // jax.process_count()
+    self.data = common.get_jax_process_dataset(
+        dataset=dataset,
+        dataset_dir=dataset_dir,
+        split=split,
+        global_batch_size=batch_size,
+        pp_fn=preprocess)
+  def run(self, params):
+    """Run eval."""
+    # Assumes that the ground truth is processed by the eval crop.
+    eval_mask = np.ones((EVAL_CROP_H, EVAL_CROP_W), dtype=np.bool_)
+    rmses = []
+    abs_res = []
+    abs_logs = []
+    d1s = []
+    d2s = []
+    d3s = []
+    for batch in self.data.as_numpy_iterator():
+      # Outputs is a dict with values shaped (gather/same, devices, batch, ...)
+      out = self.predict_fn(params, batch)
+      if jax.process_index():  # Host0 gets all preds and does eval.
+        continue
+      # First, we remove the "gather" dim and transfer the result to host,
+      # leading to numpy arrays of (devices, device_batch, ...)
+      out = jax.tree_map(lambda x: jax.device_get(x[0]), out)
+      # Then the bool-indexing with mask resulting in flat (global_batch, ...)
+      out = jax.tree_map(lambda x: x[out["mask"] == 1], out)  # pylint:disable=cell-var-from-loop
+      for gt, pred in zip(out["gt"], out["y"]):
+        pred = _resize_nearest(pred, (EVAL_CROP_H, EVAL_CROP_W))
+        valid_mask = np.logical_and(gt > self.min_depth, gt < self.max_depth)
+        valid_mask = np.logical_and(valid_mask, eval_mask)
+        rmses.append(_compute_rmse(gt[valid_mask], pred[valid_mask]))
+        abs_res.append(_compute_abs_re(gt[valid_mask], pred[valid_mask]))
+        abs_logs.append(_compute_abs_log(gt[valid_mask], pred[valid_mask]))
+        d1s.append(_compute_delta(gt[valid_mask], pred[valid_mask], order=1))
+        d2s.append(_compute_delta(gt[valid_mask], pred[valid_mask], order=2))
+        d3s.append(_compute_delta(gt[valid_mask], pred[valid_mask], order=3))
+    if jax.process_index():  # Host0 gets all preds and does eval.
+      return
+    yield "RMSE", np.mean(rmses)
+    yield "abs_RE", np.mean(abs_res)
+    yield "log10", np.mean(abs_logs)
+    yield "delta1", np.mean(d1s)
+    yield "delta2", np.mean(d2s)
+    yield "delta3", np.mean(d3s)
+@functools.partial(jax.jit, static_argnums=(1,), backend="cpu")
+def _resize_nearest(image, shape):
+  return jax.image.resize(image, shape, "nearest")
+def _compute_rmse(gt, pred):
+  diff = gt - pred
+  return np.sqrt(np.mean(np.power(diff, 2)))
+def _compute_abs_re(gt, pred):
+  diff = np.abs(gt - pred)
+  return np.mean(diff / gt)
+def _compute_abs_log(gt, pred):
+  diff = np.abs(np.log10(gt) - np.log10(pred))
+  return np.mean(diff)
+def _compute_delta(gt, pred, order):
+  rel_diff = np.maximum(gt / pred, pred / gt)
+  return np.sum(rel_diff < 1.25**order) / rel_diff.size