# lots of images moved to directory sample_fbi_s1e1 # list them: find sample_fbi_s1e1 | grep "jpg\|JPG\|jpeg$" > sample_fbi_s1e1.txt # copy them to public: scp -q -r -P 2820 sample_fbi_s1e1 hexagon.renyi.hu:./ai-shared/daniel/sameenergy/ # example URL: # https://static.renyi.hu/ai-shared/daniel/sameenergy/sample_fbi_s1e1/x_BRIDGE_ADRIATIC/Dobogoko_Esztergom/Videk_ut_Dobogoko_Esztergom_014.jpg # run CLIP: time cat sample_fbi_s1e1.txt | python create_embeddings.py sample_fbi_s1e1.pkl no-thumbs # -> sample_fbi_s1e1.pkl contains embeddings and filenames. # some 12 images/sec on CPU. # gradio app: python app.py --pkl sample_fbi_s1e1.pkl --url https://static.renyi.hu/ai-shared/daniel/sameenergy/ # or python app.py # ...and then it takes these from app.ini python convert.py sample_fbi_s1e1.pkl # -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16. ssh -p 2820 hexagon.renyi.hu cd ai-shared/daniel/sameenergy lftp -p 2167 -u d.varga gw.pioneer.hu # manually provide password cd store/05_Photos # promising directories: ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde mirror 02_LOCATION\ PHOTOS ctrl-z # -> puts mirroring to background. ctrl-d # -> exits lftp without terminating the background job, making it nohup. # scp'd files to buda cd /data/daniel/sameenergy/ # how many bytes, as a check? find 02_LOCATION_PHOTOS -type f -exec stat --format="%s" {} \; | awk '{total += $1} END {print total}' # -> 141,133,402,112 that's 141GB. in 197108 files, not including directories. # on the Pioneer server this was 141,131,778,304 bytes in 196916 files, 6446 directories, good enough. find 02_LOCATION_PHOTOS -type f > raw_files cat raw_files | grep -i "jpg\|jpeg$" > jpg_files # TODO # chmod files on ai-shared nohup bash create_embeddings.sh & # ...but it's really just this: # cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs # -> after some 8 hours or so processes cca 200k images, resulting in # 02_LOCATION_PHOTOS.pkl # hashes for deduplication: bash hashes.sh # takes jpg_files and outputs md5sums # ad hoc sample from duplicates: cat md5sums | awk 'BEGIN{FS=" "} { if ($1 in m) { print $1 "\t" $2 "\t" m[$1] } ; m[$1] = $2 }' | awk '(NR%4000==0)' | cut -f2- python convert.py 02_LOCATION_PHOTOS.pkl # -> creates float16 02_LOCATION_PHOTOS.f16.pkl mv md5sums 02_LOCATION_PHOTOS.f16.md5sums python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl # started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting. ssh buda cd /data/daniel/sameenergy nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . & # 30MB/sec, that's some 10 hours? don't forget that the source is still increasing. nohup bash hashes.sh > md5.cout 2> md5.cerr & # -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs. cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" | wc cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums 591500 02_and_PhotoLibrary.854G.deduped_md5sums 514706 PhotoLibrary.854G.deduped_md5sums # -> is not worth the hassle merging them. let's just do PhotoLibrary. # rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that, # doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums # TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived. ##### # thumbnailing # on hexagon cd ~/ai-shared/daniel/sameenergy nohup cp -r 02_LOCATION_PHOTOS 02_LOCATION_PHOTOS.thumbs & nohup cp -r PhotoLibrary PhotoLibrary.thumbs & # -> this is slooooow, a day or so. # the following code, located at hexagon:~/ai-shared/daniel/sameenergy/downscale.sh , # downscales so that the image fits into 1024x1024 find $root -type f | grep -i "jpeg\|jpg$" | while read f ; do echo "$f" ; convert "$f" -resize "1024x1024>" "$f" ; done # it was run like this, setting root=02_LOCATION_PHOTOS.thumbs nohup bash downscale.sh > 02_LOCATION_PHOTOS.downscale.cout 2> 02_LOCATION_PHOTOS.downscale.cerr & # -> took a night or so. nohup bash downscale.sh > PhotoLibrary.downscale.cout 2> PhotoLibrary.downscale.cerr & # -> took 2 days or so. # added to app.py to patch the filenames in the pickle to change PhotoLibrary to PhotoLibrary.thumbs