File size: 4,930 Bytes
1ce3798
 
 
 
 
 
 
 
 
 
 
 
9cdc9a1
1ce3798
9cdc9a1
1ce3798
 
9cdc9a1
 
 
 
 
 
 
8424a77
 
 
 
ae2c23b
 
8424a77
 
 
 
 
 
 
 
 
 
 
ae2c23b
 
 
 
 
 
8424a77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae2c23b
 
 
8424a77
 
 
 
e6e7ab0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c91769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9de5f50
2c91769
9de5f50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

# lots of images moved to directory sample_fbi_s1e1

# list them:
find sample_fbi_s1e1 | grep "jpg\|JPG\|jpeg$" > sample_fbi_s1e1.txt

# copy them to public:
scp -q -r -P 2820 sample_fbi_s1e1 hexagon.renyi.hu:./ai-shared/daniel/sameenergy/
# example URL:
# https://static.renyi.hu/ai-shared/daniel/sameenergy/sample_fbi_s1e1/x_BRIDGE_ADRIATIC/Dobogoko_Esztergom/Videk_ut_Dobogoko_Esztergom_014.jpg

# run CLIP:
time cat sample_fbi_s1e1.txt | python create_embeddings.py sample_fbi_s1e1.pkl no-thumbs
# -> sample_fbi_s1e1.pkl contains embeddings and filenames.
# some 12 images/sec on CPU.

# gradio app:
python app.py --pkl sample_fbi_s1e1.pkl --url https://static.renyi.hu/ai-shared/daniel/sameenergy/
# or
python app.py
# ...and then it takes these from app.ini

python convert.py sample_fbi_s1e1.pkl
# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.


ssh -p 2820 hexagon.renyi.hu
cd ai-shared/daniel/sameenergy
lftp -p 2167 -u d.varga gw.pioneer.hu
# manually provide password
cd store/05_Photos
# promising directories:
ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde
mirror 02_LOCATION\ PHOTOS
ctrl-z
# -> puts mirroring to background.
ctrl-d
# -> exits lftp without terminating the background job, making it nohup.

# scp'd files to buda
cd /data/daniel/sameenergy/

# how many bytes, as a check?
find 02_LOCATION_PHOTOS -type f -exec stat --format="%s" {} \; | awk '{total += $1} END {print total}'
# -> 141,133,402,112 that's 141GB. in 197108 files, not including directories.
# on the Pioneer server this was 141,131,778,304 bytes in 196916 files, 6446 directories, good enough.

find 02_LOCATION_PHOTOS -type f > raw_files
cat raw_files | grep -i "jpg\|jpeg$" > jpg_files

# TODO
# chmod files on ai-shared

nohup bash create_embeddings.sh &
# ...but it's really just this:
# cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
# -> after some 8 hours or so processes cca 200k images, resulting in
# 02_LOCATION_PHOTOS.pkl

# hashes for deduplication:
bash hashes.sh
# takes jpg_files and outputs md5sums

# ad hoc sample from duplicates:
cat md5sums | awk 'BEGIN{FS="  "} { if ($1 in m) { print $1 "\t" $2 "\t" m[$1] } ; m[$1] = $2 }' | awk '(NR%4000==0)' | cut -f2-

python convert.py 02_LOCATION_PHOTOS.pkl
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl


# started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting.
ssh buda
cd /data/daniel/sameenergy
nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . &
# 30MB/sec, that's some 10 hours? don't forget that the source is still increasing.


nohup bash hashes.sh > md5.cout 2> md5.cerr &
# -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs.
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" | wc

cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums


cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums

wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums
   591500 02_and_PhotoLibrary.854G.deduped_md5sums
   514706 PhotoLibrary.854G.deduped_md5sums
# -> is not worth the hassle merging them. let's just do PhotoLibrary.

# rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that,
# doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums

# TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived.



#####
# thumbnailing

# on hexagon
cd ~/ai-shared/daniel/sameenergy
nohup cp -r 02_LOCATION_PHOTOS 02_LOCATION_PHOTOS.thumbs &
nohup cp -r PhotoLibrary PhotoLibrary.thumbs &
# -> this is slooooow, a day or so.

# the following code, located at hexagon:~/ai-shared/daniel/sameenergy/downscale.sh ,
# downscales so that the image fits into 1024x1024
find $root -type f | grep -i "jpeg\|jpg$" | while read f ; do echo "$f" ; convert "$f" -resize "1024x1024>" "$f" ; done

# it was run like this, setting root=02_LOCATION_PHOTOS.thumbs
nohup bash downscale.sh > 02_LOCATION_PHOTOS.downscale.cout 2> 02_LOCATION_PHOTOS.downscale.cerr &
# -> took a night or so.
nohup bash downscale.sh > PhotoLibrary.downscale.cout 2> PhotoLibrary.downscale.cerr &
# -> took 2 days or so.

# added to app.py to patch the filenames in the pickle to change PhotoLibrary to PhotoLibrary.thumbs