Spaces:
Sleeping
Sleeping
Daniel Varga
commited on
Commit
·
e6e7ab0
1
Parent(s):
ae27165
PhotoLibrary. create_embeddings.py refactor, intermediate save.
Browse files- app.ini +2 -1
- create_embeddings.py +21 -12
- create_embeddings.sh +1 -1
- readme.sh +27 -0
app.ini
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
[DEFAULT]
|
| 2 |
-
pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
|
|
|
|
| 3 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
|
|
|
| 1 |
[DEFAULT]
|
| 2 |
+
# pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
|
| 3 |
+
pkl = PhotoLibrary.854G.deduped.f16.pkl
|
| 4 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
create_embeddings.py
CHANGED
|
@@ -31,11 +31,27 @@ assert output_filename.endswith("pkl"), "first argument is the output pickle"
|
|
| 31 |
assert sys.argv[2] in ("thumbs", "no-thumbs"), "second argument either thumbs or no-thumbs"
|
| 32 |
do_thumbs = sys.argv[2] == "thumbs"
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
embeddings = []
|
| 35 |
filenames = []
|
| 36 |
thumbs = []
|
| 37 |
print("starting processing")
|
| 38 |
batch = []
|
|
|
|
| 39 |
for filename in sys.stdin:
|
| 40 |
filename = filename.rstrip()
|
| 41 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
|
@@ -43,10 +59,13 @@ for filename in sys.stdin:
|
|
| 43 |
rgb = Image.open(filename).convert("RGB")
|
| 44 |
img = preprocess(rgb)
|
| 45 |
batch.append(img)
|
|
|
|
| 46 |
if len(batch) >= batch_size:
|
| 47 |
do_batch(batch, embeddings)
|
| 48 |
batch = []
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
if do_thumbs:
|
| 51 |
rgb.thumbnail((128, 128))
|
| 52 |
thumb = np.array(rgb)
|
|
@@ -63,14 +82,4 @@ for filename in sys.stdin:
|
|
| 63 |
if len(batch) > 0:
|
| 64 |
do_batch(batch, embeddings)
|
| 65 |
|
| 66 |
-
embeddings
|
| 67 |
-
assert len(embeddings) == len(filenames)
|
| 68 |
-
print(f"processed {len(embeddings)} images")
|
| 69 |
-
|
| 70 |
-
data = {"embeddings": embeddings, "filenames": filenames}
|
| 71 |
-
if do_thumbs:
|
| 72 |
-
assert len(embeddings) == len(thumbs)
|
| 73 |
-
data["thumbs"] = thumbs
|
| 74 |
-
|
| 75 |
-
with open(output_filename, "wb") as f:
|
| 76 |
-
pickle.dump(data, f)
|
|
|
|
| 31 |
assert sys.argv[2] in ("thumbs", "no-thumbs"), "second argument either thumbs or no-thumbs"
|
| 32 |
do_thumbs = sys.argv[2] == "thumbs"
|
| 33 |
|
| 34 |
+
|
| 35 |
+
def save(output_filename, embeddings, filenames):
|
| 36 |
+
embeddings = np.array(embeddings)
|
| 37 |
+
assert len(embeddings) == len(filenames)
|
| 38 |
+
print(f"processed {len(embeddings)} images")
|
| 39 |
+
|
| 40 |
+
data = {"embeddings": embeddings, "filenames": filenames}
|
| 41 |
+
if do_thumbs:
|
| 42 |
+
assert len(embeddings) == len(thumbs)
|
| 43 |
+
data["thumbs"] = thumbs
|
| 44 |
+
|
| 45 |
+
with open(output_filename, "wb") as f:
|
| 46 |
+
pickle.dump(data, f)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
embeddings = []
|
| 50 |
filenames = []
|
| 51 |
thumbs = []
|
| 52 |
print("starting processing")
|
| 53 |
batch = []
|
| 54 |
+
batch_count = 0
|
| 55 |
for filename in sys.stdin:
|
| 56 |
filename = filename.rstrip()
|
| 57 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
|
|
|
| 59 |
rgb = Image.open(filename).convert("RGB")
|
| 60 |
img = preprocess(rgb)
|
| 61 |
batch.append(img)
|
| 62 |
+
filenames.append(filename)
|
| 63 |
if len(batch) >= batch_size:
|
| 64 |
do_batch(batch, embeddings)
|
| 65 |
batch = []
|
| 66 |
+
batch_count += 1
|
| 67 |
+
if batch_count % 200 == 0:
|
| 68 |
+
save(output_filename, embeddings, filenames)
|
| 69 |
if do_thumbs:
|
| 70 |
rgb.thumbnail((128, 128))
|
| 71 |
thumb = np.array(rgb)
|
|
|
|
| 82 |
if len(batch) > 0:
|
| 83 |
do_batch(batch, embeddings)
|
| 84 |
|
| 85 |
+
save(output_filename, embeddings, filenames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
create_embeddings.sh
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
cat
|
|
|
|
| 1 |
+
cat PhotoLibrary.854G.deduped_md5sums | cut -f3- -d' ' | python ~/experiments/kalman/se/create_embeddings.py PhotoLibrary.854G.deduped.pkl no-thumbs
|
readme.sh
CHANGED
|
@@ -68,3 +68,30 @@ python convert.py 02_LOCATION_PHOTOS.pkl
|
|
| 68 |
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
|
| 69 |
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
|
| 70 |
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
|
| 69 |
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
|
| 70 |
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# started downloading PhotoLibrary, but it's super big, 6 days of downloading and counting.
|
| 74 |
+
ssh buda
|
| 75 |
+
cd /data/daniel/sameenergy
|
| 76 |
+
nohup rsync -r hexagon.renyi.hu:./ai-shared/daniel/sameenergy/PhotoLibrary . &
|
| 77 |
+
# 30MB/sec, that's some 10 hours? don't forget that the source is still increasing.
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
nohup bash hashes.sh > md5.cout 2> md5.cerr &
|
| 81 |
+
# -> creates PhotoLibrary.854G.md5sums , md5.cout and md5.cerr are just logs.
|
| 82 |
+
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" | wc
|
| 83 |
+
|
| 84 |
+
cat PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > PhotoLibrary.854G.deduped_md5sums
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
cat 02_LOCATION_PHOTOS PhotoLibrary.854G.md5sums | awk '{ if ($1 in m) { print "skipping\t" $0 "\tbecause\t" m[$1] } else { print $0 } ; m[$1] = $0 }' | grep -v "^skipping" > 02_and_PhotoLibrary.854G.deduped_md5sums
|
| 88 |
+
|
| 89 |
+
wc -l 02_and_PhotoLibrary.854G.deduped_md5sums PhotoLibrary.854G.deduped_md5sums
|
| 90 |
+
591500 02_and_PhotoLibrary.854G.deduped_md5sums
|
| 91 |
+
514706 PhotoLibrary.854G.deduped_md5sums
|
| 92 |
+
# -> is not worth the hassle merging them. let's just do PhotoLibrary.
|
| 93 |
+
|
| 94 |
+
# rsync has finished, turns out i've collected PhotoLibrary.854G.raw_files right before that,
|
| 95 |
+
# doing the complete would need a re-hash, is not worth the hassle either. staying with PhotoLibrary.854G.deduped_md5sums
|
| 96 |
+
|
| 97 |
+
# TODO I don't think lftp has finished successfully, because the Tünde folder has never arrived.
|