Spaces:
Sleeping
Sleeping
Daniel Varga
commited on
Commit
·
8424a77
1
Parent(s):
67d87f5
create embedding, deduplication, docs
Browse files- app.ini +1 -1
- app.py +1 -1
- create_embeddings.py +20 -13
- create_embeddings.sh +1 -0
- dedupe.py +41 -0
- hashes.sh +1 -0
- readme.sh +37 -0
app.ini
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
[DEFAULT]
|
| 2 |
-
pkl =
|
| 3 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
|
|
|
| 1 |
[DEFAULT]
|
| 2 |
+
pkl = 02_LOCATION_PHOTOS.deduped.f16.pkl
|
| 3 |
url = https://static.renyi.hu/ai-shared/daniel/sameenergy/
|
app.py
CHANGED
|
@@ -174,4 +174,4 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
|
|
| 174 |
|
| 175 |
|
| 176 |
if __name__ == "__main__":
|
| 177 |
-
demo.launch()
|
|
|
|
| 174 |
|
| 175 |
|
| 176 |
if __name__ == "__main__":
|
| 177 |
+
demo.launch(share=False)
|
create_embeddings.py
CHANGED
|
@@ -13,6 +13,7 @@ def do_batch(batch, embeddings):
|
|
| 13 |
image_features = model.encode_image(image_batch).float()
|
| 14 |
embeddings += image_features.cpu().numpy().tolist()
|
| 15 |
print(f"{len(embeddings)} done")
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
# even though it's not worth bothering with cuda,
|
|
@@ -38,19 +39,25 @@ batch = []
|
|
| 38 |
for filename in sys.stdin:
|
| 39 |
filename = filename.rstrip()
|
| 40 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
# remaining
|
| 56 |
if len(batch) > 0:
|
|
|
|
| 13 |
image_features = model.encode_image(image_batch).float()
|
| 14 |
embeddings += image_features.cpu().numpy().tolist()
|
| 15 |
print(f"{len(embeddings)} done")
|
| 16 |
+
sys.stdout.flush()
|
| 17 |
|
| 18 |
|
| 19 |
# even though it's not worth bothering with cuda,
|
|
|
|
| 39 |
for filename in sys.stdin:
|
| 40 |
filename = filename.rstrip()
|
| 41 |
if filename.lower().endswith("jpg") or filename.lower().endswith("jpeg"):
|
| 42 |
+
try:
|
| 43 |
+
rgb = Image.open(filename).convert("RGB")
|
| 44 |
+
img = preprocess(rgb)
|
| 45 |
+
batch.append(img)
|
| 46 |
+
if len(batch) >= batch_size:
|
| 47 |
+
do_batch(batch, embeddings)
|
| 48 |
+
batch = []
|
| 49 |
+
filenames.append(filename)
|
| 50 |
+
if do_thumbs:
|
| 51 |
+
rgb.thumbnail((128, 128))
|
| 52 |
+
thumb = np.array(rgb)
|
| 53 |
+
thumbs.append(thumb)
|
| 54 |
+
if len(filenames) >= limit:
|
| 55 |
+
break
|
| 56 |
+
except KeyboardInterrupt:
|
| 57 |
+
raise
|
| 58 |
+
except:
|
| 59 |
+
print(f"ERROR, skipping {filename}")
|
| 60 |
+
sys.stdout.flush()
|
| 61 |
|
| 62 |
# remaining
|
| 63 |
if len(batch) > 0:
|
create_embeddings.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
|
dedupe.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# takes a pickle, takes a file with md5sums, outputs the pickle without hash dupes.
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
import pickle
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
pickle_filename, md5_filename, output_pickle_filename = sys.argv[1:]
|
| 9 |
+
|
| 10 |
+
hashes = {}
|
| 11 |
+
for line in open(md5_filename, "r"):
|
| 12 |
+
line = line.strip("\n")
|
| 13 |
+
md5 = line.split()[0]
|
| 14 |
+
assert len(md5) == 32
|
| 15 |
+
assert line[32:34] == " "
|
| 16 |
+
filename = line[34:]
|
| 17 |
+
hashes[filename] = md5
|
| 18 |
+
print(len(hashes), "hashes read")
|
| 19 |
+
|
| 20 |
+
data = pickle.load(open(pickle_filename, "rb"))
|
| 21 |
+
print(len(data["embeddings"]), "embeddings read")
|
| 22 |
+
|
| 23 |
+
filenames = data["filenames"]
|
| 24 |
+
collected_indices = []
|
| 25 |
+
collected_md5s = set()
|
| 26 |
+
for i in range(len(filenames)):
|
| 27 |
+
filename = filenames[i]
|
| 28 |
+
md5 = hashes[filename] # not the hash of the filename, the hash of the file content.
|
| 29 |
+
if md5 not in collected_md5s:
|
| 30 |
+
collected_indices.append(i)
|
| 31 |
+
collected_md5s.add(md5)
|
| 32 |
+
|
| 33 |
+
print(len(collected_indices), "unique hashes")
|
| 34 |
+
filenames = np.array(filenames)
|
| 35 |
+
|
| 36 |
+
data["filenames"] = filenames[collected_indices].tolist()
|
| 37 |
+
data["embeddings"] = data["embeddings"][collected_indices]
|
| 38 |
+
assert "thumbs" not in data
|
| 39 |
+
|
| 40 |
+
with open(output_pickle_filename, "wb") as f:
|
| 41 |
+
pickle.dump(data, f)
|
hashes.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
cat jpg_files | while read f ; do md5sum "$f" ; done > md5sums
|
readme.sh
CHANGED
|
@@ -22,3 +22,40 @@ python app.py
|
|
| 22 |
|
| 23 |
python convert.py sample_fbi_s1e1.pkl
|
| 24 |
# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
python convert.py sample_fbi_s1e1.pkl
|
| 24 |
# -> creates sample_fbi_s1e1.f16.pkl dumbed from float64 to float16.
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
ssh -p 2820 hexagon.renyi.hu
|
| 28 |
+
cd ai-shared/daniel/sameenergy
|
| 29 |
+
lftp -p 2167 gw.pioneer.hu
|
| 30 |
+
# manually provide username, password
|
| 31 |
+
cd store/05_Photos
|
| 32 |
+
# promising directories:
|
| 33 |
+
ls 02_LOCATION\ PHOTOS 05_TO_LOCATION_PHOTOS PhotoLibrary Tünde
|
| 34 |
+
mirror 02_LOCATION\ PHOTOS
|
| 35 |
+
ctrl-z
|
| 36 |
+
# -> puts mirroring to background.
|
| 37 |
+
ctrl-d
|
| 38 |
+
# -> exits lftp without terminating the background job, making it nohup.
|
| 39 |
+
|
| 40 |
+
# scp'd files to buda
|
| 41 |
+
cd /data/daniel/sameenergy/
|
| 42 |
+
find 02_LOCATION_PHOTOS -type f > raw_files
|
| 43 |
+
cat raw_files | grep -i "jpg\|jpeg$" > jpg_files
|
| 44 |
+
|
| 45 |
+
# TODO
|
| 46 |
+
# chmod files on ai-shared
|
| 47 |
+
|
| 48 |
+
nohup bash create_embeddings.sh &
|
| 49 |
+
# ...but it's really just this:
|
| 50 |
+
# cat jpg_files | python ~/experiments/kalman/se/create_embeddings.py 02_LOCATION_PHOTOS.pkl no-thumbs
|
| 51 |
+
# -> after some 8 hours or so processes cca 200k images, resulting in
|
| 52 |
+
# 02_LOCATION_PHOTOS.pkl
|
| 53 |
+
|
| 54 |
+
# hashes for deduplication:
|
| 55 |
+
bash hashes.sh
|
| 56 |
+
# takes jpg_files and outputs md5sums
|
| 57 |
+
|
| 58 |
+
python convert.py 02_LOCATION_PHOTOS.pkl
|
| 59 |
+
# -> creates float16 02_LOCATION_PHOTOS.f16.pkl
|
| 60 |
+
mv md5sums 02_LOCATION_PHOTOS.f16.md5sums
|
| 61 |
+
python dedupe.py 02_LOCATION_PHOTOS.f16.pkl 02_LOCATION_PHOTOS.f16.md5sums 02_LOCATION_PHOTOS.deduped.f16.pkl
|