Spaces:

q6
/

p

Running

App Files Files Community

q6 commited on Dec 6, 2024

Commit

92ca3c6

1 Parent(s): defc92b

MAJOR: Client-side downloading

Browse files

Files changed (1) hide show

Client/hunt.py +41 -31

Client/hunt.py CHANGED Viewed

@@ -2,10 +2,9 @@ import os
 import lmdb
 import requests
 import io
-import tarfile
-import zstandard
-local = 1
 if local:
     endpoint = "http://127.0.0.1:7860"
 else:
@@ -37,6 +36,17 @@ for inp in inputs:
 blacklist = ['\0', None]
 for index in indexs:
     group_name = valid[index].rsplit(".", 1)[0]
     os.makedirs(f"images/{group_name}", exist_ok=True)
@@ -44,47 +54,47 @@ for index in indexs:
         post_ids = [x for x in f.read().split("\n") if x]
     with db.begin(write=True) as txn:
-        post_ids = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
-        post_ids = {post_id: url.decode() if url else None for post_id, url in post_ids.items()}
-        filtered = [post_id for post_id, url in post_ids.items() if url == None and f"{post_id}.png" not in images_cache]
-        print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids)}")
         if filtered:
             data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
             for post_id, url in data.items():
                 txn.put(post_id.encode(), url.encode())
-                post_ids[post_id] = url
             no_exif = set(filtered) - set(data.keys())
             for post_id in no_exif:
                 txn.put(post_id.encode(), b'\0')
-    to_download = {post_id: url for post_id, url in post_ids.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
-    post_ids.update(to_download)
-    print(f"Downloading {len(to_download)} images...")
-    if to_download:
-        images_zstd = requests.post(f'{endpoint}/download', json={'posts': to_download})
-        print("Decompressing zstd...")
-        dctx = zstandard.ZstdDecompressor()
-        decompressed = dctx.decompress(images_zstd.content)
-        print("Extracting images from tar...")
-        with io.BytesIO(decompressed) as f:
-            with tarfile.open(fileobj=f, mode='r:') as tarf:
-                tarf.extractall("images/Stash")
-        images_cache = os.listdir("images/Stash")
-    print("Linking images...")
-    for i, post_id in enumerate(post_ids.keys()):
         if f"{post_id}.png" in images_cache:
-            if not os.path.exists(f"images/{group_name}/{i}_{post_id}.png"):
-                os.link(f"images/Stash/{post_id}.png", f"images/{group_name}/{i}_{post_id}.png")
     if len(os.listdir(f'images/{group_name}')) == 0:
         os.rmdir(f"images/{group_name}")
-db.close()

 import lmdb
 import requests
 import io
+from concurrent.futures import ThreadPoolExecutor
+local = 0
 if local:
     endpoint = "http://127.0.0.1:7860"
 else:
 blacklist = ['\0', None]
+def download_image(args):
+    post_id, url = args
+    full_url = img_base + url
+    response = requests.get(full_url, headers={
+        "Referer": "https://www.pixiv.net/",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0"
+    })
+    with open(f"images/Stash/{post_id}.png", "wb") as img_file:
+        img_file.write(response.content)
+    return post_id, True
 for index in indexs:
     group_name = valid[index].rsplit(".", 1)[0]
     os.makedirs(f"images/{group_name}", exist_ok=True)
         post_ids = [x for x in f.read().split("\n") if x]
     with db.begin(write=True) as txn:
+        post_ids_dict = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
+        post_ids_dict = {post_id: url.decode() if url else None for post_id, url in post_ids_dict.items()}
+        filtered = [post_id for post_id, url in post_ids_dict.items() if url is None and f"{post_id}.png" not in images_cache]
+        print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids_dict)}")
         if filtered:
             data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
             for post_id, url in data.items():
                 txn.put(post_id.encode(), url.encode())
+                post_ids_dict[post_id] = url
             no_exif = set(filtered) - set(data.keys())
             for post_id in no_exif:
                 txn.put(post_id.encode(), b'\0')
+    to_download = {post_id: url for post_id, url in post_ids_dict.items()
+                   if url not in blacklist and f"{post_id}.png" not in images_cache}
+    print(f"Total images to download: {len(to_download)}")
+    max_workers = 30
+    to_download_items = list(to_download.items())
+    if to_download_items:
+        print("Starting download...")
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            results = list(executor.map(download_image, to_download_items))
+    images_cache = os.listdir("images/Stash")
+    print("Linking images to the group directory...")
+    for i, post_id in enumerate(post_ids_dict.keys()):
         if f"{post_id}.png" in images_cache:
+            stash_path = f"images/Stash/{post_id}.png"
+            dest_path = f"images/{group_name}/{i}_{post_id}.png"
+            if not os.path.exists(dest_path):
+                os.link(stash_path, dest_path)
     if len(os.listdir(f'images/{group_name}')) == 0:
         os.rmdir(f"images/{group_name}")
+db.close()