q6 commited on
Commit
92ca3c6
·
1 Parent(s): defc92b

MAJOR: Client-side downloading

Browse files
Files changed (1) hide show
  1. Client/hunt.py +41 -31
Client/hunt.py CHANGED
@@ -2,10 +2,9 @@ import os
2
  import lmdb
3
  import requests
4
  import io
5
- import tarfile
6
- import zstandard
7
 
8
- local = 1
9
  if local:
10
  endpoint = "http://127.0.0.1:7860"
11
  else:
@@ -37,6 +36,17 @@ for inp in inputs:
37
 
38
  blacklist = ['\0', None]
39
 
 
 
 
 
 
 
 
 
 
 
 
40
  for index in indexs:
41
  group_name = valid[index].rsplit(".", 1)[0]
42
  os.makedirs(f"images/{group_name}", exist_ok=True)
@@ -44,47 +54,47 @@ for index in indexs:
44
  post_ids = [x for x in f.read().split("\n") if x]
45
 
46
  with db.begin(write=True) as txn:
47
- post_ids = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
48
- post_ids = {post_id: url.decode() if url else None for post_id, url in post_ids.items()}
49
- filtered = [post_id for post_id, url in post_ids.items() if url == None and f"{post_id}.png" not in images_cache]
 
50
 
51
- print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids)}")
52
  if filtered:
53
  data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
54
 
55
  for post_id, url in data.items():
56
  txn.put(post_id.encode(), url.encode())
57
- post_ids[post_id] = url
58
 
59
  no_exif = set(filtered) - set(data.keys())
60
-
61
  for post_id in no_exif:
62
  txn.put(post_id.encode(), b'\0')
63
 
64
- to_download = {post_id: url for post_id, url in post_ids.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
65
- post_ids.update(to_download)
66
- print(f"Downloading {len(to_download)} images...")
67
- if to_download:
68
- images_zstd = requests.post(f'{endpoint}/download', json={'posts': to_download})
69
-
70
- print("Decompressing zstd...")
71
- dctx = zstandard.ZstdDecompressor()
72
- decompressed = dctx.decompress(images_zstd.content)
73
-
74
- print("Extracting images from tar...")
75
- with io.BytesIO(decompressed) as f:
76
- with tarfile.open(fileobj=f, mode='r:') as tarf:
77
- tarf.extractall("images/Stash")
78
-
79
- images_cache = os.listdir("images/Stash")
80
-
81
- print("Linking images...")
82
- for i, post_id in enumerate(post_ids.keys()):
83
  if f"{post_id}.png" in images_cache:
84
- if not os.path.exists(f"images/{group_name}/{i}_{post_id}.png"):
85
- os.link(f"images/Stash/{post_id}.png", f"images/{group_name}/{i}_{post_id}.png")
 
 
86
 
87
  if len(os.listdir(f'images/{group_name}')) == 0:
88
  os.rmdir(f"images/{group_name}")
89
 
90
- db.close()
 
2
  import lmdb
3
  import requests
4
  import io
5
+ from concurrent.futures import ThreadPoolExecutor
 
6
 
7
+ local = 0
8
  if local:
9
  endpoint = "http://127.0.0.1:7860"
10
  else:
 
36
 
37
  blacklist = ['\0', None]
38
 
39
+ def download_image(args):
40
+ post_id, url = args
41
+ full_url = img_base + url
42
+ response = requests.get(full_url, headers={
43
+ "Referer": "https://www.pixiv.net/",
44
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0"
45
+ })
46
+ with open(f"images/Stash/{post_id}.png", "wb") as img_file:
47
+ img_file.write(response.content)
48
+ return post_id, True
49
+
50
  for index in indexs:
51
  group_name = valid[index].rsplit(".", 1)[0]
52
  os.makedirs(f"images/{group_name}", exist_ok=True)
 
54
  post_ids = [x for x in f.read().split("\n") if x]
55
 
56
  with db.begin(write=True) as txn:
57
+ post_ids_dict = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
58
+ post_ids_dict = {post_id: url.decode() if url else None for post_id, url in post_ids_dict.items()}
59
+
60
+ filtered = [post_id for post_id, url in post_ids_dict.items() if url is None and f"{post_id}.png" not in images_cache]
61
 
62
+ print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids_dict)}")
63
  if filtered:
64
  data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
65
 
66
  for post_id, url in data.items():
67
  txn.put(post_id.encode(), url.encode())
68
+ post_ids_dict[post_id] = url
69
 
70
  no_exif = set(filtered) - set(data.keys())
 
71
  for post_id in no_exif:
72
  txn.put(post_id.encode(), b'\0')
73
 
74
+ to_download = {post_id: url for post_id, url in post_ids_dict.items()
75
+ if url not in blacklist and f"{post_id}.png" not in images_cache}
76
+
77
+ print(f"Total images to download: {len(to_download)}")
78
+
79
+ max_workers = 30
80
+ to_download_items = list(to_download.items())
81
+
82
+ if to_download_items:
83
+ print("Starting download...")
84
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
85
+ results = list(executor.map(download_image, to_download_items))
86
+
87
+ images_cache = os.listdir("images/Stash")
88
+
89
+ print("Linking images to the group directory...")
90
+ for i, post_id in enumerate(post_ids_dict.keys()):
 
 
91
  if f"{post_id}.png" in images_cache:
92
+ stash_path = f"images/Stash/{post_id}.png"
93
+ dest_path = f"images/{group_name}/{i}_{post_id}.png"
94
+ if not os.path.exists(dest_path):
95
+ os.link(stash_path, dest_path)
96
 
97
  if len(os.listdir(f'images/{group_name}')) == 0:
98
  os.rmdir(f"images/{group_name}")
99
 
100
+ db.close()