q6 commited on
Commit
ea81c1a
·
1 Parent(s): 6579923

Revert "Optimised storing"

Browse files

This reverts commit e733149ccf89c86b7bc96ba9ec9d24c3f6a0d85c.

Files changed (2) hide show
  1. Client/Extract Pixiv/ai_search.py +1 -1
  2. Client/hunt.py +25 -26
Client/Extract Pixiv/ai_search.py CHANGED
@@ -11,7 +11,7 @@ os.chdir(os.path.dirname(os.path.abspath(__file__)))
11
 
12
  input_url = input("Enter the URL: ")
13
 
14
- pages = 300 // 60
15
 
16
  params = {
17
  'raw': input_url,
 
11
 
12
  input_url = input("Enter the URL: ")
13
 
14
+ pages = 1200 // 60
15
 
16
  params = {
17
  'raw': input_url,
Client/hunt.py CHANGED
@@ -5,13 +5,17 @@ from concurrent.futures import ThreadPoolExecutor
5
  from tqdm import tqdm
6
 
7
  local = 0
8
- endpoint = "http://127.0.0.1:7860" if local else "https://q6-p.hf.space"
 
 
 
 
9
  img_base = 'https://i.pximg.net/img-original/img/'
10
 
11
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
12
  os.makedirs("images/Stash", exist_ok=True)
13
 
14
- images_cache = set(os.listdir("images/Stash"))
15
 
16
  db = lmdb.open("db", subdir=True, map_size=1048576)
17
  valid = [f for f in os.listdir() if f.endswith(".txt")]
@@ -21,14 +25,14 @@ for idx, file in enumerate(valid):
21
 
22
  inputs = input("Enter the index of the file: ")
23
  inputs = inputs.split()
24
- indexes = []
25
 
26
  for inp in inputs:
27
  if inp.isdigit():
28
- indexes.append(int(inp) - 1)
29
  elif "-" in inp:
30
  start, end = map(int, inp.split("-"))
31
- indexes.extend(range(start - 1, end))
32
 
33
  def download_image(args):
34
  post_id, url = args
@@ -41,52 +45,47 @@ def download_image(args):
41
  img_file.write(response.content)
42
  return post_id, True
43
 
44
- for index in indexes:
45
  group_name = valid[index].rsplit(".", 1)[0]
46
  os.makedirs(f"images/{group_name}", exist_ok=True)
47
-
48
  with open(valid[index], "r") as f:
49
  post_ids = [x for x in f.read().split("\n") if x]
50
 
51
  with db.begin(write=True) as txn:
52
- urls = {}
53
- for post_id in post_ids:
54
- url = txn.get(post_id.encode())
55
- urls[post_id] = url.decode() if url is not None and url != b'' else None
56
-
57
- filtered = [post_id for post_id, url in urls.items()
58
- if url is None and f"{post_id}.png" not in images_cache]
59
-
60
- print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(urls)}")
61
 
 
 
62
  if filtered:
63
  data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
64
 
65
  for post_id, url in data.items():
66
  txn.put(post_id.encode(), url.encode())
67
- urls[post_id] = url
68
 
69
  no_exif = set(filtered) - set(data.keys())
70
  for post_id in no_exif:
71
  txn.put(post_id.encode(), b'')
72
- urls[post_id] = ''
73
 
74
- to_download = {post_id: url for post_id, url in urls.items()
75
  if url and f"{post_id}.png" not in images_cache}
76
 
77
  print(f"Total images to download: {len(to_download)}")
78
 
79
- if to_download:
 
 
 
80
  print("Starting download...")
81
- with ThreadPoolExecutor(max_workers=30) as executor:
82
- list(tqdm(executor.map(download_image, to_download.items()),
83
- total=len(to_download),
84
- desc="Downloading"))
85
 
86
- images_cache = set(os.listdir("images/Stash"))
87
 
88
  print("Linking images to the group directory...")
89
- for i, post_id in enumerate(urls.keys()):
90
  if f"{post_id}.png" in images_cache:
91
  stash_path = f"images/Stash/{post_id}.png"
92
  dest_path = f"images/{group_name}/{i}_{post_id}.png"
 
5
  from tqdm import tqdm
6
 
7
  local = 0
8
+ if local:
9
+ endpoint = "http://127.0.0.1:7860"
10
+ else:
11
+ endpoint = "https://q6-p.hf.space"
12
+
13
  img_base = 'https://i.pximg.net/img-original/img/'
14
 
15
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
16
  os.makedirs("images/Stash", exist_ok=True)
17
 
18
+ images_cache = os.listdir("images/Stash")
19
 
20
  db = lmdb.open("db", subdir=True, map_size=1048576)
21
  valid = [f for f in os.listdir() if f.endswith(".txt")]
 
25
 
26
  inputs = input("Enter the index of the file: ")
27
  inputs = inputs.split()
28
+ indexs = []
29
 
30
  for inp in inputs:
31
  if inp.isdigit():
32
+ indexs.append(int(inp) - 1)
33
  elif "-" in inp:
34
  start, end = map(int, inp.split("-"))
35
+ indexs.extend(range(start - 1, end))
36
 
37
  def download_image(args):
38
  post_id, url = args
 
45
  img_file.write(response.content)
46
  return post_id, True
47
 
48
+ for index in indexs:
49
  group_name = valid[index].rsplit(".", 1)[0]
50
  os.makedirs(f"images/{group_name}", exist_ok=True)
 
51
  with open(valid[index], "r") as f:
52
  post_ids = [x for x in f.read().split("\n") if x]
53
 
54
  with db.begin(write=True) as txn:
55
+ post_ids_dict = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
56
+ post_ids_dict = {post_id: url.decode() if url != None else None for post_id, url in post_ids_dict.items()}
 
 
 
 
 
 
 
57
 
58
+ filtered = [post_id for post_id, url in post_ids_dict.items() if url == None and f"{post_id}.png" not in images_cache]
59
+ print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids_dict)}")
60
  if filtered:
61
  data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
62
 
63
  for post_id, url in data.items():
64
  txn.put(post_id.encode(), url.encode())
65
+ post_ids_dict[post_id] = url
66
 
67
  no_exif = set(filtered) - set(data.keys())
68
  for post_id in no_exif:
69
  txn.put(post_id.encode(), b'')
 
70
 
71
+ to_download = {post_id: url for post_id, url in post_ids_dict.items()
72
  if url and f"{post_id}.png" not in images_cache}
73
 
74
  print(f"Total images to download: {len(to_download)}")
75
 
76
+ max_workers = 30
77
+ to_download_items = list(to_download.items())
78
+
79
+ if to_download_items:
80
  print("Starting download...")
81
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
82
+ for result in tqdm(executor.map(download_image, to_download_items), total=len(to_download_items), desc="Downloading"):
83
+ pass
 
84
 
85
+ images_cache = os.listdir("images/Stash")
86
 
87
  print("Linking images to the group directory...")
88
+ for i, post_id in enumerate(post_ids_dict.keys()):
89
  if f"{post_id}.png" in images_cache:
90
  stash_path = f"images/Stash/{post_id}.png"
91
  dest_path = f"images/{group_name}/{i}_{post_id}.png"