q6 commited on
Commit
82901f0
·
1 Parent(s): e5661ed

Refactor code to be shorter

Browse files
Client/Extract Pixiv/user.py CHANGED
@@ -11,10 +11,6 @@ else:
11
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
12
 
13
  inp = input("Enter User IDs (separated by spaces or commas): ")
14
- inps='1'
15
- while inps != "":
16
- inps = input('')
17
- inp += inps
18
  user_ids = re.findall(r"\d+", inp)
19
 
20
  user_ids = [int(uid) for uid in user_ids]
 
11
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
12
 
13
  inp = input("Enter User IDs (separated by spaces or commas): ")
 
 
 
 
14
  user_ids = re.findall(r"\d+", inp)
15
 
16
  user_ids = [int(uid) for uid in user_ids]
Client/actual_size.py DELETED
@@ -1,8 +0,0 @@
1
- import lmdb
2
- import os
3
-
4
- os.chdir(os.path.dirname(os.path.abspath(__file__)))
5
- db = lmdb.open("db", subdir=True, map_size=524288)
6
-
7
- print(db.stat())
8
- print(db.info())
 
 
 
 
 
 
 
 
 
Client/hunt.py CHANGED
@@ -5,39 +5,32 @@ from concurrent.futures import ThreadPoolExecutor
5
  from tqdm import tqdm
6
 
7
  local = 0
8
- if local:
9
- endpoint = "http://127.0.0.1:7860"
10
- else:
11
- endpoint = "https://q6-p.hf.space"
12
-
13
  img_base = 'https://i.pximg.net/img-original/img/'
14
 
15
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
16
  os.makedirs("images/Stash", exist_ok=True)
17
 
18
- images_cache = os.listdir("images/Stash")
19
 
20
- db = lmdb.open("db", subdir=True, map_size=1048576)
21
- valid = [f for f in os.listdir() if f.endswith(".txt")]
22
 
 
23
  for idx, file in enumerate(valid):
24
  print(f"{idx + 1}: {file}")
25
 
26
- inputs = input("Enter the index of the file: ")
27
- inputs = inputs.split()
28
  indexs = []
29
-
30
  for inp in inputs:
31
- if inp.isdigit():
32
- indexs.append(int(inp) - 1)
33
- elif "-" in inp:
34
  start, end = map(int, inp.split("-"))
35
  indexs.extend(range(start - 1, end))
 
 
36
 
37
  def download_image(args):
38
  post_id, url = args
39
- full_url = img_base + url
40
- response = requests.get(full_url, headers={
41
  "Referer": "https://www.pixiv.net/",
42
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0"
43
  })
@@ -45,54 +38,48 @@ def download_image(args):
45
  img_file.write(response.content)
46
  return post_id, True
47
 
 
 
 
 
 
48
  for index in indexs:
49
  group_name = valid[index].rsplit(".", 1)[0]
50
  os.makedirs(f"images/{group_name}", exist_ok=True)
51
  with open(valid[index], "r") as f:
52
- post_ids = [x for x in f.read().split("\n") if x]
53
 
54
  with db.begin(write=True) as txn:
55
  post_ids_dict = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
56
- post_ids_dict = {post_id: url.decode() if url != None else None for post_id, url in post_ids_dict.items()}
 
57
 
58
- filtered = [post_id for post_id, url in post_ids_dict.items() if url == None and f"{post_id}.png" not in images_cache]
59
- print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids_dict)}")
60
  if filtered:
61
  data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
62
-
63
  for post_id, url in data.items():
64
  txn.put(post_id.encode(), url.encode())
65
  post_ids_dict[post_id] = url
66
-
67
  no_exif = set(filtered) - set(data.keys())
68
  for post_id in no_exif:
69
  txn.put(post_id.encode(), b'')
70
 
71
- to_download = {post_id: url for post_id, url in post_ids_dict.items()
72
- if url and f"{post_id}.png" not in images_cache}
73
-
74
- print(f"Total images to download: {len(to_download)}")
 
 
75
 
76
- max_workers = 30
77
- to_download_items = list(to_download.items())
78
-
79
- if to_download_items:
80
- print("Starting download...")
81
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
82
- for result in tqdm(executor.map(download_image, to_download_items), total=len(to_download_items), desc="Downloading"):
83
- pass
84
-
85
- images_cache = os.listdir("images/Stash")
86
 
87
  print("Linking images to the group directory...")
88
- for i, post_id in enumerate(post_ids_dict.keys()):
89
- if f"{post_id}.png" in images_cache:
90
- stash_path = f"images/Stash/{post_id}.png"
91
- dest_path = f"images/{group_name}/{i}_{post_id}.png"
92
- if not os.path.exists(dest_path):
93
- os.link(stash_path, dest_path)
94
-
95
- if len(os.listdir(f'images/{group_name}')) == 0:
96
  os.rmdir(f"images/{group_name}")
97
 
98
  db.close()
 
5
  from tqdm import tqdm
6
 
7
  local = 0
8
+ endpoint = "http://127.0.0.1:7860" if local else "https://q6-p.hf.space"
 
 
 
 
9
  img_base = 'https://i.pximg.net/img-original/img/'
10
 
11
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
12
  os.makedirs("images/Stash", exist_ok=True)
13
 
14
+ images_cache = set(os.listdir("images/Stash"))
15
 
16
+ db = lmdb.open("db", subdir=True, map_size=1048576 * 2)
 
17
 
18
+ valid = [f for f in os.listdir() if f.endswith(".txt")]
19
  for idx, file in enumerate(valid):
20
  print(f"{idx + 1}: {file}")
21
 
22
+ inputs = input("Enter the index of the file: ").split()
 
23
  indexs = []
 
24
  for inp in inputs:
25
+ if "-" in inp:
 
 
26
  start, end = map(int, inp.split("-"))
27
  indexs.extend(range(start - 1, end))
28
+ elif inp.isdigit():
29
+ indexs.append(int(inp) - 1)
30
 
31
  def download_image(args):
32
  post_id, url = args
33
+ response = requests.get(img_base + url, headers={
 
34
  "Referer": "https://www.pixiv.net/",
35
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0"
36
  })
 
38
  img_file.write(response.content)
39
  return post_id, True
40
 
41
+ def decode_if_binary(val):
42
+ if type(val) == bytes:
43
+ return val.decode()
44
+ return val
45
+
46
  for index in indexs:
47
  group_name = valid[index].rsplit(".", 1)[0]
48
  os.makedirs(f"images/{group_name}", exist_ok=True)
49
  with open(valid[index], "r") as f:
50
+ post_ids = f.read().split()
51
 
52
  with db.begin(write=True) as txn:
53
  post_ids_dict = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
54
+ filtered = [post_id for post_id in post_ids if post_ids_dict[post_id] == None and f"{post_id}.png" not in images_cache]
55
+ print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids)}")
56
 
 
 
57
  if filtered:
58
  data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered}).json()
 
59
  for post_id, url in data.items():
60
  txn.put(post_id.encode(), url.encode())
61
  post_ids_dict[post_id] = url
 
62
  no_exif = set(filtered) - set(data.keys())
63
  for post_id in no_exif:
64
  txn.put(post_id.encode(), b'')
65
 
66
+ to_download = {post_id: decode_if_binary(url) for post_id, url in post_ids_dict.items() if url and f"{post_id}.png" not in images_cache}
67
+
68
+ if to_download:
69
+ print(f"Total images to download: {len(to_download)}")
70
+ with ThreadPoolExecutor(max_workers=20) as executor:
71
+ list(tqdm(executor.map(download_image, to_download.items()), total=len(to_download), desc="Downloading"))
72
 
73
+ images_cache.update(os.listdir("images/Stash"))
 
 
 
 
 
 
 
 
 
74
 
75
  print("Linking images to the group directory...")
76
+ for i, post_id in enumerate(post_ids):
77
+ stash_path = f"images/Stash/{post_id}.png"
78
+ dest_path = f"images/{group_name}/{i}_{post_id}.png"
79
+ if os.path.exists(stash_path) and not os.path.exists(dest_path):
80
+ os.link(stash_path, dest_path)
81
+
82
+ if not os.listdir(f'images/{group_name}'):
 
83
  os.rmdir(f"images/{group_name}")
84
 
85
  db.close()
Client/show_all.py CHANGED
@@ -3,8 +3,7 @@ import lmdb
3
 
4
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
5
 
6
- db = lmdb.open("db", subdir=True, map_size=1048576)
7
-
8
 
9
  items = []
10
 
@@ -12,18 +11,14 @@ with db.begin() as txn:
12
  cursor = txn.cursor()
13
 
14
  for key, value in cursor:
15
- if value != b'\x00':
16
- try:
17
- key_str = key.decode('utf-8')
18
- value_str = value.decode('utf-8')
19
-
20
- int_part_str = value_str.split('p')[1].split('.')[0]
21
- int_part = int(int_part_str)
22
- items.append((key_str, int_part))
23
 
24
- except (IndexError, ValueError) as e:
25
- print(f"Skipping key {key} due to parsing error: {e}")
26
- continue
27
 
28
  sorted_items = sorted(items, key=lambda item: item[1])
29
 
 
3
 
4
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
5
 
6
+ db = lmdb.open("db", subdir=True, map_size=1048576 * 2)
 
7
 
8
  items = []
9
 
 
11
  cursor = txn.cursor()
12
 
13
  for key, value in cursor:
14
+ if value != b'':
15
+ key_str = key.decode('utf-8')
16
+ value_str = value.decode('utf-8')
17
+
18
+ int_part_str = value_str.split('p')[1].split('.')[0]
19
+ int_part = int(int_part_str)
20
+ items.append((key_str, int_part))
 
21
 
 
 
 
22
 
23
  sorted_items = sorted(items, key=lambda item: item[1])
24