q6 commited on
Commit
e4435bb
·
1 Parent(s): 5bd3b77
Files changed (3) hide show
  1. API/app.py +4 -6
  2. Client/clear_db.py +3 -4
  3. Client/hunt.py +44 -57
API/app.py CHANGED
@@ -160,7 +160,7 @@ def determine_exif_type(metadata):
160
 
161
  async def get_exif(url, session):
162
  start_range = 0
163
- end_range = 512
164
 
165
  headers = {
166
  "Referer": "https://www.pixiv.net/",
@@ -200,7 +200,7 @@ async def process_post(post_id, session, semaphore):
200
  metadata = await get_exif(image_url, session)
201
  exif_type = determine_exif_type(metadata)
202
  if exif_type not in ['photoshop', 'celsys', None]:
203
- return post_id, image_url.replace(img_base, '')
204
  return post_id, None
205
  except Exception as e:
206
  return post_id, None
@@ -216,7 +216,7 @@ async def pixif(
216
  tasks = [process_post(post_id, session, semaphore) for post_id in post_ids]
217
  results = await asyncio.gather(*tasks)
218
 
219
- image_exifs = {post_id: image_url for post_id, image_url in results if image_url}
220
  return image_exifs
221
 
222
  async def download_image(session, post_id, post_url):
@@ -231,7 +231,6 @@ async def generate_tar(posts, session):
231
  tarf = tarfile.open(mode="w", fileobj=tar_buffer)
232
 
233
  semaphore = asyncio.Semaphore(100) # Adjust based on your needs
234
- lock = asyncio.Lock() # Create a lock for synchronization
235
 
236
  async def add_to_tar(post_id, image_url):
237
  async with semaphore:
@@ -241,8 +240,7 @@ async def generate_tar(posts, session):
241
  image_name = f"{post_id}.png"
242
  info = tarfile.TarInfo(name=image_name)
243
  info.size = len(image_data)
244
- async with lock: # Ensure only one coroutine writes to the tarfile at a time
245
- tarf.addfile(tarinfo=info, fileobj=io.BytesIO(image_data))
246
 
247
  tasks = [add_to_tar(post_id, image_url) for post_id, image_url in posts.items()]
248
  await asyncio.gather(*tasks)
 
160
 
161
  async def get_exif(url, session):
162
  start_range = 0
163
+ end_range = 1024
164
 
165
  headers = {
166
  "Referer": "https://www.pixiv.net/",
 
200
  metadata = await get_exif(image_url, session)
201
  exif_type = determine_exif_type(metadata)
202
  if exif_type not in ['photoshop', 'celsys', None]:
203
+ return post_id, image_url
204
  return post_id, None
205
  except Exception as e:
206
  return post_id, None
 
216
  tasks = [process_post(post_id, session, semaphore) for post_id in post_ids]
217
  results = await asyncio.gather(*tasks)
218
 
219
+ image_exifs = {post_id: image_url.replace('https://i.pximg.net/img-original/', '', 1) for post_id, image_url in results if image_url}
220
  return image_exifs
221
 
222
  async def download_image(session, post_id, post_url):
 
231
  tarf = tarfile.open(mode="w", fileobj=tar_buffer)
232
 
233
  semaphore = asyncio.Semaphore(100) # Adjust based on your needs
 
234
 
235
  async def add_to_tar(post_id, image_url):
236
  async with semaphore:
 
240
  image_name = f"{post_id}.png"
241
  info = tarfile.TarInfo(name=image_name)
242
  info.size = len(image_data)
243
+ tarf.addfile(tarinfo=info, fileobj=io.BytesIO(image_data))
 
244
 
245
  tasks = [add_to_tar(post_id, image_url) for post_id, image_url in posts.items()]
246
  await asyncio.gather(*tasks)
Client/clear_db.py CHANGED
@@ -4,7 +4,6 @@ import shutil
4
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
5
 
6
  shutil.rmtree("db", ignore_errors=True)
7
- shutil.rmtree("images", ignore_errors=True)
8
- # for file in os.listdir():
9
- # if file.endswith(".txt"):
10
- # os.rename(file, f"txt logs/{file}")
 
4
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
5
 
6
  shutil.rmtree("db", ignore_errors=True)
7
+ for file in os.listdir():
8
+ if file.endswith(".txt"):
9
+ os.rename(file, f"txt logs/{file}")
 
Client/hunt.py CHANGED
@@ -17,72 +17,59 @@ for idx, file in enumerate(valid):
17
 
18
  inputs = input("Enter the index of the file: ")
19
  inputs = inputs.split()
20
- indexes = []
21
 
22
  for inp in inputs:
23
  if inp.isdigit():
24
- indexes.append(int(inp) - 1)
25
  elif "-" in inp:
26
  start, end = map(int, inp.split("-"))
27
- indexes.extend(range(start - 1, end))
28
 
29
  blacklist = ['\0', None]
30
 
31
- group_data = {}
32
- group_post_ids = {}
33
-
34
- images_cache = os.listdir("images/Stash")
35
-
36
- for index in indexes:
37
  group_name = valid[index].rsplit(".", 1)[0]
38
  os.makedirs(f"images/{group_name}", exist_ok=True)
39
  with open(valid[index], "r") as f:
40
  post_ids = [x for x in f.read().split("\n") if x]
41
- group_post_ids[group_name] = post_ids
42
- for position, post_id in enumerate(post_ids):
43
- group_data[post_id] = {'group_name': group_name, 'position': position}
44
-
45
- all_post_ids = list(group_data.keys())
46
-
47
- with db.begin(write=True) as txn:
48
- post_id_urls = {post_id: txn.get(post_id.encode()) for post_id in all_post_ids}
49
- post_id_urls = {post_id: url.decode() if url else None for post_id, url in post_id_urls.items()}
50
-
51
- filtered = [post_id for post_id, url in post_id_urls.items() if url == None and f"{post_id}.png" not in images_cache]
52
-
53
- print(f"Filtered: {len(filtered)}/{len(post_id_urls)}")
54
- if filtered:
55
- data = requests.post('https://q6-p.hf.space/pixif', json={"post_ids": filtered}).json()
56
- for post_id, url in data.items():
57
- txn.put(post_id.encode(), url.encode())
58
- post_id_urls[post_id] = url
59
- no_exif = set(filtered) - set(data.keys())
60
- for post_id in no_exif:
61
- txn.put(post_id.encode(), b'\0')
62
-
63
- to_download = {post_id: url for post_id, url in post_id_urls.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
64
-
65
- print(f"Downloading {len(to_download)} images...")
66
- if to_download:
67
- images_zip = requests.post('https://q6-p.hf.space/download', json={'posts': to_download}, stream=True)
68
-
69
- print("Extracting images...")
70
- with io.BytesIO(images_zip.content) as f:
71
- with tarfile.open(fileobj=f, mode='r') as tarf:
72
- tarf.extractall("images/Stash")
73
-
74
- images_cache = os.listdir("images/Stash")
75
- print("Linking images...")
76
-
77
- for post_id in all_post_ids:
78
- if f"{post_id}.png" in images_cache:
79
- group_name = group_data[post_id]['group_name']
80
- position = group_data[post_id]['position']
81
- target_path = f"images/{group_name}/{position}_{post_id}.png"
82
- if not os.path.exists(target_path):
83
- os.link(f"images/Stash/{post_id}.png", target_path)
84
-
85
- for group_name in group_post_ids.keys():
86
- group_folder = f"images/{group_name}"
87
- if len(os.listdir(group_folder)) == 0:
88
- os.rmdir(group_folder)
 
17
 
18
  inputs = input("Enter the index of the file: ")
19
  inputs = inputs.split()
20
+ indexs = []
21
 
22
  for inp in inputs:
23
  if inp.isdigit():
24
+ indexs.append(int(inp) - 1)
25
  elif "-" in inp:
26
  start, end = map(int, inp.split("-"))
27
+ indexs.extend(range(start - 1, end))
28
 
29
  blacklist = ['\0', None]
30
 
31
+ for index in indexs:
32
+ images_cache = os.listdir("images/Stash")
 
 
 
 
33
  group_name = valid[index].rsplit(".", 1)[0]
34
  os.makedirs(f"images/{group_name}", exist_ok=True)
35
  with open(valid[index], "r") as f:
36
  post_ids = [x for x in f.read().split("\n") if x]
37
+
38
+ with db.begin(write=True) as txn:
39
+ post_ids = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
40
+ post_ids = {post_id: url.decode() if url else None for post_id, url in post_ids.items()}
41
+ filtered = [post_id for post_id, url in post_ids.items() if url == None and f"{post_id}.png" not in images_cache]
42
+
43
+ print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids)}")
44
+ if filtered:
45
+ data = requests.post('https://q6-p.hf.space/pixif', json={"post_ids": filtered}).json()
46
+
47
+ for post_id, url in data.items():
48
+ txn.put(post_id.encode(), url.encode())
49
+ post_ids[post_id] = url
50
+
51
+ no_exif = set(filtered) - set(data.keys())
52
+
53
+ for post_id in no_exif:
54
+ txn.put(post_id.encode(), b'\0')
55
+
56
+ to_download = {post_id: url for post_id, url in post_ids.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
57
+ print(f"Downloading {len(to_download)} images...")
58
+ if to_download:
59
+ images_zip = requests.post('https://q6-p.hf.space/download', json={'posts': to_download}, stream=True)
60
+
61
+ print("Extracting images...")
62
+ with io.BytesIO(images_zip.content) as f:
63
+ with tarfile.open(fileobj=f, mode='r') as tarf:
64
+ tarf.extractall("images/Stash")
65
+
66
+
67
+ images_cache = os.listdir("images/Stash")
68
+ print("Linking images...")
69
+ for i, post_id in enumerate(post_ids.keys()):
70
+ if f"{post_id}.png" in images_cache:
71
+ if not os.path.exists(f"images/{group_name}/{i}_{post_id}.png"):
72
+ os.link(f"images/Stash/{post_id}.png", f"images/{group_name}/{i}_{post_id}.png")
73
+
74
+ if len(os.listdir(f'images/{group_name}')) == 0:
75
+ os.rmdir(f"images/{group_name}")