q6 commited on
Commit
bd3116c
·
1 Parent(s): a082713

Rem from hunt.py

Browse files
Files changed (1) hide show
  1. Client/Scripts/hunt.py +9 -214
Client/Scripts/hunt.py CHANGED
@@ -7,10 +7,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
7
  import shutil
8
  from typing import Dict, Iterator, List, Optional, Sequence, Tuple, Union
9
 
10
- import numpy as np
11
  import requests
12
  from tqdm import tqdm
13
- from PIL import Image
14
 
15
  LOCAL = 0
16
  DRY_RUN = 0
@@ -32,11 +30,6 @@ STREAM_IDLE_TIMEOUT_SECONDS = 45
32
  STREAM_MAX_READ_TIMEOUTS = 3
33
  STREAM_MAX_RETRIES = 3
34
  STREAM_RETRY_DELAY_SECONDS = 2
35
- EXIF_BATCH_SIZE = 200
36
- EXIF_TYPE_ORDER = ("novelai", "sd", "comfy", "mj", "celsys", "photoshop", "stealth")
37
- EXIF_TYPE_TO_CODE = {name: idx + 1 for idx, name in enumerate(EXIF_TYPE_ORDER)}
38
- EXIF_METADATA_MAX_BYTES = 512
39
- PNG_SIGNATURE = b"\x89PNG\r\n\x1a\n"
40
  stop_event = threading.Event()
41
 
42
  def read_dotenv_value(path: str, key: str) -> Optional[str]:
@@ -75,153 +68,13 @@ def open_db(path: str) -> sqlite3.Connection:
75
  """
76
  CREATE TABLE IF NOT EXISTS pixif_cache (
77
  post_id TEXT PRIMARY KEY,
78
- url TEXT,
79
- exif_type INTEGER
80
  )
81
  """
82
  )
83
  conn.commit()
84
- ensure_db_schema(conn)
85
  return conn
86
 
87
- def ensure_db_schema(conn: sqlite3.Connection) -> None:
88
- columns = [row[1] for row in conn.execute("PRAGMA table_info(pixif_cache)")]
89
- if "exif_type" not in columns:
90
- conn.execute("ALTER TABLE pixif_cache ADD COLUMN exif_type INTEGER")
91
- conn.commit()
92
-
93
- def determine_exif_type(metadata: Optional[bytes]) -> Optional[str]:
94
- if metadata is None:
95
- return None
96
- if metadata == b"TitleAI generated image":
97
- return "novelai"
98
- if metadata.startswith(b"parameter"):
99
- return "sd"
100
- if b'{"' in metadata:
101
- return "comfy"
102
- if b"Dig" in metadata:
103
- return "mj"
104
- if metadata.startswith(b"SoftwareCelsys"):
105
- return "celsys"
106
- return "photoshop"
107
-
108
- def exif_type_to_code(exif_type: Optional[str]) -> Optional[int]:
109
- if not exif_type:
110
- return None
111
- return EXIF_TYPE_TO_CODE.get(exif_type)
112
-
113
- def parse_png_metadata(data: bytes) -> Optional[bytes]:
114
- index = 8
115
- while index < len(data):
116
- if index + 8 > len(data):
117
- break
118
- chunk_len = int.from_bytes(data[index:index + 4], "big")
119
- chunk_type = data[index + 4:index + 8]
120
- index += 8
121
- if chunk_type == b"tEXt":
122
- content = data[index:index + chunk_len]
123
- return content.replace(b"\0", b"")
124
- if chunk_type == b"iTXt":
125
- content = data[index:index + chunk_len]
126
- return content.strip()
127
- index += chunk_len + 4
128
- return None
129
-
130
- def parse_png_metadata_file(path: str) -> Optional[bytes]:
131
- try:
132
- with open(path, "rb") as handle:
133
- head = handle.read(EXIF_METADATA_MAX_BYTES)
134
- if not head.startswith(PNG_SIGNATURE):
135
- return None
136
- return parse_png_metadata(head)
137
- except Exception:
138
- return None
139
-
140
- def byteize(alpha: np.ndarray) -> np.ndarray:
141
- alpha = alpha.T.reshape((-1,))
142
- alpha = alpha[:(alpha.shape[0] // 8) * 8]
143
- alpha = np.bitwise_and(alpha, 1)
144
- alpha = alpha.reshape((-1, 8))
145
- alpha = np.packbits(alpha, axis=1)
146
- return alpha
147
-
148
- class LSBExtractor:
149
- def __init__(self, alpha: np.ndarray) -> None:
150
- self.data = byteize(alpha)
151
- self.pos = 0
152
-
153
- def get_next_n_bytes(self, n: int) -> bytearray:
154
- n_bytes = self.data[self.pos:self.pos + n]
155
- self.pos += n
156
- return bytearray(n_bytes)
157
-
158
- def read_32bit_integer(self) -> Optional[int]:
159
- bytes_list = self.get_next_n_bytes(4)
160
- if len(bytes_list) == 4:
161
- return int.from_bytes(bytes_list, byteorder="big")
162
- return None
163
-
164
- def extract_stealth_metadata(image: Image.Image) -> bool:
165
- if "A" not in image.getbands():
166
- raise AssertionError("image format")
167
- alpha = np.array(image.getchannel("A"))
168
- reader = LSBExtractor(alpha)
169
- magic = "stealth_pngcomp"
170
- read_magic = reader.get_next_n_bytes(len(magic)).decode("utf-8")
171
- if magic != read_magic:
172
- raise AssertionError("magic number")
173
- read_len = reader.read_32bit_integer()
174
- if read_len is None:
175
- raise AssertionError("length missing")
176
- return True
177
-
178
- def has_stealth_png_path(path: str) -> bool:
179
- try:
180
- with Image.open(path) as image:
181
- return extract_stealth_metadata(image)
182
- except Exception:
183
- return False
184
-
185
- def detect_exif_code_from_path(path: str) -> Optional[int]:
186
- metadata = parse_png_metadata_file(path)
187
- exif_type = determine_exif_type(metadata)
188
- code = exif_type_to_code(exif_type)
189
- if code is not None:
190
- return code
191
- if has_stealth_png_path(path):
192
- return EXIF_TYPE_TO_CODE.get("stealth")
193
- return None
194
-
195
- def detect_exif_codes_from_files(
196
- post_ids: Sequence[str],
197
- stash_dir: str,
198
- max_workers: int = MAX_WORKERS,
199
- stop_event: threading.Event = stop_event,
200
- ) -> Dict[str, Optional[int]]:
201
- if not post_ids:
202
- return {}
203
- results: Dict[str, Optional[int]] = {}
204
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
205
- futures = {
206
- executor.submit(
207
- detect_exif_code_from_path,
208
- os.path.join(stash_dir, f"{post_id}.png"),
209
- ): post_id
210
- for post_id in post_ids
211
- }
212
- with tqdm(total=len(futures), unit="image", desc="Scanning exif") as pbar:
213
- for future in as_completed(futures):
214
- if stop_event.is_set():
215
- break
216
- post_id = futures[future]
217
- try:
218
- code = future.result()
219
- except Exception:
220
- code = None
221
- results[post_id] = code
222
- pbar.update(1)
223
- return results
224
-
225
  def chunked(seq: Sequence[str], size: int) -> Iterator[List[str]]:
226
  for i in range(0, len(seq), size):
227
  yield seq[i:i + size]
@@ -229,20 +82,18 @@ def chunked(seq: Sequence[str], size: int) -> Iterator[List[str]]:
229
  def fetch_cached_state(
230
  conn: sqlite3.Connection,
231
  post_ids: Sequence[str],
232
- ) -> Tuple[Dict[str, Optional[str]], Dict[str, Optional[int]]]:
233
  post_ids_dict = {post_id: None for post_id in post_ids}
234
- exif_types = {post_id: None for post_id in post_ids}
235
  if not post_ids:
236
- return post_ids_dict, exif_types
237
 
238
  for chunk in chunked(post_ids, 900):
239
  placeholders = ",".join("?" for _ in chunk)
240
- query = f"SELECT post_id, COALESCE(url, ''), exif_type FROM pixif_cache WHERE post_id IN ({placeholders})"
241
- for post_id, url, exif_type in conn.execute(query, chunk):
242
  post_ids_dict[post_id] = url
243
- exif_types[post_id] = exif_type
244
 
245
- return post_ids_dict, exif_types
246
 
247
  def upsert_urls(conn: sqlite3.Connection, rows: Sequence[Tuple[str, str]]) -> None:
248
  if not rows:
@@ -256,30 +107,11 @@ def upsert_urls(conn: sqlite3.Connection, rows: Sequence[Tuple[str, str]]) -> No
256
  rows,
257
  )
258
 
259
- def update_exif_types(conn: sqlite3.Connection, rows: Sequence[Tuple[int, str]]) -> None:
260
- if not rows:
261
- return
262
- conn.executemany(
263
- """
264
- UPDATE pixif_cache SET exif_type = ?
265
- WHERE post_id = ?
266
- """,
267
- rows,
268
- )
269
-
270
- def fetch_exif_types(
271
- post_urls: Dict[str, str],
272
- phpsessid: str,
273
- stop_event: threading.Event = stop_event,
274
- ) -> Dict[str, Optional[int]]:
275
- return {}
276
-
277
  def stream_pixif_updates(
278
  post_ids: Sequence[str],
279
  phpsessid: str,
280
  conn: sqlite3.Connection,
281
  post_ids_dict: Dict[str, Optional[str]],
282
- exif_types: Dict[str, Optional[int]],
283
  desc: str,
284
  stop_event: threading.Event = stop_event,
285
  ) -> int:
@@ -330,7 +162,6 @@ def stream_pixif_updates(
330
  with conn:
331
  upsert_urls(conn, [(post_id, url)])
332
  post_ids_dict[post_id] = url
333
- exif_types[post_id] = exif_types.get(post_id)
334
  pbar.update(1)
335
  processed += 1
336
  last_progress = now
@@ -464,7 +295,6 @@ def scan_with_retries(
464
  phpsessid: str,
465
  conn: sqlite3.Connection,
466
  post_ids_dict: Dict[str, Optional[str]],
467
- exif_types: Dict[str, Optional[int]],
468
  desc: str,
469
  stop_event: threading.Event = stop_event,
470
  ) -> None:
@@ -473,7 +303,7 @@ def scan_with_retries(
473
  remaining = list(post_ids)
474
  attempts = 0
475
  while remaining and attempts < STREAM_MAX_RETRIES and not stop_event.is_set():
476
- stream_pixif_updates(remaining, phpsessid, conn, post_ids_dict, exif_types, desc, stop_event)
477
  remaining = [post_id for post_id in remaining if post_ids_dict.get(post_id) is None]
478
  if not remaining:
479
  break
@@ -489,7 +319,7 @@ try:
489
  with open(os.path.join(ROOT_DIR, valid[index]), "r") as f:
490
  post_ids = f.read().split()
491
 
492
- post_ids_dict, exif_types = fetch_cached_state(conn, post_ids)
493
  if DRY_RUN:
494
  filtered = list(post_ids)
495
  else:
@@ -503,26 +333,9 @@ try:
503
  if filtered:
504
  if DRY_RUN:
505
  print("Dry run outputs (post_id -> page):")
506
- scan_with_retries(filtered, phpsessid, conn, post_ids_dict, exif_types, "Scanning exif", stop_event)
507
  if DRY_RUN:
508
  continue
509
- exif_pending = {
510
- post_id: decode_if_binary(url)
511
- for post_id, url in post_ids_dict.items()
512
- if url and exif_types.get(post_id) is None
513
- }
514
- if exif_pending:
515
- exif_results = fetch_exif_types(exif_pending, phpsessid, stop_event=stop_event)
516
- rows = [
517
- (exif_type, post_id)
518
- for post_id, exif_type in exif_results.items()
519
- if exif_type is not None
520
- ]
521
- if rows:
522
- with conn:
523
- update_exif_types(conn, rows)
524
- for post_id, exif_type in exif_results.items():
525
- exif_types[post_id] = exif_type
526
  to_download = {post_id: decode_if_binary(url) for post_id, url in post_ids_dict.items() if url and f"{post_id}.png" not in images_cache}
527
 
528
  if to_download:
@@ -532,24 +345,6 @@ try:
532
 
533
  images_cache.update(os.listdir(STASH_DIR))
534
 
535
- local_pending = [
536
- post_id
537
- for post_id in post_ids
538
- if exif_types.get(post_id) is None and f"{post_id}.png" in images_cache
539
- ]
540
- if local_pending:
541
- local_results = detect_exif_codes_from_files(local_pending, STASH_DIR, stop_event=stop_event)
542
- rows = [
543
- (exif_type, post_id)
544
- for post_id, exif_type in local_results.items()
545
- if exif_type is not None
546
- ]
547
- if rows:
548
- with conn:
549
- update_exif_types(conn, rows)
550
- for post_id, exif_type in local_results.items():
551
- exif_types[post_id] = exif_type
552
-
553
  print("Linking images to the group directory...")
554
  for i, post_id in enumerate(post_ids):
555
  stash_path = os.path.join(STASH_DIR, f"{post_id}.png")
 
7
  import shutil
8
  from typing import Dict, Iterator, List, Optional, Sequence, Tuple, Union
9
 
 
10
  import requests
11
  from tqdm import tqdm
 
12
 
13
  LOCAL = 0
14
  DRY_RUN = 0
 
30
  STREAM_MAX_READ_TIMEOUTS = 3
31
  STREAM_MAX_RETRIES = 3
32
  STREAM_RETRY_DELAY_SECONDS = 2
 
 
 
 
 
33
  stop_event = threading.Event()
34
 
35
  def read_dotenv_value(path: str, key: str) -> Optional[str]:
 
68
  """
69
  CREATE TABLE IF NOT EXISTS pixif_cache (
70
  post_id TEXT PRIMARY KEY,
71
+ url TEXT
 
72
  )
73
  """
74
  )
75
  conn.commit()
 
76
  return conn
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def chunked(seq: Sequence[str], size: int) -> Iterator[List[str]]:
79
  for i in range(0, len(seq), size):
80
  yield seq[i:i + size]
 
82
  def fetch_cached_state(
83
  conn: sqlite3.Connection,
84
  post_ids: Sequence[str],
85
+ ) -> Dict[str, Optional[str]]:
86
  post_ids_dict = {post_id: None for post_id in post_ids}
 
87
  if not post_ids:
88
+ return post_ids_dict
89
 
90
  for chunk in chunked(post_ids, 900):
91
  placeholders = ",".join("?" for _ in chunk)
92
+ query = f"SELECT post_id, COALESCE(url, '') FROM pixif_cache WHERE post_id IN ({placeholders})"
93
+ for post_id, url in conn.execute(query, chunk):
94
  post_ids_dict[post_id] = url
 
95
 
96
+ return post_ids_dict
97
 
98
  def upsert_urls(conn: sqlite3.Connection, rows: Sequence[Tuple[str, str]]) -> None:
99
  if not rows:
 
107
  rows,
108
  )
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def stream_pixif_updates(
111
  post_ids: Sequence[str],
112
  phpsessid: str,
113
  conn: sqlite3.Connection,
114
  post_ids_dict: Dict[str, Optional[str]],
 
115
  desc: str,
116
  stop_event: threading.Event = stop_event,
117
  ) -> int:
 
162
  with conn:
163
  upsert_urls(conn, [(post_id, url)])
164
  post_ids_dict[post_id] = url
 
165
  pbar.update(1)
166
  processed += 1
167
  last_progress = now
 
295
  phpsessid: str,
296
  conn: sqlite3.Connection,
297
  post_ids_dict: Dict[str, Optional[str]],
 
298
  desc: str,
299
  stop_event: threading.Event = stop_event,
300
  ) -> None:
 
303
  remaining = list(post_ids)
304
  attempts = 0
305
  while remaining and attempts < STREAM_MAX_RETRIES and not stop_event.is_set():
306
+ stream_pixif_updates(remaining, phpsessid, conn, post_ids_dict, desc, stop_event)
307
  remaining = [post_id for post_id in remaining if post_ids_dict.get(post_id) is None]
308
  if not remaining:
309
  break
 
319
  with open(os.path.join(ROOT_DIR, valid[index]), "r") as f:
320
  post_ids = f.read().split()
321
 
322
+ post_ids_dict = fetch_cached_state(conn, post_ids)
323
  if DRY_RUN:
324
  filtered = list(post_ids)
325
  else:
 
333
  if filtered:
334
  if DRY_RUN:
335
  print("Dry run outputs (post_id -> page):")
336
+ scan_with_retries(filtered, phpsessid, conn, post_ids_dict, "API hunt", stop_event)
337
  if DRY_RUN:
338
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  to_download = {post_id: decode_if_binary(url) for post_id, url in post_ids_dict.items() if url and f"{post_id}.png" not in images_cache}
340
 
341
  if to_download:
 
345
 
346
  images_cache.update(os.listdir(STASH_DIR))
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  print("Linking images to the group directory...")
349
  for i, post_id in enumerate(post_ids):
350
  stash_path = os.path.join(STASH_DIR, f"{post_id}.png")