q6 commited on
Commit
597ad06
·
1 Parent(s): 76ebf3f
Files changed (2) hide show
  1. API/app.py +82 -5
  2. Dockerfile +2 -2
API/app.py CHANGED
@@ -2,6 +2,9 @@ from fastapi import FastAPI, Query, BackgroundTasks
2
  from fastapi.responses import FileResponse
3
  import aiohttp
4
  import asyncio
 
 
 
5
  import time
6
  import tempfile
7
  import zipfile
@@ -9,6 +12,9 @@ import os
9
  from pydantic import BaseModel
10
  from typing import List, Dict, Optional
11
 
 
 
 
12
 
13
  img_base = 'https://i.pximg.net/img-original/img/'
14
  COMMENTS_ROOTS_URL = "https://www.pixiv.net/ajax/illusts/comments/roots"
@@ -108,6 +114,57 @@ def parse_png_metadata(data):
108
  index += chunk_len + 4
109
  return None
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def format_comment_text(comment):
112
  text = comment.get("comment") or ""
113
  if not text and comment.get("stampId"):
@@ -193,6 +250,10 @@ async def fetch_post_comments(post_id, session, limit, semaphore):
193
 
194
  return str(post_id), results
195
 
 
 
 
 
196
  async def process_post(post_id, session, semaphore):
197
  async with semaphore:
198
  try:
@@ -217,21 +278,37 @@ async def process_post(post_id, session, semaphore):
217
 
218
  for s, e in chunks:
219
  chunk_tasks = [get_exif(image_urls[i], session) for i in range(s, e)]
220
- results = await asyncio.gather(*chunk_tasks)
221
 
222
  for image_url, metadata in zip(image_urls[s:e], results):
223
- exif_type = determine_exif_type(metadata)
 
 
 
224
  if exif_type not in ['photoshop', 'celsys', None]:
225
  return post_id, image_url
 
 
 
 
 
 
 
 
 
 
226
 
227
  return post_id, None
228
  except Exception:
229
  return post_id, None
230
 
231
  async def fetch_image_bytes(session, url, post_id, semaphore):
232
- async with semaphore:
233
- async with session.get(url) as response:
234
- return post_id, await response.read()
 
 
 
235
 
236
  @app.get("/allimages")
237
  async def all_images(
 
2
  from fastapi.responses import FileResponse
3
  import aiohttp
4
  import asyncio
5
+ import gzip
6
+ import io
7
+ import json
8
  import time
9
  import tempfile
10
  import zipfile
 
12
  from pydantic import BaseModel
13
  from typing import List, Dict, Optional
14
 
15
+ import numpy as np
16
+ from PIL import Image
17
+
18
 
19
  img_base = 'https://i.pximg.net/img-original/img/'
20
  COMMENTS_ROOTS_URL = "https://www.pixiv.net/ajax/illusts/comments/roots"
 
114
  index += chunk_len + 4
115
  return None
116
 
117
+ def byteize(alpha):
118
+ alpha = alpha.T.reshape((-1,))
119
+ alpha = alpha[:(alpha.shape[0] // 8) * 8]
120
+ alpha = np.bitwise_and(alpha, 1)
121
+ alpha = alpha.reshape((-1, 8))
122
+ alpha = np.packbits(alpha, axis=1)
123
+ return alpha
124
+
125
+ class LSBExtractor:
126
+ def __init__(self, alpha):
127
+ self.data = byteize(alpha)
128
+ self.pos = 0
129
+
130
+ def get_next_n_bytes(self, n):
131
+ n_bytes = self.data[self.pos:self.pos + n]
132
+ self.pos += n
133
+ return bytearray(n_bytes)
134
+
135
+ def read_32bit_integer(self):
136
+ bytes_list = self.get_next_n_bytes(4)
137
+ if len(bytes_list) == 4:
138
+ return int.from_bytes(bytes_list, byteorder="big")
139
+ return None
140
+
141
+ def extract_stealth_metadata(image: Image.Image) -> dict:
142
+ if "A" not in image.getbands():
143
+ raise AssertionError("image format")
144
+ alpha = np.array(image.getchannel("A"))
145
+ reader = LSBExtractor(alpha)
146
+ magic = "stealth_pngcomp"
147
+ read_magic = reader.get_next_n_bytes(len(magic)).decode("utf-8")
148
+ if magic != read_magic:
149
+ raise AssertionError("magic number")
150
+ read_len = reader.read_32bit_integer()
151
+ if read_len is None:
152
+ raise AssertionError("length missing")
153
+ read_len = read_len // 8
154
+ json_data = reader.get_next_n_bytes(read_len)
155
+ json_data = json.loads(gzip.decompress(json_data).decode("utf-8"))
156
+ if "Comment" in json_data and isinstance(json_data["Comment"], str):
157
+ json_data["Comment"] = json.loads(json_data["Comment"])
158
+ return json_data
159
+
160
+ def has_stealth_png_bytes(data: bytes) -> bool:
161
+ try:
162
+ image = Image.open(io.BytesIO(data))
163
+ extract_stealth_metadata(image)
164
+ return True
165
+ except Exception:
166
+ return False
167
+
168
  def format_comment_text(comment):
169
  text = comment.get("comment") or ""
170
  if not text and comment.get("stampId"):
 
250
 
251
  return str(post_id), results
252
 
253
+ async def fetch_image_data(session, url):
254
+ async with session.get(url) as response:
255
+ return await response.read()
256
+
257
  async def process_post(post_id, session, semaphore):
258
  async with semaphore:
259
  try:
 
278
 
279
  for s, e in chunks:
280
  chunk_tasks = [get_exif(image_urls[i], session) for i in range(s, e)]
281
+ results = await asyncio.gather(*chunk_tasks, return_exceptions=True)
282
 
283
  for image_url, metadata in zip(image_urls[s:e], results):
284
+ if isinstance(metadata, Exception):
285
+ exif_type = None
286
+ else:
287
+ exif_type = determine_exif_type(metadata)
288
  if exif_type not in ['photoshop', 'celsys', None]:
289
  return post_id, image_url
290
+
291
+ for s, e in chunks:
292
+ stealth_tasks = [fetch_image_data(session, image_urls[i]) for i in range(s, e)]
293
+ stealth_results = await asyncio.gather(*stealth_tasks, return_exceptions=True)
294
+
295
+ for image_url, data in zip(image_urls[s:e], stealth_results):
296
+ if isinstance(data, Exception) or data is None:
297
+ continue
298
+ if has_stealth_png_bytes(data):
299
+ return post_id, image_url
300
 
301
  return post_id, None
302
  except Exception:
303
  return post_id, None
304
 
305
  async def fetch_image_bytes(session, url, post_id, semaphore):
306
+ if semaphore:
307
+ async with semaphore:
308
+ async with session.get(url) as response:
309
+ return post_id, await response.read()
310
+ async with session.get(url) as response:
311
+ return post_id, await response.read()
312
 
313
  @app.get("/allimages")
314
  async def all_images(
Dockerfile CHANGED
@@ -8,7 +8,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
8
  ENV PYTHONUNBUFFERED=1
9
  WORKDIR /app
10
 
11
- RUN pip install --no-cache-dir fastapi aiohttp uvicorn pydantic requests
12
 
13
  COPY --chown=user ./API /app
14
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
8
  ENV PYTHONUNBUFFERED=1
9
  WORKDIR /app
10
 
11
+ RUN pip install --no-cache-dir fastapi aiohttp uvicorn pydantic requests numpy pillow
12
 
13
  COPY --chown=user ./API /app
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]