q6 commited on
Commit
11da7cf
·
1 Parent(s): 6f9287d

Moved seasrch to client

Browse files
.gitignore CHANGED
@@ -2,4 +2,5 @@
2
  *.txt
3
  db
4
  images
5
- merge_dev.bat
 
 
2
  *.txt
3
  db
4
  images
5
+ merge_dev.bat
6
+ __pycache__
API/app.py CHANGED
@@ -34,43 +34,6 @@ async def fetch_page(session, url):
34
  data = await response.json()
35
  return data
36
 
37
- async def search(raw, pages, ai_only=True, real_only=True, cookies=None, headers=None):
38
- keywords = raw.split('tags/')[-1].split('/')[0]
39
- url = f"https://www.pixiv.net/ajax/search/artworks/{keywords}?word={keywords}"
40
- if "?" in raw:
41
- params = raw.split('?')[1]
42
- url += f"&{params}"
43
- if "s_mode" not in url:
44
- url += "&s_mode=s_tag_full"
45
-
46
- post_ids = []
47
- tasks = []
48
-
49
- async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
50
- for page in range(1, pages + 1):
51
- page_url = f"{url.strip()}&p={page}"
52
- task = fetch_page(session, page_url)
53
- tasks.append(task)
54
-
55
- responses = await asyncio.gather(*tasks)
56
- posts = []
57
- for data in responses:
58
- if ai_only:
59
- for post in data['body']['illustManga']['data']:
60
- if post['aiType'] == 2:
61
- posts.append(post)
62
- if real_only:
63
- for post in data['body']['illustManga']['data']:
64
- if post['aiType'] != 2:
65
- posts.append(post)
66
- else:
67
- posts = data['body']['illustManga']['data']
68
- if not posts:
69
- break
70
- post_ids.extend([post['id'] for post in posts])
71
-
72
- return post_ids, requests.utils.unquote(keywords, encoding='utf-8')
73
-
74
  def base26(n):
75
  if n == 0:
76
  return "A"
@@ -85,58 +48,6 @@ def base26(n):
85
  def base26_time():
86
  return base26(int(time.time()))
87
 
88
- @app.get("/search")
89
- async def search_endpoint(
90
- raw: str = Query(..., description="The raw URL to search."),
91
- pages: int = Query(1, description="Number of pages to fetch."),
92
- ai_only: bool = Query(True, description="Filter for AI-generated content."),
93
- real_only: bool = Query(True, description="Filter for real content."),
94
- ):
95
- try:
96
- post_ids, keywords = await search(raw, pages, ai_only, real_only, cookies=cookies, headers=headers)
97
- return {"post_ids": post_ids, "filename": base26_time() + "_" + keywords}
98
- except Exception as e:
99
- return {"error": str(e)}
100
-
101
- @app.get("/user")
102
- async def user(
103
- user_id: int = Query(..., description="The user ID to fetch.")
104
- ):
105
- async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
106
- data = await fetch_page(session, f'https://www.pixiv.net/ajax/user/{user_id}/profile/all')
107
- posts = data["body"]["illusts"].keys()
108
- try:
109
- username = data['body']['pickup'][0]['userName']
110
- except (KeyError, IndexError):
111
- user_data = await fetch_page(session, f"https://www.pixiv.net/ajax/user/{user_id}")
112
- username = user_data['body']['name']
113
-
114
- return {"post_ids": list(posts), "filename": base26_time() + "_" + username.replace("|", "")}
115
-
116
- @app.get("/users")
117
- async def users(
118
- user_ids: List[int] = Query(..., description="List of user IDs to fetch.", alias="user_ids")
119
- ):
120
- async def fetch_user_data(session, uid):
121
- try:
122
- data = await fetch_page(session, f'https://www.pixiv.net/ajax/user/{uid}/profile/all')
123
- posts = list(data["body"]["illusts"].keys())
124
- try:
125
- username = data['body']['pickup'][0]['userName']
126
- except (KeyError, IndexError):
127
- user_data = await fetch_page(session, f"https://www.pixiv.net/ajax/user/{uid}")
128
- username = user_data['body']['name']
129
- filename = base26_time() + "_" + username.replace("|", "")
130
- return {"post_ids": posts, "filename": filename}
131
- except Exception as e:
132
- return {"user_id": uid, "error": str(e)}
133
-
134
- async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
135
- tasks = [fetch_user_data(session, uid) for uid in user_ids]
136
- results = await asyncio.gather(*tasks)
137
-
138
- return results
139
-
140
  def determine_exif_type(metadata):
141
  if metadata is None:
142
  return None
@@ -275,4 +186,4 @@ async def read_root():
275
 
276
  if __name__ == "__main__":
277
  import uvicorn
278
- uvicorn.run(app, host="127.0.0.1", port=7860)
 
34
  data = await response.json()
35
  return data
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def base26(n):
38
  if n == 0:
39
  return "A"
 
48
  def base26_time():
49
  return base26(int(time.time()))
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def determine_exif_type(metadata):
52
  if metadata is None:
53
  return None
 
186
 
187
  if __name__ == "__main__":
188
  import uvicorn
189
+ uvicorn.run(app, host="127.0.0.1", port=7860)
Client/Extract Pixiv/ai_search.py CHANGED
@@ -1,30 +1,20 @@
1
- import requests
2
  import os
3
-
4
- local = 1
5
- if local:
6
- endpoint = "http://127.0.0.1:7860"
7
- else:
8
- endpoint = "https://q6-p.hf.space"
9
 
10
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
11
 
12
  input_url = input("Enter the URL: ")
13
 
14
- pages = 3
15
-
16
- params = {
17
- 'raw': input_url,
18
- 'pages': pages,
19
- 'ai_only': True,
20
- 'real_only': False,
21
- }
22
 
23
- response = requests.get(f'{endpoint}/search', params=params)
 
 
 
 
24
 
25
- data = response.json()
26
- print(data)
27
- post_ids = data['post_ids']
28
- post_ids = list(dict.fromkeys(post_ids))
29
- with open(f"../{data['filename']}.txt", "w") as f:
30
- f.write("\n".join(post_ids))
 
 
1
  import os
2
+ import asyncio
3
+ import sys
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
+ from pixiv_api import search, cookies, headers
 
 
6
 
7
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
8
 
9
  input_url = input("Enter the URL: ")
10
 
11
+ pages = 5
 
 
 
 
 
 
 
12
 
13
+ async def main():
14
+ post_ids, filename = await search(input_url, pages, ai_only=True, real_only=False, cookies=cookies, headers=headers)
15
+ post_ids = list(dict.fromkeys(post_ids))
16
+ with open(f"../{filename}.txt", "w") as f:
17
+ f.write("\n".join(map(str, post_ids)))
18
 
19
+ if __name__ == "__main__":
20
+ asyncio.run(main())
 
 
 
 
Client/Extract Pixiv/pixiv_api.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+ import asyncio
3
+ import requests.utils
4
+ import time
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+ img_base = 'https://i.pximg.net/img-original/img/'
9
+
10
+ load_dotenv()
11
+
12
+ PHPSESSID = os.getenv("PHPSESSID")
13
+
14
+ cookies = {"PHPSESSID": PHPSESSID}
15
+
16
+ headers = {
17
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0",
18
+ 'referer': 'https://www.pixiv.net/',
19
+ }
20
+
21
+ async def fetch_page(session, url):
22
+ async with session.get(url) as response:
23
+ data = await response.json()
24
+ return data
25
+
26
+ async def search(raw, pages, ai_only=True, real_only=True, cookies=None, headers=None):
27
+ keywords = raw.split('tags/')[-1].split('/')[0]
28
+ url = f"https://www.pixiv.net/ajax/search/artworks/{keywords}?word={keywords}"
29
+ if "?" in raw:
30
+ params = raw.split('?')[1]
31
+ url += f"&{params}"
32
+ if "s_mode" not in url:
33
+ url += "&s_mode=s_tag_full"
34
+
35
+ post_ids = []
36
+ tasks = []
37
+
38
+ async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
39
+ for page in range(1, pages + 1):
40
+ page_url = f"{url.strip()}&p={page}"
41
+ task = fetch_page(session, page_url)
42
+ tasks.append(task)
43
+
44
+ responses = await asyncio.gather(*tasks)
45
+ posts = []
46
+ for data in responses:
47
+ if ai_only:
48
+ for post in data['body']['illustManga']['data']:
49
+ if post['aiType'] == 2:
50
+ posts.append(post)
51
+ if real_only:
52
+ for post in data['body']['illustManga']['data']:
53
+ if post['aiType'] != 2:
54
+ posts.append(post)
55
+ else:
56
+ posts = data['body']['illustManga']['data']
57
+ if not posts:
58
+ break
59
+ post_ids.extend([post['id'] for post in posts])
60
+
61
+ return post_ids, requests.utils.unquote(keywords, encoding='utf-8')
62
+
63
+ def base26(n):
64
+ if n == 0:
65
+ return "A"
66
+
67
+ b26 = ""
68
+ while n > 0:
69
+ n, remainder = divmod(n, 26)
70
+ b26 = chr(97 + remainder) + b26
71
+
72
+ return b26
73
+
74
+ def base26_time():
75
+ return base26(int(time.time()))
76
+
77
+ async def get_user(user_id, session):
78
+ data = await fetch_page(session, f'https://www.pixiv.net/ajax/user/{user_id}/profile/all')
79
+ posts = data["body"]["illusts"].keys()
80
+ try:
81
+ username = data['body']['pickup'][0]['userName']
82
+ except (KeyError, IndexError):
83
+ user_data = await fetch_page(session, f"https://www.pixiv.net/ajax/user/{user_id}")
84
+ username = user_data['body']['name']
85
+
86
+ return {"post_ids": list(posts), "filename": base26_time() + "_" + username.replace("|", "")}
87
+
88
+ async def get_users(user_ids):
89
+ async def fetch_user_data(session, uid):
90
+ try:
91
+ return await get_user(uid, session)
92
+ except Exception as e:
93
+ return {"user_id": uid, "error": str(e)}
94
+
95
+ async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
96
+ tasks = [fetch_user_data(session, uid) for uid in user_ids]
97
+ results = await asyncio.gather(*tasks)
98
+
99
+ return results
100
+
101
+ def determine_exif_type(metadata):
102
+ if metadata is None:
103
+ return None
104
+ elif metadata == b'TitleAI generated image':
105
+ return "novelai"
106
+ elif metadata.startswith(b"parameter"):
107
+ return "sd"
108
+ elif b'{"' in metadata:
109
+ return "comfy"
110
+ elif b"Dig" in metadata:
111
+ return "mj"
112
+ elif metadata.startswith(b"SoftwareCelsys"):
113
+ return "celsys"
114
+ else:
115
+ return "photoshop"
116
+
117
+ async def get_exif(url, session):
118
+ start_range = 0
119
+ end_range = 512
120
+
121
+ headers = {
122
+ "Referer": "https://www.pixiv.net/",
123
+ "Range": f"bytes={start_range}-{end_range}"
124
+ }
125
+
126
+ async with session.get(url, headers=headers) as response:
127
+ data = await response.read()
128
+ return parse_png_metadata(data)
129
+
130
+ def parse_png_metadata(data):
131
+ index = 8
132
+
133
+ while index < len(data):
134
+ if index + 8 > len(data):
135
+ break
136
+ chunk_len = int.from_bytes(data[index:index+4], 'big')
137
+ chunk_type = data[index+4:index+8].decode('ascii')
138
+ index += 8
139
+
140
+ if chunk_type in ['tEXt', 'iTXt']:
141
+ content = data[index:index+chunk_len]
142
+ if chunk_type == 'tEXt':
143
+ return content.replace(b'\0', b'')
144
+ elif chunk_type == 'iTXt':
145
+ return content.strip()
146
+
147
+ index += chunk_len + 4
148
+ return None
149
+
150
+ async def process_post(post_id, session, semaphore):
151
+ async with semaphore:
152
+ try:
153
+ data = await fetch_page(session, f"https://www.pixiv.net/ajax/illust/{post_id}/pages")
154
+ image_urls = [page['urls']['original'] for page in data['body'] if 'png' in page['urls']['original']]
155
+
156
+ initial_offsets = [1, 5, 5, 10, 10, 10]
157
+
158
+ chunks = []
159
+ start = 0
160
+ for offset in initial_offsets:
161
+ end = start + offset
162
+ if end > len(image_urls):
163
+ end = len(image_urls)
164
+ chunks.append((start, end))
165
+ start = end
166
+
167
+ while start < len(image_urls):
168
+ end = min(start + 10, len(image_urls))
169
+ chunks.append((start, end))
170
+ start = end
171
+
172
+ for s, e in chunks:
173
+ chunk_tasks = [get_exif(image_urls[i], session) for i in range(s, e)]
174
+ results = await asyncio.gather(*chunk_tasks)
175
+
176
+ for image_url, metadata in zip(image_urls[s:e], results):
177
+ exif_type = determine_exif_type(metadata)
178
+ if exif_type not in ['photoshop', 'celsys', None]:
179
+ return post_id, image_url
180
+
181
+ return post_id, None
182
+ except:
183
+ return post_id, None
184
+
185
+ async def get_pixif_data(post_ids):
186
+ semaphore = asyncio.Semaphore(100)
187
+ async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
188
+ tasks = [process_post(post_id, session, semaphore) for post_id in post_ids]
189
+ results = await asyncio.gather(*tasks)
190
+
191
+ image_exifs = {post_id: image_url.replace(img_base, '', 1) for post_id, image_url in results if image_url}
192
+ return image_exifs
Client/Extract Pixiv/real_search.py CHANGED
@@ -1,11 +1,8 @@
1
- import requests
2
  import os
3
-
4
- local = 0
5
- if local:
6
- endpoint = "http://127.0.0.1:7860"
7
- else:
8
- endpoint = "https://q6-p.hf.space"
9
 
10
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
11
 
@@ -13,15 +10,10 @@ input_url = input("Enter the URL: ")
13
 
14
  pages = 300 // 60
15
 
16
- params = {
17
- 'raw': input_url,
18
- 'pages': pages,
19
- 'ai_only': False,
20
- 'real_only': True,
21
- }
22
-
23
- response = requests.get(f'{endpoint}/search', params=params)
24
 
25
- data = response.json()
26
- with open(f"../{data['filename']}.txt", "w") as f:
27
- f.write("\n".join(data['post_ids']))
 
 
1
  import os
2
+ import asyncio
3
+ import sys
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
+ from pixiv_api import search, cookies, headers
 
 
6
 
7
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
8
 
 
10
 
11
  pages = 300 // 60
12
 
13
+ async def main():
14
+ post_ids, filename = await search(input_url, pages, ai_only=False, real_only=True, cookies=cookies, headers=headers)
15
+ with open(f"../{filename}.txt", "w") as f:
16
+ f.write("\n".join(map(str, post_ids)))
 
 
 
 
17
 
18
+ if __name__ == "__main__":
19
+ asyncio.run(main())
 
Client/Extract Pixiv/user.py CHANGED
@@ -1,13 +1,9 @@
1
- import requests
2
  import re
3
  import os
4
-
5
- local = 0
6
-
7
- if local:
8
- endpoint = "http://127.0.0.1:7860"
9
- else:
10
- endpoint = "https://q6-p.hf.space"
11
 
12
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
13
 
@@ -20,17 +16,18 @@ if len(user_ids) == 0:
20
  import sys
21
 
22
  sys.exit()
23
-
24
- response = requests.get(f'{endpoint}/users', params={'user_ids': user_ids})
25
- response.raise_for_status()
26
- data = response.json()
27
 
28
- for user_data in data:
29
- if 'error' in user_data:
30
- print(f"User ID {user_data.get('user_id')} Error: {user_data.get('error')}")
31
- continue
32
- filename = user_data['filename']
33
- post_ids = user_data['post_ids']
 
 
 
 
 
34
 
35
- with open(os.path.join("..", filename + '.txt'), "w", encoding='utf-8') as f:
36
- f.write("\n".join(post_ids))
 
 
1
  import re
2
  import os
3
+ import asyncio
4
+ import sys
5
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6
+ from pixiv_api import get_users
 
 
 
7
 
8
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
9
 
 
16
  import sys
17
 
18
  sys.exit()
 
 
 
 
19
 
20
+ async def main():
21
+ data = await get_users(user_ids)
22
+ for user_data in data:
23
+ if 'error' in user_data:
24
+ print(f"User ID {user_data.get('user_id')} Error: {user_data.get('error')}")
25
+ continue
26
+ filename = user_data['filename']
27
+ post_ids = user_data['post_ids']
28
+
29
+ with open(os.path.join("..", filename + '.txt'), "w", encoding='utf-8') as f:
30
+ f.write("\n".join(map(str, post_ids)))
31
 
32
+ if __name__ == "__main__":
33
+ asyncio.run(main())
Client/hunt.py CHANGED
@@ -76,10 +76,11 @@ for index in indexs:
76
  for i, post_id in enumerate(post_ids):
77
  stash_path = f"images/Stash/{post_id}.png"
78
  dest_path = f"images/{group_name}/{i}_{post_id}.png"
 
79
  if os.path.exists(stash_path) and not os.path.exists(dest_path):
80
  os.link(stash_path, dest_path)
81
 
82
  if not os.listdir(f'images/{group_name}'):
83
  os.rmdir(f"images/{group_name}")
84
 
85
- db.close()
 
76
  for i, post_id in enumerate(post_ids):
77
  stash_path = f"images/Stash/{post_id}.png"
78
  dest_path = f"images/{group_name}/{i}_{post_id}.png"
79
+
80
  if os.path.exists(stash_path) and not os.path.exists(dest_path):
81
  os.link(stash_path, dest_path)
82
 
83
  if not os.listdir(f'images/{group_name}'):
84
  os.rmdir(f"images/{group_name}")
85
 
86
+ db.close()