q6 commited on
Commit
a1a9773
·
1 Parent(s): 4f2aff0

change client to sqlite

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. Client/hunt.py +69 -17
  3. Client/migrate_to_sqlite.py +88 -0
.gitignore CHANGED
@@ -4,4 +4,5 @@ images
4
  merge_dev.bat
5
  __pycache__
6
  *.png
7
- *.txt
 
 
4
  merge_dev.bat
5
  __pycache__
6
  *.png
7
+ *.txt
8
+ db.sqlite
Client/hunt.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- import lmdb
3
  import requests
4
  import tempfile
5
  import zipfile
@@ -39,7 +39,51 @@ os.makedirs("images/Stash", exist_ok=True)
39
 
40
  images_cache = set(os.listdir("images/Stash"))
41
 
42
- db = lmdb.open("db", subdir=True, map_size=1048576 * 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  valid = [f for f in os.listdir() if f.endswith(".txt")]
45
  for idx, file in enumerate(valid):
@@ -88,20 +132,28 @@ for index in indexs:
88
  with open(valid[index], "r") as f:
89
  post_ids = f.read().split()
90
 
91
- with db.begin(write=True) as txn:
92
- post_ids_dict = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
93
- filtered = [post_id for post_id in post_ids if post_ids_dict[post_id] is None and f"{post_id}.png" not in images_cache]
94
- print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids)}")
95
-
96
- if filtered:
97
- data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered, "phpsessid": phpsessid})
98
- data = data.json()
99
- for post_id, url in data.items():
100
- txn.put(post_id.encode(), url.encode())
101
- post_ids_dict[post_id] = url
102
- no_exif = set(filtered) - set(data.keys())
103
- for post_id in no_exif:
104
- txn.put(post_id.encode(), b'')
 
 
 
 
 
 
 
 
105
 
106
  to_download = {post_id: decode_if_binary(url) for post_id, url in post_ids_dict.items() if url and f"{post_id}.png" not in images_cache}
107
 
@@ -122,4 +174,4 @@ for index in indexs:
122
  if not os.listdir(f'images/{group_name}'):
123
  os.rmdir(f"images/{group_name}")
124
 
125
- db.close()
 
1
  import os
2
+ import sqlite3
3
  import requests
4
  import tempfile
5
  import zipfile
 
39
 
40
  images_cache = set(os.listdir("images/Stash"))
41
 
42
+ DB_PATH = "db.sqlite"
43
+
44
+ def open_db(path):
45
+ conn = sqlite3.connect(path)
46
+ conn.execute(
47
+ """
48
+ CREATE TABLE IF NOT EXISTS pixif_cache (
49
+ post_id TEXT PRIMARY KEY,
50
+ url TEXT
51
+ )
52
+ """
53
+ )
54
+ conn.commit()
55
+ return conn
56
+
57
+ def chunked(seq, size):
58
+ for i in range(0, len(seq), size):
59
+ yield seq[i:i + size]
60
+
61
+ def fetch_cached_urls(conn, post_ids):
62
+ post_ids_dict = {post_id: None for post_id in post_ids}
63
+ if not post_ids:
64
+ return post_ids_dict
65
+
66
+ for chunk in chunked(post_ids, 900):
67
+ placeholders = ",".join("?" for _ in chunk)
68
+ query = f"SELECT post_id, COALESCE(url, '') FROM pixif_cache WHERE post_id IN ({placeholders})"
69
+ for post_id, url in conn.execute(query, chunk):
70
+ post_ids_dict[post_id] = url
71
+
72
+ return post_ids_dict
73
+
74
+ def upsert_urls(conn, rows):
75
+ if not rows:
76
+ return
77
+ conn.executemany(
78
+ """
79
+ INSERT INTO pixif_cache (post_id, url)
80
+ VALUES (?, ?)
81
+ ON CONFLICT(post_id) DO UPDATE SET url = excluded.url
82
+ """,
83
+ rows,
84
+ )
85
+
86
+ conn = open_db(DB_PATH)
87
 
88
  valid = [f for f in os.listdir() if f.endswith(".txt")]
89
  for idx, file in enumerate(valid):
 
132
  with open(valid[index], "r") as f:
133
  post_ids = f.read().split()
134
 
135
+ post_ids_dict = fetch_cached_urls(conn, post_ids)
136
+ filtered = [
137
+ post_id
138
+ for post_id in post_ids
139
+ if post_ids_dict[post_id] is None and f"{post_id}.png" not in images_cache
140
+ ]
141
+ print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids)}")
142
+
143
+ if filtered:
144
+ data = requests.post(f'{endpoint}/pixif', json={"post_ids": filtered, "phpsessid": phpsessid})
145
+ data = data.json()
146
+ rows = [(post_id, url) for post_id, url in data.items()]
147
+ no_exif = set(filtered) - set(data.keys())
148
+ rows.extend((post_id, "") for post_id in no_exif)
149
+
150
+ with conn:
151
+ upsert_urls(conn, rows)
152
+
153
+ for post_id, url in data.items():
154
+ post_ids_dict[post_id] = url
155
+ for post_id in no_exif:
156
+ post_ids_dict[post_id] = ""
157
 
158
  to_download = {post_id: decode_if_binary(url) for post_id, url in post_ids_dict.items() if url and f"{post_id}.png" not in images_cache}
159
 
 
174
  if not os.listdir(f'images/{group_name}'):
175
  os.rmdir(f"images/{group_name}")
176
 
177
+ conn.close()
Client/migrate_to_sqlite.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sqlite3
4
+
5
+ import lmdb
6
+
7
+
8
+ def get_map_size(lmdb_path):
9
+ data_path = os.path.join(lmdb_path, "data.mdb")
10
+ try:
11
+ size = os.path.getsize(data_path)
12
+ except OSError:
13
+ return 1048576 * 2
14
+ return max(size * 2, 1048576 * 2)
15
+
16
+
17
+ def open_sqlite(path):
18
+ conn = sqlite3.connect(path)
19
+ conn.execute(
20
+ """
21
+ CREATE TABLE IF NOT EXISTS pixif_cache (
22
+ post_id TEXT PRIMARY KEY,
23
+ url TEXT
24
+ )
25
+ """
26
+ )
27
+ conn.commit()
28
+ return conn
29
+
30
+
31
+ def upsert_rows(conn, rows):
32
+ if not rows:
33
+ return
34
+ conn.executemany(
35
+ """
36
+ INSERT INTO pixif_cache (post_id, url)
37
+ VALUES (?, ?)
38
+ ON CONFLICT(post_id) DO UPDATE SET url = excluded.url
39
+ """,
40
+ rows,
41
+ )
42
+
43
+
44
+ def migrate(lmdb_path, sqlite_path, batch_size):
45
+ map_size = get_map_size(lmdb_path)
46
+ lmdb_env = lmdb.open(
47
+ lmdb_path,
48
+ subdir=True,
49
+ readonly=True,
50
+ lock=False,
51
+ map_size=map_size,
52
+ )
53
+ conn = open_sqlite(sqlite_path)
54
+
55
+ rows = []
56
+ with lmdb_env.begin() as txn:
57
+ cursor = txn.cursor()
58
+ for key, value in cursor:
59
+ post_id = key.decode("utf-8")
60
+ url = value.decode("utf-8") if value else ""
61
+ rows.append((post_id, url))
62
+
63
+ if len(rows) >= batch_size:
64
+ with conn:
65
+ upsert_rows(conn, rows)
66
+ rows.clear()
67
+
68
+ if rows:
69
+ with conn:
70
+ upsert_rows(conn, rows)
71
+
72
+ conn.close()
73
+ lmdb_env.close()
74
+
75
+
76
+ def main():
77
+ parser = argparse.ArgumentParser(description="Migrate LMDB cache to SQLite.")
78
+ parser.add_argument("--lmdb", default="db", help="Path to LMDB directory.")
79
+ parser.add_argument("--sqlite", default="db.sqlite", help="Path to SQLite file.")
80
+ parser.add_argument("--batch-size", type=int, default=1000, help="Rows per batch insert.")
81
+ args = parser.parse_args()
82
+
83
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
84
+ migrate(args.lmdb, args.sqlite, args.batch_size)
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()