Spaces:

q6
/

p

Running

App Files Files Community

q6 commited on Dec 5, 2024

Commit

e4435bb

1 Parent(s): 5bd3b77

Push

Browse files

Files changed (3) hide show

API/app.py +4 -6
Client/clear_db.py +3 -4
Client/hunt.py +44 -57

API/app.py CHANGED Viewed

@@ -160,7 +160,7 @@ def determine_exif_type(metadata):
 async def get_exif(url, session):
     start_range = 0
-    end_range = 512
     headers = {
         "Referer": "https://www.pixiv.net/",
@@ -200,7 +200,7 @@ async def process_post(post_id, session, semaphore):
                 metadata = await get_exif(image_url, session)
                 exif_type = determine_exif_type(metadata)
                 if exif_type not in ['photoshop', 'celsys', None]:
-                    return post_id, image_url.replace(img_base, '')
             return post_id, None
         except Exception as e:
             return post_id, None
@@ -216,7 +216,7 @@ async def pixif(
         tasks = [process_post(post_id, session, semaphore) for post_id in post_ids]
         results = await asyncio.gather(*tasks)
-    image_exifs = {post_id: image_url for post_id, image_url in results if image_url}
     return image_exifs
 async def download_image(session, post_id, post_url):
@@ -231,7 +231,6 @@ async def generate_tar(posts, session):
     tarf = tarfile.open(mode="w", fileobj=tar_buffer)
     semaphore = asyncio.Semaphore(100)  # Adjust based on your needs
-    lock = asyncio.Lock()  # Create a lock for synchronization
     async def add_to_tar(post_id, image_url):
         async with semaphore:
@@ -241,8 +240,7 @@ async def generate_tar(posts, session):
                 image_name = f"{post_id}.png"
                 info = tarfile.TarInfo(name=image_name)
                 info.size = len(image_data)
-        async with lock:  # Ensure only one coroutine writes to the tarfile at a time
-            tarf.addfile(tarinfo=info, fileobj=io.BytesIO(image_data))
     tasks = [add_to_tar(post_id, image_url) for post_id, image_url in posts.items()]
     await asyncio.gather(*tasks)

 async def get_exif(url, session):
     start_range = 0
+    end_range = 1024
     headers = {
         "Referer": "https://www.pixiv.net/",
                 metadata = await get_exif(image_url, session)
                 exif_type = determine_exif_type(metadata)
                 if exif_type not in ['photoshop', 'celsys', None]:
+                    return post_id, image_url
             return post_id, None
         except Exception as e:
             return post_id, None
         tasks = [process_post(post_id, session, semaphore) for post_id in post_ids]
         results = await asyncio.gather(*tasks)
+    image_exifs = {post_id: image_url.replace('https://i.pximg.net/img-original/', '', 1) for post_id, image_url in results if image_url}
     return image_exifs
 async def download_image(session, post_id, post_url):
     tarf = tarfile.open(mode="w", fileobj=tar_buffer)
     semaphore = asyncio.Semaphore(100)  # Adjust based on your needs
     async def add_to_tar(post_id, image_url):
         async with semaphore:
                 image_name = f"{post_id}.png"
                 info = tarfile.TarInfo(name=image_name)
                 info.size = len(image_data)
+                tarf.addfile(tarinfo=info, fileobj=io.BytesIO(image_data))
     tasks = [add_to_tar(post_id, image_url) for post_id, image_url in posts.items()]
     await asyncio.gather(*tasks)

Client/clear_db.py CHANGED Viewed

@@ -4,7 +4,6 @@ import shutil
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 shutil.rmtree("db", ignore_errors=True)
-shutil.rmtree("images", ignore_errors=True)
-# for file in os.listdir():
-#     if file.endswith(".txt"):
-#         os.rename(file, f"txt logs/{file}")

 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 shutil.rmtree("db", ignore_errors=True)
+for file in os.listdir():
+    if file.endswith(".txt"):
+        os.rename(file, f"txt logs/{file}")

Client/hunt.py CHANGED Viewed

@@ -17,72 +17,59 @@ for idx, file in enumerate(valid):
 inputs = input("Enter the index of the file: ")
 inputs = inputs.split()
-indexes = []
 for inp in inputs:
     if inp.isdigit():
-        indexes.append(int(inp) - 1)
     elif "-" in inp:
         start, end = map(int, inp.split("-"))
-        indexes.extend(range(start - 1, end))
 blacklist = ['\0', None]
-group_data = {}
-group_post_ids = {}
-images_cache = os.listdir("images/Stash")
-for index in indexes:
     group_name = valid[index].rsplit(".", 1)[0]
     os.makedirs(f"images/{group_name}", exist_ok=True)
     with open(valid[index], "r") as f:
         post_ids = [x for x in f.read().split("\n") if x]
-    group_post_ids[group_name] = post_ids
-    for position, post_id in enumerate(post_ids):
-        group_data[post_id] = {'group_name': group_name, 'position': position}
-all_post_ids = list(group_data.keys())
-with db.begin(write=True) as txn:
-    post_id_urls = {post_id: txn.get(post_id.encode()) for post_id in all_post_ids}
-    post_id_urls = {post_id: url.decode() if url else None for post_id, url in post_id_urls.items()}
-    filtered = [post_id for post_id, url in post_id_urls.items() if url == None and f"{post_id}.png" not in images_cache]
-    print(f"Filtered: {len(filtered)}/{len(post_id_urls)}")
-    if filtered:
-        data = requests.post('https://q6-p.hf.space/pixif', json={"post_ids": filtered}).json()
-        for post_id, url in data.items():
-            txn.put(post_id.encode(), url.encode())
-            post_id_urls[post_id] = url
-        no_exif = set(filtered) - set(data.keys())
-        for post_id in no_exif:
-            txn.put(post_id.encode(), b'\0')
-to_download = {post_id: url for post_id, url in post_id_urls.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
-print(f"Downloading {len(to_download)} images...")
-if to_download:
-    images_zip = requests.post('https://q6-p.hf.space/download', json={'posts': to_download}, stream=True)
-    print("Extracting images...")
-    with io.BytesIO(images_zip.content) as f:
-        with tarfile.open(fileobj=f, mode='r') as tarf:
-            tarf.extractall("images/Stash")
-images_cache = os.listdir("images/Stash")
-print("Linking images...")
-for post_id in all_post_ids:
-    if f"{post_id}.png" in images_cache:
-        group_name = group_data[post_id]['group_name']
-        position = group_data[post_id]['position']
-        target_path = f"images/{group_name}/{position}_{post_id}.png"
-        if not os.path.exists(target_path):
-            os.link(f"images/Stash/{post_id}.png", target_path)
-for group_name in group_post_ids.keys():
-    group_folder = f"images/{group_name}"
-    if len(os.listdir(group_folder)) == 0:
-        os.rmdir(group_folder)

 inputs = input("Enter the index of the file: ")
 inputs = inputs.split()
+indexs = []
 for inp in inputs:
     if inp.isdigit():
+        indexs.append(int(inp) - 1)
     elif "-" in inp:
         start, end = map(int, inp.split("-"))
+        indexs.extend(range(start - 1, end))
 blacklist = ['\0', None]
+for index in indexs:
+    images_cache = os.listdir("images/Stash")
     group_name = valid[index].rsplit(".", 1)[0]
     os.makedirs(f"images/{group_name}", exist_ok=True)
     with open(valid[index], "r") as f:
         post_ids = [x for x in f.read().split("\n") if x]
+    with db.begin(write=True) as txn:
+        post_ids = {post_id: txn.get(post_id.encode()) for post_id in post_ids}
+        post_ids = {post_id: url.decode() if url else None for post_id, url in post_ids.items()}
+        filtered = [post_id for post_id, url in post_ids.items() if url == None and f"{post_id}.png" not in images_cache]
+        print(f"Group: {group_name}\nFiltered: {len(filtered)}/{len(post_ids)}")
+        if filtered:
+            data = requests.post('https://q6-p.hf.space/pixif', json={"post_ids": filtered}).json()
+            for post_id, url in data.items():
+                txn.put(post_id.encode(), url.encode())
+                post_ids[post_id] = url
+            no_exif = set(filtered) - set(data.keys())
+            for post_id in no_exif:
+                txn.put(post_id.encode(), b'\0')
+    to_download = {post_id: url for post_id, url in post_ids.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
+    print(f"Downloading {len(to_download)} images...")
+    if to_download:
+        images_zip = requests.post('https://q6-p.hf.space/download', json={'posts': to_download}, stream=True)
+        print("Extracting images...")
+        with io.BytesIO(images_zip.content) as f:
+            with tarfile.open(fileobj=f, mode='r') as tarf:
+                tarf.extractall("images/Stash")
+    images_cache = os.listdir("images/Stash")
+    print("Linking images...")
+    for i, post_id in enumerate(post_ids.keys()):
+        if f"{post_id}.png" in images_cache:
+            if not os.path.exists(f"images/{group_name}/{i}_{post_id}.png"):
+                os.link(f"images/Stash/{post_id}.png", f"images/{group_name}/{i}_{post_id}.png")
+    if len(os.listdir(f'images/{group_name}')) == 0:
+        os.rmdir(f"images/{group_name}")