Spaces:

q6
/

p

Paused

App Files Files Community

q6 commited on Dec 6, 2024

Commit

134abb3

1 Parent(s): c2e82c1

zstd

Browse files

Files changed (5) hide show

API/app.py +19 -23
Client/Extract Pixiv/ai_search.py +1 -1
Client/hunt.py +10 -5
Client/show_all.py +27 -4
Dockerfile +1 -1

API/app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from pydantic import BaseModel
 from typing import List, Dict
 import tarfile
 from dotenv import load_dotenv
 img_base = 'https://i.pximg.net/img-original/img/'
@@ -195,20 +197,17 @@ async def process_post(post_id, session, semaphore):
             data = await fetch_page(session, f"https://www.pixiv.net/ajax/illust/{post_id}/pages")
             image_urls = [page['urls']['original'] for page in data['body'] if 'png' in page['urls']['original']][:20]
-            # Create tasks to fetch EXIF data for all image URLs
             tasks = [get_exif(image_url, session) for image_url in image_urls]
             exif_data_list = await asyncio.gather(*tasks)
-            # Process EXIF data to find the earliest matching image per post ID
             for image_url, metadata in zip(image_urls, exif_data_list):
                 exif_type = determine_exif_type(metadata)
                 if exif_type not in ['photoshop', 'celsys', None]:
                     return post_id, image_url
             return post_id, None
-        except Exception as e:
             return post_id, None
 @app.post("/pixif")
 async def pixif(
     items: pixifModel
@@ -223,11 +222,7 @@ async def pixif(
     image_exifs = {post_id: image_url.replace(img_base, '', 1) for post_id, image_url in results if image_url}
     return image_exifs
-import tempfile
-from fastapi import BackgroundTasks
-async def generate_tar_gz(posts, session):
     semaphore = asyncio.Semaphore(1000)
     images = {}
@@ -240,41 +235,42 @@ async def generate_tar_gz(posts, session):
     tasks = [fetch_image(post_id, image_url) for post_id, image_url in posts.items()]
     results = await asyncio.gather(*tasks)
     images = {post_id: image_data for post_id, image_data in results}
-    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".tar.gz")
-    with tarfile.open(fileobj=temp_file, mode="w:gz") as tar:
         for post_id, image_data in images.items():
             image_name = f"{post_id}.png"
             file_info = tarfile.TarInfo(name=image_name)
             file_info.size = len(image_data)
             tar.addfile(tarinfo=file_info, fileobj=io.BytesIO(image_data))
     temp_file.seek(0)
     return temp_file
 @app.post("/download")
 async def download(
     items: PixifDownloadModel,
-    background_tasks: BackgroundTasks
 ):
     posts = items.posts
     async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
-        temp_file = await generate_tar_gz(posts, session)
-    filename = f"{base26_time()}.tar.gz"
-    def cleanup_temp_file(file_path):
-        os.unlink(file_path)
-    background_tasks.add_task(cleanup_temp_file, temp_file.name)
     return FileResponse(
         path=temp_file.name,
-        media_type="application/gzip",
-        filename=filename,
-        background=background_tasks
     )
 @app.get("/")

 from typing import List, Dict
 import tarfile
 from dotenv import load_dotenv
+import tempfile
+import zstandard as zstd
 img_base = 'https://i.pximg.net/img-original/img/'
             data = await fetch_page(session, f"https://www.pixiv.net/ajax/illust/{post_id}/pages")
             image_urls = [page['urls']['original'] for page in data['body'] if 'png' in page['urls']['original']][:20]
             tasks = [get_exif(image_url, session) for image_url in image_urls]
             exif_data_list = await asyncio.gather(*tasks)
             for image_url, metadata in zip(image_urls, exif_data_list):
                 exif_type = determine_exif_type(metadata)
                 if exif_type not in ['photoshop', 'celsys', None]:
                     return post_id, image_url
             return post_id, None
+        except:
             return post_id, None
 @app.post("/pixif")
 async def pixif(
     items: pixifModel
     image_exifs = {post_id: image_url.replace(img_base, '', 1) for post_id, image_url in results if image_url}
     return image_exifs
+async def generate_zstd_archive(posts, session):
     semaphore = asyncio.Semaphore(1000)
     images = {}
     tasks = [fetch_image(post_id, image_url) for post_id, image_url in posts.items()]
     results = await asyncio.gather(*tasks)
     images = {post_id: image_data for post_id, image_data in results}
+    # Create a tar in memory
+    tar_buffer = io.BytesIO()
+    with tarfile.open(fileobj=tar_buffer, mode="w") as tar:
         for post_id, image_data in images.items():
             image_name = f"{post_id}.png"
             file_info = tarfile.TarInfo(name=image_name)
             file_info.size = len(image_data)
             tar.addfile(tarinfo=file_info, fileobj=io.BytesIO(image_data))
+    tar_buffer.seek(0)
+    # Compress with zstd at level 3
+    cctx = zstd.ZstdCompressor(level=3)
+    compressed = cctx.compress(tar_buffer.read())
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".zstd")
+    temp_file.write(compressed)
+    temp_file.flush()
     temp_file.seek(0)
     return temp_file
 @app.post("/download")
 async def download(
     items: PixifDownloadModel,
 ):
     posts = items.posts
     async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
+        temp_file = await generate_zstd_archive(posts, session)
+    filename = f"{base26_time()}.zstd"
     return FileResponse(
         path=temp_file.name,
+        media_type="application/zstd",
+        filename=filename
     )
 @app.get("/")

Client/Extract Pixiv/ai_search.py CHANGED Viewed

@@ -11,7 +11,7 @@ os.chdir(os.path.dirname(os.path.abspath(__file__)))
 input_url = input("Enter the URL: ")
-pages = 600 // 60
 params = {
     'raw': input_url,

 input_url = input("Enter the URL: ")
+pages = 1200 // 60
 params = {
     'raw': input_url,

Client/hunt.py CHANGED Viewed

@@ -3,6 +3,7 @@ import lmdb
 import requests
 import io
 import tarfile
 local = 0
 if local:
@@ -62,14 +63,18 @@ for index in indexs:
     to_download = {post_id: url for post_id, url in post_ids.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
     print(f"Downloading {len(to_download)} images...")
     if to_download:
-        images_tar_gz = requests.post(f'{endpoint}/download', json={'posts': to_download}, stream=True)
-        print("Extracting images...")
-        with io.BytesIO(images_tar_gz.content) as f:
-            with tarfile.open(fileobj=f, mode='r:gz') as tarf:
                 tarf.extractall("images/Stash")
     images_cache = os.listdir("images/Stash")
     print("Linking images...")
     for i, post_id in enumerate(post_ids.keys()):

 import requests
 import io
 import tarfile
+import zstandard
 local = 0
 if local:
     to_download = {post_id: url for post_id, url in post_ids.items() if url not in blacklist and f"{post_id}.png" not in images_cache}
     print(f"Downloading {len(to_download)} images...")
     if to_download:
+        images_zstd = requests.post(f'{endpoint}/download', json={'posts': to_download}, stream=True)
+        # Decompress zstd
+        print("Decompressing zstd...")
+        dctx = zstandard.ZstdDecompressor()
+        decompressed = dctx.decompress(images_zstd.content)
+        print("Extracting images from tar...")
+        with io.BytesIO(decompressed) as f:
+            with tarfile.open(fileobj=f, mode='r:') as tarf:
                 tarf.extractall("images/Stash")
     images_cache = os.listdir("images/Stash")
     print("Linking images...")
     for i, post_id in enumerate(post_ids.keys()):

Client/show_all.py CHANGED Viewed

@@ -1,13 +1,36 @@
 import os
 import lmdb
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 db = lmdb.open("db", subdir=True, map_size=524288)
-# view all in db
 with db.begin() as txn:
-    for key, value in txn.cursor():
         if value != b'\x00':
-            print(key.decode(), value.decode().split('p')[1].split('.')[0], end=' ')

 import os
 import lmdb
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 db = lmdb.open("db", subdir=True, map_size=524288)
+items = []
 with db.begin() as txn:
+    cursor = txn.cursor()
+    for key, value in cursor:
         if value != b'\x00':
+            try:
+                key_str = key.decode('utf-8')
+                value_str = value.decode('utf-8')
+                int_part_str = value_str.split('p')[1].split('.')[0]
+                int_part = int(int_part_str)
+                items.append((key_str, int_part))
+            except (IndexError, ValueError) as e:
+                print(f"Skipping key {key} due to parsing error: {e}")
+                continue
+sorted_items = sorted(items, key=lambda item: item[1])
+for key, int_value in sorted_items:
+    print(f"{key} {int_value}", end=' ')
+print()

Dockerfile CHANGED Viewed

@@ -8,7 +8,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 WORKDIR /app
-RUN pip install --no-cache-dir fastapi aiohttp uvicorn python-dotenv pydantic requests aiofiles
 COPY --chown=user ./API /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 ENV PYTHONUNBUFFERED=1
 WORKDIR /app
+RUN pip install --no-cache-dir fastapi aiohttp uvicorn python-dotenv pydantic requests zstandard
 COPY --chown=user ./API /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]