Spaces:
Running
Running
Commit
·
abbf29d
0
Parent(s):
Duplicate from keturn/INED-datasette
Browse filesCo-authored-by: Kevin Turner <keturn@users.noreply.huggingface.co>
- .gitattributes +34 -0
- Dockerfile +31 -0
- README.md +11 -0
- metadata.json +14 -0
- settings.json +3 -0
- src/import-git.sh +16 -0
- src/textdir2sql/loading.py +90 -0
.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM datasetteproject/datasette:0.64.1
|
| 2 |
+
|
| 3 |
+
# huggingface spaces run as user 1000
|
| 4 |
+
RUN adduser hf-space --uid 1000 --disabled-password --gecos '' && \
|
| 5 |
+
mkdir /home/hf-space/app && \
|
| 6 |
+
chown hf-space: /home/hf-space/app
|
| 7 |
+
WORKDIR /home/hf-space/app
|
| 8 |
+
|
| 9 |
+
RUN datasette install datasette-configure-fts && \
|
| 10 |
+
datasette install datasette-render-image-tags
|
| 11 |
+
|
| 12 |
+
RUN apt-get update && \
|
| 13 |
+
apt-get install -y --no-install-recommends git && \
|
| 14 |
+
apt-get clean && \
|
| 15 |
+
rm -rf /var/lib/apt && \
|
| 16 |
+
rm -rf /var/lib/dpkg/info/*
|
| 17 |
+
|
| 18 |
+
USER hf-space
|
| 19 |
+
|
| 20 |
+
# spaces default port
|
| 21 |
+
EXPOSE 7860
|
| 22 |
+
ENTRYPOINT ["datasette", "--host=0.0.0.0", "--port=7860"]
|
| 23 |
+
CMD ["."]
|
| 24 |
+
|
| 25 |
+
ENV PYTHONPATH=/home/hf-space/app/src/
|
| 26 |
+
|
| 27 |
+
COPY src src
|
| 28 |
+
COPY metadata.json settings.json ./
|
| 29 |
+
|
| 30 |
+
RUN src/import-git.sh && \
|
| 31 |
+
datasette inspect *.db --inspect-file=inspect-data.json
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: INED Datasette
|
| 3 |
+
emoji: 🐢
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
duplicated_from: keturn/INED-datasette
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
metadata.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "Imaginary Network Expanded Dataset",
|
| 3 |
+
"description": "Curated by Sygil",
|
| 4 |
+
"source_url": "https://github.com/Sygil-Dev/INE-dataset",
|
| 5 |
+
"databases": {
|
| 6 |
+
"INE": {
|
| 7 |
+
"tables": {
|
| 8 |
+
"images": {
|
| 9 |
+
"fts_table": "captions_fts"
|
| 10 |
+
}
|
| 11 |
+
}
|
| 12 |
+
}
|
| 13 |
+
}
|
| 14 |
+
}
|
settings.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"default_page_size": 20
|
| 3 |
+
}
|
src/import-git.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -x -e -o pipefail
|
| 3 |
+
|
| 4 |
+
REPO="https://github.com/Sygil-Dev/INE-dataset.git"
|
| 5 |
+
IMAGE_HOST="https://raw.githubusercontent.com/Sygil-Dev/INE-dataset/main/data/"
|
| 6 |
+
|
| 7 |
+
# avoid cloning all the image files
|
| 8 |
+
git clone --no-checkout --filter=blob:none --depth 1 "${REPO}" dataset
|
| 9 |
+
|
| 10 |
+
# Beware `--no-cone` is deprecated, so this may stop working someday
|
| 11 |
+
# https://git-scm.com/docs/git-sparse-checkout#_internalsnon_cone_problems
|
| 12 |
+
git -C dataset sparse-checkout set --no-cone '/data/*.txt'
|
| 13 |
+
git -C dataset checkout main
|
| 14 |
+
|
| 15 |
+
python3 -m textdir2sql.loading dataset/data INE.db \
|
| 16 |
+
--image-host="${IMAGE_HOST}"
|
src/textdir2sql/loading.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
from functools import partial
|
| 3 |
+
from itertools import islice
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import click
|
| 7 |
+
|
| 8 |
+
BATCH_SIZE=1024
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@click.command()
|
| 12 |
+
@click.argument('input_dir', type=click.Path(exists=True, file_okay=False, path_type=Path))
|
| 13 |
+
@click.argument('output', type=click.Path(dir_okay=False, writable=True, path_type=Path))
|
| 14 |
+
@click.option('--image-host', help="base URL of images")
|
| 15 |
+
@click.option('--explicit/--no-explicit', default=False)
|
| 16 |
+
def main(input_dir: Path, output: Path, image_host: str, explicit:bool):
|
| 17 |
+
connection = sqlite3.connect(output)
|
| 18 |
+
try:
|
| 19 |
+
_main_with_connection(input_dir, connection, image_host, explicit)
|
| 20 |
+
finally:
|
| 21 |
+
connection.close()
|
| 22 |
+
|
| 23 |
+
def _main_with_connection(input_dir: Path, connection: sqlite3.Connection, image_host: str=None, explicit=False):
|
| 24 |
+
connection.execute("CREATE TABLE IF NOT EXISTS "
|
| 25 |
+
" captions(image_key text PRIMARY KEY, caption text NOT NULL);")
|
| 26 |
+
|
| 27 |
+
if image_host:
|
| 28 |
+
connection.execute(f"""
|
| 29 |
+
CREATE VIEW IF NOT EXISTS images AS
|
| 30 |
+
SELECT {sql_quote(connection, image_host)} || image_key || '.jpg' AS image,
|
| 31 |
+
caption,
|
| 32 |
+
rowid
|
| 33 |
+
FROM captions
|
| 34 |
+
""")
|
| 35 |
+
|
| 36 |
+
text_files = input_dir.glob("*.txt")
|
| 37 |
+
|
| 38 |
+
with click.progressbar(chunked(text_files, BATCH_SIZE)) as progress:
|
| 39 |
+
for batch in progress:
|
| 40 |
+
text_file: Path
|
| 41 |
+
pairs = ((text_file.stem, text_file.read_text())
|
| 42 |
+
for text_file in batch)
|
| 43 |
+
with connection:
|
| 44 |
+
connection.executemany("INSERT INTO captions(image_key, caption) "
|
| 45 |
+
"VALUES(?, ?) ", pairs)
|
| 46 |
+
|
| 47 |
+
if not explicit:
|
| 48 |
+
ratings = ["rating:unsafe", "rating:explicit", "rating:mature", "meta:nsfw",
|
| 49 |
+
"subreddit:%nsfw"]
|
| 50 |
+
for rating in ratings:
|
| 51 |
+
with connection:
|
| 52 |
+
c = connection.execute("DELETE FROM captions WHERE caption LIKE ?",
|
| 53 |
+
(f"%{rating}%",))
|
| 54 |
+
print(f"Removed {c.rowcount} {rating} rows")
|
| 55 |
+
|
| 56 |
+
with connection:
|
| 57 |
+
# Add full-text search index
|
| 58 |
+
connection.execute("""CREATE VIRTUAL TABLE
|
| 59 |
+
captions_fts USING
|
| 60 |
+
fts5(caption, image_key UNINDEXED, content=captions)
|
| 61 |
+
""")
|
| 62 |
+
connection.execute("""
|
| 63 |
+
INSERT INTO "captions_fts" (rowid, image_key, caption)
|
| 64 |
+
SELECT rowid, image_key, caption
|
| 65 |
+
FROM captions
|
| 66 |
+
""")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def chunked(iterable, n):
|
| 70 |
+
return iter(partial(take, n, iter(iterable)), [])
|
| 71 |
+
|
| 72 |
+
def take(n, iterable):
|
| 73 |
+
return list(islice(iterable, n))
|
| 74 |
+
|
| 75 |
+
def sql_quote(connection, value: str) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Apply SQLite string quoting to a value, including wrapping it in single quotes.
|
| 78 |
+
:param value: String to quote
|
| 79 |
+
"""
|
| 80 |
+
# Normally we would use .execute(sql, [params]) for escaping, but
|
| 81 |
+
# occasionally that isn't available - most notable when we need
|
| 82 |
+
# to include a "... DEFAULT 'value'" in a column definition.
|
| 83 |
+
return connection.execute(
|
| 84 |
+
# Use SQLite itself to correctly escape this string:
|
| 85 |
+
"SELECT quote(:value)",
|
| 86 |
+
{"value": value},
|
| 87 |
+
).fetchone()[0]
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
main()
|