Synced repo using 'sync_with_huggingface' Github Action
Browse files- CHANGELOG.md +2 -0
- iscc_sct/__init__.py +1 -1
- iscc_sct/demo.py +23 -10
- poetry.lock +3 -3
- pyproject.toml +1 -1
- tests/test_iscc_sct.py +1 -1
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
# Changelog
|
| 2 |
|
|
|
|
|
|
|
| 3 |
## [0.1.2] - 2024-08-19
|
| 4 |
- Encode granular features with base64
|
| 5 |
- Refactor result format to generic ISCC data model
|
|
|
|
| 1 |
# Changelog
|
| 2 |
|
| 3 |
+
## [0.1.3] - Unrelease
|
| 4 |
+
|
| 5 |
## [0.1.2] - 2024-08-19
|
| 6 |
- Encode granular features with base64
|
| 7 |
- Refactor result format to generic ISCC data model
|
iscc_sct/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
__version__ = "0.1.
|
| 2 |
from iscc_sct.options import *
|
| 3 |
from iscc_sct.utils import *
|
| 4 |
from iscc_sct.code_semantic_text import *
|
|
|
|
| 1 |
+
__version__ = "0.1.3"
|
| 2 |
from iscc_sct.options import *
|
| 3 |
from iscc_sct.utils import *
|
| 4 |
from iscc_sct.code_semantic_text import *
|
iscc_sct/demo.py
CHANGED
|
@@ -443,9 +443,8 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
| 443 |
)
|
| 444 |
|
| 445 |
with gr.Row(variant="panel"):
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
"""
|
| 449 |
## Understanding ISCC Semantic Text-Codes
|
| 450 |
|
| 451 |
### What is an ISCC Semantic Text-Code?
|
|
@@ -476,7 +475,11 @@ document-embedding.
|
|
| 476 |
The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
|
| 477 |
allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
|
| 478 |
or stored.
|
|
|
|
|
|
|
| 479 |
|
|
|
|
|
|
|
| 480 |
### Why is this useful?
|
| 481 |
- **Content creators**: Find similar content across languages.
|
| 482 |
- **Researchers**: Quickly compare documents or find related texts in different languages.
|
|
@@ -490,20 +493,30 @@ language barriers!
|
|
| 490 |
The "Explore Details & Advanced Options" section provides additional tools and information:
|
| 491 |
|
| 492 |
1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed
|
| 493 |
-
|
| 494 |
|
| 495 |
2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split
|
| 496 |
-
|
| 497 |
|
| 498 |
3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is
|
| 499 |
-
|
| 500 |
|
| 501 |
4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B.
|
| 502 |
-
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
"""
|
| 505 |
-
|
| 506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
|
| 508 |
if __name__ == "__main__": # pragma: no cover
|
| 509 |
demo.launch()
|
|
|
|
| 443 |
)
|
| 444 |
|
| 445 |
with gr.Row(variant="panel"):
|
| 446 |
+
gr.Markdown(
|
| 447 |
+
"""
|
|
|
|
| 448 |
## Understanding ISCC Semantic Text-Codes
|
| 449 |
|
| 450 |
### What is an ISCC Semantic Text-Code?
|
|
|
|
| 475 |
The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
|
| 476 |
allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
|
| 477 |
or stored.
|
| 478 |
+
"""
|
| 479 |
+
)
|
| 480 |
|
| 481 |
+
gr.Markdown(
|
| 482 |
+
"""
|
| 483 |
### Why is this useful?
|
| 484 |
- **Content creators**: Find similar content across languages.
|
| 485 |
- **Researchers**: Quickly compare documents or find related texts in different languages.
|
|
|
|
| 493 |
The "Explore Details & Advanced Options" section provides additional tools and information:
|
| 494 |
|
| 495 |
1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed
|
| 496 |
+
comparisons but may be more sensitive to minor differences.
|
| 497 |
|
| 498 |
2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split
|
| 499 |
+
for processing.
|
| 500 |
|
| 501 |
3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is
|
| 502 |
+
color-coded and labeled with its size and simprint (a similarity preserving fingerprint).
|
| 503 |
|
| 504 |
4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B.
|
| 505 |
+
This table shows which specific parts of the texts are most similar (above 80%), along with their
|
| 506 |
+
approximate cosine similarity (scaled -100% to +100%).
|
| 507 |
+
|
| 508 |
+
For more information about the **ISCC** see:
|
| 509 |
+
- https://github.com/iscc
|
| 510 |
+
- https://iscc.codes
|
| 511 |
+
- https://iscc.io
|
| 512 |
+
- [ISO 24138:2024](https://www.iso.org/standard/77899.html)
|
| 513 |
"""
|
| 514 |
+
)
|
| 515 |
+
with gr.Row():
|
| 516 |
+
gr.Markdown(
|
| 517 |
+
f"iscc-sct v{sct.__version__} | Source Code: https://github.com/iscc/iscc-sct",
|
| 518 |
+
elem_classes="footer",
|
| 519 |
+
)
|
| 520 |
|
| 521 |
if __name__ == "__main__": # pragma: no cover
|
| 522 |
demo.launch()
|
poetry.lock
CHANGED
|
@@ -755,13 +755,13 @@ files = [
|
|
| 755 |
|
| 756 |
[[package]]
|
| 757 |
name = "importlib-metadata"
|
| 758 |
-
version = "8.
|
| 759 |
description = "Read metadata from Python packages"
|
| 760 |
optional = false
|
| 761 |
python-versions = ">=3.8"
|
| 762 |
files = [
|
| 763 |
-
{file = "importlib_metadata-8.
|
| 764 |
-
{file = "importlib_metadata-8.
|
| 765 |
]
|
| 766 |
|
| 767 |
[package.dependencies]
|
|
|
|
| 755 |
|
| 756 |
[[package]]
|
| 757 |
name = "importlib-metadata"
|
| 758 |
+
version = "8.3.0"
|
| 759 |
description = "Read metadata from Python packages"
|
| 760 |
optional = false
|
| 761 |
python-versions = ">=3.8"
|
| 762 |
files = [
|
| 763 |
+
{file = "importlib_metadata-8.3.0-py3-none-any.whl", hash = "sha256:42817a4a0be5845d22c6e212db66a94ad261e2318d80b3e0d363894a79df2b67"},
|
| 764 |
+
{file = "importlib_metadata-8.3.0.tar.gz", hash = "sha256:9c8fa6e8ea0f9516ad5c8db9246a731c948193c7754d3babb0114a05b27dd364"},
|
| 765 |
]
|
| 766 |
|
| 767 |
[package.dependencies]
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "iscc-sct"
|
| 3 |
-
version = "0.1.
|
| 4 |
description = "ISCC - Semantic Code Text"
|
| 5 |
authors = ["Titusz <tp@py7.de>"]
|
| 6 |
license = "CC-BY-NC-SA-4.0"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "iscc-sct"
|
| 3 |
+
version = "0.1.3"
|
| 4 |
description = "ISCC - Semantic Code Text"
|
| 5 |
authors = ["Titusz <tp@py7.de>"]
|
| 6 |
license = "CC-BY-NC-SA-4.0"
|
tests/test_iscc_sct.py
CHANGED
|
@@ -31,7 +31,7 @@ be matched based on lexical similarity.
|
|
| 31 |
|
| 32 |
|
| 33 |
def test_version():
|
| 34 |
-
assert sct.__version__ == "0.1.
|
| 35 |
|
| 36 |
|
| 37 |
def test_code_text_semantic_default():
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def test_version():
|
| 34 |
+
assert sct.__version__ == "0.1.3"
|
| 35 |
|
| 36 |
|
| 37 |
def test_code_text_semantic_default():
|