Synced repo using 'sync_with_huggingface' Github Action
Browse files- .editorconfig +1 -1
- iscc_sct/cli.py +9 -3
- iscc_sct/code_semantic_text.py +6 -2
- iscc_sct/demo.py +69 -96
- iscc_sct/dev.py +20 -0
- iscc_sct/models.py +18 -6
- iscc_sct/options.py +21 -7
- iscc_sct/samples.yml +105 -0
- poetry.lock +1 -1
- pyproject.toml +5 -3
- tests/benchmark.py +6 -2
- tests/test_cli.py +3 -1
- tests/test_iscc_sct.py +3 -1
- tests/test_main.py +3 -1
- tests/test_models.py +17 -3
- tests/visualize.py +3 -1
.editorconfig
CHANGED
|
@@ -12,7 +12,7 @@ indent_size = 4
|
|
| 12 |
end_of_line = lf
|
| 13 |
insert_final_newline = true
|
| 14 |
trim_trailing_whitespace = true
|
| 15 |
-
max_line_length =
|
| 16 |
|
| 17 |
|
| 18 |
# YAML files
|
|
|
|
| 12 |
end_of_line = lf
|
| 13 |
insert_final_newline = true
|
| 14 |
trim_trailing_whitespace = true
|
| 15 |
+
max_line_length = 100
|
| 16 |
|
| 17 |
|
| 18 |
# YAML files
|
iscc_sct/cli.py
CHANGED
|
@@ -8,9 +8,15 @@ from charset_normalizer import from_bytes
|
|
| 8 |
|
| 9 |
def main():
|
| 10 |
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
|
| 11 |
-
parser.add_argument(
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
|
| 15 |
args = parser.parse_args()
|
| 16 |
|
|
|
|
| 8 |
|
| 9 |
def main():
|
| 10 |
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
|
| 11 |
+
parser.add_argument(
|
| 12 |
+
"path", type=str, help="Path to text files (supports glob patterns).", nargs="?"
|
| 13 |
+
)
|
| 14 |
+
parser.add_argument(
|
| 15 |
+
"-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
|
| 16 |
+
)
|
| 17 |
+
parser.add_argument(
|
| 18 |
+
"-g", "--granular", action="store_true", help="Activate granular processing."
|
| 19 |
+
)
|
| 20 |
parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
|
| 21 |
args = parser.parse_args()
|
| 22 |
|
iscc_sct/code_semantic_text.py
CHANGED
|
@@ -233,11 +233,15 @@ def model():
|
|
| 233 |
so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 234 |
try:
|
| 235 |
with sct.timer("ONNXMODEL load time"):
|
| 236 |
-
return rt.InferenceSession(
|
|
|
|
|
|
|
| 237 |
except NoSuchFile: # pragma: no cover
|
| 238 |
with sct.timer("ONNXMODEL aquisition/load time"):
|
| 239 |
model_path = sct.get_model()
|
| 240 |
-
return rt.InferenceSession(
|
|
|
|
|
|
|
| 241 |
|
| 242 |
|
| 243 |
def tokenize_chunks(chunks):
|
|
|
|
| 233 |
so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 234 |
try:
|
| 235 |
with sct.timer("ONNXMODEL load time"):
|
| 236 |
+
return rt.InferenceSession(
|
| 237 |
+
sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers
|
| 238 |
+
)
|
| 239 |
except NoSuchFile: # pragma: no cover
|
| 240 |
with sct.timer("ONNXMODEL aquisition/load time"):
|
| 241 |
model_path = sct.get_model()
|
| 242 |
+
return rt.InferenceSession(
|
| 243 |
+
model_path, sess_options=so, providers=selected_onnx_providers
|
| 244 |
+
)
|
| 245 |
|
| 246 |
|
| 247 |
def tokenize_chunks(chunks):
|
iscc_sct/demo.py
CHANGED
|
@@ -6,6 +6,7 @@ from loguru import logger as log
|
|
| 6 |
import gradio as gr
|
| 7 |
import iscc_sct as sct
|
| 8 |
import textwrap
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
newline_symbols = {
|
|
@@ -84,7 +85,9 @@ def generate_similarity_bar(similarity):
|
|
| 84 |
|
| 85 |
# Adjust the text position to be centered within the colored bar
|
| 86 |
text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
|
| 87 |
-
text_alignment =
|
|
|
|
|
|
|
| 88 |
|
| 89 |
bar_html = f"""
|
| 90 |
<h3>Semantic Similarity</h3>
|
|
@@ -97,66 +100,12 @@ def generate_similarity_bar(similarity):
|
|
| 97 |
return bar_html
|
| 98 |
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
" ".join(paragraph.split())
|
| 104 |
-
for paragraph in """
|
| 105 |
-
This document specifies the syntax and structure of the International Standard Content Code (ISCC),
|
| 106 |
-
as an identification system for digital assets (including encodings of text, images, audio, video or other content
|
| 107 |
-
across all media sectors). It also describes ISCC metadata and the use of ISCC in conjunction with other schemes, such
|
| 108 |
-
as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
|
| 109 |
-
|
| 110 |
-
An ISCC applies to a specific digital asset and is a data-descriptor deterministically constructed from multiple hash
|
| 111 |
-
digests using the algorithms and rules in this document. This document does not provide information on registration of
|
| 112 |
-
ISCCs.
|
| 113 |
-
""".strip().split("\n\n")
|
| 114 |
-
]
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
sample_text_de = "\n\n".join(
|
| 118 |
-
[
|
| 119 |
-
" ".join(paragraph.split())
|
| 120 |
-
for paragraph in """
|
| 121 |
-
Dieses Dokument spezifiziert die Syntax und Struktur des International Standard Content Code (ISCC) als
|
| 122 |
-
Identifizierungssystem für digitale Inhalte (einschließlich Kodierungen von Text, Bildern, Audio, Video oder anderen
|
| 123 |
-
Inhalten in allen Medienbereichen). Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung mit
|
| 124 |
-
anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
|
| 125 |
-
|
| 126 |
-
Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor, der deterministisch aus mehreren
|
| 127 |
-
Hash-Digests unter Verwendung der Algorithmen und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält
|
| 128 |
-
keine Informationen über die Registrierung von ISCCs.
|
| 129 |
-
""".strip().split("\n\n")
|
| 130 |
-
]
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
sample_text_bg = "\n\n".join(
|
| 134 |
-
[
|
| 135 |
-
" ".join(paragraph.split())
|
| 136 |
-
for paragraph in """
|
| 137 |
-
Този документ определя синтаксиса и структурата на Международния стандартен код на съдържанието (ISCC) като система за
|
| 138 |
-
идентификация на цифрови активи (включително кодиране на текст, изображения, аудио, видео или друго съдържание във
|
| 139 |
-
всички медийни сектори). Той описва също метаданните на ISCC и използването на ISCC във връзка с други схеми, като
|
| 140 |
-
DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
|
| 141 |
-
|
| 142 |
-
ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор на данни, конструиран от множество
|
| 143 |
-
хеш-разходи, като се използват алгоритмите и правилата в настоящия документ. Настоящият документ не предоставя
|
| 144 |
-
информация за регистрацията на ISCC.
|
| 145 |
-
""".strip().split("\n\n")
|
| 146 |
-
]
|
| 147 |
-
)
|
| 148 |
|
| 149 |
-
sample_text_zh = "\n\n".join(
|
| 150 |
-
[
|
| 151 |
-
" ".join(paragraph.split())
|
| 152 |
-
for paragraph in """
|
| 153 |
-
本文件规定了国际标准内容代码(ISCC)的语法和结构,作为数字资产(包括所有媒 体领域的文本、图像、音频、视频或其他内容的编码)的标识系统。它还介绍了
|
| 154 |
-
ISCC 元数据以及 ISCC 与其他方案(如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC)的结合使用。
|
| 155 |
|
| 156 |
-
|
| 157 |
-
""".strip().split("\n\n")
|
| 158 |
-
]
|
| 159 |
-
)
|
| 160 |
|
| 161 |
custom_css = """
|
| 162 |
"""
|
|
@@ -176,67 +125,81 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
| 176 |
""",
|
| 177 |
)
|
| 178 |
with gr.Row(variant="panel"):
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
with gr.Row(variant="panel"):
|
| 188 |
with gr.Column(variant="panel"):
|
| 189 |
in_text_a = gr.TextArea(
|
| 190 |
label="Text A",
|
| 191 |
-
placeholder="Choose sample text from the dropdown or type or paste your text.",
|
| 192 |
lines=12,
|
| 193 |
max_lines=12,
|
| 194 |
)
|
| 195 |
-
sample_dropdown_a = gr.Dropdown(
|
| 196 |
-
choices=["None", "English", "Bulgarian"], label="Select sample for Text A", value="None"
|
| 197 |
-
)
|
| 198 |
out_code_a = gr.Textbox(label="ISCC Code for Text A")
|
| 199 |
-
out_chunks_a = gr.HighlightedText(
|
| 200 |
-
label="Chunked Text A",
|
| 201 |
-
interactive=False,
|
| 202 |
-
elem_id="chunked-text-a",
|
| 203 |
-
)
|
| 204 |
with gr.Column(variant="panel"):
|
| 205 |
in_text_b = gr.TextArea(
|
| 206 |
label="Text B",
|
| 207 |
-
placeholder="Choose sample text from the dropdown or type or paste your text.",
|
| 208 |
lines=12,
|
| 209 |
max_lines=12,
|
| 210 |
)
|
| 211 |
-
sample_dropdown_b = gr.Dropdown(
|
| 212 |
-
choices=["None", "German", "Chinese"], label="Select sample for Text B", value="None"
|
| 213 |
-
)
|
| 214 |
out_code_b = gr.Textbox(label="ISCC Code for Text B")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
out_chunks_b = gr.HighlightedText(
|
| 216 |
label="Chunked Text B",
|
| 217 |
interactive=False,
|
| 218 |
elem_id="chunked-text-b",
|
| 219 |
)
|
| 220 |
|
| 221 |
-
def update_sample_text(choice,
|
| 222 |
if choice == "None":
|
| 223 |
return ""
|
| 224 |
-
|
| 225 |
-
return sample_text_en if choice == "English" else sample_text_bg
|
| 226 |
-
else:
|
| 227 |
-
return sample_text_de if choice == "German" else sample_text_zh
|
| 228 |
|
| 229 |
sample_dropdown_a.change(
|
| 230 |
-
lambda choice: update_sample_text(choice, "
|
|
|
|
|
|
|
| 231 |
)
|
| 232 |
sample_dropdown_b.change(
|
| 233 |
-
lambda choice: update_sample_text(choice, "
|
|
|
|
|
|
|
| 234 |
)
|
| 235 |
|
| 236 |
-
with gr.Row(variant="panel"):
|
| 237 |
-
with gr.Column(variant="panel"):
|
| 238 |
-
out_similarity = gr.HTML(label="Similarity")
|
| 239 |
-
|
| 240 |
def process_text(text, nbits, suffix):
|
| 241 |
log.debug(f"{text[:20]}")
|
| 242 |
out_code_func = globals().get(f"out_code_{suffix}")
|
|
@@ -248,7 +211,9 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
| 248 |
out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
|
| 249 |
}
|
| 250 |
|
| 251 |
-
result = sct.gen_text_code_semantic(
|
|
|
|
|
|
|
| 252 |
iscc = sct.Metadata(**result).to_object_format()
|
| 253 |
|
| 254 |
# Generate chunked text with simprints and overlaps
|
|
@@ -318,14 +283,22 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
| 318 |
show_progress="full",
|
| 319 |
)
|
| 320 |
|
| 321 |
-
out_code_a.change(
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
def reset_all():
|
| 325 |
return (
|
| 326 |
gr.Slider(value=128), # Reset ISCC Bit-Length
|
| 327 |
-
gr.Dropdown(
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
gr.TextArea(value=""), # Reset Text A
|
| 330 |
gr.TextArea(value=""), # Reset Text B
|
| 331 |
gr.Textbox(value=""), # Reset ISCC Code for Text A
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
import iscc_sct as sct
|
| 8 |
import textwrap
|
| 9 |
+
import yaml
|
| 10 |
|
| 11 |
|
| 12 |
newline_symbols = {
|
|
|
|
| 85 |
|
| 86 |
# Adjust the text position to be centered within the colored bar
|
| 87 |
text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
|
| 88 |
+
text_alignment = (
|
| 89 |
+
"transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
|
| 90 |
+
)
|
| 91 |
|
| 92 |
bar_html = f"""
|
| 93 |
<h3>Semantic Similarity</h3>
|
|
|
|
| 100 |
return bar_html
|
| 101 |
|
| 102 |
|
| 103 |
+
def load_samples():
|
| 104 |
+
with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
|
| 105 |
+
return yaml.safe_load(file)["samples"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
samples = load_samples()
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
custom_css = """
|
| 111 |
"""
|
|
|
|
| 125 |
""",
|
| 126 |
)
|
| 127 |
with gr.Row(variant="panel"):
|
| 128 |
+
with gr.Column(variant="panel"):
|
| 129 |
+
sample_dropdown_a = gr.Dropdown(
|
| 130 |
+
choices=["None"] + [lang for lang in samples["a"]],
|
| 131 |
+
label="Select sample for Text A",
|
| 132 |
+
value="None",
|
| 133 |
+
)
|
| 134 |
+
with gr.Column(variant="panel"):
|
| 135 |
+
sample_dropdown_b = gr.Dropdown(
|
| 136 |
+
choices=["None"] + [lang for lang in samples["b"]],
|
| 137 |
+
label="Select sample for Text B",
|
| 138 |
+
value="None",
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
with gr.Row(variant="panel"):
|
| 142 |
with gr.Column(variant="panel"):
|
| 143 |
in_text_a = gr.TextArea(
|
| 144 |
label="Text A",
|
| 145 |
+
placeholder="Choose sample text from the dropdown above or type or paste your text.",
|
| 146 |
lines=12,
|
| 147 |
max_lines=12,
|
| 148 |
)
|
|
|
|
|
|
|
|
|
|
| 149 |
out_code_a = gr.Textbox(label="ISCC Code for Text A")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
with gr.Column(variant="panel"):
|
| 151 |
in_text_b = gr.TextArea(
|
| 152 |
label="Text B",
|
| 153 |
+
placeholder="Choose sample text from the dropdown above or type or paste your text.",
|
| 154 |
lines=12,
|
| 155 |
max_lines=12,
|
| 156 |
)
|
|
|
|
|
|
|
|
|
|
| 157 |
out_code_b = gr.Textbox(label="ISCC Code for Text B")
|
| 158 |
+
|
| 159 |
+
with gr.Row(variant="panel"):
|
| 160 |
+
with gr.Column(variant="panel"):
|
| 161 |
+
out_similarity = gr.HTML(label="Similarity")
|
| 162 |
+
|
| 163 |
+
with gr.Row(variant="panel"):
|
| 164 |
+
in_iscc_bits = gr.Slider(
|
| 165 |
+
label="ISCC Bit-Length",
|
| 166 |
+
info="NUMBER OF BITS FOR OUTPUT ISCC",
|
| 167 |
+
minimum=64,
|
| 168 |
+
maximum=256,
|
| 169 |
+
step=32,
|
| 170 |
+
value=64,
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
with gr.Row(variant="panel"):
|
| 174 |
+
with gr.Column(variant="panel"):
|
| 175 |
+
out_chunks_a = gr.HighlightedText(
|
| 176 |
+
label="Chunked Text A",
|
| 177 |
+
interactive=False,
|
| 178 |
+
elem_id="chunked-text-a",
|
| 179 |
+
)
|
| 180 |
+
with gr.Column(variant="panel"):
|
| 181 |
out_chunks_b = gr.HighlightedText(
|
| 182 |
label="Chunked Text B",
|
| 183 |
interactive=False,
|
| 184 |
elem_id="chunked-text-b",
|
| 185 |
)
|
| 186 |
|
| 187 |
+
def update_sample_text(choice, group):
|
| 188 |
if choice == "None":
|
| 189 |
return ""
|
| 190 |
+
return samples[group][choice]
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
sample_dropdown_a.change(
|
| 193 |
+
lambda choice: update_sample_text(choice, "a"),
|
| 194 |
+
inputs=[sample_dropdown_a],
|
| 195 |
+
outputs=[in_text_a],
|
| 196 |
)
|
| 197 |
sample_dropdown_b.change(
|
| 198 |
+
lambda choice: update_sample_text(choice, "b"),
|
| 199 |
+
inputs=[sample_dropdown_b],
|
| 200 |
+
outputs=[in_text_b],
|
| 201 |
)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
def process_text(text, nbits, suffix):
|
| 204 |
log.debug(f"{text[:20]}")
|
| 205 |
out_code_func = globals().get(f"out_code_{suffix}")
|
|
|
|
| 211 |
out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
|
| 212 |
}
|
| 213 |
|
| 214 |
+
result = sct.gen_text_code_semantic(
|
| 215 |
+
text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
|
| 216 |
+
)
|
| 217 |
iscc = sct.Metadata(**result).to_object_format()
|
| 218 |
|
| 219 |
# Generate chunked text with simprints and overlaps
|
|
|
|
| 283 |
show_progress="full",
|
| 284 |
)
|
| 285 |
|
| 286 |
+
out_code_a.change(
|
| 287 |
+
compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
|
| 288 |
+
)
|
| 289 |
+
out_code_b.change(
|
| 290 |
+
compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
|
| 291 |
+
)
|
| 292 |
|
| 293 |
def reset_all():
|
| 294 |
return (
|
| 295 |
gr.Slider(value=128), # Reset ISCC Bit-Length
|
| 296 |
+
gr.Dropdown(
|
| 297 |
+
value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
|
| 298 |
+
), # Reset sample dropdown A
|
| 299 |
+
gr.Dropdown(
|
| 300 |
+
value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
|
| 301 |
+
), # Reset sample dropdown B
|
| 302 |
gr.TextArea(value=""), # Reset Text A
|
| 303 |
gr.TextArea(value=""), # Reset Text B
|
| 304 |
gr.Textbox(value=""), # Reset ISCC Code for Text A
|
iscc_sct/dev.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import pathlib
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
HERE = pathlib.Path(__file__).parent.absolute()
|
|
@@ -20,3 +21,22 @@ def convert_lf(): # pragma: no cover
|
|
| 20 |
outfile.write(content)
|
| 21 |
n += 1
|
| 22 |
print(f"{n} files converted to LF")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pathlib
|
| 2 |
+
import yaml
|
| 3 |
|
| 4 |
|
| 5 |
HERE = pathlib.Path(__file__).parent.absolute()
|
|
|
|
| 21 |
outfile.write(content)
|
| 22 |
n += 1
|
| 23 |
print(f"{n} files converted to LF")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def format_yml():
|
| 27 |
+
for f in HERE.glob("**\*.yml"):
|
| 28 |
+
with open(f, "rt", encoding="utf-8") as infile:
|
| 29 |
+
data = yaml.safe_load(infile)
|
| 30 |
+
with open(f, "wt", encoding="utf-8", newline="\n") as outf:
|
| 31 |
+
yaml.safe_dump(
|
| 32 |
+
data,
|
| 33 |
+
outf,
|
| 34 |
+
indent=2,
|
| 35 |
+
width=80,
|
| 36 |
+
encoding="utf-8",
|
| 37 |
+
sort_keys=False,
|
| 38 |
+
default_flow_style=False,
|
| 39 |
+
default_style=">",
|
| 40 |
+
allow_unicode=True,
|
| 41 |
+
line_break="\n",
|
| 42 |
+
)
|
iscc_sct/models.py
CHANGED
|
@@ -82,7 +82,9 @@ class PrettyBaseModel(BaseModel):
|
|
| 82 |
return self.pretty_repr()
|
| 83 |
|
| 84 |
def pretty_repr(self):
|
| 85 |
-
return self.model_dump_json(
|
|
|
|
|
|
|
| 86 |
|
| 87 |
|
| 88 |
class Feature(PrettyBaseModel):
|
|
@@ -132,9 +134,15 @@ class Metadata(PrettyBaseModel):
|
|
| 132 |
new_features.append(new_feature_set)
|
| 133 |
else:
|
| 134 |
new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
|
| 135 |
-
new_feature_set.offsets = [
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
new_features.append(new_feature_set)
|
| 139 |
|
| 140 |
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
|
|
@@ -154,7 +162,9 @@ class Metadata(PrettyBaseModel):
|
|
| 154 |
# Convert to object format if in index format
|
| 155 |
feature_set = self.to_object_format().features[0]
|
| 156 |
|
| 157 |
-
if not all(
|
|
|
|
|
|
|
| 158 |
return None
|
| 159 |
|
| 160 |
# Sort features by offset
|
|
@@ -191,7 +201,9 @@ class Metadata(PrettyBaseModel):
|
|
| 191 |
# Convert to object format if in index format
|
| 192 |
feature_set = self.to_object_format().features[0]
|
| 193 |
|
| 194 |
-
if not all(
|
|
|
|
|
|
|
| 195 |
return []
|
| 196 |
|
| 197 |
# Sort features by offset
|
|
|
|
| 82 |
return self.pretty_repr()
|
| 83 |
|
| 84 |
def pretty_repr(self):
|
| 85 |
+
return self.model_dump_json(
|
| 86 |
+
indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False
|
| 87 |
+
)
|
| 88 |
|
| 89 |
|
| 90 |
class Feature(PrettyBaseModel):
|
|
|
|
| 134 |
new_features.append(new_feature_set)
|
| 135 |
else:
|
| 136 |
new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
|
| 137 |
+
new_feature_set.offsets = [
|
| 138 |
+
f.offset for f in feature_set.simprints if f.offset is not None
|
| 139 |
+
]
|
| 140 |
+
new_feature_set.sizes = [
|
| 141 |
+
f.size for f in feature_set.simprints if f.size is not None
|
| 142 |
+
]
|
| 143 |
+
new_feature_set.contents = [
|
| 144 |
+
f.content for f in feature_set.simprints if f.content is not None
|
| 145 |
+
]
|
| 146 |
new_features.append(new_feature_set)
|
| 147 |
|
| 148 |
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
|
|
|
|
| 162 |
# Convert to object format if in index format
|
| 163 |
feature_set = self.to_object_format().features[0]
|
| 164 |
|
| 165 |
+
if not all(
|
| 166 |
+
feature.content and feature.offset is not None for feature in feature_set.simprints
|
| 167 |
+
):
|
| 168 |
return None
|
| 169 |
|
| 170 |
# Sort features by offset
|
|
|
|
| 201 |
# Convert to object format if in index format
|
| 202 |
feature_set = self.to_object_format().features[0]
|
| 203 |
|
| 204 |
+
if not all(
|
| 205 |
+
feature.content and feature.offset is not None for feature in feature_set.simprints
|
| 206 |
+
):
|
| 207 |
return []
|
| 208 |
|
| 209 |
# Sort features by offset
|
iscc_sct/options.py
CHANGED
|
@@ -29,15 +29,27 @@ class SctOptions(BaseSettings):
|
|
| 29 |
multiple_of=32,
|
| 30 |
)
|
| 31 |
|
| 32 |
-
characters: bool = Field(
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
precision: int = Field(
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
simprints: bool = Field(
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
sizes: bool = Field(
|
|
|
|
|
|
|
| 41 |
|
| 42 |
contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
|
| 43 |
|
|
@@ -52,7 +64,9 @@ class SctOptions(BaseSettings):
|
|
| 52 |
description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
|
| 53 |
)
|
| 54 |
|
| 55 |
-
trim: bool = Field(
|
|
|
|
|
|
|
| 56 |
|
| 57 |
model_config = SettingsConfigDict(
|
| 58 |
env_file=".env",
|
|
|
|
| 29 |
multiple_of=32,
|
| 30 |
)
|
| 31 |
|
| 32 |
+
characters: bool = Field(
|
| 33 |
+
True, description="ISCC_SCT_CHARACTERS - Include document character count"
|
| 34 |
+
)
|
| 35 |
+
embedding: bool = Field(
|
| 36 |
+
False, description="ISCC_SCT_EMBEDDING - Include global document embedding"
|
| 37 |
+
)
|
| 38 |
|
| 39 |
+
precision: int = Field(
|
| 40 |
+
8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)"
|
| 41 |
+
)
|
| 42 |
|
| 43 |
+
simprints: bool = Field(
|
| 44 |
+
False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints"
|
| 45 |
+
)
|
| 46 |
+
offsets: bool = Field(
|
| 47 |
+
False, description="ISCC_SCT_OFFSETS - Include offsets of granular features"
|
| 48 |
+
)
|
| 49 |
|
| 50 |
+
sizes: bool = Field(
|
| 51 |
+
False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)"
|
| 52 |
+
)
|
| 53 |
|
| 54 |
contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
|
| 55 |
|
|
|
|
| 64 |
description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
|
| 65 |
)
|
| 66 |
|
| 67 |
+
trim: bool = Field(
|
| 68 |
+
False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)"
|
| 69 |
+
)
|
| 70 |
|
| 71 |
model_config = SettingsConfigDict(
|
| 72 |
env_file=".env",
|
iscc_sct/samples.yml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"samples":
|
| 2 |
+
"a":
|
| 3 |
+
"English": >
|
| 4 |
+
This document specifies the syntax and structure of the International Standard
|
| 5 |
+
Content Code (ISCC), as an identification system for digital assets (including
|
| 6 |
+
encodings of text, images, audio, video or other content across all media sectors).
|
| 7 |
+
It also describes ISCC metadata and the use of ISCC in conjunction with other
|
| 8 |
+
schemes, such as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
An ISCC applies to a specific digital asset and is a data-descriptor deterministically
|
| 12 |
+
constructed from multiple hash digests using the algorithms and rules in this
|
| 13 |
+
document. This document does not provide information on registration of ISCCs.
|
| 14 |
+
"Hungarian": >
|
| 15 |
+
Ez a dokumentum meghatározza a Nemzetközi Szabványos Tartalomkód (ISCC) szintaxisát
|
| 16 |
+
és szerkezetét, amely a digitális eszközök azonosítási rendszere (beleértve
|
| 17 |
+
a szövegek, képek, hang-, videó- vagy egyéb tartalmak kódolását minden médiaszektorban).
|
| 18 |
+
Leírja továbbá az ISCC metaadatokat és az ISCC használatát más sémákkal, mint
|
| 19 |
+
például a DOI, ISAN, ISBN, ISRC, ISSN és ISWC.
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
Az ISCC egy adott digitális eszközre vonatkozik, és több hash kivonatból determinisztikusan
|
| 23 |
+
összeállított adatleíró a jelen dokumentumban szereplő algoritmusok és szabályok
|
| 24 |
+
alapján. Ez a dokumentum nem ad tájékoztatást az ISCC-k regisztrációjáról.
|
| 25 |
+
"Bulgarian": >
|
| 26 |
+
Този документ определя синтаксиса и структурата на Международния стандартен
|
| 27 |
+
код на съдържанието (ISCC) като система за идентификация на цифрови активи (включително
|
| 28 |
+
кодиране на текст, изображения, аудио, видео или друго съдържание във всички
|
| 29 |
+
медийни сектори). Той описва също метаданните на ISCC и използването на ISCC
|
| 30 |
+
във връзка с други схеми, като DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор
|
| 34 |
+
на данни, конструиран от множество хеш-разходи, като се използват алгоритмите
|
| 35 |
+
и правилата в настоящия документ. Настоящият документ не предоставя информация
|
| 36 |
+
за регистрацията на ISCC.
|
| 37 |
+
"Finnish": >
|
| 38 |
+
Tässä asiakirjassa määritellään ISCC:n (International Standard Content Code)
|
| 39 |
+
syntaksi ja rakenne digitaalisen omaisuuden tunnistusjärjestelmänä (mukaan lukien
|
| 40 |
+
tekstin, kuvien, äänen, videon tai muun sisällön koodaukset kaikilla mediasektoreilla).
|
| 41 |
+
Siinä kuvataan myös ISCC-metatiedot ja ISCC:n käyttö muiden järjestelmien, kuten
|
| 42 |
+
DOI, ISAN, ISBN, ISRC, ISSN ja ISWC, kanssa.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
ISCC koskee tiettyä digitaalista omaisuutta, ja se on datakuvaaja, joka on deterministisesti
|
| 46 |
+
muodostettu useista hajautuskoosteista käyttämällä tämän asiakirjan algoritmeja
|
| 47 |
+
ja sääntöjä. Tämä asiakirja ei sisällä tietoja ISCC:iden rekisteröinnistä.
|
| 48 |
+
"Arabic": >
|
| 49 |
+
تحدد هذه الوثيقة بناء جملة وبنية كود المحتوى القياسي الدولي (ISCC)، كنظام تعريف
|
| 50 |
+
للأصول الرقمية (بما في ذلك ترميز النصوص أو الصور أو الصوت أو الفيديو أو أي محتوى
|
| 51 |
+
آخر عبر جميع قطاعات الوسائط). ويصف أيضًا بيانات تعريف ISCC واستخدام ISCC بالتزامن
|
| 52 |
+
مع مخططات أخرى، مثل DOI وISAN وISBN وISRC وISSN وISWC.
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
ينطبق ISCC على أصل رقمي محدد وهو عبارة عن واصف بيانات تم إنشاؤه بشكل حتمي من
|
| 56 |
+
ملخصات تجزئة متعددة باستخدام الخوارزميات والقواعد الواردة في هذه الوثيقة. لا
|
| 57 |
+
تقدم هذه الوثيقة معلومات عن تسجيل ISCCs.
|
| 58 |
+
"b":
|
| 59 |
+
"Paraphrased": >
|
| 60 |
+
In order to identify digital assets (such as encodings of text, images, music,
|
| 61 |
+
video, and other content across all media sectors), this paper outlines the
|
| 62 |
+
syntax and structure of the International Standard Content Code (ISCC). Additionally,
|
| 63 |
+
it explains how to use ISCC metadata and how to combine it with other schemes
|
| 64 |
+
like DOI, ISAN, ISBN, ISRC, ISSN, and ISWC.
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
An ISCC is a data-descriptor that is applied to a particular digital asset and
|
| 68 |
+
is deterministically built from several hash digests utilizing the rules and
|
| 69 |
+
techniques in this specification. There is no information about ISCC registration
|
| 70 |
+
in this paper.
|
| 71 |
+
"German": >
|
| 72 |
+
Dieses Dokument spezifiziert die Syntax und Struktur des International Standard
|
| 73 |
+
Content Code (ISCC) als Identifizierungssystem für digitale Inhalte (einschließlich
|
| 74 |
+
Kodierungen von Text, Bildern, Audio, Video oder anderen Inhalten in allen Medienbereichen).
|
| 75 |
+
Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung
|
| 76 |
+
mit anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor,
|
| 80 |
+
der deterministisch aus mehreren Hash-Digests unter Verwendung der Algorithmen
|
| 81 |
+
und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält keine Informationen
|
| 82 |
+
über die Registrierung von ISCCs.
|
| 83 |
+
"Chinese": >
|
| 84 |
+
本文件規定了國際標準內容編碼 (ISCC) 的語法和結構,作為數位資產 (包括所有媒體領域的文字、影像、音訊、視訊或其他內容的編碼) 的識別系統。它還介紹了
|
| 85 |
+
ISCC 元資料以及 ISCC 與其他方案(如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC)的結合使用。
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
ISCC 適用於特定的數位資產,是使用本文件中的演算法和規則,由多個雜湊摘要(hash digests)確定地建構出來的資料描述符(data-descriptor)。本文件不提供
|
| 89 |
+
ISCC 的註冊資訊。
|
| 90 |
+
"Korean": >
|
| 91 |
+
이 문서는 디지털 자산(모든 미디어 부문의 텍스트, 이미지, 오디오, 비디오 또는 기타 콘텐츠의 인코딩 포함)에 대한 식별 시스템인 ISCC(국제
|
| 92 |
+
표준 콘텐츠 코드)의 구문과 구조를 지정합니다. 또한 ISCC 메타데이터와 DOI, ISAN, ISBN, ISRC, ISSN 및 ISWC와
|
| 93 |
+
같은 다른 체계와 함께 ISCC를 사용하는 방법에 대해 설명합니다.
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
ISCC는 특정 디지털 자산에 적용되며 이 문서의 알고리즘과 규칙을 사용하여 여러 해시 다이제스트에서 결정론적으로 구성된 데이터 설명자입니다.
|
| 97 |
+
이 문서는 ISCC 등록에 대한 정보를 제공하지 않습니다.
|
| 98 |
+
"Japanese": >
|
| 99 |
+
この文書は、デジタル資産 (すべてのメディア セクターにわたるテキスト、画像、オーディオ、ビデオ、またはその他のコンテンツのエンコードを含む) の識別システムとして、国際標準コンテンツ
|
| 100 |
+
コード (ISCC) の構文と構造を指定します。また、ISCC メタデータと、DOI、ISAN、ISBN、ISRC、ISSN、ISWC などの他のスキームと組み合わせた
|
| 101 |
+
ISCC の使用についても説明します。
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
ISCC は特定のデジタル資産に適用され、本書のアルゴリズムとルールを使用して複数のハッシュ ダイジェストから決定論的に構築されるデータ記述子です。この文書には、ISCC
|
| 105 |
+
の登録に関する情報は記載されていません。
|
poetry.lock
CHANGED
|
@@ -2796,4 +2796,4 @@ gpu = ["onnxruntime-gpu"]
|
|
| 2796 |
[metadata]
|
| 2797 |
lock-version = "2.0"
|
| 2798 |
python-versions = ">=3.9,<3.13"
|
| 2799 |
-
content-hash = "
|
|
|
|
| 2796 |
[metadata]
|
| 2797 |
lock-version = "2.0"
|
| 2798 |
python-versions = ">=3.9,<3.13"
|
| 2799 |
+
content-hash = "e4a4f012af4c1e60326f792c8801857dbf9298d8992fdd83d3b8f0688d4c04ea"
|
pyproject.toml
CHANGED
|
@@ -60,6 +60,7 @@ numpy = "<2.0.0"
|
|
| 60 |
pybase64 = "^1.4.0"
|
| 61 |
certifi = ">=2024.07.04"
|
| 62 |
gradio = { version = "*", optional = true }
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
[tool.poetry.extras]
|
|
@@ -79,7 +80,7 @@ mdformat-gfm-alerts = "*"
|
|
| 79 |
mdformat-frontmatter = "*"
|
| 80 |
|
| 81 |
[tool.ruff]
|
| 82 |
-
line-length =
|
| 83 |
|
| 84 |
[tool.ruff.format]
|
| 85 |
line-ending = "lf"
|
|
@@ -89,11 +90,12 @@ omit = ["iscc_sct/dev.py", "tests/", "iscc_sct/demo.py"]
|
|
| 89 |
|
| 90 |
[tool.poe.tasks]
|
| 91 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
| 92 |
-
format-markdown = { cmd = "mdformat --wrap
|
|
|
|
| 93 |
convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
|
| 94 |
test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
|
| 95 |
update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
|
| 96 |
-
all = ["format-code", "format-markdown", "convert-lf", "test"]
|
| 97 |
update = ["update-dependencies", "all"]
|
| 98 |
|
| 99 |
[build-system]
|
|
|
|
| 60 |
pybase64 = "^1.4.0"
|
| 61 |
certifi = ">=2024.07.04"
|
| 62 |
gradio = { version = "*", optional = true }
|
| 63 |
+
pyyaml = "^6.0.2"
|
| 64 |
|
| 65 |
|
| 66 |
[tool.poetry.extras]
|
|
|
|
| 80 |
mdformat-frontmatter = "*"
|
| 81 |
|
| 82 |
[tool.ruff]
|
| 83 |
+
line-length = 100
|
| 84 |
|
| 85 |
[tool.ruff.format]
|
| 86 |
line-ending = "lf"
|
|
|
|
| 90 |
|
| 91 |
[tool.poe.tasks]
|
| 92 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
| 93 |
+
format-markdown = { cmd = "mdformat --wrap 100 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
|
| 94 |
+
format-yml = { script = "iscc_sct.dev:format_yml", help = "Format YML files"}
|
| 95 |
convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
|
| 96 |
test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
|
| 97 |
update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
|
| 98 |
+
all = ["format-code", "format-markdown", "format-yml", "convert-lf", "test"]
|
| 99 |
update = ["update-dependencies", "all"]
|
| 100 |
|
| 101 |
[build-system]
|
tests/benchmark.py
CHANGED
|
@@ -32,7 +32,9 @@ def benchmark(folder):
|
|
| 32 |
elapsed_time = end_time - start_time
|
| 33 |
total_time += elapsed_time
|
| 34 |
file_count += 1
|
| 35 |
-
log.info(
|
|
|
|
|
|
|
| 36 |
|
| 37 |
if file_count > 0:
|
| 38 |
avg_time = total_time / file_count
|
|
@@ -45,7 +47,9 @@ def benchmark(folder):
|
|
| 45 |
|
| 46 |
def main():
|
| 47 |
parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
|
| 48 |
-
parser.add_argument(
|
|
|
|
|
|
|
| 49 |
args = parser.parse_args()
|
| 50 |
|
| 51 |
benchmark(args.folder)
|
|
|
|
| 32 |
elapsed_time = end_time - start_time
|
| 33 |
total_time += elapsed_time
|
| 34 |
file_count += 1
|
| 35 |
+
log.info(
|
| 36 |
+
f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}"
|
| 37 |
+
)
|
| 38 |
|
| 39 |
if file_count > 0:
|
| 40 |
avg_time = total_time / file_count
|
|
|
|
| 47 |
|
| 48 |
def main():
|
| 49 |
parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"folder", type=str, help="Directory containing text files for benchmarking."
|
| 52 |
+
)
|
| 53 |
args = parser.parse_args()
|
| 54 |
|
| 55 |
benchmark(args.folder)
|
tests/test_cli.py
CHANGED
|
@@ -52,7 +52,9 @@ def test_cli_generate_sct(sample_text_file):
|
|
| 52 |
|
| 53 |
|
| 54 |
def test_cli_generate_sct_granular(sample_text_file):
|
| 55 |
-
result = subprocess.run(
|
|
|
|
|
|
|
| 56 |
assert result.returncode == 0
|
| 57 |
assert "features" in result.stdout
|
| 58 |
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def test_cli_generate_sct_granular(sample_text_file):
|
| 55 |
+
result = subprocess.run(
|
| 56 |
+
[sct, str(sample_text_file), "--granular"], capture_output=True, text=True
|
| 57 |
+
)
|
| 58 |
assert result.returncode == 0
|
| 59 |
assert "features" in result.stdout
|
| 60 |
|
tests/test_iscc_sct.py
CHANGED
|
@@ -178,7 +178,9 @@ def test_embed_tokens():
|
|
| 178 |
chunks = ["Hello World", "These are chunks"]
|
| 179 |
tokens = tokenize_chunks(chunks)
|
| 180 |
embeddings = embed_tokens(tokens)
|
| 181 |
-
assert list(embeddings[0][0][:3]) == pytest.approx(
|
|
|
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
def test_embed_chunks():
|
|
|
|
| 178 |
chunks = ["Hello World", "These are chunks"]
|
| 179 |
tokens = tokenize_chunks(chunks)
|
| 180 |
embeddings = embed_tokens(tokens)
|
| 181 |
+
assert list(embeddings[0][0][:3]) == pytest.approx(
|
| 182 |
+
[0.05907335, 0.11408358, 0.12727071], rel=1e-2
|
| 183 |
+
)
|
| 184 |
|
| 185 |
|
| 186 |
def test_embed_chunks():
|
tests/test_main.py
CHANGED
|
@@ -21,7 +21,9 @@ def test_create_granular():
|
|
| 21 |
"maintype": "semantic",
|
| 22 |
"subtype": "text",
|
| 23 |
"version": 0,
|
| 24 |
-
"simprints": [
|
|
|
|
|
|
|
| 25 |
}
|
| 26 |
],
|
| 27 |
}
|
|
|
|
| 21 |
"maintype": "semantic",
|
| 22 |
"subtype": "text",
|
| 23 |
"version": 0,
|
| 24 |
+
"simprints": [
|
| 25 |
+
{"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}
|
| 26 |
+
],
|
| 27 |
}
|
| 28 |
],
|
| 29 |
}
|
tests/test_models.py
CHANGED
|
@@ -22,7 +22,11 @@ def test_feature_initialization():
|
|
| 22 |
|
| 23 |
def test_feature_set_initialization():
|
| 24 |
fs = FeatureSet()
|
| 25 |
-
assert fs.model_dump(exclude_none=True) == {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def test_sct_meta_initialization():
|
|
@@ -33,7 +37,12 @@ def test_sct_meta_initialization():
|
|
| 33 |
assert meta.features is None
|
| 34 |
|
| 35 |
# Test initialization with all fields
|
| 36 |
-
features = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
|
| 38 |
assert meta.iscc == "ISCC1234567890"
|
| 39 |
assert meta.characters == 1000
|
|
@@ -67,7 +76,12 @@ def test_metadata_to_index_format():
|
|
| 67 |
def test_metadata_to_object_format():
|
| 68 |
# Test conversion from Index-Format to Object-Format
|
| 69 |
features = [
|
| 70 |
-
FeatureSet(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
]
|
| 72 |
meta = Metadata(iscc="ISCC1234567890", features=features)
|
| 73 |
object_meta = meta.to_object_format()
|
|
|
|
| 22 |
|
| 23 |
def test_feature_set_initialization():
|
| 24 |
fs = FeatureSet()
|
| 25 |
+
assert fs.model_dump(exclude_none=True) == {
|
| 26 |
+
"maintype": "semantic",
|
| 27 |
+
"subtype": "text",
|
| 28 |
+
"version": 0,
|
| 29 |
+
}
|
| 30 |
|
| 31 |
|
| 32 |
def test_sct_meta_initialization():
|
|
|
|
| 37 |
assert meta.features is None
|
| 38 |
|
| 39 |
# Test initialization with all fields
|
| 40 |
+
features = [
|
| 41 |
+
FeatureSet(
|
| 42 |
+
simprints=[Feature(simprint="feature1", offset=0, content="text1")],
|
| 43 |
+
embedding=[0.1, 0.2],
|
| 44 |
+
)
|
| 45 |
+
]
|
| 46 |
meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
|
| 47 |
assert meta.iscc == "ISCC1234567890"
|
| 48 |
assert meta.characters == 1000
|
|
|
|
| 76 |
def test_metadata_to_object_format():
|
| 77 |
# Test conversion from Index-Format to Object-Format
|
| 78 |
features = [
|
| 79 |
+
FeatureSet(
|
| 80 |
+
simprints=["feature1", "feature2"],
|
| 81 |
+
offsets=[0, 5],
|
| 82 |
+
sizes=[5, 5],
|
| 83 |
+
contents=["text1", "text2"],
|
| 84 |
+
)
|
| 85 |
]
|
| 86 |
meta = Metadata(iscc="ISCC1234567890", features=features)
|
| 87 |
object_meta = meta.to_object_format()
|
tests/visualize.py
CHANGED
|
@@ -49,7 +49,9 @@ def generate_html(fingerprint_data):
|
|
| 49 |
if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
|
| 50 |
overlap_end = chunks[i + 1]["offset"]
|
| 51 |
html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
|
| 52 |
-
html_content += escape_and_preserve_breaks(
|
|
|
|
|
|
|
| 53 |
else:
|
| 54 |
html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
|
| 55 |
|
|
|
|
| 49 |
if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
|
| 50 |
overlap_end = chunks[i + 1]["offset"]
|
| 51 |
html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
|
| 52 |
+
html_content += escape_and_preserve_breaks(
|
| 53 |
+
chunk["text"][overlap_end - chunk["offset"] :]
|
| 54 |
+
)
|
| 55 |
else:
|
| 56 |
html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
|
| 57 |
|