ayushKishor's picture
Keep extraction chunks free of context prefixes
0a62245
raw
history blame contribute delete
523 Bytes
from pluto.tools import strip_non_extractable_context
def test_strip_non_extractable_context_removes_extraction_metadata_prefixes():
chunk = (
"[Context | doc:attention | chunk:C0 | section:introduction]\n"
"[Document context: Attention Is All You Need | Domain: ML]\n\n"
"The Transformer is a model architecture based on attention mechanisms."
)
assert strip_non_extractable_context(chunk) == (
"The Transformer is a model architecture based on attention mechanisms."
)