bayan-api / src /nlp /pipeline_context.py
youssefreda9's picture
HF Deploy: Fix syntax error with smart quotes in popup.js
fe1e225
Raw
History Blame Contribute Delete
4.66 kB
"""
PipelineContext — Shared state for a single /api/analyze request.
INVARIANTS:
1. original_text is IMMUTABLE after construction
2. _offset_mappers is APPEND-ONLY — past mappers never mutated or removed
3. map_to_original() is READ-ONLY and deterministic
4. All coordinate transforms go through OffsetMapper public API
TERMINOLOGY:
original_text = ORIGINAL_TEXT (user's raw input, immutable)
current_text = CURRENT_TEXT (pipeline working copy, mutated per stage)
# FUTURE: If pipeline grows beyond 5 stages, the linear reverse_map_offset()
# chain in map_to_original() could be replaced with a precomputed cumulative
# mapping or segment-tree composition. Not needed for 3 stages.
"""
import logging
from nlp.correction_patch import PatchSet, CorrectionPatch, PRIORITY
from nlp.stage_locker import StageLocker, PIPELINE_DEBUG
logger = logging.getLogger(__name__)
class PipelineContext:
"""
Shared state object for a single /api/analyze request.
Carries all pipeline state through Spelling → Grammar → Punctuation.
Discarded after the response is sent.
"""
def __init__(self, original_text: str):
self.original_text: str = original_text # IMMUTABLE — never assign again
self.current_text: str = original_text
self.patches: PatchSet = PatchSet()
self._offset_mappers: list = [] # APPEND-ONLY
self.stage_locker: StageLocker = StageLocker()
def map_to_original(self, start: int, end: int) -> tuple:
"""
Map CURRENT_TEXT coords → ORIGINAL_TEXT coords.
Walks the OffsetMapper chain in REVERSE, calling
reverse_map_offset() on each mapper.
READ-ONLY: does not mutate any state.
DETERMINISTIC: same inputs always produce same outputs.
"""
curr_start, curr_end = start, end
for mapper in reversed(self._offset_mappers):
curr_start = mapper.reverse_map_offset(curr_start, is_end=False)
curr_end = mapper.reverse_map_offset(curr_end, is_end=True)
return curr_start, curr_end
def add_patch(self, stage: str, start_current: int, end_current: int,
replacement: str, confidence: float = 1.0,
alternatives: list = None):
"""
Create a CorrectionPatch with BOTH coordinate spaces populated.
CURRENT coords: from the diff (for StageLocker)
ORIGINAL coords: computed via reverse mapper chain (for API + overlap resolution)
"""
start_orig, end_orig = self.map_to_original(start_current, end_current)
text_len = len(self.original_text)
start_orig = max(0, min(start_orig, text_len))
end_orig = max(start_orig, min(end_orig, text_len))
patch = CorrectionPatch(
stage=stage,
start_original=start_orig,
end_original=end_orig,
start_current=start_current,
end_current=end_current,
original=self.original_text[start_orig:end_orig],
replacement=replacement,
priority=PRIORITY.get(stage, 0),
confidence=confidence,
locked=True,
alternatives=alternatives or [],
)
self.patches.add(patch)
# Lock in CURRENT_TEXT coordinates
self.stage_locker.lock(start_current, end_current, stage)
if PIPELINE_DEBUG:
logger.debug(
f"[PipelineContext] PATCH stage={stage} "
f"CURRENT=[{start_current}:{end_current}] "
f"ORIGINAL=[{start_orig}:{end_orig}] "
f"'{patch.original}' → '{replacement}'"
)
return patch
def mutate_text(self, text_after: str, OffsetMapperClass):
"""
Update CURRENT_TEXT after a stage produces a new version.
1. Creates OffsetMapper(text_before=current_text, text_after=new)
2. APPENDS to mapper chain (append-only invariant)
3. Updates StageLocker via mapper.forward_map_range()
4. Sets current_text = text_after
"""
if text_after == self.current_text:
return
mapper = OffsetMapperClass(self.current_text, text_after)
self._offset_mappers.append(mapper) # APPEND-ONLY
# StageLocker updates via mapper.forward_map_range() — direction is in the name
self.stage_locker.update_via_mapper(mapper)
if PIPELINE_DEBUG:
logger.debug(
f"[PipelineContext] MUTATE len={len(self.current_text)}{len(text_after)} "
f"mappers={len(self._offset_mappers)}"
)
self.current_text = text_after