| """ |
| Preprocessing module — cleans and normalizes raw text in records. |
| |
| Engine contract: |
| run(EngineInput) -> EngineOutput |
| |
| Applies basic text cleaning to each record's ``raw_text`` field: |
| strip whitespace, normalize unicode, collapse whitespace runs. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import logging |
| import re |
| import unicodedata |
| from typing import Any, Dict, List |
|
|
| from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus |
|
|
| logger = logging.getLogger("modules.preprocessing") |
|
|
|
|
| def _clean_text(text: str) -> str: |
| """Basic text normalization.""" |
| |
| text = unicodedata.normalize("NFKC", text) |
| |
| text = text.strip() |
| |
| text = re.sub(r"\s+", " ", text) |
| return text |
|
|
|
|
| def run(engine_input: EngineInput) -> EngineOutput: |
| """ |
| Preprocess all records: clean ``raw_text``, normalize entity fields. |
| """ |
| try: |
| cleaned: List[NormalizedRecord] = [] |
| for record in engine_input.records: |
| |
| updated = record.model_copy(update={ |
| "raw_text": _clean_text(record.raw_text), |
| "entity_name": record.entity_name.strip() if record.entity_name else None, |
| "entity_email": record.entity_email.strip().lower() if record.entity_email else None, |
| "entity_domain": record.entity_domain.strip().lower() if record.entity_domain else None, |
| }) |
| cleaned.append(updated) |
|
|
| return EngineOutput( |
| stage="preprocessing", |
| status=StageStatus.SUCCESS, |
| records=cleaned, |
| summary=f"Preprocessed {len(cleaned)} records", |
| ) |
| except Exception as exc: |
| logger.error("Preprocessing failed: %s", exc, exc_info=True) |
| return EngineOutput( |
| stage="preprocessing", |
| status=StageStatus.FAILED, |
| error=str(exc), |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def preprocess(text: str) -> str: |
| """Legacy wrapper (deprecated). Use ``run()`` instead.""" |
| return text.strip().lower() |
|
|
|
|
| if __name__ == "__main__": |
| print(preprocess(" This is RAW DATA. ")) |
|
|