mod-osint / modules /preprocessing /preprocess_data.py
moddux's picture
deploy: HF sanitized GUI snapshot
b75c637
"""
Preprocessing module — cleans and normalizes raw text in records.
Engine contract:
run(EngineInput) -> EngineOutput
Applies basic text cleaning to each record's ``raw_text`` field:
strip whitespace, normalize unicode, collapse whitespace runs.
"""
from __future__ import annotations
import logging
import re
import unicodedata
from typing import Any, Dict, List
from engine.io_contract import EngineInput, EngineOutput, NormalizedRecord, StageStatus
logger = logging.getLogger("modules.preprocessing")
def _clean_text(text: str) -> str:
"""Basic text normalization."""
# Unicode NFKC normalization
text = unicodedata.normalize("NFKC", text)
# Strip leading/trailing whitespace
text = text.strip()
# Collapse multiple whitespace to single space
text = re.sub(r"\s+", " ", text)
return text
def run(engine_input: EngineInput) -> EngineOutput:
"""
Preprocess all records: clean ``raw_text``, normalize entity fields.
"""
try:
cleaned: List[NormalizedRecord] = []
for record in engine_input.records:
# Create a copy with cleaned text
updated = record.model_copy(update={
"raw_text": _clean_text(record.raw_text),
"entity_name": record.entity_name.strip() if record.entity_name else None,
"entity_email": record.entity_email.strip().lower() if record.entity_email else None,
"entity_domain": record.entity_domain.strip().lower() if record.entity_domain else None,
})
cleaned.append(updated)
return EngineOutput(
stage="preprocessing",
status=StageStatus.SUCCESS,
records=cleaned,
summary=f"Preprocessed {len(cleaned)} records",
)
except Exception as exc:
logger.error("Preprocessing failed: %s", exc, exc_info=True)
return EngineOutput(
stage="preprocessing",
status=StageStatus.FAILED,
error=str(exc),
)
# ---------------------------------------------------------------------------
# Legacy compatibility
# ---------------------------------------------------------------------------
def preprocess(text: str) -> str:
"""Legacy wrapper (deprecated). Use ``run()`` instead."""
return text.strip().lower()
if __name__ == "__main__":
print(preprocess(" This is RAW DATA. "))