File size: 9,232 Bytes
b75c637
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
Normalize — canonical schema builder + entity linking keys.

Reads raw files from the input directory, detects file types, parses
them into ``NormalizedRecord`` instances with deterministic row IDs
and entity-linking fields.
"""

from __future__ import annotations

import csv
import hashlib
import io
import json
import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Optional

from engine.io_contract import FileType, InputFile, InputSpec, NormalizedRecord

logger = logging.getLogger("engine.normalize")

# ---------------------------------------------------------------------------
# File-type detection
# ---------------------------------------------------------------------------

_EXT_MAP: Dict[str, FileType] = {
    ".csv": FileType.CSV,
    ".json": FileType.JSON,
    ".txt": FileType.TXT,
    ".html": FileType.HTML,
    ".htm": FileType.HTML,
    ".log": FileType.LOG,
}


def detect_file_type(path: Path) -> FileType:
    """Detect file type from extension."""
    return _EXT_MAP.get(path.suffix.lower(), FileType.UNKNOWN)


def _file_sha256(path: Path) -> str:
    """Compute SHA-256 hex digest of a file."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()


# ---------------------------------------------------------------------------
# Build InputSpec from a directory
# ---------------------------------------------------------------------------

def build_input_spec(input_dir: Path) -> InputSpec:
    """
    Scan *input_dir* for supported files and return an ``InputSpec``.
    """
    input_dir = Path(input_dir)
    files: List[InputFile] = []
    if input_dir.is_file():
        # Single file mode
        ft = detect_file_type(input_dir)
        files.append(InputFile(
            path=input_dir,
            file_type=ft,
            size_bytes=input_dir.stat().st_size,
            sha256=_file_sha256(input_dir),
        ))
        input_dir = input_dir.parent
    else:
        for p in sorted(input_dir.iterdir()):
            if p.is_file() and not p.name.startswith("."):
                ft = detect_file_type(p)
                files.append(InputFile(
                    path=p,
                    file_type=ft,
                    size_bytes=p.stat().st_size,
                    sha256=_file_sha256(p),
                ))
    logger.info("InputSpec: %d files from %s", len(files), input_dir)
    return InputSpec(input_dir=input_dir, files=files)


# ---------------------------------------------------------------------------
# Deterministic row ID
# ---------------------------------------------------------------------------

def _make_row_id(source_file: str, index: int, content_hash: str) -> str:
    """
    Deterministic row ID = first 12 hex chars of SHA-256(source_file + index + content).
    """
    raw = f"{source_file}:{index}:{content_hash}"
    return hashlib.sha256(raw.encode()).hexdigest()[:12]


# ---------------------------------------------------------------------------
# Entity extraction helpers (lightweight, no ML)
# ---------------------------------------------------------------------------

_EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
_IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
_PHONE_RE = re.compile(r"\b\+?1?\d{9,15}\b")
_DOMAIN_RE = re.compile(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b")
_HASH_RE = re.compile(r"\b[a-fA-F0-9]{32,64}\b")


def _extract_entities(text: str) -> Dict[str, Optional[str]]:
    """Extract first occurrence of common entity types from text."""
    email_m = _EMAIL_RE.search(text)
    ip_m = _IP_RE.search(text)
    phone_m = _PHONE_RE.search(text)
    domain_m = _DOMAIN_RE.search(text)
    hash_m = _HASH_RE.search(text)
    return {
        "entity_email": email_m.group(0) if email_m else None,
        "entity_ip": ip_m.group(0) if ip_m else None,
        "entity_phone": phone_m.group(0) if phone_m else None,
        "entity_domain": domain_m.group(0) if domain_m else None,
        "entity_hash": hash_m.group(0) if hash_m else None,
    }


# ---------------------------------------------------------------------------
# Parsers per file type
# ---------------------------------------------------------------------------

def _parse_csv(path: Path) -> List[NormalizedRecord]:
    records: List[NormalizedRecord] = []
    with open(path, newline="", encoding="utf-8", errors="replace") as f:
        reader = csv.DictReader(f)
        for idx, row in enumerate(reader):
            text = json.dumps(row, ensure_ascii=False)
            content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
            entities = _extract_entities(text)
            # Try to pick up common column names
            name = row.get("name") or row.get("Name") or row.get("entity_name")
            phone = row.get("phone") or row.get("Phone") or row.get("entity_phone")
            email = row.get("email") or row.get("Email") or row.get("entity_email")
            records.append(NormalizedRecord(
                row_id=_make_row_id(str(path), idx, content_hash),
                source_file=str(path.name),
                source_type=FileType.CSV,
                entity_name=name or entities.get("entity_name"),
                entity_phone=phone or entities.get("entity_phone"),
                entity_email=email or entities.get("entity_email"),
                entity_ip=entities.get("entity_ip"),
                entity_domain=entities.get("entity_domain"),
                entity_hash=entities.get("entity_hash"),
                raw_text=text,
                extra=dict(row),
            ))
    return records


def _parse_json(path: Path) -> List[NormalizedRecord]:
    records: List[NormalizedRecord] = []
    with open(path, encoding="utf-8", errors="replace") as f:
        data = json.load(f)
    # Handle both single object and list of objects
    items = data if isinstance(data, list) else [data]
    for idx, item in enumerate(items):
        text = json.dumps(item, ensure_ascii=False) if isinstance(item, dict) else str(item)
        content_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
        entities = _extract_entities(text)
        extra = item if isinstance(item, dict) else {"value": item}
        records.append(NormalizedRecord(
            row_id=_make_row_id(str(path), idx, content_hash),
            source_file=str(path.name),
            source_type=FileType.JSON,
            entity_name=extra.get("name") if isinstance(extra, dict) else None,
            entity_email=entities.get("entity_email"),
            entity_ip=entities.get("entity_ip"),
            entity_phone=entities.get("entity_phone"),
            entity_domain=entities.get("entity_domain"),
            entity_hash=entities.get("entity_hash"),
            raw_text=text,
            extra=extra,
        ))
    return records


def _parse_text(path: Path, file_type: FileType = FileType.TXT) -> List[NormalizedRecord]:
    """Parse plain text / HTML / log files — one record per non-empty line."""
    records: List[NormalizedRecord] = []
    with open(path, encoding="utf-8", errors="replace") as f:
        lines = f.readlines()
    for idx, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
        content_hash = hashlib.sha256(line.encode()).hexdigest()[:16]
        entities = _extract_entities(line)
        records.append(NormalizedRecord(
            row_id=_make_row_id(str(path), idx, content_hash),
            source_file=str(path.name),
            source_type=file_type,
            entity_email=entities.get("entity_email"),
            entity_ip=entities.get("entity_ip"),
            entity_phone=entities.get("entity_phone"),
            entity_domain=entities.get("entity_domain"),
            entity_hash=entities.get("entity_hash"),
            raw_text=line,
        ))
    return records


# ---------------------------------------------------------------------------
# Main normalization entry point
# ---------------------------------------------------------------------------

def normalize_files(input_spec: InputSpec) -> List[NormalizedRecord]:
    """
    Parse all files in *input_spec* and return a flat list of
    ``NormalizedRecord`` instances.
    """
    all_records: List[NormalizedRecord] = []
    for f in input_spec.files:
        try:
            if f.file_type == FileType.CSV:
                recs = _parse_csv(f.path)
            elif f.file_type == FileType.JSON:
                recs = _parse_json(f.path)
            elif f.file_type in (FileType.TXT, FileType.LOG):
                recs = _parse_text(f.path, f.file_type)
            elif f.file_type == FileType.HTML:
                recs = _parse_text(f.path, FileType.HTML)
            else:
                recs = _parse_text(f.path, FileType.UNKNOWN)
            logger.info("Parsed %d records from %s (%s)", len(recs), f.path.name, f.file_type.value)
            all_records.extend(recs)
        except Exception as exc:
            logger.error("Failed to parse %s: %s", f.path, exc)
    logger.info("Total normalized records: %d", len(all_records))
    return all_records