File size: 8,542 Bytes
ae91091
 
d4ff564
 
 
 
 
 
 
 
 
 
 
ae91091
 
d4ff564
ae91091
 
 
233bc02
 
d4ff564
 
 
 
233bc02
ae91091
 
 
233bc02
 
 
 
 
 
 
 
 
 
ae91091
 
 
 
 
7726529
ae91091
7726529
ae91091
 
 
 
7726529
ae91091
7726529
ae91091
 
 
 
 
 
 
 
 
 
 
 
 
d4ff564
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
d4ff564
 
 
 
 
 
ae91091
 
d4ff564
 
 
 
 
ae91091
 
 
 
 
 
 
d4ff564
ae91091
 
 
 
 
 
 
 
 
4c114c1
d4ff564
 
 
 
 
 
 
4c114c1
 
d4ff564
4c114c1
d4ff564
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c114c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""
NER Engine — Named Entity Recognition using HuggingFace Transformers.
Wraps the Nomio4640/ner-mongolian fine-tuned model.

Long-text handling:
  BERT has a 512-token hard limit. Long social-media posts (especially
  Google reviews, long Facebook posts) are silently truncated, causing
  entities in the second half to be completely missed.

  Fix: texts longer than MAX_CHUNK_CHARS are split at sentence boundaries
  into overlapping chunks. Each chunk is processed independently and the
  character offsets from each chunk are corrected before merging. Duplicate
  entities at chunk boundaries are deduplicated by (word, start) key.
"""

from typing import List, Tuple
from .models import EntityResult


HF_MODEL_ID = "Nomio4640/ner-mongolian"

# ~400-450 Mongolian Cyrillic tokens ≈ 1 200-1 500 characters.
# Keeping well below 512 BERT tokens leaves room for tokenizer overhead.
MAX_CHUNK_CHARS = 1_300


class NEREngine:
    """Named Entity Recognition service using HuggingFace pipeline."""

    def __init__(self, model_name: str = None):
        import os
        # Use local model if it exists, otherwise fall back to HuggingFace Hub
        local_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "adapters", "ner_mongolian")
        if model_name:
            self.model_name = model_name
        elif os.path.exists(os.path.join(local_path, "model.safetensors")):
            self.model_name = local_path
        else:
            self.model_name = HF_MODEL_ID
        self._pipeline = None

    def _load_pipeline(self):
        """Lazy-load the NER pipeline (heavy model, load only when needed)."""
        if self._pipeline is None:
            import torch
            from transformers import pipeline
            device = 0 if torch.cuda.is_available() else -1
            self._pipeline = pipeline(
                "ner",
                model=self.model_name,
                aggregation_strategy="simple",
                device=device,
            )
            print(f"[NEREngine] Loaded on {'GPU' if device == 0 else 'CPU'}")
        return self._pipeline

    def _clean_entities(self, raw_entities: List[dict]) -> List[dict]:
        """Merge subword tokens (## prefixed) back together."""
        cleaned = []
        for ent in raw_entities:
            word = ent.get("word", "")
            if word.startswith("##") and len(cleaned) > 0:
                cleaned[-1]["word"] += word.replace("##", "")
            else:
                cleaned.append(dict(ent))
        return cleaned

    # ------------------------------------------------------------------
    # Long-text chunking
    # ------------------------------------------------------------------

    def _chunk_text(self, text: str, max_chars: int = MAX_CHUNK_CHARS) -> List[Tuple[str, int]]:
        """
        Split *text* into chunks of at most *max_chars* characters, breaking
        at sentence boundaries where possible.  Returns a list of
        (chunk_text, start_char_offset_in_original) tuples.
        """
        chunks: List[Tuple[str, int]] = []
        start = 0
        n = len(text)
        while start < n:
            end = min(start + max_chars, n)
            if end < n:
                # Try to break at a sentence boundary within the window
                for sep in (". ", "! ", "? ", "\n", " "):
                    pos = text.rfind(sep, start + max_chars // 2, end)
                    if pos != -1:
                        end = pos + len(sep)
                        break
            chunk = text[start:end].strip()
            if chunk:
                chunks.append((chunk, start))
            start = end
        return chunks or [(text, 0)]

    def _recognize_chunked(self, text: str) -> List[EntityResult]:
        """
        Run NER on *text* by splitting it into chunks, correcting entity
        character offsets back to the original text's coordinate space,
        and deduplicating entities that appear at chunk boundaries.
        """
        pipe = self._load_pipeline()
        chunks = self._chunk_text(text)
        all_results: List[EntityResult] = []
        seen: set = set()          # (word_lower, abs_start) dedup key

        for chunk_text, chunk_offset in chunks:
            if not chunk_text.strip():
                continue
            try:
                raw = pipe(chunk_text)
            except Exception:
                continue
            for ent in self._clean_entities(raw):
                word = ent.get("word", "")
                abs_start = chunk_offset + int(ent.get("start", 0))
                abs_end   = chunk_offset + int(ent.get("end", 0))
                key = (word.lower(), abs_start)
                if key in seen:
                    continue
                seen.add(key)
                all_results.append(EntityResult(
                    word=word,
                    entity_group=ent.get("entity_group", "MISC"),
                    score=float(ent.get("score", 0.0)),
                    start=abs_start,
                    end=abs_end,
                ))

        return all_results

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def recognize(self, text: str) -> List[EntityResult]:
        """
        Run NER on a single text and return cleaned entities.
        Automatically chunks texts longer than MAX_CHUNK_CHARS so that
        entities in the second half of long documents are not silently
        dropped by BERT's 512-token truncation.
        """
        if not text or not text.strip():
            return []

        # Long text → chunk-and-merge instead of letting BERT truncate
        if len(text) > MAX_CHUNK_CHARS:
            return self._recognize_chunked(text)

        pipe = self._load_pipeline()
        try:
            raw = pipe(text)
        except Exception:
            return []

        results = []
        for ent in self._clean_entities(raw):
            results.append(EntityResult(
                word=ent.get("word", ""),
                entity_group=ent.get("entity_group", "MISC"),
                score=float(ent.get("score", 0.0)),
                start=int(ent.get("start", 0)),
                end=int(ent.get("end", 0)),
            ))
        return results

    def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
        """
        Run NER on a batch of texts.

        Short texts (≤ MAX_CHUNK_CHARS) are processed together via HuggingFace
        pipeline batching for GPU efficiency.  Long texts are handled
        individually with chunk-and-merge so that no entities are missed.
        """
        if not texts:
            return []

        out: List[List[EntityResult]] = [[] for _ in texts]

        # Separate short and long texts
        short_texts:  List[str] = []
        short_indices: List[int] = []
        long_indices:  List[int] = []

        for i, text in enumerate(texts):
            if not text or not text.strip():
                continue
            if len(text) > MAX_CHUNK_CHARS:
                long_indices.append(i)
            else:
                short_texts.append(text)
                short_indices.append(i)

        # --- Batch-process short texts ---
        if short_texts:
            pipe = self._load_pipeline()
            try:
                raw_results = pipe(short_texts, batch_size=batch_size)
                for idx, raw in zip(short_indices, raw_results):
                    entity_results = []
                    for ent in self._clean_entities(raw):
                        entity_results.append(EntityResult(
                            word=ent.get("word", ""),
                            entity_group=ent.get("entity_group", "MISC"),
                            score=float(ent.get("score", 0.0)),
                            start=int(ent.get("start", 0)),
                            end=int(ent.get("end", 0)),
                        ))
                    out[idx] = entity_results
            except Exception as e:
                print(f"[NEREngine] Batch processing error: {e}")
                # Fallback to per-text processing
                for idx, text in zip(short_indices, short_texts):
                    out[idx] = self.recognize(text)

        # --- Chunk-and-merge long texts (sequential, no truncation) ---
        for idx in long_indices:
            out[idx] = self._recognize_chunked(texts[idx])

        return out