Spaces:
Sleeping
Sleeping
File size: 8,542 Bytes
ae91091 d4ff564 ae91091 d4ff564 ae91091 233bc02 d4ff564 233bc02 ae91091 233bc02 ae91091 7726529 ae91091 7726529 ae91091 7726529 ae91091 7726529 ae91091 d4ff564 ae91091 d4ff564 ae91091 d4ff564 ae91091 d4ff564 ae91091 4c114c1 d4ff564 4c114c1 d4ff564 4c114c1 d4ff564 4c114c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | """
NER Engine — Named Entity Recognition using HuggingFace Transformers.
Wraps the Nomio4640/ner-mongolian fine-tuned model.
Long-text handling:
BERT has a 512-token hard limit. Long social-media posts (especially
Google reviews, long Facebook posts) are silently truncated, causing
entities in the second half to be completely missed.
Fix: texts longer than MAX_CHUNK_CHARS are split at sentence boundaries
into overlapping chunks. Each chunk is processed independently and the
character offsets from each chunk are corrected before merging. Duplicate
entities at chunk boundaries are deduplicated by (word, start) key.
"""
from typing import List, Tuple
from .models import EntityResult
HF_MODEL_ID = "Nomio4640/ner-mongolian"
# ~400-450 Mongolian Cyrillic tokens ≈ 1 200-1 500 characters.
# Keeping well below 512 BERT tokens leaves room for tokenizer overhead.
MAX_CHUNK_CHARS = 1_300
class NEREngine:
"""Named Entity Recognition service using HuggingFace pipeline."""
def __init__(self, model_name: str = None):
import os
# Use local model if it exists, otherwise fall back to HuggingFace Hub
local_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "adapters", "ner_mongolian")
if model_name:
self.model_name = model_name
elif os.path.exists(os.path.join(local_path, "model.safetensors")):
self.model_name = local_path
else:
self.model_name = HF_MODEL_ID
self._pipeline = None
def _load_pipeline(self):
"""Lazy-load the NER pipeline (heavy model, load only when needed)."""
if self._pipeline is None:
import torch
from transformers import pipeline
device = 0 if torch.cuda.is_available() else -1
self._pipeline = pipeline(
"ner",
model=self.model_name,
aggregation_strategy="simple",
device=device,
)
print(f"[NEREngine] Loaded on {'GPU' if device == 0 else 'CPU'}")
return self._pipeline
def _clean_entities(self, raw_entities: List[dict]) -> List[dict]:
"""Merge subword tokens (## prefixed) back together."""
cleaned = []
for ent in raw_entities:
word = ent.get("word", "")
if word.startswith("##") and len(cleaned) > 0:
cleaned[-1]["word"] += word.replace("##", "")
else:
cleaned.append(dict(ent))
return cleaned
# ------------------------------------------------------------------
# Long-text chunking
# ------------------------------------------------------------------
def _chunk_text(self, text: str, max_chars: int = MAX_CHUNK_CHARS) -> List[Tuple[str, int]]:
"""
Split *text* into chunks of at most *max_chars* characters, breaking
at sentence boundaries where possible. Returns a list of
(chunk_text, start_char_offset_in_original) tuples.
"""
chunks: List[Tuple[str, int]] = []
start = 0
n = len(text)
while start < n:
end = min(start + max_chars, n)
if end < n:
# Try to break at a sentence boundary within the window
for sep in (". ", "! ", "? ", "\n", " "):
pos = text.rfind(sep, start + max_chars // 2, end)
if pos != -1:
end = pos + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append((chunk, start))
start = end
return chunks or [(text, 0)]
def _recognize_chunked(self, text: str) -> List[EntityResult]:
"""
Run NER on *text* by splitting it into chunks, correcting entity
character offsets back to the original text's coordinate space,
and deduplicating entities that appear at chunk boundaries.
"""
pipe = self._load_pipeline()
chunks = self._chunk_text(text)
all_results: List[EntityResult] = []
seen: set = set() # (word_lower, abs_start) dedup key
for chunk_text, chunk_offset in chunks:
if not chunk_text.strip():
continue
try:
raw = pipe(chunk_text)
except Exception:
continue
for ent in self._clean_entities(raw):
word = ent.get("word", "")
abs_start = chunk_offset + int(ent.get("start", 0))
abs_end = chunk_offset + int(ent.get("end", 0))
key = (word.lower(), abs_start)
if key in seen:
continue
seen.add(key)
all_results.append(EntityResult(
word=word,
entity_group=ent.get("entity_group", "MISC"),
score=float(ent.get("score", 0.0)),
start=abs_start,
end=abs_end,
))
return all_results
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def recognize(self, text: str) -> List[EntityResult]:
"""
Run NER on a single text and return cleaned entities.
Automatically chunks texts longer than MAX_CHUNK_CHARS so that
entities in the second half of long documents are not silently
dropped by BERT's 512-token truncation.
"""
if not text or not text.strip():
return []
# Long text → chunk-and-merge instead of letting BERT truncate
if len(text) > MAX_CHUNK_CHARS:
return self._recognize_chunked(text)
pipe = self._load_pipeline()
try:
raw = pipe(text)
except Exception:
return []
results = []
for ent in self._clean_entities(raw):
results.append(EntityResult(
word=ent.get("word", ""),
entity_group=ent.get("entity_group", "MISC"),
score=float(ent.get("score", 0.0)),
start=int(ent.get("start", 0)),
end=int(ent.get("end", 0)),
))
return results
def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
"""
Run NER on a batch of texts.
Short texts (≤ MAX_CHUNK_CHARS) are processed together via HuggingFace
pipeline batching for GPU efficiency. Long texts are handled
individually with chunk-and-merge so that no entities are missed.
"""
if not texts:
return []
out: List[List[EntityResult]] = [[] for _ in texts]
# Separate short and long texts
short_texts: List[str] = []
short_indices: List[int] = []
long_indices: List[int] = []
for i, text in enumerate(texts):
if not text or not text.strip():
continue
if len(text) > MAX_CHUNK_CHARS:
long_indices.append(i)
else:
short_texts.append(text)
short_indices.append(i)
# --- Batch-process short texts ---
if short_texts:
pipe = self._load_pipeline()
try:
raw_results = pipe(short_texts, batch_size=batch_size)
for idx, raw in zip(short_indices, raw_results):
entity_results = []
for ent in self._clean_entities(raw):
entity_results.append(EntityResult(
word=ent.get("word", ""),
entity_group=ent.get("entity_group", "MISC"),
score=float(ent.get("score", 0.0)),
start=int(ent.get("start", 0)),
end=int(ent.get("end", 0)),
))
out[idx] = entity_results
except Exception as e:
print(f"[NEREngine] Batch processing error: {e}")
# Fallback to per-text processing
for idx, text in zip(short_indices, short_texts):
out[idx] = self.recognize(text)
# --- Chunk-and-merge long texts (sequential, no truncation) ---
for idx in long_indices:
out[idx] = self._recognize_chunked(texts[idx])
return out
|