Spaces:
Sleeping
Sleeping
bert chunk problem
Browse files- frontend/src/app/page.tsx +140 -27
- nlp_core/ner_engine.py +145 -43
- nlp_core/sentiment.py +9 -2
frontend/src/app/page.tsx
CHANGED
|
@@ -62,34 +62,133 @@ interface GlobalAnalysis {
|
|
| 62 |
}
|
| 63 |
|
| 64 |
function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } }) {
|
| 65 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return (
|
| 67 |
-
<div
|
| 68 |
-
{
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
</div>
|
| 94 |
);
|
| 95 |
}
|
|
@@ -545,6 +644,20 @@ export default function Dashboard() {
|
|
| 545 |
{/* Results */}
|
| 546 |
{data && !loading && (
|
| 547 |
<>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
{/* Stats */}
|
| 549 |
<div className="stats-grid">
|
| 550 |
<div className="stat-card">
|
|
|
|
| 62 |
}
|
| 63 |
|
| 64 |
function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } }) {
|
| 65 |
+
const [hoveredId, setHoveredId] = useState<string | null>(null);
|
| 66 |
+
|
| 67 |
+
const W = 780, H = 500;
|
| 68 |
+
const cx = W / 2, cy = H / 2;
|
| 69 |
+
const colorMap: Record<string, string> = {
|
| 70 |
+
PER: "#ff6b6b", ORG: "#4ecdc4", LOC: "#ffd93d", MISC: "#a78bfa",
|
| 71 |
+
};
|
| 72 |
+
|
| 73 |
+
// Pick top nodes sorted by frequency
|
| 74 |
+
const topNodes = [...network.nodes]
|
| 75 |
+
.sort((a, b) => b.frequency - a.frequency)
|
| 76 |
+
.slice(0, 40);
|
| 77 |
+
|
| 78 |
+
// Arrange nodes in concentric rings by entity type so same-type nodes
|
| 79 |
+
// cluster together, making co-occurrence edges easier to read.
|
| 80 |
+
const typeOrder = ["PER", "ORG", "LOC", "MISC"];
|
| 81 |
+
const ringRadii = [105, 168, 225, 278];
|
| 82 |
+
const grouped: Record<string, typeof topNodes> = {};
|
| 83 |
+
for (const node of topNodes) {
|
| 84 |
+
const t = node.entity_type || "MISC";
|
| 85 |
+
if (!grouped[t]) grouped[t] = [];
|
| 86 |
+
grouped[t].push(node);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
const posMap = new Map<string, { x: number; y: number }>();
|
| 90 |
+
typeOrder.forEach((type, ti) => {
|
| 91 |
+
const group = grouped[type] || [];
|
| 92 |
+
const r = ringRadii[Math.min(ti, ringRadii.length - 1)];
|
| 93 |
+
group.forEach((node, i) => {
|
| 94 |
+
// Offset each ring's start angle slightly so labels don't collide
|
| 95 |
+
const offset = (ti * Math.PI) / 4;
|
| 96 |
+
const angle = offset + (2 * Math.PI * i) / Math.max(group.length, 1);
|
| 97 |
+
posMap.set(node.id, { x: cx + r * Math.cos(angle), y: cy + r * Math.sin(angle) });
|
| 98 |
+
});
|
| 99 |
+
});
|
| 100 |
+
|
| 101 |
+
// Show top edges by weight (only between visible nodes)
|
| 102 |
+
const topEdges = [...network.edges]
|
| 103 |
+
.filter(e => posMap.has(e.source) && posMap.has(e.target) && e.source !== e.target)
|
| 104 |
+
.sort((a, b) => b.weight - a.weight)
|
| 105 |
+
.slice(0, 80);
|
| 106 |
+
|
| 107 |
+
const maxWeight = topEdges.length > 0 ? topEdges[0].weight : 1;
|
| 108 |
+
|
| 109 |
return (
|
| 110 |
+
<div>
|
| 111 |
+
<div style={{ overflowX: "auto" }}>
|
| 112 |
+
<svg
|
| 113 |
+
width="100%"
|
| 114 |
+
viewBox={`0 0 ${W} ${H}`}
|
| 115 |
+
style={{ display: "block", margin: "0 auto", minWidth: 340 }}
|
| 116 |
+
>
|
| 117 |
+
{/* Edges */}
|
| 118 |
+
{topEdges.map((edge, i) => {
|
| 119 |
+
const s = posMap.get(edge.source)!;
|
| 120 |
+
const t = posMap.get(edge.target)!;
|
| 121 |
+
const isHighlighted = hoveredId === edge.source || hoveredId === edge.target;
|
| 122 |
+
const opacity = isHighlighted ? 0.7 : 0.12 + (edge.weight / maxWeight) * 0.18;
|
| 123 |
+
const strokeW = isHighlighted
|
| 124 |
+
? Math.max(2, (edge.weight / maxWeight) * 4)
|
| 125 |
+
: Math.max(0.5, (edge.weight / maxWeight) * 1.5);
|
| 126 |
+
return (
|
| 127 |
+
<line
|
| 128 |
+
key={i}
|
| 129 |
+
x1={s.x} y1={s.y} x2={t.x} y2={t.y}
|
| 130 |
+
stroke={isHighlighted ? "rgba(255,255,255,0.55)" : "rgba(255,255,255,0.25)"}
|
| 131 |
+
strokeWidth={strokeW}
|
| 132 |
+
strokeOpacity={opacity}
|
| 133 |
+
/>
|
| 134 |
+
);
|
| 135 |
+
})}
|
| 136 |
+
|
| 137 |
+
{/* Nodes */}
|
| 138 |
+
{topNodes.map(node => {
|
| 139 |
+
const pos = posMap.get(node.id);
|
| 140 |
+
if (!pos) return null;
|
| 141 |
+
const r = Math.max(14, Math.min(32, 10 + node.frequency * 1.2));
|
| 142 |
+
const color = colorMap[node.entity_type] || "#6c63ff";
|
| 143 |
+
const isHovered = hoveredId === node.id;
|
| 144 |
+
const label = node.label.length > 11 ? node.label.slice(0, 9) + "…" : node.label;
|
| 145 |
+
return (
|
| 146 |
+
<g
|
| 147 |
+
key={node.id}
|
| 148 |
+
style={{ cursor: "pointer" }}
|
| 149 |
+
onMouseEnter={() => setHoveredId(node.id)}
|
| 150 |
+
onMouseLeave={() => setHoveredId(null)}
|
| 151 |
+
>
|
| 152 |
+
<circle
|
| 153 |
+
cx={pos.x} cy={pos.y}
|
| 154 |
+
r={isHovered ? r + 5 : r}
|
| 155 |
+
fill={`${color}22`}
|
| 156 |
+
stroke={color}
|
| 157 |
+
strokeWidth={isHovered ? 3 : 1.8}
|
| 158 |
+
style={{ transition: "r 0.15s, stroke-width 0.15s" }}
|
| 159 |
+
/>
|
| 160 |
+
<text
|
| 161 |
+
x={pos.x} y={pos.y}
|
| 162 |
+
textAnchor="middle"
|
| 163 |
+
dominantBaseline="middle"
|
| 164 |
+
fill={isHovered ? "#fff" : color}
|
| 165 |
+
fontSize={Math.max(8, Math.min(11, r * 0.62))}
|
| 166 |
+
fontWeight={600}
|
| 167 |
+
>
|
| 168 |
+
{label}
|
| 169 |
+
</text>
|
| 170 |
+
<title>{`${node.label} (${node.entity_type}) — ${node.frequency}×`}</title>
|
| 171 |
+
</g>
|
| 172 |
+
);
|
| 173 |
+
})}
|
| 174 |
+
</svg>
|
| 175 |
+
</div>
|
| 176 |
+
|
| 177 |
+
{/* Legend */}
|
| 178 |
+
<div style={{
|
| 179 |
+
display: "flex", gap: "1.25rem", justifyContent: "center",
|
| 180 |
+
marginTop: "0.75rem", flexWrap: "wrap",
|
| 181 |
+
}}>
|
| 182 |
+
{Object.entries(colorMap).map(([type, color]) => (
|
| 183 |
+
<div key={type} style={{ display: "flex", alignItems: "center", gap: "0.35rem", fontSize: "0.72rem" }}>
|
| 184 |
+
<div style={{ width: 10, height: 10, borderRadius: "50%", background: color, opacity: 0.8 }} />
|
| 185 |
+
<span style={{ color: "var(--text-muted)" }}>{type}</span>
|
| 186 |
+
</div>
|
| 187 |
+
))}
|
| 188 |
+
<span style={{ color: "var(--text-muted)", fontSize: "0.7rem" }}>
|
| 189 |
+
— {network.nodes.length} зангилаа · {network.edges.length} холбоос (шилдэг 40/80 харуулав)
|
| 190 |
+
</span>
|
| 191 |
+
</div>
|
| 192 |
</div>
|
| 193 |
);
|
| 194 |
}
|
|
|
|
| 644 |
{/* Results */}
|
| 645 |
{data && !loading && (
|
| 646 |
<>
|
| 647 |
+
{/* Toolbar: new analysis + active file info */}
|
| 648 |
+
<div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "1rem", flexWrap: "wrap", gap: "0.5rem" }}>
|
| 649 |
+
<span style={{ fontSize: "0.8rem", color: "var(--text-muted)" }}>
|
| 650 |
+
📄 {data.total_documents} нийтлэл шинжлэгдлээ
|
| 651 |
+
</span>
|
| 652 |
+
<button
|
| 653 |
+
className="btn btn-secondary"
|
| 654 |
+
style={{ fontSize: "0.8rem" }}
|
| 655 |
+
onClick={() => { setData(null); setInsights([]); setError(""); setActiveTab("overview"); }}
|
| 656 |
+
>
|
| 657 |
+
+ Шинэ шинжилгээ
|
| 658 |
+
</button>
|
| 659 |
+
</div>
|
| 660 |
+
|
| 661 |
{/* Stats */}
|
| 662 |
<div className="stats-grid">
|
| 663 |
<div className="stat-card">
|
nlp_core/ner_engine.py
CHANGED
|
@@ -1,14 +1,28 @@
|
|
| 1 |
"""
|
| 2 |
NER Engine — Named Entity Recognition using HuggingFace Transformers.
|
| 3 |
-
Wraps the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
-
from typing import List
|
| 7 |
from .models import EntityResult
|
| 8 |
|
| 9 |
|
| 10 |
HF_MODEL_ID = "Nomio4640/ner-mongolian"
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
class NEREngine:
|
| 14 |
"""Named Entity Recognition service using HuggingFace pipeline."""
|
|
@@ -51,19 +65,96 @@ class NEREngine:
|
|
| 51 |
cleaned.append(dict(ent))
|
| 52 |
return cleaned
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def recognize(self, text: str) -> List[EntityResult]:
|
| 55 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
if not text or not text.strip():
|
| 57 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
pipe = self._load_pipeline()
|
| 59 |
try:
|
| 60 |
raw = pipe(text)
|
| 61 |
except Exception:
|
| 62 |
return []
|
| 63 |
|
| 64 |
-
cleaned = self._clean_entities(raw)
|
| 65 |
results = []
|
| 66 |
-
for ent in
|
| 67 |
results.append(EntityResult(
|
| 68 |
word=ent.get("word", ""),
|
| 69 |
entity_group=ent.get("entity_group", "MISC"),
|
|
@@ -74,45 +165,56 @@ class NEREngine:
|
|
| 74 |
return results
|
| 75 |
|
| 76 |
def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
|
| 77 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
if not texts:
|
| 79 |
return []
|
| 80 |
-
|
| 81 |
-
# Filter empty texts to avoid pipeline errors
|
| 82 |
-
valid_texts = []
|
| 83 |
-
valid_indices = []
|
| 84 |
-
for i, text in enumerate(texts):
|
| 85 |
-
if text and text.strip():
|
| 86 |
-
valid_texts.append(text)
|
| 87 |
-
valid_indices.append(i)
|
| 88 |
-
|
| 89 |
-
# Preallocate empty results for all texts
|
| 90 |
out: List[List[EntityResult]] = [[] for _ in texts]
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
return out
|
|
|
|
| 1 |
"""
|
| 2 |
NER Engine — Named Entity Recognition using HuggingFace Transformers.
|
| 3 |
+
Wraps the Nomio4640/ner-mongolian fine-tuned model.
|
| 4 |
+
|
| 5 |
+
Long-text handling:
|
| 6 |
+
BERT has a 512-token hard limit. Long social-media posts (especially
|
| 7 |
+
Google reviews, long Facebook posts) are silently truncated, causing
|
| 8 |
+
entities in the second half to be completely missed.
|
| 9 |
+
|
| 10 |
+
Fix: texts longer than MAX_CHUNK_CHARS are split at sentence boundaries
|
| 11 |
+
into overlapping chunks. Each chunk is processed independently and the
|
| 12 |
+
character offsets from each chunk are corrected before merging. Duplicate
|
| 13 |
+
entities at chunk boundaries are deduplicated by (word, start) key.
|
| 14 |
"""
|
| 15 |
|
| 16 |
+
from typing import List, Tuple
|
| 17 |
from .models import EntityResult
|
| 18 |
|
| 19 |
|
| 20 |
HF_MODEL_ID = "Nomio4640/ner-mongolian"
|
| 21 |
|
| 22 |
+
# ~400-450 Mongolian Cyrillic tokens ≈ 1 200-1 500 characters.
|
| 23 |
+
# Keeping well below 512 BERT tokens leaves room for tokenizer overhead.
|
| 24 |
+
MAX_CHUNK_CHARS = 1_300
|
| 25 |
+
|
| 26 |
|
| 27 |
class NEREngine:
|
| 28 |
"""Named Entity Recognition service using HuggingFace pipeline."""
|
|
|
|
| 65 |
cleaned.append(dict(ent))
|
| 66 |
return cleaned
|
| 67 |
|
| 68 |
+
# ------------------------------------------------------------------
|
| 69 |
+
# Long-text chunking
|
| 70 |
+
# ------------------------------------------------------------------
|
| 71 |
+
|
| 72 |
+
def _chunk_text(self, text: str, max_chars: int = MAX_CHUNK_CHARS) -> List[Tuple[str, int]]:
|
| 73 |
+
"""
|
| 74 |
+
Split *text* into chunks of at most *max_chars* characters, breaking
|
| 75 |
+
at sentence boundaries where possible. Returns a list of
|
| 76 |
+
(chunk_text, start_char_offset_in_original) tuples.
|
| 77 |
+
"""
|
| 78 |
+
chunks: List[Tuple[str, int]] = []
|
| 79 |
+
start = 0
|
| 80 |
+
n = len(text)
|
| 81 |
+
while start < n:
|
| 82 |
+
end = min(start + max_chars, n)
|
| 83 |
+
if end < n:
|
| 84 |
+
# Try to break at a sentence boundary within the window
|
| 85 |
+
for sep in (". ", "! ", "? ", "\n", " "):
|
| 86 |
+
pos = text.rfind(sep, start + max_chars // 2, end)
|
| 87 |
+
if pos != -1:
|
| 88 |
+
end = pos + len(sep)
|
| 89 |
+
break
|
| 90 |
+
chunk = text[start:end].strip()
|
| 91 |
+
if chunk:
|
| 92 |
+
chunks.append((chunk, start))
|
| 93 |
+
start = end
|
| 94 |
+
return chunks or [(text, 0)]
|
| 95 |
+
|
| 96 |
+
def _recognize_chunked(self, text: str) -> List[EntityResult]:
|
| 97 |
+
"""
|
| 98 |
+
Run NER on *text* by splitting it into chunks, correcting entity
|
| 99 |
+
character offsets back to the original text's coordinate space,
|
| 100 |
+
and deduplicating entities that appear at chunk boundaries.
|
| 101 |
+
"""
|
| 102 |
+
pipe = self._load_pipeline()
|
| 103 |
+
chunks = self._chunk_text(text)
|
| 104 |
+
all_results: List[EntityResult] = []
|
| 105 |
+
seen: set = set() # (word_lower, abs_start) dedup key
|
| 106 |
+
|
| 107 |
+
for chunk_text, chunk_offset in chunks:
|
| 108 |
+
if not chunk_text.strip():
|
| 109 |
+
continue
|
| 110 |
+
try:
|
| 111 |
+
raw = pipe(chunk_text)
|
| 112 |
+
except Exception:
|
| 113 |
+
continue
|
| 114 |
+
for ent in self._clean_entities(raw):
|
| 115 |
+
word = ent.get("word", "")
|
| 116 |
+
abs_start = chunk_offset + int(ent.get("start", 0))
|
| 117 |
+
abs_end = chunk_offset + int(ent.get("end", 0))
|
| 118 |
+
key = (word.lower(), abs_start)
|
| 119 |
+
if key in seen:
|
| 120 |
+
continue
|
| 121 |
+
seen.add(key)
|
| 122 |
+
all_results.append(EntityResult(
|
| 123 |
+
word=word,
|
| 124 |
+
entity_group=ent.get("entity_group", "MISC"),
|
| 125 |
+
score=float(ent.get("score", 0.0)),
|
| 126 |
+
start=abs_start,
|
| 127 |
+
end=abs_end,
|
| 128 |
+
))
|
| 129 |
+
|
| 130 |
+
return all_results
|
| 131 |
+
|
| 132 |
+
# ------------------------------------------------------------------
|
| 133 |
+
# Public API
|
| 134 |
+
# ------------------------------------------------------------------
|
| 135 |
+
|
| 136 |
def recognize(self, text: str) -> List[EntityResult]:
|
| 137 |
+
"""
|
| 138 |
+
Run NER on a single text and return cleaned entities.
|
| 139 |
+
Automatically chunks texts longer than MAX_CHUNK_CHARS so that
|
| 140 |
+
entities in the second half of long documents are not silently
|
| 141 |
+
dropped by BERT's 512-token truncation.
|
| 142 |
+
"""
|
| 143 |
if not text or not text.strip():
|
| 144 |
return []
|
| 145 |
+
|
| 146 |
+
# Long text → chunk-and-merge instead of letting BERT truncate
|
| 147 |
+
if len(text) > MAX_CHUNK_CHARS:
|
| 148 |
+
return self._recognize_chunked(text)
|
| 149 |
+
|
| 150 |
pipe = self._load_pipeline()
|
| 151 |
try:
|
| 152 |
raw = pipe(text)
|
| 153 |
except Exception:
|
| 154 |
return []
|
| 155 |
|
|
|
|
| 156 |
results = []
|
| 157 |
+
for ent in self._clean_entities(raw):
|
| 158 |
results.append(EntityResult(
|
| 159 |
word=ent.get("word", ""),
|
| 160 |
entity_group=ent.get("entity_group", "MISC"),
|
|
|
|
| 165 |
return results
|
| 166 |
|
| 167 |
def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
|
| 168 |
+
"""
|
| 169 |
+
Run NER on a batch of texts.
|
| 170 |
+
|
| 171 |
+
Short texts (≤ MAX_CHUNK_CHARS) are processed together via HuggingFace
|
| 172 |
+
pipeline batching for GPU efficiency. Long texts are handled
|
| 173 |
+
individually with chunk-and-merge so that no entities are missed.
|
| 174 |
+
"""
|
| 175 |
if not texts:
|
| 176 |
return []
|
| 177 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
out: List[List[EntityResult]] = [[] for _ in texts]
|
| 179 |
+
|
| 180 |
+
# Separate short and long texts
|
| 181 |
+
short_texts: List[str] = []
|
| 182 |
+
short_indices: List[int] = []
|
| 183 |
+
long_indices: List[int] = []
|
| 184 |
+
|
| 185 |
+
for i, text in enumerate(texts):
|
| 186 |
+
if not text or not text.strip():
|
| 187 |
+
continue
|
| 188 |
+
if len(text) > MAX_CHUNK_CHARS:
|
| 189 |
+
long_indices.append(i)
|
| 190 |
+
else:
|
| 191 |
+
short_texts.append(text)
|
| 192 |
+
short_indices.append(i)
|
| 193 |
+
|
| 194 |
+
# --- Batch-process short texts ---
|
| 195 |
+
if short_texts:
|
| 196 |
+
pipe = self._load_pipeline()
|
| 197 |
+
try:
|
| 198 |
+
raw_results = pipe(short_texts, batch_size=batch_size)
|
| 199 |
+
for idx, raw in zip(short_indices, raw_results):
|
| 200 |
+
entity_results = []
|
| 201 |
+
for ent in self._clean_entities(raw):
|
| 202 |
+
entity_results.append(EntityResult(
|
| 203 |
+
word=ent.get("word", ""),
|
| 204 |
+
entity_group=ent.get("entity_group", "MISC"),
|
| 205 |
+
score=float(ent.get("score", 0.0)),
|
| 206 |
+
start=int(ent.get("start", 0)),
|
| 207 |
+
end=int(ent.get("end", 0)),
|
| 208 |
+
))
|
| 209 |
+
out[idx] = entity_results
|
| 210 |
+
except Exception as e:
|
| 211 |
+
print(f"[NEREngine] Batch processing error: {e}")
|
| 212 |
+
# Fallback to per-text processing
|
| 213 |
+
for idx, text in zip(short_indices, short_texts):
|
| 214 |
+
out[idx] = self.recognize(text)
|
| 215 |
+
|
| 216 |
+
# --- Chunk-and-merge long texts (sequential, no truncation) ---
|
| 217 |
+
for idx in long_indices:
|
| 218 |
+
out[idx] = self._recognize_chunked(texts[idx])
|
| 219 |
+
|
| 220 |
return out
|
nlp_core/sentiment.py
CHANGED
|
@@ -7,15 +7,22 @@ from typing import List, Optional
|
|
| 7 |
from .models import SentimentResult
|
| 8 |
|
| 9 |
|
| 10 |
-
# Map model labels to human-readable labels
|
|
|
|
|
|
|
|
|
|
| 11 |
LABEL_MAP = {
|
| 12 |
"positive": "positive",
|
| 13 |
"neutral": "neutral",
|
| 14 |
"negative": "negative",
|
| 15 |
-
#
|
| 16 |
"LABEL_0": "negative",
|
| 17 |
"LABEL_1": "neutral",
|
| 18 |
"LABEL_2": "positive",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
}
|
| 20 |
|
| 21 |
|
|
|
|
| 7 |
from .models import SentimentResult
|
| 8 |
|
| 9 |
|
| 10 |
+
# Map model labels to human-readable labels.
|
| 11 |
+
# Keys include both original-case and .lower() forms because we call
|
| 12 |
+
# result["label"].lower() before the lookup — the uppercase forms would
|
| 13 |
+
# never match after lowercasing.
|
| 14 |
LABEL_MAP = {
|
| 15 |
"positive": "positive",
|
| 16 |
"neutral": "neutral",
|
| 17 |
"negative": "negative",
|
| 18 |
+
# Original-case (kept for safety if .lower() is ever removed)
|
| 19 |
"LABEL_0": "negative",
|
| 20 |
"LABEL_1": "neutral",
|
| 21 |
"LABEL_2": "positive",
|
| 22 |
+
# Lowercased forms — these are what actually get looked up
|
| 23 |
+
"label_0": "negative",
|
| 24 |
+
"label_1": "neutral",
|
| 25 |
+
"label_2": "positive",
|
| 26 |
}
|
| 27 |
|
| 28 |
|