Nomio4640 commited on
Commit
d4ff564
·
1 Parent(s): 729849e

bert chunk problem

Browse files
frontend/src/app/page.tsx CHANGED
@@ -62,34 +62,133 @@ interface GlobalAnalysis {
62
  }
63
 
64
  function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } }) {
65
- const colorMap: Record<string, string> = { PER: "#ff6b6b", ORG: "#4ecdc4", LOC: "#ffd93d", MISC: "#a78bfa" };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  return (
67
- <div className="network-container" style={{ padding: "2rem", display: "flex", flexWrap: "wrap", gap: "0.5rem", justifyContent: "center", alignItems: "center" }}>
68
- {network.nodes
69
- .sort((a, b) => b.frequency - a.frequency)
70
- .slice(0, 30)
71
- .map((node) => {
72
- const size = Math.max(40, Math.min(120, node.frequency * 5));
73
- const c = colorMap[node.entity_type] || "#6c63ff";
74
- return (
75
- <div
76
- key={node.id}
77
- style={{
78
- width: size, height: size, borderRadius: "50%",
79
- background: `${c}20`, border: `2px solid ${c}`,
80
- display: "flex", alignItems: "center", justifyContent: "center",
81
- fontSize: Math.max(9, Math.min(14, node.frequency)),
82
- color: c, fontWeight: 600, textAlign: "center", padding: "4px",
83
- cursor: "pointer", transition: "transform 0.2s",
84
- }}
85
- title={`${node.label} (${node.entity_type}) — ${node.frequency}x`}
86
- onMouseOver={(e) => (e.currentTarget.style.transform = "scale(1.15)")}
87
- onMouseOut={(e) => (e.currentTarget.style.transform = "scale(1)")}
88
- >
89
- {node.label.length > 12 ? node.label.slice(0, 10) + "…" : node.label}
90
- </div>
91
- );
92
- })}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  </div>
94
  );
95
  }
@@ -545,6 +644,20 @@ export default function Dashboard() {
545
  {/* Results */}
546
  {data && !loading && (
547
  <>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  {/* Stats */}
549
  <div className="stats-grid">
550
  <div className="stat-card">
 
62
  }
63
 
64
  function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } }) {
65
+ const [hoveredId, setHoveredId] = useState<string | null>(null);
66
+
67
+ const W = 780, H = 500;
68
+ const cx = W / 2, cy = H / 2;
69
+ const colorMap: Record<string, string> = {
70
+ PER: "#ff6b6b", ORG: "#4ecdc4", LOC: "#ffd93d", MISC: "#a78bfa",
71
+ };
72
+
73
+ // Pick top nodes sorted by frequency
74
+ const topNodes = [...network.nodes]
75
+ .sort((a, b) => b.frequency - a.frequency)
76
+ .slice(0, 40);
77
+
78
+ // Arrange nodes in concentric rings by entity type so same-type nodes
79
+ // cluster together, making co-occurrence edges easier to read.
80
+ const typeOrder = ["PER", "ORG", "LOC", "MISC"];
81
+ const ringRadii = [105, 168, 225, 278];
82
+ const grouped: Record<string, typeof topNodes> = {};
83
+ for (const node of topNodes) {
84
+ const t = node.entity_type || "MISC";
85
+ if (!grouped[t]) grouped[t] = [];
86
+ grouped[t].push(node);
87
+ }
88
+
89
+ const posMap = new Map<string, { x: number; y: number }>();
90
+ typeOrder.forEach((type, ti) => {
91
+ const group = grouped[type] || [];
92
+ const r = ringRadii[Math.min(ti, ringRadii.length - 1)];
93
+ group.forEach((node, i) => {
94
+ // Offset each ring's start angle slightly so labels don't collide
95
+ const offset = (ti * Math.PI) / 4;
96
+ const angle = offset + (2 * Math.PI * i) / Math.max(group.length, 1);
97
+ posMap.set(node.id, { x: cx + r * Math.cos(angle), y: cy + r * Math.sin(angle) });
98
+ });
99
+ });
100
+
101
+ // Show top edges by weight (only between visible nodes)
102
+ const topEdges = [...network.edges]
103
+ .filter(e => posMap.has(e.source) && posMap.has(e.target) && e.source !== e.target)
104
+ .sort((a, b) => b.weight - a.weight)
105
+ .slice(0, 80);
106
+
107
+ const maxWeight = topEdges.length > 0 ? topEdges[0].weight : 1;
108
+
109
  return (
110
+ <div>
111
+ <div style={{ overflowX: "auto" }}>
112
+ <svg
113
+ width="100%"
114
+ viewBox={`0 0 ${W} ${H}`}
115
+ style={{ display: "block", margin: "0 auto", minWidth: 340 }}
116
+ >
117
+ {/* Edges */}
118
+ {topEdges.map((edge, i) => {
119
+ const s = posMap.get(edge.source)!;
120
+ const t = posMap.get(edge.target)!;
121
+ const isHighlighted = hoveredId === edge.source || hoveredId === edge.target;
122
+ const opacity = isHighlighted ? 0.7 : 0.12 + (edge.weight / maxWeight) * 0.18;
123
+ const strokeW = isHighlighted
124
+ ? Math.max(2, (edge.weight / maxWeight) * 4)
125
+ : Math.max(0.5, (edge.weight / maxWeight) * 1.5);
126
+ return (
127
+ <line
128
+ key={i}
129
+ x1={s.x} y1={s.y} x2={t.x} y2={t.y}
130
+ stroke={isHighlighted ? "rgba(255,255,255,0.55)" : "rgba(255,255,255,0.25)"}
131
+ strokeWidth={strokeW}
132
+ strokeOpacity={opacity}
133
+ />
134
+ );
135
+ })}
136
+
137
+ {/* Nodes */}
138
+ {topNodes.map(node => {
139
+ const pos = posMap.get(node.id);
140
+ if (!pos) return null;
141
+ const r = Math.max(14, Math.min(32, 10 + node.frequency * 1.2));
142
+ const color = colorMap[node.entity_type] || "#6c63ff";
143
+ const isHovered = hoveredId === node.id;
144
+ const label = node.label.length > 11 ? node.label.slice(0, 9) + "…" : node.label;
145
+ return (
146
+ <g
147
+ key={node.id}
148
+ style={{ cursor: "pointer" }}
149
+ onMouseEnter={() => setHoveredId(node.id)}
150
+ onMouseLeave={() => setHoveredId(null)}
151
+ >
152
+ <circle
153
+ cx={pos.x} cy={pos.y}
154
+ r={isHovered ? r + 5 : r}
155
+ fill={`${color}22`}
156
+ stroke={color}
157
+ strokeWidth={isHovered ? 3 : 1.8}
158
+ style={{ transition: "r 0.15s, stroke-width 0.15s" }}
159
+ />
160
+ <text
161
+ x={pos.x} y={pos.y}
162
+ textAnchor="middle"
163
+ dominantBaseline="middle"
164
+ fill={isHovered ? "#fff" : color}
165
+ fontSize={Math.max(8, Math.min(11, r * 0.62))}
166
+ fontWeight={600}
167
+ >
168
+ {label}
169
+ </text>
170
+ <title>{`${node.label} (${node.entity_type}) — ${node.frequency}×`}</title>
171
+ </g>
172
+ );
173
+ })}
174
+ </svg>
175
+ </div>
176
+
177
+ {/* Legend */}
178
+ <div style={{
179
+ display: "flex", gap: "1.25rem", justifyContent: "center",
180
+ marginTop: "0.75rem", flexWrap: "wrap",
181
+ }}>
182
+ {Object.entries(colorMap).map(([type, color]) => (
183
+ <div key={type} style={{ display: "flex", alignItems: "center", gap: "0.35rem", fontSize: "0.72rem" }}>
184
+ <div style={{ width: 10, height: 10, borderRadius: "50%", background: color, opacity: 0.8 }} />
185
+ <span style={{ color: "var(--text-muted)" }}>{type}</span>
186
+ </div>
187
+ ))}
188
+ <span style={{ color: "var(--text-muted)", fontSize: "0.7rem" }}>
189
+ — {network.nodes.length} зангилаа · {network.edges.length} холбоос (шилдэг 40/80 харуулав)
190
+ </span>
191
+ </div>
192
  </div>
193
  );
194
  }
 
644
  {/* Results */}
645
  {data && !loading && (
646
  <>
647
+ {/* Toolbar: new analysis + active file info */}
648
+ <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "1rem", flexWrap: "wrap", gap: "0.5rem" }}>
649
+ <span style={{ fontSize: "0.8rem", color: "var(--text-muted)" }}>
650
+ 📄 {data.total_documents} нийтлэл шинжлэгдлээ
651
+ </span>
652
+ <button
653
+ className="btn btn-secondary"
654
+ style={{ fontSize: "0.8rem" }}
655
+ onClick={() => { setData(null); setInsights([]); setError(""); setActiveTab("overview"); }}
656
+ >
657
+ + Шинэ шинжилгээ
658
+ </button>
659
+ </div>
660
+
661
  {/* Stats */}
662
  <div className="stats-grid">
663
  <div className="stat-card">
nlp_core/ner_engine.py CHANGED
@@ -1,14 +1,28 @@
1
  """
2
  NER Engine — Named Entity Recognition using HuggingFace Transformers.
3
- Wraps the Davlan/bert-base-multilingual-cased-ner-hrl model.
 
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
- from typing import List
7
  from .models import EntityResult
8
 
9
 
10
  HF_MODEL_ID = "Nomio4640/ner-mongolian"
11
 
 
 
 
 
12
 
13
  class NEREngine:
14
  """Named Entity Recognition service using HuggingFace pipeline."""
@@ -51,19 +65,96 @@ class NEREngine:
51
  cleaned.append(dict(ent))
52
  return cleaned
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def recognize(self, text: str) -> List[EntityResult]:
55
- """Run NER on a single text and return cleaned entities."""
 
 
 
 
 
56
  if not text or not text.strip():
57
  return []
 
 
 
 
 
58
  pipe = self._load_pipeline()
59
  try:
60
  raw = pipe(text)
61
  except Exception:
62
  return []
63
 
64
- cleaned = self._clean_entities(raw)
65
  results = []
66
- for ent in cleaned:
67
  results.append(EntityResult(
68
  word=ent.get("word", ""),
69
  entity_group=ent.get("entity_group", "MISC"),
@@ -74,45 +165,56 @@ class NEREngine:
74
  return results
75
 
76
  def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
77
- """Run NER on a batch of texts utilizing Hugging Face pipeline batching."""
 
 
 
 
 
 
78
  if not texts:
79
  return []
80
-
81
- # Filter empty texts to avoid pipeline errors
82
- valid_texts = []
83
- valid_indices = []
84
- for i, text in enumerate(texts):
85
- if text and text.strip():
86
- valid_texts.append(text)
87
- valid_indices.append(i)
88
-
89
- # Preallocate empty results for all texts
90
  out: List[List[EntityResult]] = [[] for _ in texts]
91
-
92
- if not valid_texts:
93
- return out
94
-
95
- pipe = self._load_pipeline()
96
- try:
97
- # Send batch directly to pipeline
98
- raw_results = pipe(valid_texts, batch_size=batch_size)
99
-
100
- for idx, raw in zip(valid_indices, raw_results):
101
- cleaned = self._clean_entities(raw)
102
- entity_results = []
103
- for ent in cleaned:
104
- entity_results.append(EntityResult(
105
- word=ent.get("word", ""),
106
- entity_group=ent.get("entity_group", "MISC"),
107
- score=float(ent.get("score", 0.0)),
108
- start=int(ent.get("start", 0)),
109
- end=int(ent.get("end", 0)),
110
- ))
111
- out[idx] = entity_results
112
- except Exception as e:
113
- print(f"[NEREngine] Batch processing error: {e}")
114
- # Fallback to single text processing if pipeline batch fails
115
- for idx, text in zip(valid_indices, valid_texts):
116
- out[idx] = self.recognize(text)
117
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  return out
 
1
  """
2
  NER Engine — Named Entity Recognition using HuggingFace Transformers.
3
+ Wraps the Nomio4640/ner-mongolian fine-tuned model.
4
+
5
+ Long-text handling:
6
+ BERT has a 512-token hard limit. Long social-media posts (especially
7
+ Google reviews, long Facebook posts) are silently truncated, causing
8
+ entities in the second half to be completely missed.
9
+
10
+ Fix: texts longer than MAX_CHUNK_CHARS are split at sentence boundaries
11
+ into overlapping chunks. Each chunk is processed independently and the
12
+ character offsets from each chunk are corrected before merging. Duplicate
13
+ entities at chunk boundaries are deduplicated by (word, start) key.
14
  """
15
 
16
+ from typing import List, Tuple
17
  from .models import EntityResult
18
 
19
 
20
  HF_MODEL_ID = "Nomio4640/ner-mongolian"
21
 
22
+ # ~400-450 Mongolian Cyrillic tokens ≈ 1 200-1 500 characters.
23
+ # Keeping well below 512 BERT tokens leaves room for tokenizer overhead.
24
+ MAX_CHUNK_CHARS = 1_300
25
+
26
 
27
  class NEREngine:
28
  """Named Entity Recognition service using HuggingFace pipeline."""
 
65
  cleaned.append(dict(ent))
66
  return cleaned
67
 
68
+ # ------------------------------------------------------------------
69
+ # Long-text chunking
70
+ # ------------------------------------------------------------------
71
+
72
+ def _chunk_text(self, text: str, max_chars: int = MAX_CHUNK_CHARS) -> List[Tuple[str, int]]:
73
+ """
74
+ Split *text* into chunks of at most *max_chars* characters, breaking
75
+ at sentence boundaries where possible. Returns a list of
76
+ (chunk_text, start_char_offset_in_original) tuples.
77
+ """
78
+ chunks: List[Tuple[str, int]] = []
79
+ start = 0
80
+ n = len(text)
81
+ while start < n:
82
+ end = min(start + max_chars, n)
83
+ if end < n:
84
+ # Try to break at a sentence boundary within the window
85
+ for sep in (". ", "! ", "? ", "\n", " "):
86
+ pos = text.rfind(sep, start + max_chars // 2, end)
87
+ if pos != -1:
88
+ end = pos + len(sep)
89
+ break
90
+ chunk = text[start:end].strip()
91
+ if chunk:
92
+ chunks.append((chunk, start))
93
+ start = end
94
+ return chunks or [(text, 0)]
95
+
96
+ def _recognize_chunked(self, text: str) -> List[EntityResult]:
97
+ """
98
+ Run NER on *text* by splitting it into chunks, correcting entity
99
+ character offsets back to the original text's coordinate space,
100
+ and deduplicating entities that appear at chunk boundaries.
101
+ """
102
+ pipe = self._load_pipeline()
103
+ chunks = self._chunk_text(text)
104
+ all_results: List[EntityResult] = []
105
+ seen: set = set() # (word_lower, abs_start) dedup key
106
+
107
+ for chunk_text, chunk_offset in chunks:
108
+ if not chunk_text.strip():
109
+ continue
110
+ try:
111
+ raw = pipe(chunk_text)
112
+ except Exception:
113
+ continue
114
+ for ent in self._clean_entities(raw):
115
+ word = ent.get("word", "")
116
+ abs_start = chunk_offset + int(ent.get("start", 0))
117
+ abs_end = chunk_offset + int(ent.get("end", 0))
118
+ key = (word.lower(), abs_start)
119
+ if key in seen:
120
+ continue
121
+ seen.add(key)
122
+ all_results.append(EntityResult(
123
+ word=word,
124
+ entity_group=ent.get("entity_group", "MISC"),
125
+ score=float(ent.get("score", 0.0)),
126
+ start=abs_start,
127
+ end=abs_end,
128
+ ))
129
+
130
+ return all_results
131
+
132
+ # ------------------------------------------------------------------
133
+ # Public API
134
+ # ------------------------------------------------------------------
135
+
136
  def recognize(self, text: str) -> List[EntityResult]:
137
+ """
138
+ Run NER on a single text and return cleaned entities.
139
+ Automatically chunks texts longer than MAX_CHUNK_CHARS so that
140
+ entities in the second half of long documents are not silently
141
+ dropped by BERT's 512-token truncation.
142
+ """
143
  if not text or not text.strip():
144
  return []
145
+
146
+ # Long text → chunk-and-merge instead of letting BERT truncate
147
+ if len(text) > MAX_CHUNK_CHARS:
148
+ return self._recognize_chunked(text)
149
+
150
  pipe = self._load_pipeline()
151
  try:
152
  raw = pipe(text)
153
  except Exception:
154
  return []
155
 
 
156
  results = []
157
+ for ent in self._clean_entities(raw):
158
  results.append(EntityResult(
159
  word=ent.get("word", ""),
160
  entity_group=ent.get("entity_group", "MISC"),
 
165
  return results
166
 
167
  def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
168
+ """
169
+ Run NER on a batch of texts.
170
+
171
+ Short texts (≤ MAX_CHUNK_CHARS) are processed together via HuggingFace
172
+ pipeline batching for GPU efficiency. Long texts are handled
173
+ individually with chunk-and-merge so that no entities are missed.
174
+ """
175
  if not texts:
176
  return []
177
+
 
 
 
 
 
 
 
 
 
178
  out: List[List[EntityResult]] = [[] for _ in texts]
179
+
180
+ # Separate short and long texts
181
+ short_texts: List[str] = []
182
+ short_indices: List[int] = []
183
+ long_indices: List[int] = []
184
+
185
+ for i, text in enumerate(texts):
186
+ if not text or not text.strip():
187
+ continue
188
+ if len(text) > MAX_CHUNK_CHARS:
189
+ long_indices.append(i)
190
+ else:
191
+ short_texts.append(text)
192
+ short_indices.append(i)
193
+
194
+ # --- Batch-process short texts ---
195
+ if short_texts:
196
+ pipe = self._load_pipeline()
197
+ try:
198
+ raw_results = pipe(short_texts, batch_size=batch_size)
199
+ for idx, raw in zip(short_indices, raw_results):
200
+ entity_results = []
201
+ for ent in self._clean_entities(raw):
202
+ entity_results.append(EntityResult(
203
+ word=ent.get("word", ""),
204
+ entity_group=ent.get("entity_group", "MISC"),
205
+ score=float(ent.get("score", 0.0)),
206
+ start=int(ent.get("start", 0)),
207
+ end=int(ent.get("end", 0)),
208
+ ))
209
+ out[idx] = entity_results
210
+ except Exception as e:
211
+ print(f"[NEREngine] Batch processing error: {e}")
212
+ # Fallback to per-text processing
213
+ for idx, text in zip(short_indices, short_texts):
214
+ out[idx] = self.recognize(text)
215
+
216
+ # --- Chunk-and-merge long texts (sequential, no truncation) ---
217
+ for idx in long_indices:
218
+ out[idx] = self._recognize_chunked(texts[idx])
219
+
220
  return out
nlp_core/sentiment.py CHANGED
@@ -7,15 +7,22 @@ from typing import List, Optional
7
  from .models import SentimentResult
8
 
9
 
10
- # Map model labels to human-readable labels
 
 
 
11
  LABEL_MAP = {
12
  "positive": "positive",
13
  "neutral": "neutral",
14
  "negative": "negative",
15
- # Some models output LABEL_0, LABEL_1, LABEL_2
16
  "LABEL_0": "negative",
17
  "LABEL_1": "neutral",
18
  "LABEL_2": "positive",
 
 
 
 
19
  }
20
 
21
 
 
7
  from .models import SentimentResult
8
 
9
 
10
+ # Map model labels to human-readable labels.
11
+ # Keys include both original-case and .lower() forms because we call
12
+ # result["label"].lower() before the lookup — the uppercase forms would
13
+ # never match after lowercasing.
14
  LABEL_MAP = {
15
  "positive": "positive",
16
  "neutral": "neutral",
17
  "negative": "negative",
18
+ # Original-case (kept for safety if .lower() is ever removed)
19
  "LABEL_0": "negative",
20
  "LABEL_1": "neutral",
21
  "LABEL_2": "positive",
22
+ # Lowercased forms — these are what actually get looked up
23
+ "label_0": "negative",
24
+ "label_1": "neutral",
25
+ "label_2": "positive",
26
  }
27
 
28