Spaces:

Nomio4640
/

NLP-intelligence

Sleeping

App Files Files Community

Nomio4640 commited on 17 days ago

Commit

d4ff564

1 Parent(s): 729849e

bert chunk problem

Browse files

Files changed (3) hide show

frontend/src/app/page.tsx +140 -27
nlp_core/ner_engine.py +145 -43
nlp_core/sentiment.py +9 -2

frontend/src/app/page.tsx CHANGED Viewed

@@ -62,34 +62,133 @@ interface GlobalAnalysis {
 }
 function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } }) {
-  const colorMap: Record<string, string> = { PER: "#ff6b6b", ORG: "#4ecdc4", LOC: "#ffd93d", MISC: "#a78bfa" };
   return (
-    <div className="network-container" style={{ padding: "2rem", display: "flex", flexWrap: "wrap", gap: "0.5rem", justifyContent: "center", alignItems: "center" }}>
-      {network.nodes
-        .sort((a, b) => b.frequency - a.frequency)
-        .slice(0, 30)
-        .map((node) => {
-          const size = Math.max(40, Math.min(120, node.frequency * 5));
-          const c = colorMap[node.entity_type] || "#6c63ff";
-          return (
-            <div
-              key={node.id}
-              style={{
-                width: size, height: size, borderRadius: "50%",
-                background: `${c}20`, border: `2px solid ${c}`,
-                display: "flex", alignItems: "center", justifyContent: "center",
-                fontSize: Math.max(9, Math.min(14, node.frequency)),
-                color: c, fontWeight: 600, textAlign: "center", padding: "4px",
-                cursor: "pointer", transition: "transform 0.2s",
-              }}
-              title={`${node.label} (${node.entity_type}) — ${node.frequency}x`}
-              onMouseOver={(e) => (e.currentTarget.style.transform = "scale(1.15)")}
-              onMouseOut={(e) => (e.currentTarget.style.transform = "scale(1)")}
-            >
-              {node.label.length > 12 ? node.label.slice(0, 10) + "…" : node.label}
-            </div>
-          );
-        })}
     </div>
   );
 }
@@ -545,6 +644,20 @@ export default function Dashboard() {
       {/* Results */}
       {data && !loading && (
         <>
           {/* Stats */}
           <div className="stats-grid">
             <div className="stat-card">

 }
 function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } }) {
+  const [hoveredId, setHoveredId] = useState<string | null>(null);
+  const W = 780, H = 500;
+  const cx = W / 2, cy = H / 2;
+  const colorMap: Record<string, string> = {
+    PER: "#ff6b6b", ORG: "#4ecdc4", LOC: "#ffd93d", MISC: "#a78bfa",
+  };
+  // Pick top nodes sorted by frequency
+  const topNodes = [...network.nodes]
+    .sort((a, b) => b.frequency - a.frequency)
+    .slice(0, 40);
+  // Arrange nodes in concentric rings by entity type so same-type nodes
+  // cluster together, making co-occurrence edges easier to read.
+  const typeOrder = ["PER", "ORG", "LOC", "MISC"];
+  const ringRadii = [105, 168, 225, 278];
+  const grouped: Record<string, typeof topNodes> = {};
+  for (const node of topNodes) {
+    const t = node.entity_type || "MISC";
+    if (!grouped[t]) grouped[t] = [];
+    grouped[t].push(node);
+  }
+  const posMap = new Map<string, { x: number; y: number }>();
+  typeOrder.forEach((type, ti) => {
+    const group = grouped[type] || [];
+    const r = ringRadii[Math.min(ti, ringRadii.length - 1)];
+    group.forEach((node, i) => {
+      // Offset each ring's start angle slightly so labels don't collide
+      const offset = (ti * Math.PI) / 4;
+      const angle = offset + (2 * Math.PI * i) / Math.max(group.length, 1);
+      posMap.set(node.id, { x: cx + r * Math.cos(angle), y: cy + r * Math.sin(angle) });
+    });
+  });
+  // Show top edges by weight (only between visible nodes)
+  const topEdges = [...network.edges]
+    .filter(e => posMap.has(e.source) && posMap.has(e.target) && e.source !== e.target)
+    .sort((a, b) => b.weight - a.weight)
+    .slice(0, 80);
+  const maxWeight = topEdges.length > 0 ? topEdges[0].weight : 1;
   return (
+    <div>
+      <div style={{ overflowX: "auto" }}>
+        <svg
+          width="100%"
+          viewBox={`0 0 ${W} ${H}`}
+          style={{ display: "block", margin: "0 auto", minWidth: 340 }}
+        >
+          {/* Edges */}
+          {topEdges.map((edge, i) => {
+            const s = posMap.get(edge.source)!;
+            const t = posMap.get(edge.target)!;
+            const isHighlighted = hoveredId === edge.source || hoveredId === edge.target;
+            const opacity = isHighlighted ? 0.7 : 0.12 + (edge.weight / maxWeight) * 0.18;
+            const strokeW = isHighlighted
+              ? Math.max(2, (edge.weight / maxWeight) * 4)
+              : Math.max(0.5, (edge.weight / maxWeight) * 1.5);
+            return (
+              <line
+                key={i}
+                x1={s.x} y1={s.y} x2={t.x} y2={t.y}
+                stroke={isHighlighted ? "rgba(255,255,255,0.55)" : "rgba(255,255,255,0.25)"}
+                strokeWidth={strokeW}
+                strokeOpacity={opacity}
+              />
+            );
+          })}
+          {/* Nodes */}
+          {topNodes.map(node => {
+            const pos = posMap.get(node.id);
+            if (!pos) return null;
+            const r = Math.max(14, Math.min(32, 10 + node.frequency * 1.2));
+            const color = colorMap[node.entity_type] || "#6c63ff";
+            const isHovered = hoveredId === node.id;
+            const label = node.label.length > 11 ? node.label.slice(0, 9) + "…" : node.label;
+            return (
+              <g
+                key={node.id}
+                style={{ cursor: "pointer" }}
+                onMouseEnter={() => setHoveredId(node.id)}
+                onMouseLeave={() => setHoveredId(null)}
+              >
+                <circle
+                  cx={pos.x} cy={pos.y}
+                  r={isHovered ? r + 5 : r}
+                  fill={`${color}22`}
+                  stroke={color}
+                  strokeWidth={isHovered ? 3 : 1.8}
+                  style={{ transition: "r 0.15s, stroke-width 0.15s" }}
+                />
+                <text
+                  x={pos.x} y={pos.y}
+                  textAnchor="middle"
+                  dominantBaseline="middle"
+                  fill={isHovered ? "#fff" : color}
+                  fontSize={Math.max(8, Math.min(11, r * 0.62))}
+                  fontWeight={600}
+                >
+                  {label}
+                </text>
+                <title>{`${node.label} (${node.entity_type}) — ${node.frequency}×`}</title>
+              </g>
+            );
+          })}
+        </svg>
+      </div>
+      {/* Legend */}
+      <div style={{
+        display: "flex", gap: "1.25rem", justifyContent: "center",
+        marginTop: "0.75rem", flexWrap: "wrap",
+      }}>
+        {Object.entries(colorMap).map(([type, color]) => (
+          <div key={type} style={{ display: "flex", alignItems: "center", gap: "0.35rem", fontSize: "0.72rem" }}>
+            <div style={{ width: 10, height: 10, borderRadius: "50%", background: color, opacity: 0.8 }} />
+            <span style={{ color: "var(--text-muted)" }}>{type}</span>
+          </div>
+        ))}
+        <span style={{ color: "var(--text-muted)", fontSize: "0.7rem" }}>
+          — {network.nodes.length} зангилаа · {network.edges.length} холбоос (шилдэг 40/80 харуулав)
+        </span>
+      </div>
     </div>
   );
 }
       {/* Results */}
       {data && !loading && (
         <>
+          {/* Toolbar: new analysis + active file info */}
+          <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "1rem", flexWrap: "wrap", gap: "0.5rem" }}>
+            <span style={{ fontSize: "0.8rem", color: "var(--text-muted)" }}>
+              📄 {data.total_documents} нийтлэл шинжлэгдлээ
+            </span>
+            <button
+              className="btn btn-secondary"
+              style={{ fontSize: "0.8rem" }}
+              onClick={() => { setData(null); setInsights([]); setError(""); setActiveTab("overview"); }}
+            >
+              ＋ Шинэ шинжилгээ
+            </button>
+          </div>
           {/* Stats */}
           <div className="stats-grid">
             <div className="stat-card">

nlp_core/ner_engine.py CHANGED Viewed

@@ -1,14 +1,28 @@
 """
 NER Engine — Named Entity Recognition using HuggingFace Transformers.
-Wraps the Davlan/bert-base-multilingual-cased-ner-hrl model.
 """
-from typing import List
 from .models import EntityResult
 HF_MODEL_ID = "Nomio4640/ner-mongolian"
 class NEREngine:
     """Named Entity Recognition service using HuggingFace pipeline."""
@@ -51,19 +65,96 @@ class NEREngine:
                 cleaned.append(dict(ent))
         return cleaned
     def recognize(self, text: str) -> List[EntityResult]:
-        """Run NER on a single text and return cleaned entities."""
         if not text or not text.strip():
             return []
         pipe = self._load_pipeline()
         try:
             raw = pipe(text)
         except Exception:
             return []
-        cleaned = self._clean_entities(raw)
         results = []
-        for ent in cleaned:
             results.append(EntityResult(
                 word=ent.get("word", ""),
                 entity_group=ent.get("entity_group", "MISC"),
@@ -74,45 +165,56 @@ class NEREngine:
         return results
     def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
-        """Run NER on a batch of texts utilizing Hugging Face pipeline batching."""
         if not texts:
             return []
-        # Filter empty texts to avoid pipeline errors
-        valid_texts = []
-        valid_indices = []
-        for i, text in enumerate(texts):
-            if text and text.strip():
-                valid_texts.append(text)
-                valid_indices.append(i)
-        # Preallocate empty results for all texts
         out: List[List[EntityResult]] = [[] for _ in texts]
-        if not valid_texts:
-            return out
-        pipe = self._load_pipeline()
-        try:
-            # Send batch directly to pipeline
-            raw_results = pipe(valid_texts, batch_size=batch_size)
-            for idx, raw in zip(valid_indices, raw_results):
-                cleaned = self._clean_entities(raw)
-                entity_results = []
-                for ent in cleaned:
-                    entity_results.append(EntityResult(
-                        word=ent.get("word", ""),
-                        entity_group=ent.get("entity_group", "MISC"),
-                        score=float(ent.get("score", 0.0)),
-                        start=int(ent.get("start", 0)),
-                        end=int(ent.get("end", 0)),
-                    ))
-                out[idx] = entity_results
-        except Exception as e:
-            print(f"[NEREngine] Batch processing error: {e}")
-            # Fallback to single text processing if pipeline batch fails
-            for idx, text in zip(valid_indices, valid_texts):
-                out[idx] = self.recognize(text)
         return out

 """
 NER Engine — Named Entity Recognition using HuggingFace Transformers.
+Wraps the Nomio4640/ner-mongolian fine-tuned model.
+Long-text handling:
+  BERT has a 512-token hard limit. Long social-media posts (especially
+  Google reviews, long Facebook posts) are silently truncated, causing
+  entities in the second half to be completely missed.
+  Fix: texts longer than MAX_CHUNK_CHARS are split at sentence boundaries
+  into overlapping chunks. Each chunk is processed independently and the
+  character offsets from each chunk are corrected before merging. Duplicate
+  entities at chunk boundaries are deduplicated by (word, start) key.
 """
+from typing import List, Tuple
 from .models import EntityResult
 HF_MODEL_ID = "Nomio4640/ner-mongolian"
+# ~400-450 Mongolian Cyrillic tokens ≈ 1 200-1 500 characters.
+# Keeping well below 512 BERT tokens leaves room for tokenizer overhead.
+MAX_CHUNK_CHARS = 1_300
 class NEREngine:
     """Named Entity Recognition service using HuggingFace pipeline."""
                 cleaned.append(dict(ent))
         return cleaned
+    # ------------------------------------------------------------------
+    # Long-text chunking
+    # ------------------------------------------------------------------
+    def _chunk_text(self, text: str, max_chars: int = MAX_CHUNK_CHARS) -> List[Tuple[str, int]]:
+        """
+        Split *text* into chunks of at most *max_chars* characters, breaking
+        at sentence boundaries where possible.  Returns a list of
+        (chunk_text, start_char_offset_in_original) tuples.
+        """
+        chunks: List[Tuple[str, int]] = []
+        start = 0
+        n = len(text)
+        while start < n:
+            end = min(start + max_chars, n)
+            if end < n:
+                # Try to break at a sentence boundary within the window
+                for sep in (". ", "! ", "? ", "\n", " "):
+                    pos = text.rfind(sep, start + max_chars // 2, end)
+                    if pos != -1:
+                        end = pos + len(sep)
+                        break
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append((chunk, start))
+            start = end
+        return chunks or [(text, 0)]
+    def _recognize_chunked(self, text: str) -> List[EntityResult]:
+        """
+        Run NER on *text* by splitting it into chunks, correcting entity
+        character offsets back to the original text's coordinate space,
+        and deduplicating entities that appear at chunk boundaries.
+        """
+        pipe = self._load_pipeline()
+        chunks = self._chunk_text(text)
+        all_results: List[EntityResult] = []
+        seen: set = set()          # (word_lower, abs_start) dedup key
+        for chunk_text, chunk_offset in chunks:
+            if not chunk_text.strip():
+                continue
+            try:
+                raw = pipe(chunk_text)
+            except Exception:
+                continue
+            for ent in self._clean_entities(raw):
+                word = ent.get("word", "")
+                abs_start = chunk_offset + int(ent.get("start", 0))
+                abs_end   = chunk_offset + int(ent.get("end", 0))
+                key = (word.lower(), abs_start)
+                if key in seen:
+                    continue
+                seen.add(key)
+                all_results.append(EntityResult(
+                    word=word,
+                    entity_group=ent.get("entity_group", "MISC"),
+                    score=float(ent.get("score", 0.0)),
+                    start=abs_start,
+                    end=abs_end,
+                ))
+        return all_results
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
     def recognize(self, text: str) -> List[EntityResult]:
+        """
+        Run NER on a single text and return cleaned entities.
+        Automatically chunks texts longer than MAX_CHUNK_CHARS so that
+        entities in the second half of long documents are not silently
+        dropped by BERT's 512-token truncation.
+        """
         if not text or not text.strip():
             return []
+        # Long text → chunk-and-merge instead of letting BERT truncate
+        if len(text) > MAX_CHUNK_CHARS:
+            return self._recognize_chunked(text)
         pipe = self._load_pipeline()
         try:
             raw = pipe(text)
         except Exception:
             return []
         results = []
+        for ent in self._clean_entities(raw):
             results.append(EntityResult(
                 word=ent.get("word", ""),
                 entity_group=ent.get("entity_group", "MISC"),
         return results
     def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
+        """
+        Run NER on a batch of texts.
+        Short texts (≤ MAX_CHUNK_CHARS) are processed together via HuggingFace
+        pipeline batching for GPU efficiency.  Long texts are handled
+        individually with chunk-and-merge so that no entities are missed.
+        """
         if not texts:
             return []
         out: List[List[EntityResult]] = [[] for _ in texts]
+        # Separate short and long texts
+        short_texts:  List[str] = []
+        short_indices: List[int] = []
+        long_indices:  List[int] = []
+        for i, text in enumerate(texts):
+            if not text or not text.strip():
+                continue
+            if len(text) > MAX_CHUNK_CHARS:
+                long_indices.append(i)
+            else:
+                short_texts.append(text)
+                short_indices.append(i)
+        # --- Batch-process short texts ---
+        if short_texts:
+            pipe = self._load_pipeline()
+            try:
+                raw_results = pipe(short_texts, batch_size=batch_size)
+                for idx, raw in zip(short_indices, raw_results):
+                    entity_results = []
+                    for ent in self._clean_entities(raw):
+                        entity_results.append(EntityResult(
+                            word=ent.get("word", ""),
+                            entity_group=ent.get("entity_group", "MISC"),
+                            score=float(ent.get("score", 0.0)),
+                            start=int(ent.get("start", 0)),
+                            end=int(ent.get("end", 0)),
+                        ))
+                    out[idx] = entity_results
+            except Exception as e:
+                print(f"[NEREngine] Batch processing error: {e}")
+                # Fallback to per-text processing
+                for idx, text in zip(short_indices, short_texts):
+                    out[idx] = self.recognize(text)
+        # --- Chunk-and-merge long texts (sequential, no truncation) ---
+        for idx in long_indices:
+            out[idx] = self._recognize_chunked(texts[idx])
         return out

nlp_core/sentiment.py CHANGED Viewed

@@ -7,15 +7,22 @@ from typing import List, Optional
 from .models import SentimentResult
-# Map model labels to human-readable labels
 LABEL_MAP = {
     "positive": "positive",
     "neutral": "neutral",
     "negative": "negative",
-    # Some models output LABEL_0, LABEL_1, LABEL_2
     "LABEL_0": "negative",
     "LABEL_1": "neutral",
     "LABEL_2": "positive",
 }

 from .models import SentimentResult
+# Map model labels to human-readable labels.
+# Keys include both original-case and .lower() forms because we call
+# result["label"].lower() before the lookup — the uppercase forms would
+# never match after lowercasing.
 LABEL_MAP = {
     "positive": "positive",
     "neutral": "neutral",
     "negative": "negative",
+    # Original-case (kept for safety if .lower() is ever removed)
     "LABEL_0": "negative",
     "LABEL_1": "neutral",
     "LABEL_2": "positive",
+    # Lowercased forms — these are what actually get looked up
+    "label_0": "negative",
+    "label_1": "neutral",
+    "label_2": "positive",
 }