Spaces:

Nomio4640
/

NLP-intelligence

Sleeping

App Files Files Community

Nomio4640 commited on 27 days ago

Commit

7726529

1 Parent(s): e51c368

added test for NER

Browse files

Files changed (9) hide show

adapters/api/main.py +23 -5
adapters/api/routers/analysis.py +20 -0
data/test.txt +0 -0
data/train.txt +0 -0
data/valid.txt +0 -0
eval/evaluate.py +133 -0
frontend/next.config.ts +7 -2
frontend/src/app/page.tsx +80 -10
nlp_core/ner_engine.py +6 -0

adapters/api/main.py CHANGED Viewed

@@ -1,10 +1,6 @@
-"""
-FastAPI adapter — REST API entry point.
-This is the outer adapter that wraps the NLP core domain layer.
-"""
 import logging
 import traceback
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
@@ -56,6 +52,7 @@ async def root():
         "name": "NLP Intelligence API",
         "version": "1.0.0",
         "endpoints": {
             "upload": "POST /api/upload",
             "analyze": "POST /api/analyze",
             "network": "POST /api/network",
@@ -65,3 +62,24 @@ async def root():
             "admin_stopwords": "GET/POST /api/admin/stopwords",
         },
     }

 import logging
 import traceback
+import torch
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
         "name": "NLP Intelligence API",
         "version": "1.0.0",
         "endpoints": {
+            "health": "GET /api/health",
             "upload": "POST /api/upload",
             "analyze": "POST /api/analyze",
             "network": "POST /api/network",
             "admin_stopwords": "GET/POST /api/admin/stopwords",
         },
     }
+@app.get("/api/health")
+async def health():
+    """
+    Quick health check used by the frontend on page load.
+    Returns GPU availability and which NLP models are loaded.
+    """
+    from adapters.api import services
+    gpu = torch.cuda.is_available()
+    gpu_name = torch.cuda.get_device_name(0) if gpu else None
+    return {
+        "status": "ok",
+        "gpu": gpu,
+        "gpu_name": gpu_name,
+        "models": {
+            "ner": services.ner._pipeline is not None,
+            "sentiment": services.sentiment._pipeline is not None,
+            "topic": services.topic._model is not None,
+        },
+    }

adapters/api/routers/analysis.py CHANGED Viewed

@@ -16,6 +16,7 @@ import csv
 import io
 import json
 import logging
 import uuid
 from typing import List
@@ -321,6 +322,7 @@ def _run_analysis(
     run_sentiment: bool,
     run_topics: bool,
 ) -> AnalysisResponse:
     preprocessor = services.preprocessor
     kb = services.kb
@@ -328,6 +330,8 @@ def _run_analysis(
     ids       = [row.get("ID", str(i)) for i, row in enumerate(rows)]
     sources   = [row.get("Source", "") for row in rows]
     # Dual preprocessing — one pass, two outputs
     nlp_texts: List[str] = []
     tm_texts:  List[str] = []
@@ -335,11 +339,15 @@ def _run_analysis(
         nlp, tm = preprocessor.preprocess_dual(raw)
         nlp_texts.append(nlp)
         tm_texts.append(tm)
     # NER
     ner_results = []
     if run_ner:
         ner_results = services.ner.recognize_batch(nlp_texts)
     # Entity relabeling from admin custom labels
     custom_labels = kb.get_labels(label_type="entity") if run_ner else {}
@@ -347,18 +355,30 @@ def _run_analysis(
     # Sentiment
     sentiment_results = []
     if run_sentiment:
         sentiment_results = services.sentiment.analyze_batch(nlp_texts)
     # Topic modeling — now works from 3 documents via KMeans fallback
     topic_results = []
     topic_summary = []
     if run_topics:
         if len(tm_texts) >= MIN_TOPICS_DOCS:
             try:
                 topic_results, topic_summary = services.topic.fit_transform(tm_texts)
             except Exception as exc:
                 topic_summary = [{"error": f"Topic modeling failed: {exc}"}]
         else:
             topic_summary = [{
                 "info": (
                     f"Topic modeling needs at least {MIN_TOPICS_DOCS} documents. "

 import io
 import json
 import logging
+import time
 import uuid
 from typing import List
     run_sentiment: bool,
     run_topics: bool,
 ) -> AnalysisResponse:
+    t0 = time.time()
     preprocessor = services.preprocessor
     kb = services.kb
     ids       = [row.get("ID", str(i)) for i, row in enumerate(rows)]
     sources   = [row.get("Source", "") for row in rows]
+    logger.info(f"[Pipeline] Starting analysis: {len(raw_texts)} rows, NER={run_ner}, Sentiment={run_sentiment}, Topics={run_topics}")
     # Dual preprocessing — one pass, two outputs
     nlp_texts: List[str] = []
     tm_texts:  List[str] = []
         nlp, tm = preprocessor.preprocess_dual(raw)
         nlp_texts.append(nlp)
         tm_texts.append(tm)
+    logger.info(f"[Pipeline] Preprocessing done in {(time.time()-t0)*1000:.0f}ms")
     # NER
     ner_results = []
     if run_ner:
+        t1 = time.time()
         ner_results = services.ner.recognize_batch(nlp_texts)
+        total_ents = sum(len(r) for r in ner_results)
+        logger.info(f"[Pipeline] NER done in {(time.time()-t1)*1000:.0f}ms — found {total_ents} entities total")
     # Entity relabeling from admin custom labels
     custom_labels = kb.get_labels(label_type="entity") if run_ner else {}
     # Sentiment
     sentiment_results = []
     if run_sentiment:
+        t1 = time.time()
         sentiment_results = services.sentiment.analyze_batch(nlp_texts)
+        pos = sum(1 for s in sentiment_results if s.label == "positive")
+        neg = sum(1 for s in sentiment_results if s.label == "negative")
+        neu = sum(1 for s in sentiment_results if s.label == "neutral")
+        logger.info(f"[Pipeline] Sentiment done in {(time.time()-t1)*1000:.0f}ms — pos={pos} neu={neu} neg={neg}")
     # Topic modeling — now works from 3 documents via KMeans fallback
     topic_results = []
     topic_summary = []
     if run_topics:
+        non_empty_tm = [t for t in tm_texts if t.strip()]
+        logger.info(f"[Pipeline] Topic modeling: {len(non_empty_tm)} non-empty TM texts (need >={MIN_TOPICS_DOCS})")
         if len(tm_texts) >= MIN_TOPICS_DOCS:
             try:
+                t1 = time.time()
                 topic_results, topic_summary = services.topic.fit_transform(tm_texts)
+                real_topics = [t for t in topic_summary if isinstance(t, dict) and t.get("topic_id", -1) >= 0]
+                logger.info(f"[Pipeline] Topics done in {(time.time()-t1)*1000:.0f}ms — {len(real_topics)} real topics, summary={topic_summary}")
             except Exception as exc:
+                logger.error(f"[Pipeline] Topic modeling FAILED: {exc}", exc_info=True)
                 topic_summary = [{"error": f"Topic modeling failed: {exc}"}]
         else:
+            logger.info(f"[Pipeline] Skipping topics — only {len(tm_texts)} docs (need {MIN_TOPICS_DOCS}+)")
             topic_summary = [{
                 "info": (
                     f"Topic modeling needs at least {MIN_TOPICS_DOCS} documents. "

data/test.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/train.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/valid.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/evaluate.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import sys
+import logging
+# Add the project root to the python path so we can import nlp_core
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from nlp_core.ner_engine import NEREngine
+from nlp_core.preprocessing import Preprocessor
+def extract_entities_from_conll(lines):
+    """
+    Extracts entities from a list of CoNLL-formatted lines for a single sentence.
+    Returns the reconstructed text and a list of entities: (type, string).
+    """
+    words = []
+    entities = []
+    current_entity_type = None
+    current_entity_words = []
+    for line in lines:
+        parts = line.strip().split()
+        if len(parts) < 4:
+            continue
+        word = parts[0]
+        tag = parts[-1]
+        words.append(word)
+        if tag.startswith("B-"):
+            if current_entity_type:
+                entities.append((current_entity_type, " ".join(current_entity_words)))
+            current_entity_type = tag[2:]
+            current_entity_words = [word]
+        elif tag.startswith("I-"):
+            if current_entity_type == tag[2:]:
+                current_entity_words.append(word)
+            else:
+                if current_entity_type:
+                    entities.append((current_entity_type, " ".join(current_entity_words)))
+                current_entity_type = tag[2:]
+                current_entity_words = [word]
+        else:
+            if current_entity_type:
+                entities.append((current_entity_type, " ".join(current_entity_words)))
+                current_entity_type = None
+                current_entity_words = []
+    if current_entity_type:
+        entities.append((current_entity_type, " ".join(current_entity_words)))
+    text = " ".join(words)
+    return text, entities
+def evaluate_ner(test_file_path, limit=None):
+    print(f"Loading test data from {test_file_path}...")
+    with open(test_file_path, "r", encoding="utf-8") as f:
+        blocks = f.read().split("\n\n")
+    sentences = []
+    for block in blocks:
+        if not block.strip():
+            continue
+        text, true_ents = extract_entities_from_conll(block.split("\n"))
+        if text:
+            sentences.append((text, true_ents))
+    if limit:
+        sentences = sentences[:limit]
+    print(f"Loaded {len(sentences)} test sentences.")
+    preprocessor = Preprocessor()
+    ner = NEREngine()
+    true_positives = 0
+    false_positives = 0
+    false_negatives = 0
+    print("Running NER evaluation (this may take a while)...")
+    for i, (text, true_ents) in enumerate(sentences):
+        if i > 0 and i % 50 == 0:
+            print(f"Processed {i}/{len(sentences)} sentences...")
+        # Clean text specifically for NER
+        clean_text = preprocessor.preprocess_nlp(text)
+        predicted_results = ner.recognize(clean_text)
+        # Format predictions into (type, string) lowercased for fair comparison
+        pred_ents = [(res.entity_group, res.word.replace(" ", "").lower()) for res in predicted_results]
+        # Format true entities similarly (strip spaces, lowercase)
+        # Note: The model output uses different spacing sometimes due to subwords.
+        true_ents_formatted = [(t, w.replace(" ", "").lower()) for t, w in true_ents]
+        # Calculate overlaps
+        for true_e in true_ents_formatted:
+            if true_e in pred_ents:
+                true_positives += 1
+                pred_ents.remove(true_e)
+            else:
+                false_negatives += 1
+        # Whatever is left in pred_ents are false positives
+        false_positives += len(pred_ents)
+    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
+    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
+    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+    print("\n" + "="*40)
+    print("NER EVALUATION RESULTS (Entity-Level Exact Match)")
+    print("="*40)
+    print(f"Sentences Evaluated: {len(sentences)}")
+    print(f"True Positives:      {true_positives}")
+    print(f"False Positives:     {false_positives}")
+    print(f"False Negatives:     {false_negatives}")
+    print("-" * 40)
+    print(f"Precision:           {precision:.4f}")
+    print(f"Recall:              {recall:.4f}")
+    print(f"F1 Score:            {f1:.4f}")
+    print("="*40)
+if __name__ == "__main__":
+    test_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "test.txt")
+    if not os.path.exists(test_path):
+        print(f"Error: Could not find CoNLL test file at {test_path}")
+    else:
+        # Run on the first 500 sentences to get a quick estimate.
+        # Change limit=None to run on the entire test set.
+        evaluate_ner(test_path, limit=500)

frontend/next.config.ts CHANGED Viewed

@@ -1,12 +1,17 @@
 import type { NextConfig } from "next";
 const nextConfig: NextConfig = {
   async rewrites() {
     return [
       {
         source: "/api/:path*",
-        // When using Colab + Ngrok, paste your Ngrok URL here!
-        destination: "https://joye-tetracid-trevor.ngrok-free.dev/api/:path*",
       },
     ];
   },

 import type { NextConfig } from "next";
+// Set NEXT_PUBLIC_API_URL in .env.local to point to your backend.
+// Example for Colab:  NEXT_PUBLIC_API_URL=https://your-url.ngrok-free.dev
+// Example for local:  NEXT_PUBLIC_API_URL=http://localhost:8000
+// If not set, defaults to localhost:8000
+const API_URL = process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000";
 const nextConfig: NextConfig = {
   async rewrites() {
     return [
       {
         source: "/api/:path*",
+        destination: `${API_URL}/api/:path*`,
       },
     ];
   },

frontend/src/app/page.tsx CHANGED Viewed

@@ -94,6 +94,9 @@ function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } })
   );
 }
 export default function Dashboard() {
   const [data, setData] = useState<AnalysisResult | null>(null);
   const [insights, setInsights] = useState<InsightItem[]>([]);
@@ -117,14 +120,36 @@ export default function Dashboard() {
   // Annotation editor
   const [editingDoc, setEditingDoc] = useState<DocForEditor | null>(null);
   const loadHistory = useCallback(async () => {
     setHistoryLoading(true);
     try {
-      const res = await fetch(`${API_BASE}/api/history?limit=50`, { headers: { "ngrok-skip-browser-warning": "true" } });
       if (res.ok) setHistory(await res.json());
-    } finally {
-      setHistoryLoading(false);
-    }
   }, []);
   useEffect(() => {
@@ -194,25 +219,38 @@ export default function Dashboard() {
   const uploadCSV = useCallback(async (file: File) => {
     setLoading(true);
     setError("");
     try {
       const formData = new FormData();
       formData.append("file", file);
       const res = await fetch(`${API_BASE}/api/upload?run_ner=true&run_sentiment=true&run_topics=true`, {
         method: "POST",
         body: formData,
       });
       if (!res.ok) {
-        const err = await res.json();
         throw new Error(err.detail || "Upload failed");
       }
       const result: AnalysisResult = await res.json();
       setData(result);
-      const insightsRes = await fetch(`${API_BASE}/api/insights`, { headers: { "ngrok-skip-browser-warning": "true" }, method: "POST" });
       if (insightsRes.ok) setInsights(await insightsRes.json());
     } catch (e: any) {
       setError(e.message || "Error uploading file");
     } finally {
       setLoading(false);
     }
   }, []);
@@ -220,19 +258,25 @@ export default function Dashboard() {
     if (!textInput.trim()) return;
     setLoading(true);
     setError("");
     try {
       const res = await fetch(`${API_BASE}/api/analyze`, {
         method: "POST",
-        headers: { "ngrok-skip-browser-warning": "true",  "Content-Type": "application/json" },
         body: JSON.stringify({ text: textInput }),
       });
       if (!res.ok) throw new Error("Analysis failed");
       const result: AnalysisResult = await res.json();
       setData(result);
     } catch (e: any) {
       setError(e.message);
     } finally {
       setLoading(false);
     }
   }, [textInput]);
@@ -276,6 +320,30 @@ export default function Dashboard() {
   return (
     <div>
       {/* Annotation editor modal */}
       {editingDoc && (
         <AnnotationEditor
@@ -287,6 +355,7 @@ export default function Dashboard() {
       {/* Upload Section */}
       {!data && !loading && (
         <section style={{ marginBottom: "2rem" }}>
           <div
             className={`upload-area ${dragging ? "dragging" : ""}`}
@@ -299,9 +368,10 @@ export default function Dashboard() {
             <p className="upload-text">
               <strong>CSV файл чирж оруулах</strong> эсвэл дарж сонгох
             </p>
-            <p style={{ fontSize: "0.75rem", color: "var(--text-muted)", marginTop: "0.5rem" }}>
-              &apos;text&apos; эсвэл &apos;Text&apos; баганатай CSV файл шаардлагатай
-            </p>
             <input
               ref={fileInputRef}
               type="file"

   );
 }
+// Standard headers needed for all API calls when going through Ngrok
+const NGROK_HEADERS = { "ngrok-skip-browser-warning": "true" };
 export default function Dashboard() {
   const [data, setData] = useState<AnalysisResult | null>(null);
   const [insights, setInsights] = useState<InsightItem[]>([]);
   // Annotation editor
   const [editingDoc, setEditingDoc] = useState<DocForEditor | null>(null);
+  // Backend health check
+  const [backendOk, setBackendOk] = useState<boolean | null>(null); // null = checking
+  // Health check on mount — tells you immediately if backend is reachable
+  useEffect(() => {
+    const check = async () => {
+      console.group("[NLP] Backend health check");
+      try {
+        const res = await fetch(`${API_BASE}/api/health`, { headers: NGROK_HEADERS });
+        const ok = res.ok;
+        setBackendOk(ok);
+        console.log(ok ? "✅ Backend reachable" : `❌ Backend returned ${res.status}`);
+      } catch (e) {
+        setBackendOk(false);
+        console.error("❌ Backend unreachable:", e);
+      }
+      console.groupEnd();
+    };
+    check();
+  }, []);
   const loadHistory = useCallback(async () => {
     setHistoryLoading(true);
+    console.group("[NLP] Load history");
     try {
+      const res = await fetch(`${API_BASE}/api/history?limit=50`, { headers: NGROK_HEADERS });
+      console.log(`→ GET /api/history  status=${res.status}`);
       if (res.ok) setHistory(await res.json());
+    } catch (e) { console.error(e); }
+    finally { setHistoryLoading(false); console.groupEnd(); }
   }, []);
   useEffect(() => {
   const uploadCSV = useCallback(async (file: File) => {
     setLoading(true);
     setError("");
+    console.group(`[NLP] CSV Upload — ${file.name} (${(file.size/1024).toFixed(1)} KB)`);
     try {
       const formData = new FormData();
       formData.append("file", file);
+      // ⚠️ IMPORTANT: ngrok-skip-browser-warning header MUST be included here.
+      // Without it, Ngrok returns an HTML warning page instead of forwarding
+      // the request to FastAPI → FastAPI tries to parse HTML as CSV → 500 error.
       const res = await fetch(`${API_BASE}/api/upload?run_ner=true&run_sentiment=true&run_topics=true`, {
         method: "POST",
+        headers: NGROK_HEADERS,   // ← THE FIX
         body: formData,
       });
+      console.log(`→ POST /api/upload  status=${res.status}`);
       if (!res.ok) {
+        const err = await res.json().catch(() => ({ detail: `HTTP ${res.status}` }));
         throw new Error(err.detail || "Upload failed");
       }
       const result: AnalysisResult = await res.json();
+      console.log(`← ${result.total_documents} documents, topics=${result.topic_summary?.length}`);
       setData(result);
+      setActiveTab("overview");  // Auto-switch to results
+      // Immediately fetch insights after upload
+      const insightsRes = await fetch(`${API_BASE}/api/insights`, { headers: NGROK_HEADERS, method: "POST" });
+      console.log(`→ POST /api/insights  status=${insightsRes.status}`);
       if (insightsRes.ok) setInsights(await insightsRes.json());
     } catch (e: any) {
+      console.error("Upload error:", e);
       setError(e.message || "Error uploading file");
     } finally {
       setLoading(false);
+      console.groupEnd();
     }
   }, []);
     if (!textInput.trim()) return;
     setLoading(true);
     setError("");
+    console.group(`[NLP] Analyze text (${textInput.length} chars)`);
     try {
       const res = await fetch(`${API_BASE}/api/analyze`, {
         method: "POST",
+        headers: { ...NGROK_HEADERS, "Content-Type": "application/json" },
         body: JSON.stringify({ text: textInput }),
       });
+      console.log(`→ POST /api/analyze  status=${res.status}`);
       if (!res.ok) throw new Error("Analysis failed");
       const result: AnalysisResult = await res.json();
+      console.log(`← entities:`, result.documents[0]?.entities?.length ?? 0,
+        `sentiment:`, result.documents[0]?.sentiment?.label);
       setData(result);
     } catch (e: any) {
+      console.error(e);
       setError(e.message);
     } finally {
       setLoading(false);
+      console.groupEnd();
     }
   }, [textInput]);
   return (
     <div>
+      {/* Backend status banner */}
+      {backendOk === false && (
+        <div style={{
+          background: "rgba(255,80,80,0.15)", border: "1px solid var(--negative)",
+          borderRadius: "0.5rem", padding: "0.6rem 1rem", marginBottom: "1rem",
+          display: "flex", alignItems: "center", gap: "0.5rem", fontSize: "0.85rem",
+        }}>
+          <span>🔴</span>
+          <span style={{ color: "var(--negative)", fontWeight: 600 }}>Backend холболт алдаатай.</span>
+          <span style={{ color: "var(--text-muted)" }}>
+            Colab дээрх сервер ажиллаж байгаа эсэхийг шалгаад, Ngrok URL зөв эсэхийг .env.local файлд шинэчилнэ үү.
+          </span>
+        </div>
+      )}
+      {backendOk === null && (
+        <div style={{
+          background: "rgba(100,100,200,0.1)", border: "1px solid rgba(100,100,255,0.3)",
+          borderRadius: "0.5rem", padding: "0.4rem 1rem", marginBottom: "0.75rem",
+          fontSize: "0.8rem", color: "var(--text-muted)",
+        }}>
+          ⏳ Backend холболт шалгаж байна...
+        </div>
+      )}
       {/* Annotation editor modal */}
       {editingDoc && (
         <AnnotationEditor
       {/* Upload Section */}
       {!data && !loading && (
         <section style={{ marginBottom: "2rem" }}>
           <div
             className={`upload-area ${dragging ? "dragging" : ""}`}
             <p className="upload-text">
               <strong>CSV файл чирж оруулах</strong> эсвэл дарж сонгох
             </p>
+            <div style={{ fontSize: "0.75rem", color: "var(--text-muted)", marginTop: "0.5rem" }}>
+              <p>⚠️ <strong>Санамж:</strong> Шинжлэх өгөгдөл тань заавал <code>text</code> эсвэл <code>Text</code> гэсэн нэртэй баганад байх ёстой.</p>
+              <p>Хэрэв таны багана <code>Текст</code>, <code>Мессеж</code> гэх мэт Монгол нэртэй бол файлаа оруулахаас өмнө нэрийг нь <code>text</code> болгож өөрчилнө үү.</p>
+            </div>
             <input
               ref={fileInputRef}
               type="file"

nlp_core/ner_engine.py CHANGED Viewed

@@ -17,12 +17,18 @@ class NEREngine:
     def _load_pipeline(self):
         """Lazy-load the NER pipeline (heavy model, load only when needed)."""
         if self._pipeline is None:
             from transformers import pipeline
             self._pipeline = pipeline(
                 "ner",
                 model=self.model_name,
                 aggregation_strategy="simple",
             )
         return self._pipeline
     def _clean_entities(self, raw_entities: List[dict]) -> List[dict]:

     def _load_pipeline(self):
         """Lazy-load the NER pipeline (heavy model, load only when needed)."""
         if self._pipeline is None:
+            import torch
             from transformers import pipeline
+            device = 0 if torch.cuda.is_available() else -1
             self._pipeline = pipeline(
                 "ner",
                 model=self.model_name,
                 aggregation_strategy="simple",
+                truncation=True,
+                max_length=512,
+                device=device,
             )
+            print(f"[NEREngine] Loaded on {'GPU' if device == 0 else 'CPU'}")
         return self._pipeline
     def _clean_entities(self, raw_entities: List[dict]) -> List[dict]: