Spaces:
Sleeping
Sleeping
added test for NER
Browse files- adapters/api/main.py +23 -5
- adapters/api/routers/analysis.py +20 -0
- data/test.txt +0 -0
- data/train.txt +0 -0
- data/valid.txt +0 -0
- eval/evaluate.py +133 -0
- frontend/next.config.ts +7 -2
- frontend/src/app/page.tsx +80 -10
- nlp_core/ner_engine.py +6 -0
adapters/api/main.py
CHANGED
|
@@ -1,10 +1,6 @@
|
|
| 1 |
-
"""
|
| 2 |
-
FastAPI adapter — REST API entry point.
|
| 3 |
-
This is the outer adapter that wraps the NLP core domain layer.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
import logging
|
| 7 |
import traceback
|
|
|
|
| 8 |
|
| 9 |
from fastapi import FastAPI, Request
|
| 10 |
from fastapi.middleware.cors import CORSMiddleware
|
|
@@ -56,6 +52,7 @@ async def root():
|
|
| 56 |
"name": "NLP Intelligence API",
|
| 57 |
"version": "1.0.0",
|
| 58 |
"endpoints": {
|
|
|
|
| 59 |
"upload": "POST /api/upload",
|
| 60 |
"analyze": "POST /api/analyze",
|
| 61 |
"network": "POST /api/network",
|
|
@@ -65,3 +62,24 @@ async def root():
|
|
| 65 |
"admin_stopwords": "GET/POST /api/admin/stopwords",
|
| 66 |
},
|
| 67 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
import traceback
|
| 3 |
+
import torch
|
| 4 |
|
| 5 |
from fastapi import FastAPI, Request
|
| 6 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 52 |
"name": "NLP Intelligence API",
|
| 53 |
"version": "1.0.0",
|
| 54 |
"endpoints": {
|
| 55 |
+
"health": "GET /api/health",
|
| 56 |
"upload": "POST /api/upload",
|
| 57 |
"analyze": "POST /api/analyze",
|
| 58 |
"network": "POST /api/network",
|
|
|
|
| 62 |
"admin_stopwords": "GET/POST /api/admin/stopwords",
|
| 63 |
},
|
| 64 |
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@app.get("/api/health")
|
| 68 |
+
async def health():
|
| 69 |
+
"""
|
| 70 |
+
Quick health check used by the frontend on page load.
|
| 71 |
+
Returns GPU availability and which NLP models are loaded.
|
| 72 |
+
"""
|
| 73 |
+
from adapters.api import services
|
| 74 |
+
gpu = torch.cuda.is_available()
|
| 75 |
+
gpu_name = torch.cuda.get_device_name(0) if gpu else None
|
| 76 |
+
return {
|
| 77 |
+
"status": "ok",
|
| 78 |
+
"gpu": gpu,
|
| 79 |
+
"gpu_name": gpu_name,
|
| 80 |
+
"models": {
|
| 81 |
+
"ner": services.ner._pipeline is not None,
|
| 82 |
+
"sentiment": services.sentiment._pipeline is not None,
|
| 83 |
+
"topic": services.topic._model is not None,
|
| 84 |
+
},
|
| 85 |
+
}
|
adapters/api/routers/analysis.py
CHANGED
|
@@ -16,6 +16,7 @@ import csv
|
|
| 16 |
import io
|
| 17 |
import json
|
| 18 |
import logging
|
|
|
|
| 19 |
import uuid
|
| 20 |
from typing import List
|
| 21 |
|
|
@@ -321,6 +322,7 @@ def _run_analysis(
|
|
| 321 |
run_sentiment: bool,
|
| 322 |
run_topics: bool,
|
| 323 |
) -> AnalysisResponse:
|
|
|
|
| 324 |
preprocessor = services.preprocessor
|
| 325 |
kb = services.kb
|
| 326 |
|
|
@@ -328,6 +330,8 @@ def _run_analysis(
|
|
| 328 |
ids = [row.get("ID", str(i)) for i, row in enumerate(rows)]
|
| 329 |
sources = [row.get("Source", "") for row in rows]
|
| 330 |
|
|
|
|
|
|
|
| 331 |
# Dual preprocessing — one pass, two outputs
|
| 332 |
nlp_texts: List[str] = []
|
| 333 |
tm_texts: List[str] = []
|
|
@@ -335,11 +339,15 @@ def _run_analysis(
|
|
| 335 |
nlp, tm = preprocessor.preprocess_dual(raw)
|
| 336 |
nlp_texts.append(nlp)
|
| 337 |
tm_texts.append(tm)
|
|
|
|
| 338 |
|
| 339 |
# NER
|
| 340 |
ner_results = []
|
| 341 |
if run_ner:
|
|
|
|
| 342 |
ner_results = services.ner.recognize_batch(nlp_texts)
|
|
|
|
|
|
|
| 343 |
|
| 344 |
# Entity relabeling from admin custom labels
|
| 345 |
custom_labels = kb.get_labels(label_type="entity") if run_ner else {}
|
|
@@ -347,18 +355,30 @@ def _run_analysis(
|
|
| 347 |
# Sentiment
|
| 348 |
sentiment_results = []
|
| 349 |
if run_sentiment:
|
|
|
|
| 350 |
sentiment_results = services.sentiment.analyze_batch(nlp_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
# Topic modeling — now works from 3 documents via KMeans fallback
|
| 353 |
topic_results = []
|
| 354 |
topic_summary = []
|
| 355 |
if run_topics:
|
|
|
|
|
|
|
| 356 |
if len(tm_texts) >= MIN_TOPICS_DOCS:
|
| 357 |
try:
|
|
|
|
| 358 |
topic_results, topic_summary = services.topic.fit_transform(tm_texts)
|
|
|
|
|
|
|
| 359 |
except Exception as exc:
|
|
|
|
| 360 |
topic_summary = [{"error": f"Topic modeling failed: {exc}"}]
|
| 361 |
else:
|
|
|
|
| 362 |
topic_summary = [{
|
| 363 |
"info": (
|
| 364 |
f"Topic modeling needs at least {MIN_TOPICS_DOCS} documents. "
|
|
|
|
| 16 |
import io
|
| 17 |
import json
|
| 18 |
import logging
|
| 19 |
+
import time
|
| 20 |
import uuid
|
| 21 |
from typing import List
|
| 22 |
|
|
|
|
| 322 |
run_sentiment: bool,
|
| 323 |
run_topics: bool,
|
| 324 |
) -> AnalysisResponse:
|
| 325 |
+
t0 = time.time()
|
| 326 |
preprocessor = services.preprocessor
|
| 327 |
kb = services.kb
|
| 328 |
|
|
|
|
| 330 |
ids = [row.get("ID", str(i)) for i, row in enumerate(rows)]
|
| 331 |
sources = [row.get("Source", "") for row in rows]
|
| 332 |
|
| 333 |
+
logger.info(f"[Pipeline] Starting analysis: {len(raw_texts)} rows, NER={run_ner}, Sentiment={run_sentiment}, Topics={run_topics}")
|
| 334 |
+
|
| 335 |
# Dual preprocessing — one pass, two outputs
|
| 336 |
nlp_texts: List[str] = []
|
| 337 |
tm_texts: List[str] = []
|
|
|
|
| 339 |
nlp, tm = preprocessor.preprocess_dual(raw)
|
| 340 |
nlp_texts.append(nlp)
|
| 341 |
tm_texts.append(tm)
|
| 342 |
+
logger.info(f"[Pipeline] Preprocessing done in {(time.time()-t0)*1000:.0f}ms")
|
| 343 |
|
| 344 |
# NER
|
| 345 |
ner_results = []
|
| 346 |
if run_ner:
|
| 347 |
+
t1 = time.time()
|
| 348 |
ner_results = services.ner.recognize_batch(nlp_texts)
|
| 349 |
+
total_ents = sum(len(r) for r in ner_results)
|
| 350 |
+
logger.info(f"[Pipeline] NER done in {(time.time()-t1)*1000:.0f}ms — found {total_ents} entities total")
|
| 351 |
|
| 352 |
# Entity relabeling from admin custom labels
|
| 353 |
custom_labels = kb.get_labels(label_type="entity") if run_ner else {}
|
|
|
|
| 355 |
# Sentiment
|
| 356 |
sentiment_results = []
|
| 357 |
if run_sentiment:
|
| 358 |
+
t1 = time.time()
|
| 359 |
sentiment_results = services.sentiment.analyze_batch(nlp_texts)
|
| 360 |
+
pos = sum(1 for s in sentiment_results if s.label == "positive")
|
| 361 |
+
neg = sum(1 for s in sentiment_results if s.label == "negative")
|
| 362 |
+
neu = sum(1 for s in sentiment_results if s.label == "neutral")
|
| 363 |
+
logger.info(f"[Pipeline] Sentiment done in {(time.time()-t1)*1000:.0f}ms — pos={pos} neu={neu} neg={neg}")
|
| 364 |
|
| 365 |
# Topic modeling — now works from 3 documents via KMeans fallback
|
| 366 |
topic_results = []
|
| 367 |
topic_summary = []
|
| 368 |
if run_topics:
|
| 369 |
+
non_empty_tm = [t for t in tm_texts if t.strip()]
|
| 370 |
+
logger.info(f"[Pipeline] Topic modeling: {len(non_empty_tm)} non-empty TM texts (need >={MIN_TOPICS_DOCS})")
|
| 371 |
if len(tm_texts) >= MIN_TOPICS_DOCS:
|
| 372 |
try:
|
| 373 |
+
t1 = time.time()
|
| 374 |
topic_results, topic_summary = services.topic.fit_transform(tm_texts)
|
| 375 |
+
real_topics = [t for t in topic_summary if isinstance(t, dict) and t.get("topic_id", -1) >= 0]
|
| 376 |
+
logger.info(f"[Pipeline] Topics done in {(time.time()-t1)*1000:.0f}ms — {len(real_topics)} real topics, summary={topic_summary}")
|
| 377 |
except Exception as exc:
|
| 378 |
+
logger.error(f"[Pipeline] Topic modeling FAILED: {exc}", exc_info=True)
|
| 379 |
topic_summary = [{"error": f"Topic modeling failed: {exc}"}]
|
| 380 |
else:
|
| 381 |
+
logger.info(f"[Pipeline] Skipping topics — only {len(tm_texts)} docs (need {MIN_TOPICS_DOCS}+)")
|
| 382 |
topic_summary = [{
|
| 383 |
"info": (
|
| 384 |
f"Topic modeling needs at least {MIN_TOPICS_DOCS} documents. "
|
data/test.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/train.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/valid.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval/evaluate.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
# Add the project root to the python path so we can import nlp_core
|
| 6 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 7 |
+
|
| 8 |
+
from nlp_core.ner_engine import NEREngine
|
| 9 |
+
from nlp_core.preprocessing import Preprocessor
|
| 10 |
+
|
| 11 |
+
def extract_entities_from_conll(lines):
|
| 12 |
+
"""
|
| 13 |
+
Extracts entities from a list of CoNLL-formatted lines for a single sentence.
|
| 14 |
+
Returns the reconstructed text and a list of entities: (type, string).
|
| 15 |
+
"""
|
| 16 |
+
words = []
|
| 17 |
+
entities = []
|
| 18 |
+
current_entity_type = None
|
| 19 |
+
current_entity_words = []
|
| 20 |
+
|
| 21 |
+
for line in lines:
|
| 22 |
+
parts = line.strip().split()
|
| 23 |
+
if len(parts) < 4:
|
| 24 |
+
continue
|
| 25 |
+
word = parts[0]
|
| 26 |
+
tag = parts[-1]
|
| 27 |
+
|
| 28 |
+
words.append(word)
|
| 29 |
+
|
| 30 |
+
if tag.startswith("B-"):
|
| 31 |
+
if current_entity_type:
|
| 32 |
+
entities.append((current_entity_type, " ".join(current_entity_words)))
|
| 33 |
+
current_entity_type = tag[2:]
|
| 34 |
+
current_entity_words = [word]
|
| 35 |
+
elif tag.startswith("I-"):
|
| 36 |
+
if current_entity_type == tag[2:]:
|
| 37 |
+
current_entity_words.append(word)
|
| 38 |
+
else:
|
| 39 |
+
if current_entity_type:
|
| 40 |
+
entities.append((current_entity_type, " ".join(current_entity_words)))
|
| 41 |
+
current_entity_type = tag[2:]
|
| 42 |
+
current_entity_words = [word]
|
| 43 |
+
else:
|
| 44 |
+
if current_entity_type:
|
| 45 |
+
entities.append((current_entity_type, " ".join(current_entity_words)))
|
| 46 |
+
current_entity_type = None
|
| 47 |
+
current_entity_words = []
|
| 48 |
+
|
| 49 |
+
if current_entity_type:
|
| 50 |
+
entities.append((current_entity_type, " ".join(current_entity_words)))
|
| 51 |
+
|
| 52 |
+
text = " ".join(words)
|
| 53 |
+
return text, entities
|
| 54 |
+
|
| 55 |
+
def evaluate_ner(test_file_path, limit=None):
|
| 56 |
+
print(f"Loading test data from {test_file_path}...")
|
| 57 |
+
|
| 58 |
+
with open(test_file_path, "r", encoding="utf-8") as f:
|
| 59 |
+
blocks = f.read().split("\n\n")
|
| 60 |
+
|
| 61 |
+
sentences = []
|
| 62 |
+
for block in blocks:
|
| 63 |
+
if not block.strip():
|
| 64 |
+
continue
|
| 65 |
+
text, true_ents = extract_entities_from_conll(block.split("\n"))
|
| 66 |
+
if text:
|
| 67 |
+
sentences.append((text, true_ents))
|
| 68 |
+
|
| 69 |
+
if limit:
|
| 70 |
+
sentences = sentences[:limit]
|
| 71 |
+
|
| 72 |
+
print(f"Loaded {len(sentences)} test sentences.")
|
| 73 |
+
|
| 74 |
+
preprocessor = Preprocessor()
|
| 75 |
+
ner = NEREngine()
|
| 76 |
+
|
| 77 |
+
true_positives = 0
|
| 78 |
+
false_positives = 0
|
| 79 |
+
false_negatives = 0
|
| 80 |
+
|
| 81 |
+
print("Running NER evaluation (this may take a while)...")
|
| 82 |
+
for i, (text, true_ents) in enumerate(sentences):
|
| 83 |
+
if i > 0 and i % 50 == 0:
|
| 84 |
+
print(f"Processed {i}/{len(sentences)} sentences...")
|
| 85 |
+
|
| 86 |
+
# Clean text specifically for NER
|
| 87 |
+
clean_text = preprocessor.preprocess_nlp(text)
|
| 88 |
+
|
| 89 |
+
predicted_results = ner.recognize(clean_text)
|
| 90 |
+
|
| 91 |
+
# Format predictions into (type, string) lowercased for fair comparison
|
| 92 |
+
pred_ents = [(res.entity_group, res.word.replace(" ", "").lower()) for res in predicted_results]
|
| 93 |
+
|
| 94 |
+
# Format true entities similarly (strip spaces, lowercase)
|
| 95 |
+
# Note: The model output uses different spacing sometimes due to subwords.
|
| 96 |
+
true_ents_formatted = [(t, w.replace(" ", "").lower()) for t, w in true_ents]
|
| 97 |
+
|
| 98 |
+
# Calculate overlaps
|
| 99 |
+
for true_e in true_ents_formatted:
|
| 100 |
+
if true_e in pred_ents:
|
| 101 |
+
true_positives += 1
|
| 102 |
+
pred_ents.remove(true_e)
|
| 103 |
+
else:
|
| 104 |
+
false_negatives += 1
|
| 105 |
+
|
| 106 |
+
# Whatever is left in pred_ents are false positives
|
| 107 |
+
false_positives += len(pred_ents)
|
| 108 |
+
|
| 109 |
+
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
|
| 110 |
+
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
|
| 111 |
+
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
| 112 |
+
|
| 113 |
+
print("\n" + "="*40)
|
| 114 |
+
print("NER EVALUATION RESULTS (Entity-Level Exact Match)")
|
| 115 |
+
print("="*40)
|
| 116 |
+
print(f"Sentences Evaluated: {len(sentences)}")
|
| 117 |
+
print(f"True Positives: {true_positives}")
|
| 118 |
+
print(f"False Positives: {false_positives}")
|
| 119 |
+
print(f"False Negatives: {false_negatives}")
|
| 120 |
+
print("-" * 40)
|
| 121 |
+
print(f"Precision: {precision:.4f}")
|
| 122 |
+
print(f"Recall: {recall:.4f}")
|
| 123 |
+
print(f"F1 Score: {f1:.4f}")
|
| 124 |
+
print("="*40)
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
test_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "test.txt")
|
| 128 |
+
if not os.path.exists(test_path):
|
| 129 |
+
print(f"Error: Could not find CoNLL test file at {test_path}")
|
| 130 |
+
else:
|
| 131 |
+
# Run on the first 500 sentences to get a quick estimate.
|
| 132 |
+
# Change limit=None to run on the entire test set.
|
| 133 |
+
evaluate_ner(test_path, limit=500)
|
frontend/next.config.ts
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
import type { NextConfig } from "next";
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
const nextConfig: NextConfig = {
|
| 4 |
async rewrites() {
|
| 5 |
return [
|
| 6 |
{
|
| 7 |
source: "/api/:path*",
|
| 8 |
-
//
|
| 9 |
-
destination: "https://joye-tetracid-trevor.ngrok-free.dev/api/:path*",
|
| 10 |
},
|
| 11 |
];
|
| 12 |
},
|
|
|
|
| 1 |
import type { NextConfig } from "next";
|
| 2 |
|
| 3 |
+
// Set NEXT_PUBLIC_API_URL in .env.local to point to your backend.
|
| 4 |
+
// Example for Colab: NEXT_PUBLIC_API_URL=https://your-url.ngrok-free.dev
|
| 5 |
+
// Example for local: NEXT_PUBLIC_API_URL=http://localhost:8000
|
| 6 |
+
// If not set, defaults to localhost:8000
|
| 7 |
+
const API_URL = process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000";
|
| 8 |
+
|
| 9 |
const nextConfig: NextConfig = {
|
| 10 |
async rewrites() {
|
| 11 |
return [
|
| 12 |
{
|
| 13 |
source: "/api/:path*",
|
| 14 |
+
destination: `${API_URL}/api/:path*`,
|
|
|
|
| 15 |
},
|
| 16 |
];
|
| 17 |
},
|
frontend/src/app/page.tsx
CHANGED
|
@@ -94,6 +94,9 @@ function NetworkGraph({ network }: { network: { nodes: any[]; edges: any[] } })
|
|
| 94 |
);
|
| 95 |
}
|
| 96 |
|
|
|
|
|
|
|
|
|
|
| 97 |
export default function Dashboard() {
|
| 98 |
const [data, setData] = useState<AnalysisResult | null>(null);
|
| 99 |
const [insights, setInsights] = useState<InsightItem[]>([]);
|
|
@@ -117,14 +120,36 @@ export default function Dashboard() {
|
|
| 117 |
// Annotation editor
|
| 118 |
const [editingDoc, setEditingDoc] = useState<DocForEditor | null>(null);
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
const loadHistory = useCallback(async () => {
|
| 121 |
setHistoryLoading(true);
|
|
|
|
| 122 |
try {
|
| 123 |
-
const res = await fetch(`${API_BASE}/api/history?limit=50`, { headers:
|
|
|
|
| 124 |
if (res.ok) setHistory(await res.json());
|
| 125 |
-
}
|
| 126 |
-
|
| 127 |
-
}
|
| 128 |
}, []);
|
| 129 |
|
| 130 |
useEffect(() => {
|
|
@@ -194,25 +219,38 @@ export default function Dashboard() {
|
|
| 194 |
const uploadCSV = useCallback(async (file: File) => {
|
| 195 |
setLoading(true);
|
| 196 |
setError("");
|
|
|
|
| 197 |
try {
|
| 198 |
const formData = new FormData();
|
| 199 |
formData.append("file", file);
|
|
|
|
|
|
|
|
|
|
| 200 |
const res = await fetch(`${API_BASE}/api/upload?run_ner=true&run_sentiment=true&run_topics=true`, {
|
| 201 |
method: "POST",
|
|
|
|
| 202 |
body: formData,
|
| 203 |
});
|
|
|
|
| 204 |
if (!res.ok) {
|
| 205 |
-
const err = await res.json();
|
| 206 |
throw new Error(err.detail || "Upload failed");
|
| 207 |
}
|
| 208 |
const result: AnalysisResult = await res.json();
|
|
|
|
| 209 |
setData(result);
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
if (insightsRes.ok) setInsights(await insightsRes.json());
|
| 212 |
} catch (e: any) {
|
|
|
|
| 213 |
setError(e.message || "Error uploading file");
|
| 214 |
} finally {
|
| 215 |
setLoading(false);
|
|
|
|
| 216 |
}
|
| 217 |
}, []);
|
| 218 |
|
|
@@ -220,19 +258,25 @@ export default function Dashboard() {
|
|
| 220 |
if (!textInput.trim()) return;
|
| 221 |
setLoading(true);
|
| 222 |
setError("");
|
|
|
|
| 223 |
try {
|
| 224 |
const res = await fetch(`${API_BASE}/api/analyze`, {
|
| 225 |
method: "POST",
|
| 226 |
-
headers: {
|
| 227 |
body: JSON.stringify({ text: textInput }),
|
| 228 |
});
|
|
|
|
| 229 |
if (!res.ok) throw new Error("Analysis failed");
|
| 230 |
const result: AnalysisResult = await res.json();
|
|
|
|
|
|
|
| 231 |
setData(result);
|
| 232 |
} catch (e: any) {
|
|
|
|
| 233 |
setError(e.message);
|
| 234 |
} finally {
|
| 235 |
setLoading(false);
|
|
|
|
| 236 |
}
|
| 237 |
}, [textInput]);
|
| 238 |
|
|
@@ -276,6 +320,30 @@ export default function Dashboard() {
|
|
| 276 |
|
| 277 |
return (
|
| 278 |
<div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
{/* Annotation editor modal */}
|
| 280 |
{editingDoc && (
|
| 281 |
<AnnotationEditor
|
|
@@ -287,6 +355,7 @@ export default function Dashboard() {
|
|
| 287 |
|
| 288 |
{/* Upload Section */}
|
| 289 |
{!data && !loading && (
|
|
|
|
| 290 |
<section style={{ marginBottom: "2rem" }}>
|
| 291 |
<div
|
| 292 |
className={`upload-area ${dragging ? "dragging" : ""}`}
|
|
@@ -299,9 +368,10 @@ export default function Dashboard() {
|
|
| 299 |
<p className="upload-text">
|
| 300 |
<strong>CSV файл чирж оруулах</strong> эсвэл дарж сонгох
|
| 301 |
</p>
|
| 302 |
-
<
|
| 303 |
-
|
| 304 |
-
|
|
|
|
| 305 |
<input
|
| 306 |
ref={fileInputRef}
|
| 307 |
type="file"
|
|
|
|
| 94 |
);
|
| 95 |
}
|
| 96 |
|
| 97 |
+
// Standard headers needed for all API calls when going through Ngrok
|
| 98 |
+
const NGROK_HEADERS = { "ngrok-skip-browser-warning": "true" };
|
| 99 |
+
|
| 100 |
export default function Dashboard() {
|
| 101 |
const [data, setData] = useState<AnalysisResult | null>(null);
|
| 102 |
const [insights, setInsights] = useState<InsightItem[]>([]);
|
|
|
|
| 120 |
// Annotation editor
|
| 121 |
const [editingDoc, setEditingDoc] = useState<DocForEditor | null>(null);
|
| 122 |
|
| 123 |
+
// Backend health check
|
| 124 |
+
const [backendOk, setBackendOk] = useState<boolean | null>(null); // null = checking
|
| 125 |
+
|
| 126 |
+
// Health check on mount — tells you immediately if backend is reachable
|
| 127 |
+
useEffect(() => {
|
| 128 |
+
const check = async () => {
|
| 129 |
+
console.group("[NLP] Backend health check");
|
| 130 |
+
try {
|
| 131 |
+
const res = await fetch(`${API_BASE}/api/health`, { headers: NGROK_HEADERS });
|
| 132 |
+
const ok = res.ok;
|
| 133 |
+
setBackendOk(ok);
|
| 134 |
+
console.log(ok ? "✅ Backend reachable" : `❌ Backend returned ${res.status}`);
|
| 135 |
+
} catch (e) {
|
| 136 |
+
setBackendOk(false);
|
| 137 |
+
console.error("❌ Backend unreachable:", e);
|
| 138 |
+
}
|
| 139 |
+
console.groupEnd();
|
| 140 |
+
};
|
| 141 |
+
check();
|
| 142 |
+
}, []);
|
| 143 |
+
|
| 144 |
const loadHistory = useCallback(async () => {
|
| 145 |
setHistoryLoading(true);
|
| 146 |
+
console.group("[NLP] Load history");
|
| 147 |
try {
|
| 148 |
+
const res = await fetch(`${API_BASE}/api/history?limit=50`, { headers: NGROK_HEADERS });
|
| 149 |
+
console.log(`→ GET /api/history status=${res.status}`);
|
| 150 |
if (res.ok) setHistory(await res.json());
|
| 151 |
+
} catch (e) { console.error(e); }
|
| 152 |
+
finally { setHistoryLoading(false); console.groupEnd(); }
|
|
|
|
| 153 |
}, []);
|
| 154 |
|
| 155 |
useEffect(() => {
|
|
|
|
| 219 |
const uploadCSV = useCallback(async (file: File) => {
|
| 220 |
setLoading(true);
|
| 221 |
setError("");
|
| 222 |
+
console.group(`[NLP] CSV Upload — ${file.name} (${(file.size/1024).toFixed(1)} KB)`);
|
| 223 |
try {
|
| 224 |
const formData = new FormData();
|
| 225 |
formData.append("file", file);
|
| 226 |
+
// ⚠️ IMPORTANT: ngrok-skip-browser-warning header MUST be included here.
|
| 227 |
+
// Without it, Ngrok returns an HTML warning page instead of forwarding
|
| 228 |
+
// the request to FastAPI → FastAPI tries to parse HTML as CSV → 500 error.
|
| 229 |
const res = await fetch(`${API_BASE}/api/upload?run_ner=true&run_sentiment=true&run_topics=true`, {
|
| 230 |
method: "POST",
|
| 231 |
+
headers: NGROK_HEADERS, // ← THE FIX
|
| 232 |
body: formData,
|
| 233 |
});
|
| 234 |
+
console.log(`→ POST /api/upload status=${res.status}`);
|
| 235 |
if (!res.ok) {
|
| 236 |
+
const err = await res.json().catch(() => ({ detail: `HTTP ${res.status}` }));
|
| 237 |
throw new Error(err.detail || "Upload failed");
|
| 238 |
}
|
| 239 |
const result: AnalysisResult = await res.json();
|
| 240 |
+
console.log(`← ${result.total_documents} documents, topics=${result.topic_summary?.length}`);
|
| 241 |
setData(result);
|
| 242 |
+
setActiveTab("overview"); // Auto-switch to results
|
| 243 |
+
|
| 244 |
+
// Immediately fetch insights after upload
|
| 245 |
+
const insightsRes = await fetch(`${API_BASE}/api/insights`, { headers: NGROK_HEADERS, method: "POST" });
|
| 246 |
+
console.log(`→ POST /api/insights status=${insightsRes.status}`);
|
| 247 |
if (insightsRes.ok) setInsights(await insightsRes.json());
|
| 248 |
} catch (e: any) {
|
| 249 |
+
console.error("Upload error:", e);
|
| 250 |
setError(e.message || "Error uploading file");
|
| 251 |
} finally {
|
| 252 |
setLoading(false);
|
| 253 |
+
console.groupEnd();
|
| 254 |
}
|
| 255 |
}, []);
|
| 256 |
|
|
|
|
| 258 |
if (!textInput.trim()) return;
|
| 259 |
setLoading(true);
|
| 260 |
setError("");
|
| 261 |
+
console.group(`[NLP] Analyze text (${textInput.length} chars)`);
|
| 262 |
try {
|
| 263 |
const res = await fetch(`${API_BASE}/api/analyze`, {
|
| 264 |
method: "POST",
|
| 265 |
+
headers: { ...NGROK_HEADERS, "Content-Type": "application/json" },
|
| 266 |
body: JSON.stringify({ text: textInput }),
|
| 267 |
});
|
| 268 |
+
console.log(`→ POST /api/analyze status=${res.status}`);
|
| 269 |
if (!res.ok) throw new Error("Analysis failed");
|
| 270 |
const result: AnalysisResult = await res.json();
|
| 271 |
+
console.log(`← entities:`, result.documents[0]?.entities?.length ?? 0,
|
| 272 |
+
`sentiment:`, result.documents[0]?.sentiment?.label);
|
| 273 |
setData(result);
|
| 274 |
} catch (e: any) {
|
| 275 |
+
console.error(e);
|
| 276 |
setError(e.message);
|
| 277 |
} finally {
|
| 278 |
setLoading(false);
|
| 279 |
+
console.groupEnd();
|
| 280 |
}
|
| 281 |
}, [textInput]);
|
| 282 |
|
|
|
|
| 320 |
|
| 321 |
return (
|
| 322 |
<div>
|
| 323 |
+
{/* Backend status banner */}
|
| 324 |
+
{backendOk === false && (
|
| 325 |
+
<div style={{
|
| 326 |
+
background: "rgba(255,80,80,0.15)", border: "1px solid var(--negative)",
|
| 327 |
+
borderRadius: "0.5rem", padding: "0.6rem 1rem", marginBottom: "1rem",
|
| 328 |
+
display: "flex", alignItems: "center", gap: "0.5rem", fontSize: "0.85rem",
|
| 329 |
+
}}>
|
| 330 |
+
<span>🔴</span>
|
| 331 |
+
<span style={{ color: "var(--negative)", fontWeight: 600 }}>Backend холболт алдаатай.</span>
|
| 332 |
+
<span style={{ color: "var(--text-muted)" }}>
|
| 333 |
+
Colab дээрх сервер ажиллаж байгаа эсэхийг шалгаад, Ngrok URL зөв эсэхийг .env.local файлд шинэчилнэ үү.
|
| 334 |
+
</span>
|
| 335 |
+
</div>
|
| 336 |
+
)}
|
| 337 |
+
{backendOk === null && (
|
| 338 |
+
<div style={{
|
| 339 |
+
background: "rgba(100,100,200,0.1)", border: "1px solid rgba(100,100,255,0.3)",
|
| 340 |
+
borderRadius: "0.5rem", padding: "0.4rem 1rem", marginBottom: "0.75rem",
|
| 341 |
+
fontSize: "0.8rem", color: "var(--text-muted)",
|
| 342 |
+
}}>
|
| 343 |
+
⏳ Backend холболт шалгаж байна...
|
| 344 |
+
</div>
|
| 345 |
+
)}
|
| 346 |
+
|
| 347 |
{/* Annotation editor modal */}
|
| 348 |
{editingDoc && (
|
| 349 |
<AnnotationEditor
|
|
|
|
| 355 |
|
| 356 |
{/* Upload Section */}
|
| 357 |
{!data && !loading && (
|
| 358 |
+
|
| 359 |
<section style={{ marginBottom: "2rem" }}>
|
| 360 |
<div
|
| 361 |
className={`upload-area ${dragging ? "dragging" : ""}`}
|
|
|
|
| 368 |
<p className="upload-text">
|
| 369 |
<strong>CSV файл чирж оруулах</strong> эсвэл дарж сонгох
|
| 370 |
</p>
|
| 371 |
+
<div style={{ fontSize: "0.75rem", color: "var(--text-muted)", marginTop: "0.5rem" }}>
|
| 372 |
+
<p>⚠️ <strong>Санамж:</strong> Шинжлэх өгөгдөл тань заавал <code>text</code> эсвэл <code>Text</code> гэсэн нэртэй баганад байх ёстой.</p>
|
| 373 |
+
<p>Хэрэв таны багана <code>Текст</code>, <code>Мессеж</code> гэх мэт Монгол нэртэй бол файлаа оруулахаас өмнө нэрийг нь <code>text</code> болгож өөрчилнө үү.</p>
|
| 374 |
+
</div>
|
| 375 |
<input
|
| 376 |
ref={fileInputRef}
|
| 377 |
type="file"
|
nlp_core/ner_engine.py
CHANGED
|
@@ -17,12 +17,18 @@ class NEREngine:
|
|
| 17 |
def _load_pipeline(self):
|
| 18 |
"""Lazy-load the NER pipeline (heavy model, load only when needed)."""
|
| 19 |
if self._pipeline is None:
|
|
|
|
| 20 |
from transformers import pipeline
|
|
|
|
| 21 |
self._pipeline = pipeline(
|
| 22 |
"ner",
|
| 23 |
model=self.model_name,
|
| 24 |
aggregation_strategy="simple",
|
|
|
|
|
|
|
|
|
|
| 25 |
)
|
|
|
|
| 26 |
return self._pipeline
|
| 27 |
|
| 28 |
def _clean_entities(self, raw_entities: List[dict]) -> List[dict]:
|
|
|
|
| 17 |
def _load_pipeline(self):
|
| 18 |
"""Lazy-load the NER pipeline (heavy model, load only when needed)."""
|
| 19 |
if self._pipeline is None:
|
| 20 |
+
import torch
|
| 21 |
from transformers import pipeline
|
| 22 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 23 |
self._pipeline = pipeline(
|
| 24 |
"ner",
|
| 25 |
model=self.model_name,
|
| 26 |
aggregation_strategy="simple",
|
| 27 |
+
truncation=True,
|
| 28 |
+
max_length=512,
|
| 29 |
+
device=device,
|
| 30 |
)
|
| 31 |
+
print(f"[NEREngine] Loaded on {'GPU' if device == 0 else 'CPU'}")
|
| 32 |
return self._pipeline
|
| 33 |
|
| 34 |
def _clean_entities(self, raw_entities: List[dict]) -> List[dict]:
|