Nomio4640 commited on
Commit
4c114c1
·
1 Parent(s): 59f9987

added hf configs

Browse files
Files changed (5) hide show
  1. Dockerfile +37 -0
  2. nginx.conf +28 -0
  3. nlp_core/ner_engine.py +43 -3
  4. requirements.txt +5 -1
  5. start.sh +29 -0
Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:20-slim AS frontend-builder
2
+
3
+ WORKDIR /app/frontend
4
+ COPY frontend/package*.json ./
5
+ RUN npm ci
6
+ COPY frontend/ ./
7
+ ENV NEXT_PUBLIC_API_URL=http://localhost:8000
8
+ RUN npm run build
9
+
10
+ FROM python:3.11-slim
11
+
12
+ # gcc is required to compile hdbscan from source
13
+ RUN apt-get update && apt-get install -y --no-install-recommends \
14
+ gcc g++ nodejs npm nginx curl \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ WORKDIR /app
18
+
19
+ COPY requirements.txt .
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ COPY nlp_core/ ./nlp_core/
23
+ COPY adapters/ ./adapters/
24
+
25
+ COPY --from=frontend-builder /app/frontend/.next ./frontend/.next
26
+ COPY --from=frontend-builder /app/frontend/public ./frontend/public
27
+ COPY --from=frontend-builder /app/frontend/package*.json ./frontend/
28
+ COPY --from=frontend-builder /app/frontend/node_modules ./frontend/node_modules
29
+
30
+ COPY nginx.conf /etc/nginx/sites-available/default
31
+
32
+ EXPOSE 7860
33
+
34
+ COPY start.sh .
35
+ RUN chmod +x start.sh
36
+
37
+ CMD ["./start.sh"]
nginx.conf ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ server {
2
+ listen 7860;
3
+ server_name _;
4
+
5
+ client_max_body_size 100M;
6
+ proxy_read_timeout 300s;
7
+ proxy_send_timeout 300s;
8
+
9
+ location /api/ {
10
+ proxy_pass http://localhost:8000/api/;
11
+ proxy_http_version 1.1;
12
+ proxy_set_header Host $host;
13
+ proxy_set_header X-Real-IP $remote_addr;
14
+ proxy_set_header Connection "";
15
+ }
16
+
17
+ location /docs {
18
+ proxy_pass http://localhost:8000/docs;
19
+ }
20
+
21
+ location / {
22
+ proxy_pass http://localhost:3000;
23
+ proxy_http_version 1.1;
24
+ proxy_set_header Upgrade $http_upgrade;
25
+ proxy_set_header Connection "upgrade";
26
+ proxy_set_header Host $host;
27
+ }
28
+ }
nlp_core/ner_engine.py CHANGED
@@ -58,6 +58,46 @@ class NEREngine:
58
  ))
59
  return results
60
 
61
- def recognize_batch(self, texts: List[str]) -> List[List[EntityResult]]:
62
- """Run NER on a batch of texts."""
63
- return [self.recognize(t) for t in texts]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  ))
59
  return results
60
 
61
+ def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
62
+ """Run NER on a batch of texts utilizing Hugging Face pipeline batching."""
63
+ if not texts:
64
+ return []
65
+
66
+ # Filter empty texts to avoid pipeline errors
67
+ valid_texts = []
68
+ valid_indices = []
69
+ for i, text in enumerate(texts):
70
+ if text and text.strip():
71
+ valid_texts.append(text)
72
+ valid_indices.append(i)
73
+
74
+ # Preallocate empty results for all texts
75
+ out: List[List[EntityResult]] = [[] for _ in texts]
76
+
77
+ if not valid_texts:
78
+ return out
79
+
80
+ pipe = self._load_pipeline()
81
+ try:
82
+ # Send batch directly to pipeline
83
+ raw_results = pipe(valid_texts, batch_size=batch_size)
84
+
85
+ for idx, raw in zip(valid_indices, raw_results):
86
+ cleaned = self._clean_entities(raw)
87
+ entity_results = []
88
+ for ent in cleaned:
89
+ entity_results.append(EntityResult(
90
+ word=ent.get("word", ""),
91
+ entity_group=ent.get("entity_group", "MISC"),
92
+ score=float(ent.get("score", 0.0)),
93
+ start=int(ent.get("start", 0)),
94
+ end=int(ent.get("end", 0)),
95
+ ))
96
+ out[idx] = entity_results
97
+ except Exception as e:
98
+ print(f"[NEREngine] Batch processing error: {e}")
99
+ # Fallback to single text processing if pipeline batch fails
100
+ for idx, text in zip(valid_indices, valid_texts):
101
+ out[idx] = self.recognize(text)
102
+
103
+ return out
requirements.txt CHANGED
@@ -7,9 +7,10 @@ python-multipart>=0.0.6
7
 
8
  # NLP Core
9
  transformers>=4.35.0
10
- torch>=2.0.0
11
  sentence-transformers>=2.2.0
12
  bertopic>=0.16.0
 
13
  networkx>=3.0
14
 
15
  # Utilities
@@ -17,3 +18,6 @@ pandas>=2.0.0
17
  tiktoken>=0.5.0
18
  sentencepiece>=0.1.99
19
  protobuf>=3.20.0
 
 
 
 
7
 
8
  # NLP Core
9
  transformers>=4.35.0
10
+ torch==2.2.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu
11
  sentence-transformers>=2.2.0
12
  bertopic>=0.16.0
13
+ hdbscan
14
  networkx>=3.0
15
 
16
  # Utilities
 
18
  tiktoken>=0.5.0
19
  sentencepiece>=0.1.99
20
  protobuf>=3.20.0
21
+ scikit-learn
22
+
23
+
start.sh ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "=== Starting NLP Intelligence ==="
5
+
6
+ cd /app
7
+ PYTHONPATH=/app uvicorn adapters.api.main:app \
8
+ --host 0.0.0.0 \
9
+ --port 8000 \
10
+ --workers 1 \
11
+ --timeout-keep-alive 120 &
12
+
13
+ FASTAPI_PID=$!
14
+ echo "FastAPI started (PID $FASTAPI_PID)"
15
+
16
+ cd /app/frontend
17
+ npm run start -- --port 3000 &
18
+
19
+ NEXTJS_PID=$!
20
+ echo "Next.js started (PID $NEXTJS_PID)"
21
+
22
+ sleep 5
23
+ nginx -g "daemon off;" &
24
+
25
+ NGINX_PID=$!
26
+ echo "nginx started — app on port 7860"
27
+
28
+ wait -n $FASTAPI_PID $NEXTJS_PID $NGINX_PID
29
+ echo "A process exited — shutting down"