Spaces:
Sleeping
Sleeping
added hf configs
Browse files- Dockerfile +37 -0
- nginx.conf +28 -0
- nlp_core/ner_engine.py +43 -3
- requirements.txt +5 -1
- start.sh +29 -0
Dockerfile
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM node:20-slim AS frontend-builder
|
| 2 |
+
|
| 3 |
+
WORKDIR /app/frontend
|
| 4 |
+
COPY frontend/package*.json ./
|
| 5 |
+
RUN npm ci
|
| 6 |
+
COPY frontend/ ./
|
| 7 |
+
ENV NEXT_PUBLIC_API_URL=http://localhost:8000
|
| 8 |
+
RUN npm run build
|
| 9 |
+
|
| 10 |
+
FROM python:3.11-slim
|
| 11 |
+
|
| 12 |
+
# gcc is required to compile hdbscan from source
|
| 13 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 14 |
+
gcc g++ nodejs npm nginx curl \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
WORKDIR /app
|
| 18 |
+
|
| 19 |
+
COPY requirements.txt .
|
| 20 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
COPY nlp_core/ ./nlp_core/
|
| 23 |
+
COPY adapters/ ./adapters/
|
| 24 |
+
|
| 25 |
+
COPY --from=frontend-builder /app/frontend/.next ./frontend/.next
|
| 26 |
+
COPY --from=frontend-builder /app/frontend/public ./frontend/public
|
| 27 |
+
COPY --from=frontend-builder /app/frontend/package*.json ./frontend/
|
| 28 |
+
COPY --from=frontend-builder /app/frontend/node_modules ./frontend/node_modules
|
| 29 |
+
|
| 30 |
+
COPY nginx.conf /etc/nginx/sites-available/default
|
| 31 |
+
|
| 32 |
+
EXPOSE 7860
|
| 33 |
+
|
| 34 |
+
COPY start.sh .
|
| 35 |
+
RUN chmod +x start.sh
|
| 36 |
+
|
| 37 |
+
CMD ["./start.sh"]
|
nginx.conf
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
server {
|
| 2 |
+
listen 7860;
|
| 3 |
+
server_name _;
|
| 4 |
+
|
| 5 |
+
client_max_body_size 100M;
|
| 6 |
+
proxy_read_timeout 300s;
|
| 7 |
+
proxy_send_timeout 300s;
|
| 8 |
+
|
| 9 |
+
location /api/ {
|
| 10 |
+
proxy_pass http://localhost:8000/api/;
|
| 11 |
+
proxy_http_version 1.1;
|
| 12 |
+
proxy_set_header Host $host;
|
| 13 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 14 |
+
proxy_set_header Connection "";
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
location /docs {
|
| 18 |
+
proxy_pass http://localhost:8000/docs;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
location / {
|
| 22 |
+
proxy_pass http://localhost:3000;
|
| 23 |
+
proxy_http_version 1.1;
|
| 24 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 25 |
+
proxy_set_header Connection "upgrade";
|
| 26 |
+
proxy_set_header Host $host;
|
| 27 |
+
}
|
| 28 |
+
}
|
nlp_core/ner_engine.py
CHANGED
|
@@ -58,6 +58,46 @@ class NEREngine:
|
|
| 58 |
))
|
| 59 |
return results
|
| 60 |
|
| 61 |
-
def recognize_batch(self, texts: List[str]) -> List[List[EntityResult]]:
|
| 62 |
-
"""Run NER on a batch of texts."""
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
))
|
| 59 |
return results
|
| 60 |
|
| 61 |
+
def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
|
| 62 |
+
"""Run NER on a batch of texts utilizing Hugging Face pipeline batching."""
|
| 63 |
+
if not texts:
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
# Filter empty texts to avoid pipeline errors
|
| 67 |
+
valid_texts = []
|
| 68 |
+
valid_indices = []
|
| 69 |
+
for i, text in enumerate(texts):
|
| 70 |
+
if text and text.strip():
|
| 71 |
+
valid_texts.append(text)
|
| 72 |
+
valid_indices.append(i)
|
| 73 |
+
|
| 74 |
+
# Preallocate empty results for all texts
|
| 75 |
+
out: List[List[EntityResult]] = [[] for _ in texts]
|
| 76 |
+
|
| 77 |
+
if not valid_texts:
|
| 78 |
+
return out
|
| 79 |
+
|
| 80 |
+
pipe = self._load_pipeline()
|
| 81 |
+
try:
|
| 82 |
+
# Send batch directly to pipeline
|
| 83 |
+
raw_results = pipe(valid_texts, batch_size=batch_size)
|
| 84 |
+
|
| 85 |
+
for idx, raw in zip(valid_indices, raw_results):
|
| 86 |
+
cleaned = self._clean_entities(raw)
|
| 87 |
+
entity_results = []
|
| 88 |
+
for ent in cleaned:
|
| 89 |
+
entity_results.append(EntityResult(
|
| 90 |
+
word=ent.get("word", ""),
|
| 91 |
+
entity_group=ent.get("entity_group", "MISC"),
|
| 92 |
+
score=float(ent.get("score", 0.0)),
|
| 93 |
+
start=int(ent.get("start", 0)),
|
| 94 |
+
end=int(ent.get("end", 0)),
|
| 95 |
+
))
|
| 96 |
+
out[idx] = entity_results
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f"[NEREngine] Batch processing error: {e}")
|
| 99 |
+
# Fallback to single text processing if pipeline batch fails
|
| 100 |
+
for idx, text in zip(valid_indices, valid_texts):
|
| 101 |
+
out[idx] = self.recognize(text)
|
| 102 |
+
|
| 103 |
+
return out
|
requirements.txt
CHANGED
|
@@ -7,9 +7,10 @@ python-multipart>=0.0.6
|
|
| 7 |
|
| 8 |
# NLP Core
|
| 9 |
transformers>=4.35.0
|
| 10 |
-
torch
|
| 11 |
sentence-transformers>=2.2.0
|
| 12 |
bertopic>=0.16.0
|
|
|
|
| 13 |
networkx>=3.0
|
| 14 |
|
| 15 |
# Utilities
|
|
@@ -17,3 +18,6 @@ pandas>=2.0.0
|
|
| 17 |
tiktoken>=0.5.0
|
| 18 |
sentencepiece>=0.1.99
|
| 19 |
protobuf>=3.20.0
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# NLP Core
|
| 9 |
transformers>=4.35.0
|
| 10 |
+
torch==2.2.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu
|
| 11 |
sentence-transformers>=2.2.0
|
| 12 |
bertopic>=0.16.0
|
| 13 |
+
hdbscan
|
| 14 |
networkx>=3.0
|
| 15 |
|
| 16 |
# Utilities
|
|
|
|
| 18 |
tiktoken>=0.5.0
|
| 19 |
sentencepiece>=0.1.99
|
| 20 |
protobuf>=3.20.0
|
| 21 |
+
scikit-learn
|
| 22 |
+
|
| 23 |
+
|
start.sh
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
echo "=== Starting NLP Intelligence ==="
|
| 5 |
+
|
| 6 |
+
cd /app
|
| 7 |
+
PYTHONPATH=/app uvicorn adapters.api.main:app \
|
| 8 |
+
--host 0.0.0.0 \
|
| 9 |
+
--port 8000 \
|
| 10 |
+
--workers 1 \
|
| 11 |
+
--timeout-keep-alive 120 &
|
| 12 |
+
|
| 13 |
+
FASTAPI_PID=$!
|
| 14 |
+
echo "FastAPI started (PID $FASTAPI_PID)"
|
| 15 |
+
|
| 16 |
+
cd /app/frontend
|
| 17 |
+
npm run start -- --port 3000 &
|
| 18 |
+
|
| 19 |
+
NEXTJS_PID=$!
|
| 20 |
+
echo "Next.js started (PID $NEXTJS_PID)"
|
| 21 |
+
|
| 22 |
+
sleep 5
|
| 23 |
+
nginx -g "daemon off;" &
|
| 24 |
+
|
| 25 |
+
NGINX_PID=$!
|
| 26 |
+
echo "nginx started — app on port 7860"
|
| 27 |
+
|
| 28 |
+
wait -n $FASTAPI_PID $NEXTJS_PID $NGINX_PID
|
| 29 |
+
echo "A process exited — shutting down"
|