Spaces:
Running
Running
Commit Β·
622b002
1
Parent(s): 7ddb619
feat: add /embed endpoint v1.4.0
Browse files- app.py +36 -4
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -4,6 +4,7 @@ from transformers import (
|
|
| 4 |
pipeline,
|
| 5 |
AutoTokenizer,
|
| 6 |
)
|
|
|
|
| 7 |
import torch
|
| 8 |
import re
|
| 9 |
import hashlib
|
|
@@ -19,7 +20,7 @@ logger = logging.getLogger(__name__)
|
|
| 19 |
app = FastAPI(
|
| 20 |
title="CivicPulse NLP API",
|
| 21 |
description="NLP microservice for Civic Pulse Engine β Municipality of Pulilan, Bulacan",
|
| 22 |
-
version="1.
|
| 23 |
)
|
| 24 |
|
| 25 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -61,6 +62,11 @@ logger.info("Loading spaCy NER model: en_core_web_sm...")
|
|
| 61 |
nlp_spacy = spacy.load("en_core_web_sm")
|
| 62 |
logger.info("spaCy NER model loaded.")
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
# CONSTANTS
|
| 66 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -188,6 +194,13 @@ class PreprocessResponse(BaseModel):
|
|
| 188 |
language: str
|
| 189 |
is_spam: bool
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
class SentimentRequest(BaseModel):
|
| 192 |
text: str
|
| 193 |
comment_id: str | None = None
|
|
@@ -242,9 +255,9 @@ class TopicResponse(BaseModel):
|
|
| 242 |
def root():
|
| 243 |
return {
|
| 244 |
"service": "CivicPulse NLP API",
|
| 245 |
-
"version": "1.
|
| 246 |
"status": "running",
|
| 247 |
-
"endpoints": ["/preprocess", "/sentiment", "/claim-detection", "/topic-classification", "/health"],
|
| 248 |
}
|
| 249 |
|
| 250 |
@app.get("/health")
|
|
@@ -252,7 +265,7 @@ def health():
|
|
| 252 |
"""Keep-alive endpoint. GitHub Actions pings this every 25 min."""
|
| 253 |
return {
|
| 254 |
"status": "ok",
|
| 255 |
-
"models_loaded": ["sentiment", "claim-tokenizer", "topic-classification", "spacy-ner"],
|
| 256 |
"claim_detection_mode": CLAIM_DETECTION_MODE,
|
| 257 |
}
|
| 258 |
|
|
@@ -290,6 +303,25 @@ def preprocess(request: PreprocessRequest):
|
|
| 290 |
is_spam=spam,
|
| 291 |
)
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
# ββ Sentiment ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 294 |
@app.post("/sentiment", response_model=SentimentResponse)
|
| 295 |
def analyze_sentiment(request: SentimentRequest):
|
|
|
|
| 4 |
pipeline,
|
| 5 |
AutoTokenizer,
|
| 6 |
)
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
import torch
|
| 9 |
import re
|
| 10 |
import hashlib
|
|
|
|
| 20 |
app = FastAPI(
|
| 21 |
title="CivicPulse NLP API",
|
| 22 |
description="NLP microservice for Civic Pulse Engine β Municipality of Pulilan, Bulacan",
|
| 23 |
+
version="1.4.0",
|
| 24 |
)
|
| 25 |
|
| 26 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 62 |
nlp_spacy = spacy.load("en_core_web_sm")
|
| 63 |
logger.info("spaCy NER model loaded.")
|
| 64 |
|
| 65 |
+
# ββ 5. Embedding Model (for RAG pipeline) ββββββββββββββββββββββββββββββββββ
|
| 66 |
+
logger.info("Loading embedding model: meedan/paraphrase-filipino-mpnet-base-v2...")
|
| 67 |
+
embedding_model = SentenceTransformer("meedan/paraphrase-filipino-mpnet-base-v2")
|
| 68 |
+
logger.info("Embedding model loaded.")
|
| 69 |
+
|
| 70 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
# CONSTANTS
|
| 72 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 194 |
language: str
|
| 195 |
is_spam: bool
|
| 196 |
|
| 197 |
+
class EmbedRequest(BaseModel):
|
| 198 |
+
text: str
|
| 199 |
+
|
| 200 |
+
class EmbedResponse(BaseModel):
|
| 201 |
+
embedding: list[float]
|
| 202 |
+
dimensions: int
|
| 203 |
+
|
| 204 |
class SentimentRequest(BaseModel):
|
| 205 |
text: str
|
| 206 |
comment_id: str | None = None
|
|
|
|
| 255 |
def root():
|
| 256 |
return {
|
| 257 |
"service": "CivicPulse NLP API",
|
| 258 |
+
"version": "1.4.0",
|
| 259 |
"status": "running",
|
| 260 |
+
"endpoints": ["/preprocess", "/embed", "/sentiment", "/claim-detection", "/topic-classification", "/health"],
|
| 261 |
}
|
| 262 |
|
| 263 |
@app.get("/health")
|
|
|
|
| 265 |
"""Keep-alive endpoint. GitHub Actions pings this every 25 min."""
|
| 266 |
return {
|
| 267 |
"status": "ok",
|
| 268 |
+
"models_loaded": ["sentiment", "claim-tokenizer", "topic-classification", "spacy-ner", "embedding"],
|
| 269 |
"claim_detection_mode": CLAIM_DETECTION_MODE,
|
| 270 |
}
|
| 271 |
|
|
|
|
| 303 |
is_spam=spam,
|
| 304 |
)
|
| 305 |
|
| 306 |
+
# ββ Embed ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 307 |
+
@app.post("/embed", response_model=EmbedResponse)
|
| 308 |
+
def embed(request: EmbedRequest):
|
| 309 |
+
"""
|
| 310 |
+
Generate a 768-dimension dense vector embedding for a text string.
|
| 311 |
+
Used for: (1) embedding lgu_documents into pgvector, and
|
| 312 |
+
(2) embedding flagged claims for cosine similarity search.
|
| 313 |
+
Model: meedan/paraphrase-filipino-mpnet-base-v2
|
| 314 |
+
"""
|
| 315 |
+
text = request.text.strip()
|
| 316 |
+
if not text:
|
| 317 |
+
raise HTTPException(status_code=422, detail="text field cannot be empty.")
|
| 318 |
+
try:
|
| 319 |
+
vector = embedding_model.encode(text, normalize_embeddings=True).tolist()
|
| 320 |
+
return EmbedResponse(embedding=vector, dimensions=len(vector))
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.error(f"Embedding error: {e}")
|
| 323 |
+
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")
|
| 324 |
+
|
| 325 |
# ββ Sentiment ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 326 |
@app.post("/sentiment", response_model=SentimentResponse)
|
| 327 |
def analyze_sentiment(request: SentimentRequest):
|
requirements.txt
CHANGED
|
@@ -6,4 +6,5 @@ pydantic==2.7.0
|
|
| 6 |
sentencepiece==0.2.0
|
| 7 |
scipy==1.13.0
|
| 8 |
spacy==3.8.11
|
| 9 |
-
langdetect==1.0.9
|
|
|
|
|
|
| 6 |
sentencepiece==0.2.0
|
| 7 |
scipy==1.13.0
|
| 8 |
spacy==3.8.11
|
| 9 |
+
langdetect==1.0.9
|
| 10 |
+
sentence-transformers==3.0.1
|