rayubaldo44 commited on
Commit
622b002
Β·
1 Parent(s): 7ddb619

feat: add /embed endpoint v1.4.0

Browse files
Files changed (2) hide show
  1. app.py +36 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,6 +4,7 @@ from transformers import (
4
  pipeline,
5
  AutoTokenizer,
6
  )
 
7
  import torch
8
  import re
9
  import hashlib
@@ -19,7 +20,7 @@ logger = logging.getLogger(__name__)
19
  app = FastAPI(
20
  title="CivicPulse NLP API",
21
  description="NLP microservice for Civic Pulse Engine β€” Municipality of Pulilan, Bulacan",
22
- version="1.3.0",
23
  )
24
 
25
  # ═══════════════════════════════════════════════════════════════════════════
@@ -61,6 +62,11 @@ logger.info("Loading spaCy NER model: en_core_web_sm...")
61
  nlp_spacy = spacy.load("en_core_web_sm")
62
  logger.info("spaCy NER model loaded.")
63
 
 
 
 
 
 
64
  # ═══════════════════════════════════════════════════════════════════════════
65
  # CONSTANTS
66
  # ═══════════════════════════════════════════════════════════════════════════
@@ -188,6 +194,13 @@ class PreprocessResponse(BaseModel):
188
  language: str
189
  is_spam: bool
190
 
 
 
 
 
 
 
 
191
  class SentimentRequest(BaseModel):
192
  text: str
193
  comment_id: str | None = None
@@ -242,9 +255,9 @@ class TopicResponse(BaseModel):
242
  def root():
243
  return {
244
  "service": "CivicPulse NLP API",
245
- "version": "1.3.0",
246
  "status": "running",
247
- "endpoints": ["/preprocess", "/sentiment", "/claim-detection", "/topic-classification", "/health"],
248
  }
249
 
250
  @app.get("/health")
@@ -252,7 +265,7 @@ def health():
252
  """Keep-alive endpoint. GitHub Actions pings this every 25 min."""
253
  return {
254
  "status": "ok",
255
- "models_loaded": ["sentiment", "claim-tokenizer", "topic-classification", "spacy-ner"],
256
  "claim_detection_mode": CLAIM_DETECTION_MODE,
257
  }
258
 
@@ -290,6 +303,25 @@ def preprocess(request: PreprocessRequest):
290
  is_spam=spam,
291
  )
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  # ── Sentiment ──────────────────────────────────────────────────────────────
294
  @app.post("/sentiment", response_model=SentimentResponse)
295
  def analyze_sentiment(request: SentimentRequest):
 
4
  pipeline,
5
  AutoTokenizer,
6
  )
7
+ from sentence_transformers import SentenceTransformer
8
  import torch
9
  import re
10
  import hashlib
 
20
  app = FastAPI(
21
  title="CivicPulse NLP API",
22
  description="NLP microservice for Civic Pulse Engine β€” Municipality of Pulilan, Bulacan",
23
+ version="1.4.0",
24
  )
25
 
26
  # ═══════════════════════════════════════════════════════════════════════════
 
62
  nlp_spacy = spacy.load("en_core_web_sm")
63
  logger.info("spaCy NER model loaded.")
64
 
65
+ # ── 5. Embedding Model (for RAG pipeline) ──────────────────────────────────
66
+ logger.info("Loading embedding model: meedan/paraphrase-filipino-mpnet-base-v2...")
67
+ embedding_model = SentenceTransformer("meedan/paraphrase-filipino-mpnet-base-v2")
68
+ logger.info("Embedding model loaded.")
69
+
70
  # ═══════════════════════════════════════════════════════════════════════════
71
  # CONSTANTS
72
  # ═══════════════════════════════════════════════════════════════════════════
 
194
  language: str
195
  is_spam: bool
196
 
197
+ class EmbedRequest(BaseModel):
198
+ text: str
199
+
200
+ class EmbedResponse(BaseModel):
201
+ embedding: list[float]
202
+ dimensions: int
203
+
204
  class SentimentRequest(BaseModel):
205
  text: str
206
  comment_id: str | None = None
 
255
  def root():
256
  return {
257
  "service": "CivicPulse NLP API",
258
+ "version": "1.4.0",
259
  "status": "running",
260
+ "endpoints": ["/preprocess", "/embed", "/sentiment", "/claim-detection", "/topic-classification", "/health"],
261
  }
262
 
263
  @app.get("/health")
 
265
  """Keep-alive endpoint. GitHub Actions pings this every 25 min."""
266
  return {
267
  "status": "ok",
268
+ "models_loaded": ["sentiment", "claim-tokenizer", "topic-classification", "spacy-ner", "embedding"],
269
  "claim_detection_mode": CLAIM_DETECTION_MODE,
270
  }
271
 
 
303
  is_spam=spam,
304
  )
305
 
306
+ # ── Embed ──────────────────────────────────────────────────────────────────
307
+ @app.post("/embed", response_model=EmbedResponse)
308
+ def embed(request: EmbedRequest):
309
+ """
310
+ Generate a 768-dimension dense vector embedding for a text string.
311
+ Used for: (1) embedding lgu_documents into pgvector, and
312
+ (2) embedding flagged claims for cosine similarity search.
313
+ Model: meedan/paraphrase-filipino-mpnet-base-v2
314
+ """
315
+ text = request.text.strip()
316
+ if not text:
317
+ raise HTTPException(status_code=422, detail="text field cannot be empty.")
318
+ try:
319
+ vector = embedding_model.encode(text, normalize_embeddings=True).tolist()
320
+ return EmbedResponse(embedding=vector, dimensions=len(vector))
321
+ except Exception as e:
322
+ logger.error(f"Embedding error: {e}")
323
+ raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")
324
+
325
  # ── Sentiment ──────────────────────────────────────────────────────────────
326
  @app.post("/sentiment", response_model=SentimentResponse)
327
  def analyze_sentiment(request: SentimentRequest):
requirements.txt CHANGED
@@ -6,4 +6,5 @@ pydantic==2.7.0
6
  sentencepiece==0.2.0
7
  scipy==1.13.0
8
  spacy==3.8.11
9
- langdetect==1.0.9
 
 
6
  sentencepiece==0.2.0
7
  scipy==1.13.0
8
  spacy==3.8.11
9
+ langdetect==1.0.9
10
+ sentence-transformers==3.0.1