Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Sleeping

App Files Files Community

bichnhan2701 commited on Dec 18, 2025

Commit

65f24bc

1 Parent(s): c22837f

Update logic NLP

Browse files

Files changed (1) hide show

app/services/nlp_postprocess.py +69 -111

app/services/nlp_postprocess.py CHANGED Viewed

@@ -1,85 +1,31 @@
-from app.infra.redis_client import redis_client
-from app.utils.hashing import sha256
-from app.config.settings import GEMINI_API_KEY
-import google.generativeai as genai
 import asyncio
 import json
-# CACHE_TTL = 60 * 60 * 24 * 3  # 3 days
-# if GEMINI_API_KEY:
-#     genai.configure(api_key=GEMINI_API_KEY)
-#     _model = genai.GenerativeModel("gemini-pro")
-# else:
-#     _model = None
-# async def normalize_and_extract(raw_text: str) -> dict:
-#     """
-#     return {
-#       "normalized_text": "...",
-#       "keywords": [...]
-#     }
-#     """
-#     cache_key = f"nlp:{sha256(raw_text)}"
-#     cached = redis_client.get(cache_key)
-#     if cached:
-#         return json.loads(cached)
-#     prompt = f"""
-# Bạn là một hệ thống Xử lý Hậu kỳ NLP (NLP Post-Processing) Tiếng Việt.
-# Đầu vào là văn bản thô (raw transcript), có thể thiếu dấu câu và sai chính tả do nhận dạng giọng nói (ví dụ: 'ăn chứa' -> 'ăn chưa').
-# Nhiệm vụ (Trả về JSON duy nhất):
-# 1. [ASR Correction & Punctuation]: Sửa lỗi chính tả ASR, thêm dấu câu, viết hoa chuẩn xác.
-# Văn bản đầu vào: \"\"\"{raw_text}\"\"\"
-# Cấu trúc JSON bắt buộc:
-# {{
-#     "normalizedText": "Văn bản đã sửa hoàn chỉnh...",
-#     "keywords": ["Từ khóa 1", "Từ khóa 2", "..."]
-# }}
-# """
-#     result = {
-#         "normalized_text": raw_text,
-#         "keywords": []
-#     }
-#     if _model:
-#         loop = asyncio.get_event_loop()
-#         def call():
-#             r = _model.generate_content(prompt)
-#             return r.text
-#         text = await loop.run_in_executor(None, call)
-#         # clean JSON
-#         start = text.find("{")
-#         end = text.rfind("}")
-#         if start != -1 and end != -1:
-#             data = json.loads(text[start:end+1])
-#             result = {
-#                 "normalized_text": data.get("normalizedText", raw_text),
-#                 "keywords": data.get("keywords", [])
-#             }
-#     redis_client.setex(cache_key, CACHE_TTL, json.dumps(result))
-#     return result
-import logging
-import redis  # để bắt lỗi ConnectionError nếu cần
 CACHE_TTL = 60 * 60 * 24 * 3  # 3 days
 if GEMINI_API_KEY:
-    genai.configure(api_key=GEMINI_API_KEY)
-    _model = genai.GenerativeModel("gemini-pro")
 else:
-    _model = None
 async def normalize_and_extract(raw_text: str) -> dict:
     """
@@ -90,65 +36,77 @@ async def normalize_and_extract(raw_text: str) -> dict:
     """
     cache_key = f"nlp:{sha256(raw_text)}"
-    # 1) Thử đọc cache từ Redis, nhưng không để lỗi Redis làm vỡ pipeline
     try:
         cached = redis_client.get(cache_key)
         if cached:
             return json.loads(cached)
     except Exception as e:
-        logging.warning(f"Redis GET failed in normalize_and_extract, skip cache: {e}")
-    prompt = f"""
 Bạn là một hệ thống Xử lý Hậu kỳ NLP (NLP Post-Processing) Tiếng Việt.
-Đầu vào là văn bản thô (raw transcript), có thể thiếu dấu câu và sai chính tả do nhận dạng giọng nói (ví dụ: 'ăn chứa' -> 'ăn chưa').
 Nhiệm vụ (Trả về JSON duy nhất):
-1. [ASR Correction & Punctuation]: Sửa lỗi chính tả ASR, thêm dấu câu, viết hoa chuẩn xác, loại bỏ các từ bị lỗi lặp lại.
-Văn bản đầu vào: \"\"\"{raw_text}\"\"\"
-Cấu trúc JSON bắt buộc:
 {{
-    "normalizedText": "Văn bản đã sửa hoàn chỉnh...",
-    "keywords": ["Từ khóa 1", "Từ khóa 2", "..."]
 }}
 """
-    # 2) Giá trị mặc định nếu không có model
-    result = {
-        "normalized_text": raw_text,
-        "keywords": []
-    }
-    # 3) Gọi Gemini nếu có cấu hình
-    if _model:
         loop = asyncio.get_event_loop()
         def call():
-            r = _model.generate_content(prompt)
-            return r.text
-        text = await loop.run_in_executor(None, call)
-        # clean JSON
-        start = text.find("{")
-        end = text.rfind("}")
-        if start != -1 and end != -1:
-            try:
-                data = json.loads(text[start:end + 1])
-                result = {
-                    "normalized_text": data.get("normalizedText", raw_text),
-                    "keywords": data.get("keywords", []),
-                }
-            except Exception as e:
-                logging.warning(f"Failed to parse Gemini JSON, fallback to raw_text: {e}")
-    # 4) Thử ghi cache lại, cũng không để lỗi Redis làm vỡ pipeline
     try:
         redis_client.setex(cache_key, CACHE_TTL, json.dumps(result))
     except Exception as e:
-        logging.warning(f"Redis SETEX failed in normalize_and_extract, skip cache: {e}")
     return result

 import asyncio
 import json
+import logging
+from app.infra.redis_client import redis_client
+from app.utils.hashing import sha256
+from app.config.settings import GEMINI_API_KEY
+# New official client
+import google.genai as genai
+from google.api_core.exceptions import GoogleAPIError  # optional but useful
 CACHE_TTL = 60 * 60 * 24 * 3  # 3 days
+# Tạo client Gemini nếu có API key
+_gemini_client = None
+_GEMINI_MODEL = "gemini-1.5-flash"  # hoặc "gemini-1.5-flash-latest"
 if GEMINI_API_KEY:
+    try:
+        _gemini_client = genai.Client(api_key=GEMINI_API_KEY)
+        logging.info(f"[nlp_postprocess] Initialized google.genai client with model={_GEMINI_MODEL}")
+    except Exception as e:
+        logging.exception(f"[nlp_postprocess] Failed to init google.genai client: {e}")
+        _gemini_client = None
 else:
+    logging.warning("[nlp_postprocess] GEMINI_API_KEY is not set, using raw_text as normalization fallback")
 async def normalize_and_extract(raw_text: str) -> dict:
     """
     """
     cache_key = f"nlp:{sha256(raw_text)}"
+    # 1) Try get from Redis cache (best effort)
     try:
         cached = redis_client.get(cache_key)
         if cached:
             return json.loads(cached)
     except Exception as e:
+        logging.warning(f"[nlp_postprocess] Redis GET failed, skip cache: {e}")
+    # 2) Default fallback result (if no model or error)
+    result = {
+        "normalized_text": raw_text,
+        "keywords": [],
+    }
+    # 3) Call Gemini if available
+    if _gemini_client:
+        prompt = f"""
 Bạn là một hệ thống Xử lý Hậu kỳ NLP (NLP Post-Processing) Tiếng Việt.
+Đầu vào là văn bản thô (raw transcript), có thể thiếu dấu câu và sai chính tả do nhận dạng giọng nói.
 Nhiệm vụ (Trả về JSON duy nhất):
+1. Sửa lỗi chính tả ASR, thêm dấu câu, viết hoa chuẩn xác, loại bỏ các từ bị lặp lại vô nghĩa.
+2. Trích xuất danh sách từ khóa quan trọng (keywords) liên quan đến chủ đề, độ dài từ 1-4 từ.
+Văn bản đ��u vào:
+\"\"\"{raw_text}\"\"\"
+Cấu trúc JSON bắt buộc (chỉ trả JSON, không giải thích thêm):
 {{
+  "normalizedText": "Văn bản đã sửa hoàn chỉnh...",
+  "keywords": ["Từ khóa 1", "Từ khóa 2", "..."]
 }}
 """
         loop = asyncio.get_event_loop()
         def call():
+            # Nếu lỗi từ API, để try/except bên ngoài handle
+            resp = _gemini_client.models.generate_content(
+                model=_GEMINI_MODEL,
+                contents=prompt,
+            )
+            # resp.text là chuỗi model trả (có thể chứa code block)
+            return resp.text
+        try:
+            text = await loop.run_in_executor(None, call)
+            if text:
+                # clean JSON
+                start = text.find("{")
+                end = text.rfind("}")
+                if start != -1 and end != -1:
+                    try:
+                        data = json.loads(text[start:end + 1])
+                        result = {
+                            "normalized_text": data.get("normalizedText", raw_text),
+                            "keywords": data.get("keywords", []) or [],
+                        }
+                    except Exception as e:
+                        logging.warning(f"[nlp_postprocess] Failed to parse Gemini JSON, fallback to raw_text: {e}")
+                else:
+                    logging.warning("[nlp_postprocess] Gemini response has no JSON block, fallback to raw_text")
+        except GoogleAPIError as e:
+            logging.error(f"[nlp_postprocess] Gemini API error: {e}")
+        except Exception as e:
+            logging.exception(f"[nlp_postprocess] Gemini call failed, fallback to raw_text: {e}")
+    # 4) Try write back to Redis (best effort)
     try:
         redis_client.setex(cache_key, CACHE_TTL, json.dumps(result))
     except Exception as e:
+        logging.warning(f"[nlp_postprocess] Redis SETEX failed, skip cache: {e}")
     return result