Spaces:

ethonmax
/

picpocket

Sleeping

App Files Files Community

chenchaoyun commited on 26 days ago

Commit

172ad88

1 Parent(s): bd33028

Isolate DeepFace and anime model execution

Browse files

Files changed (3) hide show

Dockerfile +1 -0
api_routes.py +104 -22
config.py +3 -0

Dockerfile CHANGED Viewed

@@ -7,6 +7,7 @@ ENV TZ=Asia/Shanghai \
     IMAGES_DIR=/opt/data/images \
     MODELS_PATH=/opt/data/models \
     DEEPFACE_HOME=/opt/data/models \
     FAISS_INDEX_DIR=/opt/data/faiss \
     CELEBRITY_SOURCE_DIR=/opt/data/chinese_celeb_dataset

     IMAGES_DIR=/opt/data/images \
     MODELS_PATH=/opt/data/models \
     DEEPFACE_HOME=/opt/data/models \
+    TF_USE_LEGACY_KERAS=1 \
     FAISS_INDEX_DIR=/opt/data/faiss \
     CELEBRITY_SOURCE_DIR=/opt/data/chinese_celeb_dataset

api_routes.py CHANGED Viewed

@@ -16,6 +16,8 @@ from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Tuple
 import cv2
 import numpy as np
 from fastapi import APIRouter, File, UploadFile, HTTPException, Query, Request, \
@@ -254,8 +256,12 @@ if CLIP_AVAILABLE:
         logger.error(f"CLIP function import failed: {e}")
         CLIP_AVAILABLE = False
-# 创建线程池执行器用于异步处理CPU密集型任务
 executor = ThreadPoolExecutor(max_workers=4)
 def _log_stage_duration(stage: str, start_time: float, extra: str | None = None) -> float:
@@ -282,6 +288,18 @@ async def process_cpu_intensive_task(func, *args, **kwargs):
     return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
 def _keep_cpu_busy(duration: float, inner_loops: int = 5000) -> Dict[str, Any]:
     """
     在给定时间内执行纯CPU计算，用于防止服务器进入空闲态。
@@ -356,6 +374,21 @@ def _reset_deepface_model_cache(model_name: str = "VGG-Face") -> None:
         logger.info(f"已清除DeepFace缓存模型: {model_name}")
 def _recover_deepface_model(model_name: str = "VGG-Face") -> None:
     """组合清理动作，尽量恢复DeepFace模型可用状态。"""
     cleared = _clear_keras_session()
@@ -901,7 +934,7 @@ async def _refresh_celebrity_cache(sample_image_path: str,
     lock = _ensure_deepface_lock()
     async with lock:
         try:
-            await process_cpu_intensive_task(
                 deepface_module.find,
                 img_path=sample_image_path,
                 db_path=db_path,
@@ -913,12 +946,12 @@ async def _refresh_celebrity_cache(sample_image_path: str,
                 refresh_database=True,
             )
         except (AttributeError, RuntimeError) as attr_exc:
-            if "numpy" in str(attr_exc) or "SymbolicTensor" in str(attr_exc):
                 logger.warning(
-                    f"刷新明星向量缓存遇到 numpy/SymbolicTensor 异常，尝试恢复后重试: {attr_exc}")
                 _recover_deepface_model()
                 try:
-                    await process_cpu_intensive_task(
                         deepface_module.find,
                         img_path=sample_image_path,
                         db_path=db_path,
@@ -938,7 +971,28 @@ async def _refresh_celebrity_cache(sample_image_path: str,
                 f"刷新明星向量缓存遇到模型状态异常，尝试恢复后重试: {exc}")
             _recover_deepface_model()
             try:
-                await process_cpu_intensive_task(
                     deepface_module.find,
                     img_path=sample_image_path,
                     db_path=db_path,
@@ -951,8 +1005,6 @@ async def _refresh_celebrity_cache(sample_image_path: str,
                 )
             except Exception as retry_exc:
                 logger.warning(f"恢复后重新刷新明星缓存仍失败: {retry_exc}")
-        except Exception as e:
-            logger.warning(f"Refresh celebrity cache failed: {e}")
 async def _log_progress(task_name: str,
@@ -2976,7 +3028,7 @@ async def anime_stylize_photo(
         # 使用AnimeStylizer对图像进行动漫风格化
         logger.info(f"Starting to stylize image with anime style, style: {style_description}...")
         try:
-            stylized_image = await process_cpu_intensive_task(anime_stylizer.stylize_image, image, style_type)
             logger.info("Anime stylization processing completed")
         except Exception as e:
             logger.error(f"Anime stylization processing failed: {e}")
@@ -4549,16 +4601,16 @@ async def match_celebrity_face(
         lock = _ensure_deepface_lock()
         async with lock:
             try:
-                find_result = await process_cpu_intensive_task(
                     deepface_module.find,
                     **_build_find_kwargs(refresh=False),
                 )
             except (AttributeError, RuntimeError) as attr_err:
-                if "numpy" in str(attr_err) or "SymbolicTensor" in str(attr_err):
                     logger.warning(
-                        f"DeepFace find encountered numpy/SymbolicTensor error, 尝试清理模型后刷新缓存: {attr_err}")
                     _recover_deepface_model()
-                    find_result = await process_cpu_intensive_task(
                         deepface_module.find,
                         **_build_find_kwargs(refresh=True),
                     )
@@ -4573,7 +4625,17 @@ async def match_celebrity_face(
                     logger.warning(
                         f"DeepFace find failed without refresh: {ve}, 尝试清理模型后刷新缓存。")
                 _recover_deepface_model()
-                find_result = await process_cpu_intensive_task(
                     deepface_module.find,
                     **_build_find_kwargs(refresh=True),
                 )
@@ -4907,7 +4969,7 @@ async def face_similarity_verification(
         async with lock:
             try:
                 # 使用ArcFace模型进行人脸比对
-                verification_result = await process_cpu_intensive_task(
                     deepface_module.verify,
                     img1_path=original_path1,
                     img2_path=original_path2,
@@ -4918,12 +4980,12 @@ async def face_similarity_verification(
                 logger.info(
                     f"DeepFace verification completed result:{json.dumps(verification_result, ensure_ascii=False)}")
             except (AttributeError, RuntimeError) as attr_err:
-                if "numpy" in str(attr_err) or "SymbolicTensor" in str(attr_err):
                     logger.warning(
-                        f"DeepFace verification 遇到 numpy/SymbolicTensor 异常，尝试恢复后重试: {attr_err}")
                     _recover_deepface_model()
                     try:
-                        verification_result = await process_cpu_intensive_task(
                             deepface_module.verify,
                             img1_path=original_path1,
                             img2_path=original_path2,
@@ -4945,7 +5007,7 @@ async def face_similarity_verification(
                     f"DeepFace verification 遇到模型状态异常，尝试恢复后重试: {ve}")
                 _recover_deepface_model()
                 try:
-                    verification_result = await process_cpu_intensive_task(
                         deepface_module.verify,
                         img1_path=original_path1,
                         img2_path=original_path2,
@@ -4961,9 +5023,29 @@ async def face_similarity_verification(
                     raise HTTPException(status_code=500,
                                         detail=f"人脸比对失败: {str(retry_error)}") from retry_error
             except Exception as e:
-                logger.error(f"DeepFace verification failed: {e}")
-                raise HTTPException(status_code=500,
-                                    detail=f"人脸比对失败: {str(e)}") from e
         # 提取比对结果
         verified = verification_result["verified"]

 from datetime import datetime
 from typing import Any, Dict, List, Optional, Tuple
+os.environ.setdefault("TF_USE_LEGACY_KERAS", "1")
 import cv2
 import numpy as np
 from fastapi import APIRouter, File, UploadFile, HTTPException, Query, Request, \
         logger.error(f"CLIP function import failed: {e}")
         CLIP_AVAILABLE = False
+# 创建线程池执行器用于异步处理CPU密集型任务。
+# TensorFlow/Keras 模型对线程上下文敏感，DeepFace 必须固定到单独线程，
+# 避免被 ModelScope/Anime Style 的模型加载污染运行状态。
 executor = ThreadPoolExecutor(max_workers=4)
+deepface_executor = ThreadPoolExecutor(max_workers=1)
+anime_style_executor = ThreadPoolExecutor(max_workers=1)
 def _log_stage_duration(stage: str, start_time: float, extra: str | None = None) -> float:
     return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
+async def process_deepface_task(func, *args, **kwargs):
+    """在 DeepFace 专用单线程 executor 中执行，避免跨线程复用 Keras 模型。"""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(deepface_executor, lambda: func(*args, **kwargs))
+async def process_anime_style_task(func, *args, **kwargs):
+    """在 Anime Style 专用单线程 executor 中执行，隔离 ModelScope pipeline。"""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(anime_style_executor, lambda: func(*args, **kwargs))
 def _keep_cpu_busy(duration: float, inner_loops: int = 5000) -> Dict[str, Any]:
     """
     在给定时间内执行纯CPU计算，用于防止服务器进入空闲态。
         logger.info(f"已清除DeepFace缓存模型: {model_name}")
+def _is_deepface_model_state_error(exc: BaseException) -> bool:
+    """识别 Keras/DeepFace 模型状态污染类异常，触发清理并重试。"""
+    message = str(exc)
+    error_markers = (
+        "numpy",
+        "SymbolicTensor",
+        "EagerTensor",
+        "Attempting to capture",
+        "without building a function",
+        "conv2d",
+        "Graph execution error",
+    )
+    return any(marker in message for marker in error_markers)
 def _recover_deepface_model(model_name: str = "VGG-Face") -> None:
     """组合清理动作，尽量恢复DeepFace模型可用状态。"""
     cleared = _clear_keras_session()
     lock = _ensure_deepface_lock()
     async with lock:
         try:
+            await process_deepface_task(
                 deepface_module.find,
                 img_path=sample_image_path,
                 db_path=db_path,
                 refresh_database=True,
             )
         except (AttributeError, RuntimeError) as attr_exc:
+            if _is_deepface_model_state_error(attr_exc):
                 logger.warning(
+                    f"刷新明星向量缓存遇到 DeepFace 模型状态异常，尝试恢复后重试: {attr_exc}")
                 _recover_deepface_model()
                 try:
+                    await process_deepface_task(
                         deepface_module.find,
                         img_path=sample_image_path,
                         db_path=db_path,
                 f"刷新明星向量缓存遇到模型状态异常，尝试恢复后重试: {exc}")
             _recover_deepface_model()
             try:
+                await process_deepface_task(
+                    deepface_module.find,
+                    img_path=sample_image_path,
+                    db_path=db_path,
+                    model_name="VGG-Face",
+                    detector_backend="yolov11n",
+                    distance_metric="cosine",
+                    enforce_detection=True,
+                    silent=True,
+                    refresh_database=True,
+                )
+            except Exception as retry_exc:
+                logger.warning(f"恢复后重新刷新明星缓存仍失败: {retry_exc}")
+        except Exception as exc:
+            if not _is_deepface_model_state_error(exc):
+                logger.warning(f"Refresh celebrity cache failed: {exc}")
+                return
+            logger.warning(
+                f"刷新明星向量缓存遇到 DeepFace 底层状态异常，尝试恢复后重试: {exc}")
+            _recover_deepface_model()
+            try:
+                await process_deepface_task(
                     deepface_module.find,
                     img_path=sample_image_path,
                     db_path=db_path,
                 )
             except Exception as retry_exc:
                 logger.warning(f"恢复后重新刷新明星缓存仍失败: {retry_exc}")
 async def _log_progress(task_name: str,
         # 使用AnimeStylizer对图像进行动漫风格化
         logger.info(f"Starting to stylize image with anime style, style: {style_description}...")
         try:
+            stylized_image = await process_anime_style_task(anime_stylizer.stylize_image, image, style_type)
             logger.info("Anime stylization processing completed")
         except Exception as e:
             logger.error(f"Anime stylization processing failed: {e}")
         lock = _ensure_deepface_lock()
         async with lock:
             try:
+                find_result = await process_deepface_task(
                     deepface_module.find,
                     **_build_find_kwargs(refresh=False),
                 )
             except (AttributeError, RuntimeError) as attr_err:
+                if _is_deepface_model_state_error(attr_err):
                     logger.warning(
+                        f"DeepFace find encountered model state error, 尝试清理模型后刷新缓存: {attr_err}")
                     _recover_deepface_model()
+                    find_result = await process_deepface_task(
                         deepface_module.find,
                         **_build_find_kwargs(refresh=True),
                     )
                     logger.warning(
                         f"DeepFace find failed without refresh: {ve}, 尝试清理模型后刷新缓存。")
                 _recover_deepface_model()
+                find_result = await process_deepface_task(
+                    deepface_module.find,
+                    **_build_find_kwargs(refresh=True),
+                )
+            except Exception as exc:
+                if not _is_deepface_model_state_error(exc):
+                    raise
+                logger.warning(
+                    f"DeepFace find 遇到底层模型状态异常，尝试清理模型后刷新缓存: {exc}")
+                _recover_deepface_model()
+                find_result = await process_deepface_task(
                     deepface_module.find,
                     **_build_find_kwargs(refresh=True),
                 )
         async with lock:
             try:
                 # 使用ArcFace模型进行人脸比对
+                verification_result = await process_deepface_task(
                     deepface_module.verify,
                     img1_path=original_path1,
                     img2_path=original_path2,
                 logger.info(
                     f"DeepFace verification completed result:{json.dumps(verification_result, ensure_ascii=False)}")
             except (AttributeError, RuntimeError) as attr_err:
+                if _is_deepface_model_state_error(attr_err):
                     logger.warning(
+                        f"DeepFace verification 遇到模型状态异常，尝试恢复后重试: {attr_err}")
                     _recover_deepface_model()
                     try:
+                        verification_result = await process_deepface_task(
                             deepface_module.verify,
                             img1_path=original_path1,
                             img2_path=original_path2,
                     f"DeepFace verification 遇到模型状态异常，尝试恢复后重试: {ve}")
                 _recover_deepface_model()
                 try:
+                    verification_result = await process_deepface_task(
                         deepface_module.verify,
                         img1_path=original_path1,
                         img2_path=original_path2,
                     raise HTTPException(status_code=500,
                                         detail=f"人脸比对失败: {str(retry_error)}") from retry_error
             except Exception as e:
+                if not _is_deepface_model_state_error(e):
+                    logger.error(f"DeepFace verification failed: {e}")
+                    raise HTTPException(status_code=500,
+                                        detail=f"人脸比对失败: {str(e)}") from e
+                logger.warning(
+                    f"DeepFace verification 遇到底层模型状态异常，尝试恢复后重试: {e}")
+                _recover_deepface_model()
+                try:
+                    verification_result = await process_deepface_task(
+                        deepface_module.verify,
+                        img1_path=original_path1,
+                        img2_path=original_path2,
+                        model_name="VGG-Face",
+                        detector_backend="yolov11n",
+                        distance_metric="cosine"
+                    )
+                    logger.info(
+                        f"DeepFace verification completed after recovery: {json.dumps(verification_result, ensure_ascii=False)}")
+                except Exception as retry_error:
+                    logger.error(
+                        f"DeepFace verification failed after recovery attempt: {retry_error}")
+                    raise HTTPException(status_code=500,
+                                        detail=f"人脸比对失败: {str(retry_error)}") from retry_error
         # 提取比对结果
         verified = verification_result["verified"]

config.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import logging
 import os
 # 解决OpenMP库冲突问题
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 # 设置CPU线程数为CPU核心数，提高CPU利用率

 import logging
 import os
+# DeepFace 仍依赖 tf-keras 行为；必须在任何 TensorFlow/Keras/DeepFace import 前设置。
+os.environ.setdefault("TF_USE_LEGACY_KERAS", "1")
 # 解决OpenMP库冲突问题
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 # 设置CPU线程数为CPU核心数，提高CPU利用率