chenchaoyun commited on
Commit
172ad88
·
1 Parent(s): bd33028

Isolate DeepFace and anime model execution

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -0
  2. api_routes.py +104 -22
  3. config.py +3 -0
Dockerfile CHANGED
@@ -7,6 +7,7 @@ ENV TZ=Asia/Shanghai \
7
  IMAGES_DIR=/opt/data/images \
8
  MODELS_PATH=/opt/data/models \
9
  DEEPFACE_HOME=/opt/data/models \
 
10
  FAISS_INDEX_DIR=/opt/data/faiss \
11
  CELEBRITY_SOURCE_DIR=/opt/data/chinese_celeb_dataset
12
 
 
7
  IMAGES_DIR=/opt/data/images \
8
  MODELS_PATH=/opt/data/models \
9
  DEEPFACE_HOME=/opt/data/models \
10
+ TF_USE_LEGACY_KERAS=1 \
11
  FAISS_INDEX_DIR=/opt/data/faiss \
12
  CELEBRITY_SOURCE_DIR=/opt/data/chinese_celeb_dataset
13
 
api_routes.py CHANGED
@@ -16,6 +16,8 @@ from concurrent.futures import ThreadPoolExecutor
16
  from datetime import datetime
17
  from typing import Any, Dict, List, Optional, Tuple
18
 
 
 
19
  import cv2
20
  import numpy as np
21
  from fastapi import APIRouter, File, UploadFile, HTTPException, Query, Request, \
@@ -254,8 +256,12 @@ if CLIP_AVAILABLE:
254
  logger.error(f"CLIP function import failed: {e}")
255
  CLIP_AVAILABLE = False
256
 
257
- # 创建线程池执行器用于异步处理CPU密集型任务
 
 
258
  executor = ThreadPoolExecutor(max_workers=4)
 
 
259
 
260
 
261
  def _log_stage_duration(stage: str, start_time: float, extra: str | None = None) -> float:
@@ -282,6 +288,18 @@ async def process_cpu_intensive_task(func, *args, **kwargs):
282
  return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
283
 
284
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  def _keep_cpu_busy(duration: float, inner_loops: int = 5000) -> Dict[str, Any]:
286
  """
287
  在给定时间内执行纯CPU计算,用于防止服务器进入空闲态。
@@ -356,6 +374,21 @@ def _reset_deepface_model_cache(model_name: str = "VGG-Face") -> None:
356
  logger.info(f"已清除DeepFace缓存模型: {model_name}")
357
 
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  def _recover_deepface_model(model_name: str = "VGG-Face") -> None:
360
  """组合清理动作,尽量恢复DeepFace模型可用状态。"""
361
  cleared = _clear_keras_session()
@@ -901,7 +934,7 @@ async def _refresh_celebrity_cache(sample_image_path: str,
901
  lock = _ensure_deepface_lock()
902
  async with lock:
903
  try:
904
- await process_cpu_intensive_task(
905
  deepface_module.find,
906
  img_path=sample_image_path,
907
  db_path=db_path,
@@ -913,12 +946,12 @@ async def _refresh_celebrity_cache(sample_image_path: str,
913
  refresh_database=True,
914
  )
915
  except (AttributeError, RuntimeError) as attr_exc:
916
- if "numpy" in str(attr_exc) or "SymbolicTensor" in str(attr_exc):
917
  logger.warning(
918
- f"刷新明星向量缓存遇到 numpy/SymbolicTensor 异常,尝试恢复后重试: {attr_exc}")
919
  _recover_deepface_model()
920
  try:
921
- await process_cpu_intensive_task(
922
  deepface_module.find,
923
  img_path=sample_image_path,
924
  db_path=db_path,
@@ -938,7 +971,28 @@ async def _refresh_celebrity_cache(sample_image_path: str,
938
  f"刷新明星向量缓存遇到模型状态异常,尝试恢复后重试: {exc}")
939
  _recover_deepface_model()
940
  try:
941
- await process_cpu_intensive_task(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
  deepface_module.find,
943
  img_path=sample_image_path,
944
  db_path=db_path,
@@ -951,8 +1005,6 @@ async def _refresh_celebrity_cache(sample_image_path: str,
951
  )
952
  except Exception as retry_exc:
953
  logger.warning(f"恢复后重新刷新明星缓存仍失败: {retry_exc}")
954
- except Exception as e:
955
- logger.warning(f"Refresh celebrity cache failed: {e}")
956
 
957
 
958
  async def _log_progress(task_name: str,
@@ -2976,7 +3028,7 @@ async def anime_stylize_photo(
2976
  # 使用AnimeStylizer对图像进行动漫风格化
2977
  logger.info(f"Starting to stylize image with anime style, style: {style_description}...")
2978
  try:
2979
- stylized_image = await process_cpu_intensive_task(anime_stylizer.stylize_image, image, style_type)
2980
  logger.info("Anime stylization processing completed")
2981
  except Exception as e:
2982
  logger.error(f"Anime stylization processing failed: {e}")
@@ -4549,16 +4601,16 @@ async def match_celebrity_face(
4549
  lock = _ensure_deepface_lock()
4550
  async with lock:
4551
  try:
4552
- find_result = await process_cpu_intensive_task(
4553
  deepface_module.find,
4554
  **_build_find_kwargs(refresh=False),
4555
  )
4556
  except (AttributeError, RuntimeError) as attr_err:
4557
- if "numpy" in str(attr_err) or "SymbolicTensor" in str(attr_err):
4558
  logger.warning(
4559
- f"DeepFace find encountered numpy/SymbolicTensor error, 尝试清理模型后刷新缓存: {attr_err}")
4560
  _recover_deepface_model()
4561
- find_result = await process_cpu_intensive_task(
4562
  deepface_module.find,
4563
  **_build_find_kwargs(refresh=True),
4564
  )
@@ -4573,7 +4625,17 @@ async def match_celebrity_face(
4573
  logger.warning(
4574
  f"DeepFace find failed without refresh: {ve}, 尝试清理模型后刷新缓存。")
4575
  _recover_deepface_model()
4576
- find_result = await process_cpu_intensive_task(
 
 
 
 
 
 
 
 
 
 
4577
  deepface_module.find,
4578
  **_build_find_kwargs(refresh=True),
4579
  )
@@ -4907,7 +4969,7 @@ async def face_similarity_verification(
4907
  async with lock:
4908
  try:
4909
  # 使用ArcFace模型进行人脸比对
4910
- verification_result = await process_cpu_intensive_task(
4911
  deepface_module.verify,
4912
  img1_path=original_path1,
4913
  img2_path=original_path2,
@@ -4918,12 +4980,12 @@ async def face_similarity_verification(
4918
  logger.info(
4919
  f"DeepFace verification completed result:{json.dumps(verification_result, ensure_ascii=False)}")
4920
  except (AttributeError, RuntimeError) as attr_err:
4921
- if "numpy" in str(attr_err) or "SymbolicTensor" in str(attr_err):
4922
  logger.warning(
4923
- f"DeepFace verification 遇到 numpy/SymbolicTensor 异常,尝试恢复后重试: {attr_err}")
4924
  _recover_deepface_model()
4925
  try:
4926
- verification_result = await process_cpu_intensive_task(
4927
  deepface_module.verify,
4928
  img1_path=original_path1,
4929
  img2_path=original_path2,
@@ -4945,7 +5007,7 @@ async def face_similarity_verification(
4945
  f"DeepFace verification 遇到模型状态异常,尝试恢复后重试: {ve}")
4946
  _recover_deepface_model()
4947
  try:
4948
- verification_result = await process_cpu_intensive_task(
4949
  deepface_module.verify,
4950
  img1_path=original_path1,
4951
  img2_path=original_path2,
@@ -4961,9 +5023,29 @@ async def face_similarity_verification(
4961
  raise HTTPException(status_code=500,
4962
  detail=f"人脸比对失败: {str(retry_error)}") from retry_error
4963
  except Exception as e:
4964
- logger.error(f"DeepFace verification failed: {e}")
4965
- raise HTTPException(status_code=500,
4966
- detail=f"人脸比对失败: {str(e)}") from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4967
 
4968
  # 提取比对结果
4969
  verified = verification_result["verified"]
 
16
  from datetime import datetime
17
  from typing import Any, Dict, List, Optional, Tuple
18
 
19
+ os.environ.setdefault("TF_USE_LEGACY_KERAS", "1")
20
+
21
  import cv2
22
  import numpy as np
23
  from fastapi import APIRouter, File, UploadFile, HTTPException, Query, Request, \
 
256
  logger.error(f"CLIP function import failed: {e}")
257
  CLIP_AVAILABLE = False
258
 
259
+ # 创建线程池执行器用于异步处理CPU密集型任务
260
+ # TensorFlow/Keras 模型对线程上下文敏感,DeepFace 必须固定到单独线程,
261
+ # 避免被 ModelScope/Anime Style 的模型加载污染运行状态。
262
  executor = ThreadPoolExecutor(max_workers=4)
263
+ deepface_executor = ThreadPoolExecutor(max_workers=1)
264
+ anime_style_executor = ThreadPoolExecutor(max_workers=1)
265
 
266
 
267
  def _log_stage_duration(stage: str, start_time: float, extra: str | None = None) -> float:
 
288
  return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
289
 
290
 
291
+ async def process_deepface_task(func, *args, **kwargs):
292
+ """在 DeepFace 专用单线程 executor 中执行,避免跨线程复用 Keras 模型。"""
293
+ loop = asyncio.get_event_loop()
294
+ return await loop.run_in_executor(deepface_executor, lambda: func(*args, **kwargs))
295
+
296
+
297
+ async def process_anime_style_task(func, *args, **kwargs):
298
+ """在 Anime Style 专用单线程 executor 中执行,隔离 ModelScope pipeline。"""
299
+ loop = asyncio.get_event_loop()
300
+ return await loop.run_in_executor(anime_style_executor, lambda: func(*args, **kwargs))
301
+
302
+
303
  def _keep_cpu_busy(duration: float, inner_loops: int = 5000) -> Dict[str, Any]:
304
  """
305
  在给定时间内执行纯CPU计算,用于防止服务器进入空闲态。
 
374
  logger.info(f"已清除DeepFace缓存模型: {model_name}")
375
 
376
 
377
+ def _is_deepface_model_state_error(exc: BaseException) -> bool:
378
+ """识别 Keras/DeepFace 模型状态污染类异常,触发清理并重试。"""
379
+ message = str(exc)
380
+ error_markers = (
381
+ "numpy",
382
+ "SymbolicTensor",
383
+ "EagerTensor",
384
+ "Attempting to capture",
385
+ "without building a function",
386
+ "conv2d",
387
+ "Graph execution error",
388
+ )
389
+ return any(marker in message for marker in error_markers)
390
+
391
+
392
  def _recover_deepface_model(model_name: str = "VGG-Face") -> None:
393
  """组合清理动作,尽量恢复DeepFace模型可用状态。"""
394
  cleared = _clear_keras_session()
 
934
  lock = _ensure_deepface_lock()
935
  async with lock:
936
  try:
937
+ await process_deepface_task(
938
  deepface_module.find,
939
  img_path=sample_image_path,
940
  db_path=db_path,
 
946
  refresh_database=True,
947
  )
948
  except (AttributeError, RuntimeError) as attr_exc:
949
+ if _is_deepface_model_state_error(attr_exc):
950
  logger.warning(
951
+ f"刷新明星向量缓存遇到 DeepFace 模型状态异常,尝试恢复后重试: {attr_exc}")
952
  _recover_deepface_model()
953
  try:
954
+ await process_deepface_task(
955
  deepface_module.find,
956
  img_path=sample_image_path,
957
  db_path=db_path,
 
971
  f"刷新明星向量缓存遇到模型状态异常,尝试恢复后重试: {exc}")
972
  _recover_deepface_model()
973
  try:
974
+ await process_deepface_task(
975
+ deepface_module.find,
976
+ img_path=sample_image_path,
977
+ db_path=db_path,
978
+ model_name="VGG-Face",
979
+ detector_backend="yolov11n",
980
+ distance_metric="cosine",
981
+ enforce_detection=True,
982
+ silent=True,
983
+ refresh_database=True,
984
+ )
985
+ except Exception as retry_exc:
986
+ logger.warning(f"恢复后重新刷新明星缓存仍失败: {retry_exc}")
987
+ except Exception as exc:
988
+ if not _is_deepface_model_state_error(exc):
989
+ logger.warning(f"Refresh celebrity cache failed: {exc}")
990
+ return
991
+ logger.warning(
992
+ f"刷新明星向量缓存遇到 DeepFace 底层状态异常,尝试恢复后重试: {exc}")
993
+ _recover_deepface_model()
994
+ try:
995
+ await process_deepface_task(
996
  deepface_module.find,
997
  img_path=sample_image_path,
998
  db_path=db_path,
 
1005
  )
1006
  except Exception as retry_exc:
1007
  logger.warning(f"恢复后重新刷新明星缓存仍失败: {retry_exc}")
 
 
1008
 
1009
 
1010
  async def _log_progress(task_name: str,
 
3028
  # 使用AnimeStylizer对图像进行动漫风格化
3029
  logger.info(f"Starting to stylize image with anime style, style: {style_description}...")
3030
  try:
3031
+ stylized_image = await process_anime_style_task(anime_stylizer.stylize_image, image, style_type)
3032
  logger.info("Anime stylization processing completed")
3033
  except Exception as e:
3034
  logger.error(f"Anime stylization processing failed: {e}")
 
4601
  lock = _ensure_deepface_lock()
4602
  async with lock:
4603
  try:
4604
+ find_result = await process_deepface_task(
4605
  deepface_module.find,
4606
  **_build_find_kwargs(refresh=False),
4607
  )
4608
  except (AttributeError, RuntimeError) as attr_err:
4609
+ if _is_deepface_model_state_error(attr_err):
4610
  logger.warning(
4611
+ f"DeepFace find encountered model state error, 尝试清理模型后刷新缓存: {attr_err}")
4612
  _recover_deepface_model()
4613
+ find_result = await process_deepface_task(
4614
  deepface_module.find,
4615
  **_build_find_kwargs(refresh=True),
4616
  )
 
4625
  logger.warning(
4626
  f"DeepFace find failed without refresh: {ve}, 尝试清理模型后刷新缓存。")
4627
  _recover_deepface_model()
4628
+ find_result = await process_deepface_task(
4629
+ deepface_module.find,
4630
+ **_build_find_kwargs(refresh=True),
4631
+ )
4632
+ except Exception as exc:
4633
+ if not _is_deepface_model_state_error(exc):
4634
+ raise
4635
+ logger.warning(
4636
+ f"DeepFace find 遇到底层模型状态异常,尝试清理模型后刷新缓存: {exc}")
4637
+ _recover_deepface_model()
4638
+ find_result = await process_deepface_task(
4639
  deepface_module.find,
4640
  **_build_find_kwargs(refresh=True),
4641
  )
 
4969
  async with lock:
4970
  try:
4971
  # 使用ArcFace模型进行人脸比对
4972
+ verification_result = await process_deepface_task(
4973
  deepface_module.verify,
4974
  img1_path=original_path1,
4975
  img2_path=original_path2,
 
4980
  logger.info(
4981
  f"DeepFace verification completed result:{json.dumps(verification_result, ensure_ascii=False)}")
4982
  except (AttributeError, RuntimeError) as attr_err:
4983
+ if _is_deepface_model_state_error(attr_err):
4984
  logger.warning(
4985
+ f"DeepFace verification 遇到模型状态异常,尝试恢复后重试: {attr_err}")
4986
  _recover_deepface_model()
4987
  try:
4988
+ verification_result = await process_deepface_task(
4989
  deepface_module.verify,
4990
  img1_path=original_path1,
4991
  img2_path=original_path2,
 
5007
  f"DeepFace verification 遇到模型状态异常,尝试恢复后重试: {ve}")
5008
  _recover_deepface_model()
5009
  try:
5010
+ verification_result = await process_deepface_task(
5011
  deepface_module.verify,
5012
  img1_path=original_path1,
5013
  img2_path=original_path2,
 
5023
  raise HTTPException(status_code=500,
5024
  detail=f"人脸比对失败: {str(retry_error)}") from retry_error
5025
  except Exception as e:
5026
+ if not _is_deepface_model_state_error(e):
5027
+ logger.error(f"DeepFace verification failed: {e}")
5028
+ raise HTTPException(status_code=500,
5029
+ detail=f"人脸比对失败: {str(e)}") from e
5030
+ logger.warning(
5031
+ f"DeepFace verification 遇到底层模型状态异常,尝试恢复后重试: {e}")
5032
+ _recover_deepface_model()
5033
+ try:
5034
+ verification_result = await process_deepface_task(
5035
+ deepface_module.verify,
5036
+ img1_path=original_path1,
5037
+ img2_path=original_path2,
5038
+ model_name="VGG-Face",
5039
+ detector_backend="yolov11n",
5040
+ distance_metric="cosine"
5041
+ )
5042
+ logger.info(
5043
+ f"DeepFace verification completed after recovery: {json.dumps(verification_result, ensure_ascii=False)}")
5044
+ except Exception as retry_error:
5045
+ logger.error(
5046
+ f"DeepFace verification failed after recovery attempt: {retry_error}")
5047
+ raise HTTPException(status_code=500,
5048
+ detail=f"人脸比对失败: {str(retry_error)}") from retry_error
5049
 
5050
  # 提取比对结果
5051
  verified = verification_result["verified"]
config.py CHANGED
@@ -1,6 +1,9 @@
1
  import logging
2
  import os
3
 
 
 
 
4
  # 解决OpenMP库冲突问题
5
  os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
6
  # 设置CPU线程数为CPU核心数,提高CPU利用率
 
1
  import logging
2
  import os
3
 
4
+ # DeepFace 仍依赖 tf-keras 行为;必须在任何 TensorFlow/Keras/DeepFace import 前设置。
5
+ os.environ.setdefault("TF_USE_LEGACY_KERAS", "1")
6
+
7
  # 解决OpenMP库冲突问题
8
  os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
9
  # 设置CPU线程数为CPU核心数,提高CPU利用率