Spaces:
Running
Running
水平線の遥か上を飛んで征く、
Browse files- app.py +37 -25
- src/mfa_runner.py +11 -0
app.py
CHANGED
|
@@ -274,7 +274,12 @@ def download_all_models():
|
|
| 274 |
def download_pkuseg_models() -> bool:
|
| 275 |
"""下载 pkuseg 中文分词模型,返回是否成功
|
| 276 |
|
| 277 |
-
spacy-pkuseg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
"""
|
| 279 |
logger.info("\n【下载 pkuseg 模型】")
|
| 280 |
|
|
@@ -284,12 +289,18 @@ def download_pkuseg_models() -> bool:
|
|
| 284 |
pkuseg_model_dir = pkuseg_home / "spacy_ontonotes"
|
| 285 |
postag_model_dir = pkuseg_home / "postag"
|
| 286 |
|
| 287 |
-
# 检查
|
| 288 |
-
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
return True
|
| 291 |
|
| 292 |
-
# 检查是否有文件被错误解压到根目录
|
| 293 |
root_msgpack = pkuseg_home / "features.msgpack"
|
| 294 |
if root_msgpack.exists():
|
| 295 |
logger.info("检测到模型文件在根目录,移动到正确位置...")
|
|
@@ -311,23 +322,23 @@ def download_pkuseg_models() -> bool:
|
|
| 311 |
dst = postag_model_dir / filename
|
| 312 |
src.rename(dst)
|
| 313 |
logger.info(f"移动 {filename} -> postag/")
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
logger.info(f"spacy_ontonotes 内容: {files}")
|
| 325 |
return True
|
| 326 |
|
| 327 |
# 需要下载模型
|
| 328 |
-
logger.info("下载 pkuseg 模型
|
| 329 |
|
| 330 |
# 使用 spacy-pkuseg 的模型(新格式 msgpack)
|
|
|
|
| 331 |
models = [
|
| 332 |
{
|
| 333 |
"name": "spacy_ontonotes",
|
|
@@ -351,17 +362,18 @@ def download_pkuseg_models() -> bool:
|
|
| 351 |
|
| 352 |
for model in models:
|
| 353 |
model_name = model["name"]
|
| 354 |
-
model_dir = pkuseg_home / model_name
|
| 355 |
-
check_file = model_dir / model["check_file"]
|
| 356 |
|
| 357 |
-
|
| 358 |
-
|
| 359 |
continue
|
| 360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
downloaded = False
|
| 362 |
for url in model["urls"]:
|
| 363 |
logger.info(f"下载 {model_name}: {url}")
|
| 364 |
-
zip_path = pkuseg_home / f"{model_name}.zip"
|
| 365 |
|
| 366 |
try:
|
| 367 |
# 下载
|
|
@@ -392,12 +404,12 @@ def download_pkuseg_models() -> bool:
|
|
| 392 |
logger.warning(f"unzip 解压失败: {result.stderr}")
|
| 393 |
continue
|
| 394 |
|
| 395 |
-
#
|
| 396 |
-
zip_path
|
| 397 |
|
| 398 |
# 验证
|
| 399 |
if check_file.exists():
|
| 400 |
-
logger.info(f"{model_name} 下载并解压成功")
|
| 401 |
files = [f.name for f in model_dir.iterdir()]
|
| 402 |
logger.info(f"{model_name} 目录内容: {files}")
|
| 403 |
downloaded = True
|
|
|
|
| 274 |
def download_pkuseg_models() -> bool:
|
| 275 |
"""下载 pkuseg 中文分词模型,返回是否成功
|
| 276 |
|
| 277 |
+
spacy-pkuseg 检查模型的逻辑:
|
| 278 |
+
1. 先检查 PKUSEG_HOME/<model_name>.zip 是否存在
|
| 279 |
+
2. 如果 zip 存在,解压到 PKUSEG_HOME/<model_name>/ 目录
|
| 280 |
+
3. 如果 zip 不存在,从 GitHub 下载
|
| 281 |
+
|
| 282 |
+
因此我们需要保留 .zip 文件,否则 spacy_pkuseg 会尝试重新下载
|
| 283 |
"""
|
| 284 |
logger.info("\n【下载 pkuseg 模型】")
|
| 285 |
|
|
|
|
| 289 |
pkuseg_model_dir = pkuseg_home / "spacy_ontonotes"
|
| 290 |
postag_model_dir = pkuseg_home / "postag"
|
| 291 |
|
| 292 |
+
# 关键:检查 .zip 文件是否存在(spacy_pkuseg 的检查逻辑)
|
| 293 |
+
spacy_ontonotes_zip = pkuseg_home / "spacy_ontonotes.zip"
|
| 294 |
+
postag_zip = pkuseg_home / "postag.zip"
|
| 295 |
+
|
| 296 |
+
if spacy_ontonotes_zip.exists() and postag_zip.exists():
|
| 297 |
+
logger.info(f"pkuseg 模型 zip 文件已存在: {pkuseg_home}")
|
| 298 |
+
# 列出目录内容供调试
|
| 299 |
+
files = [f.name for f in pkuseg_home.iterdir()]
|
| 300 |
+
logger.info(f"pkuseg 目录内容: {files}")
|
| 301 |
return True
|
| 302 |
|
| 303 |
+
# 检查是否有文件被错误解压到根目录(旧版本遗留问题)
|
| 304 |
root_msgpack = pkuseg_home / "features.msgpack"
|
| 305 |
if root_msgpack.exists():
|
| 306 |
logger.info("检测到模型文件在根目录,移动到正确位置...")
|
|
|
|
| 322 |
dst = postag_model_dir / filename
|
| 323 |
src.rename(dst)
|
| 324 |
logger.info(f"移动 {filename} -> postag/")
|
| 325 |
+
|
| 326 |
+
# 再次检查(如果有解压后的目录但没有 zip,需要重新下载 zip)
|
| 327 |
+
need_download = []
|
| 328 |
+
if not spacy_ontonotes_zip.exists():
|
| 329 |
+
need_download.append("spacy_ontonotes")
|
| 330 |
+
if not postag_zip.exists():
|
| 331 |
+
need_download.append("postag")
|
| 332 |
+
|
| 333 |
+
if not need_download:
|
| 334 |
+
logger.info(f"pkuseg 模型已就绪: {pkuseg_home}")
|
|
|
|
| 335 |
return True
|
| 336 |
|
| 337 |
# 需要下载模型
|
| 338 |
+
logger.info(f"需要下载 pkuseg 模型: {need_download}")
|
| 339 |
|
| 340 |
# 使用 spacy-pkuseg 的模型(新格式 msgpack)
|
| 341 |
+
# 注意:必须保留 .zip 文件,spacy_pkuseg 会检查 zip 是否存在
|
| 342 |
models = [
|
| 343 |
{
|
| 344 |
"name": "spacy_ontonotes",
|
|
|
|
| 362 |
|
| 363 |
for model in models:
|
| 364 |
model_name = model["name"]
|
|
|
|
|
|
|
| 365 |
|
| 366 |
+
# 跳过不需要下载的模型
|
| 367 |
+
if model_name not in need_download:
|
| 368 |
continue
|
| 369 |
|
| 370 |
+
model_dir = pkuseg_home / model_name
|
| 371 |
+
zip_path = pkuseg_home / f"{model_name}.zip"
|
| 372 |
+
check_file = model_dir / model["check_file"]
|
| 373 |
+
|
| 374 |
downloaded = False
|
| 375 |
for url in model["urls"]:
|
| 376 |
logger.info(f"下载 {model_name}: {url}")
|
|
|
|
| 377 |
|
| 378 |
try:
|
| 379 |
# 下载
|
|
|
|
| 404 |
logger.warning(f"unzip 解压失败: {result.stderr}")
|
| 405 |
continue
|
| 406 |
|
| 407 |
+
# 重要:保留 zip 文件!spacy_pkuseg 会检查 zip 是否存在
|
| 408 |
+
# 不要删除 zip_path
|
| 409 |
|
| 410 |
# 验证
|
| 411 |
if check_file.exists():
|
| 412 |
+
logger.info(f"{model_name} 下载并解压成功(保留 zip 文件)")
|
| 413 |
files = [f.name for f in model_dir.iterdir()]
|
| 414 |
logger.info(f"{model_name} 目录内容: {files}")
|
| 415 |
downloaded = True
|
src/mfa_runner.py
CHANGED
|
@@ -100,6 +100,17 @@ def _build_mfa_env() -> dict:
|
|
| 100 |
pkuseg_home.mkdir(parents=True, exist_ok=True)
|
| 101 |
env["PKUSEG_HOME"] = str(pkuseg_home)
|
| 102 |
logger.info(f"设置 PKUSEG_HOME: {pkuseg_home}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
# 确保从系统环境继承 PKUSEG_HOME(如果已设置)
|
| 105 |
if "PKUSEG_HOME" not in env and os.environ.get("PKUSEG_HOME"):
|
|
|
|
| 100 |
pkuseg_home.mkdir(parents=True, exist_ok=True)
|
| 101 |
env["PKUSEG_HOME"] = str(pkuseg_home)
|
| 102 |
logger.info(f"设置 PKUSEG_HOME: {pkuseg_home}")
|
| 103 |
+
|
| 104 |
+
# 验证 pkuseg 模型是否存在(检查 zip 文件,这是 spacy_pkuseg 的检查方式)
|
| 105 |
+
spacy_ontonotes_zip = pkuseg_home / "spacy_ontonotes.zip"
|
| 106 |
+
if spacy_ontonotes_zip.exists():
|
| 107 |
+
logger.info(f"pkuseg 模型 zip 已存在: {spacy_ontonotes_zip}")
|
| 108 |
+
else:
|
| 109 |
+
logger.warning(f"pkuseg 模型 zip 不存在: {spacy_ontonotes_zip}")
|
| 110 |
+
# 列出目录内容供调试
|
| 111 |
+
if pkuseg_home.exists():
|
| 112 |
+
files = list(pkuseg_home.iterdir())
|
| 113 |
+
logger.info(f"pkuseg 目录内容: {[f.name for f in files]}")
|
| 114 |
|
| 115 |
# 确保从系统环境继承 PKUSEG_HOME(如果已设置)
|
| 116 |
if "PKUSEG_HOME" not in env and os.environ.get("PKUSEG_HOME"):
|