Sixparticle commited on
Commit
99a461a
·
1 Parent(s): e20ba09

Fix HF Space tokenizer startup crash

Browse files
Files changed (2) hide show
  1. app.py +59 -6
  2. requirements.txt +4 -3
app.py CHANGED
@@ -1,31 +1,84 @@
1
  import gradio as gr
2
  import os
 
 
 
 
3
  from huggingface_hub import snapshot_download
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, RobertaTokenizer
5
  import torch
6
 
 
 
 
 
 
7
  # 加载 CodeT5+ 模型
8
  model_name = "Salesforce/codet5p-220m"
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def prepare_local_model(repo_id: str, local_dir: str = "./model_cache") -> str:
12
  snapshot_download(repo_id=repo_id, local_dir=local_dir)
13
 
14
- # Work around a transformers/tokenizers incompatibility for this repo.
15
- # Its added_tokens.json is an empty dict, which can crash tokenizer init in some versions.
16
  added_tokens_file = os.path.join(local_dir, "added_tokens.json")
17
- if os.path.exists(added_tokens_file):
18
- os.remove(added_tokens_file)
19
 
20
  return local_dir
21
 
22
 
 
23
  local_model_dir = prepare_local_model(model_name)
 
24
  try:
25
  tokenizer = AutoTokenizer.from_pretrained(local_model_dir, use_fast=False, trust_remote_code=False)
26
- except Exception:
 
 
 
27
  # Fallback to explicit slow tokenizer class to bypass tokenizers fast-path issues.
28
- tokenizer = RobertaTokenizer.from_pretrained(local_model_dir, trust_remote_code=False)
 
 
 
 
 
 
 
29
 
30
  model = AutoModelForSeq2SeqLM.from_pretrained(local_model_dir, trust_remote_code=False)
31
 
 
1
  import gradio as gr
2
  import os
3
+ import json
4
+ import logging
5
+ import transformers
6
+ import huggingface_hub
7
  from huggingface_hub import snapshot_download
8
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, RobertaTokenizer
9
  import torch
10
 
11
+ try:
12
+ import tokenizers
13
+ except Exception: # pragma: no cover - diagnostics only
14
+ tokenizers = None
15
+
16
  # 加载 CodeT5+ 模型
17
  model_name = "Salesforce/codet5p-220m"
18
 
19
+ logger = logging.getLogger(__name__)
20
+ logging.basicConfig(level=logging.INFO)
21
+
22
+
23
+ def log_runtime_versions() -> None:
24
+ """Log runtime package versions to simplify Space startup debugging."""
25
+ tokenizers_version = getattr(tokenizers, "__version__", "not-installed")
26
+ logger.info("transformers version: %s", transformers.__version__)
27
+ logger.info("huggingface_hub version: %s", huggingface_hub.__version__)
28
+ logger.info("tokenizers version: %s", tokenizers_version)
29
+
30
+
31
+ def sanitize_added_tokens_file(added_tokens_file: str) -> None:
32
+ """Normalize malformed added_tokens.json to list format expected by tokenizers."""
33
+ if not os.path.exists(added_tokens_file):
34
+ return
35
+
36
+ try:
37
+ with open(added_tokens_file, "r", encoding="utf-8") as fp:
38
+ data = json.load(fp)
39
+ except Exception:
40
+ data = []
41
+
42
+ if isinstance(data, list):
43
+ sanitized = [item for item in data if isinstance(item, str)]
44
+ elif isinstance(data, dict):
45
+ # Some repos store empty/object payloads here; tokenizer expects a list.
46
+ sanitized = [key for key in data.keys() if isinstance(key, str)]
47
+ else:
48
+ sanitized = []
49
+
50
+ with open(added_tokens_file, "w", encoding="utf-8") as fp:
51
+ json.dump(sanitized, fp, ensure_ascii=True)
52
+
53
 
54
  def prepare_local_model(repo_id: str, local_dir: str = "./model_cache") -> str:
55
  snapshot_download(repo_id=repo_id, local_dir=local_dir)
56
 
57
+ # Work around tokenizer metadata incompatibility in some runtime combos.
 
58
  added_tokens_file = os.path.join(local_dir, "added_tokens.json")
59
+ sanitize_added_tokens_file(added_tokens_file)
 
60
 
61
  return local_dir
62
 
63
 
64
+ log_runtime_versions()
65
  local_model_dir = prepare_local_model(model_name)
66
+ auto_error = None
67
  try:
68
  tokenizer = AutoTokenizer.from_pretrained(local_model_dir, use_fast=False, trust_remote_code=False)
69
+ logger.info("Tokenizer loaded with AutoTokenizer (slow mode).")
70
+ except Exception as exc:
71
+ auto_error = exc
72
+ logger.warning("AutoTokenizer load failed, trying RobertaTokenizer fallback: %s", exc)
73
  # Fallback to explicit slow tokenizer class to bypass tokenizers fast-path issues.
74
+ try:
75
+ tokenizer = RobertaTokenizer.from_pretrained(local_model_dir, trust_remote_code=False)
76
+ logger.info("Tokenizer loaded with RobertaTokenizer fallback.")
77
+ except Exception as fallback_exc:
78
+ raise RuntimeError(
79
+ "Tokenizer initialization failed for both AutoTokenizer and RobertaTokenizer. "
80
+ f"AutoTokenizer error: {auto_error}; RobertaTokenizer error: {fallback_exc}"
81
+ ) from fallback_exc
82
 
83
  model = AutoModelForSeq2SeqLM.from_pretrained(local_model_dir, trust_remote_code=False)
84
 
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
- transformers>=4.40.0
2
- huggingface_hub>=0.23.0
3
- torch>=2.0.0
 
4
  sentencepiece>=0.1.96
5
  accelerate>=0.20.0
6
  datasets>=2.0.0
 
1
+ transformers==4.40.2
2
+ huggingface_hub==0.23.2
3
+ tokenizers==0.13.3
4
+ torch==2.1.2
5
  sentencepiece>=0.1.96
6
  accelerate>=0.20.0
7
  datasets>=2.0.0