Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files
README.md
CHANGED
|
@@ -22,6 +22,6 @@ A Gradio Space that applies the Appendix-style prompt: the model must prioritize
|
|
| 22 |
- `HF_TOKEN` — required if the model is gated.
|
| 23 |
|
| 24 |
**Files**
|
| 25 |
-
- `app.py` — Gradio app (
|
| 26 |
- `requirements.txt` — dependencies (pins transformers 4.43.3, accelerate 0.32.1)
|
| 27 |
- `examples/` — (optional) assets/presets
|
|
|
|
| 22 |
- `HF_TOKEN` — required if the model is gated.
|
| 23 |
|
| 24 |
**Files**
|
| 25 |
+
- `app.py` — Gradio app (slow tokenizer forced to avoid tokenizer.json schema mismatches)
|
| 26 |
- `requirements.txt` — dependencies (pins transformers 4.43.3, accelerate 0.32.1)
|
| 27 |
- `examples/` — (optional) assets/presets
|
app.py
CHANGED
|
@@ -60,7 +60,7 @@ def load_model(model_id: str = DEFAULT_MODEL):
|
|
| 60 |
return _tokenizer, _model
|
| 61 |
|
| 62 |
auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and len(USE_AUTH_TOKEN.strip()) > 0) else None
|
| 63 |
-
_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=TRUST_REMOTE_CODE)
|
| 64 |
_model = AutoModelForCausalLM.from_pretrained(
|
| 65 |
model_id,
|
| 66 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
|
@@ -69,11 +69,9 @@ def load_model(model_id: str = DEFAULT_MODEL):
|
|
| 69 |
trust_remote_code=TRUST_REMOTE_CODE,
|
| 70 |
)
|
| 71 |
|
| 72 |
-
# Safety: ensure pad_token_id is set
|
| 73 |
if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
|
| 74 |
_tokenizer.pad_token_id = _tokenizer.eos_token_id
|
| 75 |
|
| 76 |
-
# Prefer static cache if available to avoid DynamicCache issues in some remote code
|
| 77 |
try:
|
| 78 |
_model.generation_config.cache_implementation = "static"
|
| 79 |
except Exception:
|
|
@@ -93,7 +91,7 @@ def generate_text(question: str, context: str, temperature: float, top_p: float,
|
|
| 93 |
top_p=top_p,
|
| 94 |
max_new_tokens=max_new_tokens,
|
| 95 |
pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
|
| 96 |
-
use_cache=False,
|
| 97 |
)
|
| 98 |
text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 99 |
|
|
|
|
| 60 |
return _tokenizer, _model
|
| 61 |
|
| 62 |
auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and len(USE_AUTH_TOKEN.strip()) > 0) else None
|
| 63 |
+
_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=TRUST_REMOTE_CODE, use_fast=False)
|
| 64 |
_model = AutoModelForCausalLM.from_pretrained(
|
| 65 |
model_id,
|
| 66 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
|
|
|
| 69 |
trust_remote_code=TRUST_REMOTE_CODE,
|
| 70 |
)
|
| 71 |
|
|
|
|
| 72 |
if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
|
| 73 |
_tokenizer.pad_token_id = _tokenizer.eos_token_id
|
| 74 |
|
|
|
|
| 75 |
try:
|
| 76 |
_model.generation_config.cache_implementation = "static"
|
| 77 |
except Exception:
|
|
|
|
| 91 |
top_p=top_p,
|
| 92 |
max_new_tokens=max_new_tokens,
|
| 93 |
pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
|
| 94 |
+
use_cache=False,
|
| 95 |
)
|
| 96 |
text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 97 |
|