Spaces:

Inpris
/

Humains-Junior

Sleeping

NS-Y commited on Nov 2, 2025

Commit

74419dd

verified ·

1 Parent(s): 1a26838

Upload 3 files

Files changed (2) hide show

README.md CHANGED Viewed

@@ -22,6 +22,6 @@ A Gradio Space that applies the Appendix-style prompt: the model must prioritize
 - `HF_TOKEN` — required if the model is gated.
 **Files**
-- `app.py` — Gradio app (cache disabled to avoid DynamicCache issues)
 - `requirements.txt` — dependencies (pins transformers 4.43.3, accelerate 0.32.1)
 - `examples/` — (optional) assets/presets

 - `HF_TOKEN` — required if the model is gated.
 **Files**
+- `app.py` — Gradio app (slow tokenizer forced to avoid tokenizer.json schema mismatches)
 - `requirements.txt` — dependencies (pins transformers 4.43.3, accelerate 0.32.1)
 - `examples/` — (optional) assets/presets

app.py CHANGED Viewed

@@ -60,7 +60,7 @@ def load_model(model_id: str = DEFAULT_MODEL):
         return _tokenizer, _model
     auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and len(USE_AUTH_TOKEN.strip()) > 0) else None
-    _tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=TRUST_REMOTE_CODE)
     _model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -69,11 +69,9 @@ def load_model(model_id: str = DEFAULT_MODEL):
         trust_remote_code=TRUST_REMOTE_CODE,
     )
-    # Safety: ensure pad_token_id is set
     if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
         _tokenizer.pad_token_id = _tokenizer.eos_token_id
-    # Prefer static cache if available to avoid DynamicCache issues in some remote code
     try:
         _model.generation_config.cache_implementation = "static"
     except Exception:
@@ -93,7 +91,7 @@ def generate_text(question: str, context: str, temperature: float, top_p: float,
             top_p=top_p,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-            use_cache=False,  # <-- avoid DynamicCache path in custom modeling code
         )
     text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

         return _tokenizer, _model
     auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and len(USE_AUTH_TOKEN.strip()) > 0) else None
+    _tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=TRUST_REMOTE_CODE, use_fast=False)
     _model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         trust_remote_code=TRUST_REMOTE_CODE,
     )
     if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
         _tokenizer.pad_token_id = _tokenizer.eos_token_id
     try:
         _model.generation_config.cache_implementation = "static"
     except Exception:
             top_p=top_p,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+            use_cache=False,
         )
     text = tokenizer.decode(output_ids[0], skip_special_tokens=True)