Default to flash_attention_2; document bf16 dtype contract and lang RoPE numerics

Browse files

Files changed (2) hide show

README.md +14 -3
config.json +2 -1

README.md CHANGED Viewed

@@ -41,15 +41,26 @@
 > model = AutoModel.from_pretrained(
 >     "path/to/onevision-encoder-large-lang-tf57",
 >     trust_remote_code=True,
->     attn_implementation="eager",  # or "sdpa", "flash_attention_2", "flex_attention"
-> )
 > # default grid path
 > out = model(pixel_values=images)
 > # explicit per-patch positions (lang-only)
 > out = model(pixel_values=images, patch_positions=patch_positions)
 > ```
 >
-> Tested with `transformers==5.7.0`, `torch>=2.4`.
 >
 > ## Equivalence verification
 >

 > model = AutoModel.from_pretrained(
 >     "path/to/onevision-encoder-large-lang-tf57",
 >     trust_remote_code=True,
+> )  # default attn_implementation = "flash_attention_2" (set in config.json)
+>
 > # default grid path
 > out = model(pixel_values=images)
 > # explicit per-patch positions (lang-only)
 > out = model(pixel_values=images, patch_positions=patch_positions)
 > ```
 >
+> Override the default if you need a different backend:
+>
+> ```python
+> model = AutoModel.from_pretrained(..., attn_implementation="sdpa")
+> # supported: "flash_attention_2" (default), "sdpa", "eager", "flex_attention"
+> ```
+>
+> **Dtype contract**: weights are saved in `bfloat16`. The default `flash_attention_2` backend requires `fp16`/`bf16` inputs. If you must use `fp32`, override with `attn_implementation="sdpa"` or `"eager"`.
+>
+> **Numerical note (lang variant)**: Unlike the `large` variant, attention backends are NOT numerically equivalent in `bf16` for this model — `eager` and `flash_attention_2`/`sdpa` differ in `max_diff` up to several hundred in absolute value (mean diff < 0.1, std preserved). This is due to the lang variant intentionally keeping RoPE `cos`/`sin` in `q.dtype` (bf16) instead of upcasting to `fp32` like the `large` variant. The model still trains/serves correctly on any backend, but if you need strict numerical reproducibility against the upstream model, use `attn_implementation="eager"` in `bf16` or any backend in `fp32`.
+>
+> Tested with `transformers==5.7.0`, `torch>=2.4`, `flash-attn>=2.7`.
 >
 > ## Equivalence verification
 >

config.json CHANGED Viewed

@@ -23,5 +23,6 @@
   "auto_map": {
     "AutoConfig": "configuration_onevision_encoder.OneVisionEncoderConfig",
     "AutoModel": "modeling_onevision_encoder.OneVisionEncoderModel"
-  }
 }

   "auto_map": {
     "AutoConfig": "configuration_onevision_encoder.OneVisionEncoderConfig",
     "AutoModel": "modeling_onevision_encoder.OneVisionEncoderModel"
+  },
+  "_attn_implementation": "flash_attention_2"
 }