microsoft
/

phi-1

@@ -8,8 +8,7 @@ from __future__ import annotations
 import math
 from dataclasses import dataclass, field
-from functools import wraps
-from typing import Any, Callable, Dict, Optional, Tuple, Union
 import torch
 import torch.nn as nn
@@ -32,18 +31,6 @@ except:
     FusedDense = None
-def disable_autocast(device_type: str = "cuda") -> None:
-    def _disable_autocast(f: Callable) -> Callable:
-        @wraps(f)
-        def __disable_autocast(*args, **kwargs) -> Callable:
-            with torch.autocast(device_type, enabled=False):
-                return f(*args, **kwargs)
-        return __disable_autocast
-    return _disable_autocast
 @dataclass
 class InferenceParams:
     """Inference parameters passed to model to efficiently calculate
@@ -359,7 +346,8 @@ class SelfAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
-    @disable_autocast
     def forward(
         self,
         qkv: torch.FloatTensor,
@@ -418,7 +406,8 @@ class CrossAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
-    @disable_autocast
     def forward(
         self,
         q: torch.FloatTensor,

 import math
 from dataclasses import dataclass, field
+from typing import Any, Dict, Optional, Tuple, Union
 import torch
 import torch.nn as nn
     FusedDense = None
 @dataclass
 class InferenceParams:
     """Inference parameters passed to model to efficiently calculate
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
+    @torch.autocast("cpu", enabled=False)
+    @torch.autocast("cuda", enabled=False)
     def forward(
         self,
         qkv: torch.FloatTensor,
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
+    @torch.autocast("cpu", enabled=False)
+    @torch.autocast("cuda", enabled=False)
     def forward(
         self,
         q: torch.FloatTensor,