Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on 29 days ago

Commit

2900b36

1 Parent(s): b705945

Monkey-patch transformers to disable flash attention via wrapper script

Browse files

Files changed (1) hide show

app.py +31 -57

app.py CHANGED Viewed

@@ -215,80 +215,54 @@ class ChatBot:
             logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
             yield status_table, logs
-            # Create a fake flash_attn package to avoid import errors
-            # This will fallback to standard PyTorch attention
-            fake_flash_dir = f"/tmp/flash_attn_{timestamp}"
-            os.makedirs(fake_flash_dir, exist_ok=True)
-            with open(os.path.join(fake_flash_dir, "__init__.py"), 'w') as f:
-                f.write("""
-# Fake flash_attn module that falls back to standard PyTorch attention
-import torch
-def flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False, **kwargs):
-    '''Fallback to standard PyTorch attention (slower but works without flash-attn)'''
-    if softmax_scale is None:
-        softmax_scale = 1.0 / (q.size(-1) ** 0.5)
-    # Standard attention: softmax(Q @ K.T) @ V
-    attn_weights = torch.matmul(q, k.transpose(-2, -1)) * softmax_scale
-    if causal:
-        seq_len = attn_weights.size(-1)
-        causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=attn_weights.device), diagonal=1).bool()
-        attn_weights = attn_weights.masked_fill(causal_mask, float('-inf'))
-    attn_weights = torch.softmax(attn_weights, dim=-1)
-    if dropout_p > 0:
-        attn_weights = torch.nn.functional.dropout(attn_weights, p=dropout_p)
-    output = torch.matmul(attn_weights, v)
-    return output, None  # Return None for attention weights
-def flash_attn_varlen_func(*args, **kwargs):
-    return flash_attn_func(*args, **kwargs)
-__version__ = "2.5.0"
 """)
-            # Add fake package to Python path for subprocess
-            import sys
-            if f"/tmp" not in sys.path:
-                sys.path.insert(0, "/tmp")
-            # Set PYTHONPATH environment variable so subprocess can find fake flash_attn
-            env = os.environ.copy()
-            pythonpath = env.get('PYTHONPATH', '')
-            env['PYTHONPATH'] = f"/tmp:{pythonpath}" if pythonpath else "/tmp"
-            logs += "⚠️ **Note:** Using fallback PyTorch attention (slower than flash-attn)\n\n"
             yield status_table, logs
-            # Run lm_eval
-            cmd = [
-                "lm_eval",
-                "--model", "hf",
-                "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
-                "--tasks", task_string,
-                "--batch_size", "1",
-                "--output_path", output_dir,
-                "--log_samples"
-            ]
             status_table = self._create_status_table(tasks_to_run, "🔄 Running")
-            logs += f"🔄 **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\n"
             logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n"
             yield status_table, logs
-            # Run evaluation with custom environment
             process = subprocess.Popen(
                 cmd,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 text=True,
-                bufsize=1,
-                env=env  # Pass custom environment with PYTHONPATH
             )
             output_lines = []

             logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
             yield status_table, logs
+            # Create a wrapper script that disables flash attention before running lm_eval
+            wrapper_script = f"/tmp/run_eval_{timestamp}.py"
+            with open(wrapper_script, 'w') as f:
+                f.write(f"""
+import sys
+import os
+# Monkey-patch transformers to disable flash attention
+import transformers.modeling_flash_attention_utils as flash_utils
+def disabled_lazy_import(*args, **kwargs):
+    raise ImportError("Flash attention disabled - using eager attention")
+flash_utils.lazy_import_flash_attention = disabled_lazy_import
+# Now run lm_eval
+sys.argv = [
+    'lm_eval',
+    '--model', 'hf',
+    '--model_args', 'pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True,attn_implementation=eager',
+    '--tasks', '{task_string}',
+    '--batch_size', '1',
+    '--output_path', '{output_dir}',
+    '--log_samples'
+]
+from lm_eval.__main__ import cli_evaluate
+cli_evaluate()
 """)
+            logs += "⚠️ **Note:** Flash attention disabled, using eager attention (slower but compatible)\n\n"
             yield status_table, logs
+            # Run lm_eval via wrapper script
+            cmd = ["python3", wrapper_script]
             status_table = self._create_status_table(tasks_to_run, "🔄 Running")
+            logs += f"🔄 **Running lm_eval...**\n\nTasks: {task_string}\n\n"
             logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n"
             yield status_table, logs
+            # Run evaluation
             process = subprocess.Popen(
                 cmd,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 text=True,
+                bufsize=1
             )
             output_lines = []