Spaces:

nebulaResearch
/

zagros-1.0-quick

Paused

nebulaResearch commited on Sep 24, 2025

Commit

744d3ff

verified ·

1 Parent(s): 6a508ef

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -41,26 +41,26 @@ def compile_and_load_model():
     )
     temp_model = temp_model.to('cuda')
-    # Capture example inputs (dummy prompt for export; adjust if needed for your model)
-    example_prompt = "Hello world"  # Simple example; use a representative prompt for better optimization
     example_inputs = tokenizer(example_prompt, return_tensors="pt").to('cuda')
-    # Capture the computation graph without executing
-    with spaces.aoti_capture():
-        _ = temp_model(**example_inputs)
-    # Export the model graph
     exported_model = export(
-        temp_model,
-        example_inputs['input_ids'],
-        example_inputs['attention_mask']
     )
     # Compile to optimized binary
     compiled_model = spaces.aoti_compile(exported_model)
     # Apply the compiled model (patches the forward method)
-    spaces.aoti_apply(temp_model, compiled_model)
     model = temp_model
     print("Model compiled and ready for inference!")

     )
     temp_model = temp_model.to('cuda')
+    # Example inputs for capture (representative prompt)
+    example_prompt = "Hello world"  # Simple; adjust for better optimization if needed
     example_inputs = tokenizer(example_prompt, return_tensors="pt").to('cuda')
+    # Capture the computation graph (on the core transformer module for efficiency)
+    with spaces.aoti_capture(temp_model.model) as call:
+        _ = temp_model.model(**example_inputs)  # Forward pass only (not generate)
+    # Export the captured graph
     exported_model = export(
+        temp_model.model,
+        args=call.args,
+        kwargs=call.kwargs
     )
     # Compile to optimized binary
     compiled_model = spaces.aoti_compile(exported_model)
     # Apply the compiled model (patches the forward method)
+    spaces.aoti_apply(compiled_model, temp_model.model)
     model = temp_model
     print("Model compiled and ready for inference!")