Update app.py
Browse files
app.py
CHANGED
|
@@ -41,26 +41,26 @@ def compile_and_load_model():
|
|
| 41 |
)
|
| 42 |
temp_model = temp_model.to('cuda')
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
example_prompt = "Hello world" # Simple
|
| 46 |
example_inputs = tokenizer(example_prompt, return_tensors="pt").to('cuda')
|
| 47 |
|
| 48 |
-
# Capture the computation graph
|
| 49 |
-
with spaces.aoti_capture():
|
| 50 |
-
_ = temp_model(**example_inputs)
|
| 51 |
|
| 52 |
-
# Export the
|
| 53 |
exported_model = export(
|
| 54 |
-
temp_model,
|
| 55 |
-
|
| 56 |
-
|
| 57 |
)
|
| 58 |
|
| 59 |
# Compile to optimized binary
|
| 60 |
compiled_model = spaces.aoti_compile(exported_model)
|
| 61 |
|
| 62 |
# Apply the compiled model (patches the forward method)
|
| 63 |
-
spaces.aoti_apply(
|
| 64 |
|
| 65 |
model = temp_model
|
| 66 |
print("Model compiled and ready for inference!")
|
|
|
|
| 41 |
)
|
| 42 |
temp_model = temp_model.to('cuda')
|
| 43 |
|
| 44 |
+
# Example inputs for capture (representative prompt)
|
| 45 |
+
example_prompt = "Hello world" # Simple; adjust for better optimization if needed
|
| 46 |
example_inputs = tokenizer(example_prompt, return_tensors="pt").to('cuda')
|
| 47 |
|
| 48 |
+
# Capture the computation graph (on the core transformer module for efficiency)
|
| 49 |
+
with spaces.aoti_capture(temp_model.model) as call:
|
| 50 |
+
_ = temp_model.model(**example_inputs) # Forward pass only (not generate)
|
| 51 |
|
| 52 |
+
# Export the captured graph
|
| 53 |
exported_model = export(
|
| 54 |
+
temp_model.model,
|
| 55 |
+
args=call.args,
|
| 56 |
+
kwargs=call.kwargs
|
| 57 |
)
|
| 58 |
|
| 59 |
# Compile to optimized binary
|
| 60 |
compiled_model = spaces.aoti_compile(exported_model)
|
| 61 |
|
| 62 |
# Apply the compiled model (patches the forward method)
|
| 63 |
+
spaces.aoti_apply(compiled_model, temp_model.model)
|
| 64 |
|
| 65 |
model = temp_model
|
| 66 |
print("Model compiled and ready for inference!")
|