Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 23, 2022

Commit

a21c8ab

1 Parent(s): 9ff92c0

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -4

app.py CHANGED Viewed

@@ -1,7 +1,74 @@
 import streamlit as st
 def calc_exec_time(comp_flop, mem_bytes):
   return (comp_flop/TFLOPS + mem_bytes/GB_S)*1000
 def print_kernel_execution(c1, c2, comp_flop, mem_bytes):
   arith_int = comp_flop/mem_bytes
@@ -14,15 +81,12 @@ def print_kernel_execution(c1, c2, comp_flop, mem_bytes):
   c2.write(str(comp_flop))
   c1.write("MB: ")
   c2.write(str(mem_bytes))
-  c1.write("Arithm. intensity:")
-  c2.write(str(arith_int))
   c1.write("Time (ms):")
   c2.write(str(exec_time))
   return exec_time
-TFLOPS = 312e12
-GB_S = 1935e9
 st.sidebar.header("Transformer parameters")
 col1, col2 = st.sidebar.columns([2, 4])
@@ -31,11 +95,29 @@ bs = st.sidebar.number_input('Batch size', value=10)
 h = st.sidebar.number_input('Num heads',value=16)
 d = st.sidebar.number_input('Dimension', value=768)
 l = st.sidebar.number_input('Num layers', value=24)
 n_start = st.sidebar.number_input('Start seq', value=1)
 n = st.sidebar.number_input('End seq', value=1024)
 st.sidebar.header("GPU parameters")
 st.header('Attention layer')
 st.subheader('QKV projection')

 import streamlit as st
+# A100 specs
+TFLOPS = 312e12
+GB_S = 1935e9
+# in ms
+THREAD_OVERHEAD = 0.005
 def calc_exec_time(comp_flop, mem_bytes):
   return (comp_flop/TFLOPS + mem_bytes/GB_S)*1000
+def qkv_mha_exec(bs, h, n, d):
+  flop = 2*bs*1*d*3*d
+  nbytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def qkv_mqa_exec(bs, h, n, d):
+  flop = 2*bs*1*d*(1+2/h)*d
+  nbytes = 2*bs*1*d + 2*(2/h)*d*d + 2*bs*1*(2/h)*d
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def att1_mha_exec(bs, h, n, d):
+  flop = 2*bs*h*(d/h)*n
+  nbytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def att1_mqa_exec(bs, h, n, d):
+  flop = 2*bs*h*(d/h)*n
+  nbytes = 2*bs*h*(d/h) + 2*bs*n*(d/h) + 2*bs*h*n
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def att2_mha_exec(bs, h, n, d):
+  flop = 2*bs*h*n*(d/h)
+  nbytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def att2_mqa_exec(bs, h, n, d):
+  flop = 2*bs*h*n*(d/h)
+  nbytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def out_exec(bs, h, n, d):
+  flop = 2*bs*1*d*d
+  nbytes = 2*bs*1*d + 2*d*d + 2*bs*1*d
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def softmax_exec(bs, h, n, d):
+  flop = 0
+  nbytes = 2*bs*h*n + 2*bs*h*n
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def ln_exec(bs, h, n, d):
+  nbytes = 2*bs*1*d + 2*bs*1*d
+  flop = 0
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
+def mlp_exec(bs, h, n, d):
+  flop = 2*bs*1*d*4*d
+  nbytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
+  exec_time = calc_exec_time(flop, nbytes)
+  return flop, nbytes, exec_time
 def print_kernel_execution(c1, c2, comp_flop, mem_bytes):
   arith_int = comp_flop/mem_bytes
   c2.write(str(comp_flop))
   c1.write("MB: ")
   c2.write(str(mem_bytes))
   c1.write("Time (ms):")
   c2.write(str(exec_time))
   return exec_time
 st.sidebar.header("Transformer parameters")
 col1, col2 = st.sidebar.columns([2, 4])
 h = st.sidebar.number_input('Num heads',value=16)
 d = st.sidebar.number_input('Dimension', value=768)
 l = st.sidebar.number_input('Num layers', value=24)
 n_start = st.sidebar.number_input('Start seq', value=1)
 n = st.sidebar.number_input('End seq', value=1024)
 st.sidebar.header("GPU parameters")
+st.header("Total execution time")
+mqa_total_time = 0.
+mha_total_time = 0.
+for i in range(n_start, n):
+  shared_time = out_exec(bs, h, i, d)[2] + softmax_exec(bs, h, i , d)[2] \
+                2*ln_exec(bs, h, i, d)[2] + 2*mlp_exec(bs, h, i, d)[2] + 3*ln_exec(bs, h, i, d)
+  mha_time = shared_time + qkv_mha_exec(bs, h, i, d)[2] + att1_mha_exec(bs, h, i, d)[2] + att2_mha_exec(bs, h, i, d)[2]
+  mha_total_time += l*mha_time
+  mqa_time = shared_time + qkv_mqa_exec(bs, h, i, d)[2] + att1_mqa_exec(bs, h, i, d)[2] + att2_mqa_exec(bs, h, i, d)[2]
+  mqa_total_time += l*mqa_time
+st.write("MHA exec time: " + str(mha_total_time))
+st.write("MQA exec time: " + str(mqa_total_time))
 st.header('Attention layer')
 st.subheader('QKV projection')