Spaces:

macrocosm-os
/

Sn25

Sleeping

App Files Files Community

Booty-szy commited on Aug 20, 2024

Commit

6391563

verified ·

1 Parent(s): 5a605f5

dashboard-api (#2)

Browse files

- Updated the dashboard to run via an api, reworked api to use fastapi (f40a2d92fd262fdf7d7198e7a6130c04ed0d993d)

Files changed (4) hide show

api.py +27 -84
app.py +55 -60
classes.py +17 -0
utils.py +123 -40

api.py CHANGED Viewed

@@ -2,25 +2,36 @@
 import atexit
 import datetime
-from flask import Flask, request, jsonify
 from apscheduler.schedulers.background import BackgroundScheduler
 import utils
-app = Flask(__name__)
 # Global variables (saves time on loading data)
 state_vars = None
 reload_timestamp = datetime.datetime.now().strftime('%D %T')
 def load_data():
     """
     Reload the state variables
     """
-    global state_vars, reload_timestamp
-    state_vars = utils.load_state_vars()
     reload_timestamp = datetime.datetime.now().strftime('%D %T')
@@ -36,110 +47,42 @@ def start_scheduler():
     atexit.register(lambda: scheduler.shutdown())
-@app.route('/', methods=['GET'])
 def home():
     return "Welcome to the Bittensor Protein Folding Leaderboard API!"
-@app.route('/updated', methods=['GET'])
 def updated():
     return reload_timestamp
-@app.route('/data', methods=['GET'])
-@app.route('/data/<period>', methods=['GET'])
-def data(period=None):
-    """
-    Get the productivity metrics
-    """
-    assert period in ('24h', None), f"Invalid period: {period}. Must be '24h' or None."
-    df = state_vars["dataframe_24h"] if period == '24h' else state_vars["dataframe"]
-    return jsonify(
-        df.astype(str).to_dict(orient='records')
-    )
-@app.route('/productivity', methods=['GET'])
-@app.route('/productivity/<period>', methods=['GET'])
-def productivity_metrics(period=None):
     """
     Get the productivity metrics
     """
-    assert period in ('24h', None), f"Invalid period: {period}. Must be '24h' or None."
-    df = state_vars["dataframe_24h"] if period == '24h' else state_vars["dataframe"]
-    return jsonify(
-        utils.get_productivity(df)
-    )
-@app.route('/throughput', methods=['GET'])
-@app.route('/throughput/<period>', methods=['GET'])
-def throughput_metrics(period=None):
     """
     Get the throughput metrics
     """
-    assert period in ('24h', None), f"Invalid period: {period}. Must be '24h' or None."
-    df = state_vars["dataframe_24h"] if period == '24h' else state_vars["dataframe"]
-    return jsonify(utils.get_data_transferred(df))
-@app.route('/metagraph', methods=['GET'])
-def metagraph():
-    """
-    Get the metagraph data
-    Returns:
-    - metagraph_data: List of dicts (from pandas DataFrame)
-    """
-    df_m = state_vars["metagraph"]
-    return jsonify(
-        df_m.to_dict(orient='records')
-    )
-@app.route('/leaderboard', methods=['GET'])
-@app.route('/leaderboard/<entity>', methods=['GET'])
-@app.route('/leaderboard/<entity>/<ntop>', methods=['GET'])
-def leaderboard(entity='identity',ntop=10):
-    """
-    Get the leaderboard data
-    Returns:
-    - leaderboard_data: List of dicts (from pandas DataFrame)
-    """
-    assert entity in utils.ENTITY_CHOICES, f"Invalid entity choice: {entity}"
-    df_miners = utils.get_leaderboard(
-        state_vars["metagraph"],
-        ntop=int(ntop),
-        entity_choice=entity
-        )
-    return jsonify(
-        df_miners.to_dict(orient='records')
-    )
-@app.route('/validator', methods=['GET'])
-def validator():
-    """
-    Get the validator data
-    Returns:
-    - validator_data: List of dicts (from pandas DataFrame)
-    """
-    df_m = state_vars["metagraph"]
-    df_validators = df_m.loc[df_m.validator_trust > 0]
-    return jsonify(
-        df_validators.to_dict(orient='records')
-    )
 if __name__ == '__main__':
     load_data()
     start_scheduler()
-    app.run(host='0.0.0.0', port=5001, debug=True)
     # to test locally

 import atexit
 import datetime
 from apscheduler.schedulers.background import BackgroundScheduler
+from fastapi import FastAPI
 import utils
+import pandas as pd
+import uvicorn
+from classes import Productivity, Throughput
 # Global variables (saves time on loading data)
 state_vars = None
 reload_timestamp = datetime.datetime.now().strftime('%D %T')
+data_all = None
+data_24h = None
+app = FastAPI()
 def load_data():
     """
     Reload the state variables
     """
+    global data_all, data_24h, reload_timestamp
+    utils.fetch_new_runs()
+    data_all = utils.preload_data()
+    data_24h = (pd.Timestamp.now() -  data_all['updated_at'].apply(lambda x: pd.Timestamp(x)) < pd.Timedelta('1 days'))
     reload_timestamp = datetime.datetime.now().strftime('%D %T')
     atexit.register(lambda: scheduler.shutdown())
+@app.get("/")
 def home():
     return "Welcome to the Bittensor Protein Folding Leaderboard API!"
+@app.get("/updated")
 def updated():
     return reload_timestamp
+@app.get("/productivity", response_model=Productivity)
+def productivity_metrics():
     """
     Get the productivity metrics
     """
+    return Productivity(all_time=utils.get_productivity(data_all), last_24h=utils.get_productivity(data_24h))
+@app.get("/throughput", response_model=Throughput)
+def throughput_metrics():
     """
     Get the throughput metrics
     """
+    return Throughput(all_time=utils.get_data_transferred(data_all), last_24h=utils.get_data_transferred(data_24h))
 if __name__ == '__main__':
     load_data()
     start_scheduler()
+    uvicorn.run(app, host='0.0.0.0', port=5001)
     # to test locally

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import time
 import pandas as pd
 import streamlit as st
 import plotly.express as px
 import utils
@@ -14,21 +15,13 @@ Simulation duration distribution
 """
 UPDATE_INTERVAL = 3600
 st.title('Folding Subnet Dashboard')
 st.markdown('<br>', unsafe_allow_html=True)
-# reload data periodically
-df = utils.build_data(time.time()//UPDATE_INTERVAL)
-st.toast(f'Loaded {len(df)} runs')
-# TODO: fix the factor for 24 hours ago
-runs_alive_24h_ago = (df.last_event_at > pd.Timestamp.now() - pd.Timedelta('1d'))
-df_24h = df.loc[runs_alive_24h_ago]
-# correction factor to account for the fact that the data straddles the 24h boundary
-# correction factor is based on the fraction of the run which occurred in the last 24h
-# factor = (df_24h.last_event_at - pd.Timestamp.now() + pd.Timedelta('1d')) / pd.Timedelta('1d')
 #### ------ PRODUCTIVITY ------
@@ -37,34 +30,35 @@ df_24h = df.loc[runs_alive_24h_ago]
 st.subheader('Productivity overview')
 st.info('Productivity metrics show how many proteins have been folded, which is the primary goal of the subnet. Metrics are estimated using weights and biases data combined with heuristics.')
-productivity = utils.get_productivity(df)
-productivity_24h = utils.get_productivity(df_24h)
-m1, m2, m3 = st.columns(3)
-m1.metric('Unique proteins folded', f'{productivity.get("unique_folded"):,.0f}', delta=f'{productivity_24h.get("unique_folded"):,.0f} (24h)')
-m2.metric('Total proteins folded', f'{productivity.get("total_simulations"):,.0f}', delta=f'{productivity_24h.get("total_simulations"):,.0f} (24h)')
-m3.metric('Total simulation steps', f'{productivity.get("total_md_steps"):,.0f}', delta=f'{productivity_24h.get("total_md_steps"):,.0f} (24h)')
-st.markdown('<br>', unsafe_allow_html=True)
-time_binned_data = df.set_index('last_event_at').groupby(pd.Grouper(freq='12h'))
-PROD_CHOICES = {
-    'Unique proteins folded': 'unique_pdbs',
-    'Total simulations': 'total_pdbs',
-    'Total simulation steps': 'total_md_steps',
-}
-prod_choice_label = st.radio('Select productivity metric', list(PROD_CHOICES.keys()), index=0, horizontal=True)
-prod_choice = PROD_CHOICES[prod_choice_label]
-steps_running_total = time_binned_data[prod_choice].sum().cumsum()
-st.plotly_chart(
-    # add fillgradient to make it easier to see the trend
-    px.area(steps_running_total, y=prod_choice,
-            labels={'last_event_at':'', prod_choice: prod_choice_label},
-    ).update_traces(fill='tozeroy'),
-    use_container_width=True,
-)
 st.markdown('<br>', unsafe_allow_html=True)
@@ -75,26 +69,27 @@ st.subheader('Throughput overview')
 st.info('Throughput metrics show the total amount of data sent and received by the validators. This is a measure of the network activity and the amount of data that is being processed by the subnet.')
 MEM_UNIT = 'GB' #st.radio('Select memory unit', ['TB','GB', 'MB'], index=0, horizontal=True)
-data_transferred = utils.get_data_transferred(df,unit=MEM_UNIT)
-data_transferred_24h = utils.get_data_transferred(df_24h, unit=MEM_UNIT)
 m1, m2, m3 = st.columns(3)
-m1.metric(f'Total sent data ({MEM_UNIT})', f'{data_transferred.get("sent"):,.0f}', delta=f'{data_transferred_24h.get("sent"):,.0f} (24h)')
-m2.metric(f'Total received data ({MEM_UNIT})', f'{data_transferred.get("received"):,.0f}', delta=f'{data_transferred_24h.get("received"):,.0f} (24h)')
-m3.metric(f'Total transferred data ({MEM_UNIT})', f'{data_transferred.get("total"):,.0f}', delta=f'{data_transferred_24h.get("total"):,.0f} (24h)')
-IO_CHOICES = {'total_data_sent':'Sent', 'total_data_received':'Received'}
-io_running_total = time_binned_data[list(IO_CHOICES.keys())].sum().rename(columns=IO_CHOICES).cumsum().melt(ignore_index=False)
-io_running_total['value'] = io_running_total['value'].apply(utils.convert_unit, args=(utils.BASE_UNITS, MEM_UNIT))
-st.plotly_chart(
-    px.area(io_running_total, y='value', color='variable',
-            labels={'last_event_at':'', 'value': f'Data transferred ({MEM_UNIT})', 'variable':'Direction'},
-    ),
-    use_container_width=True,
-)
 st.markdown('<br>', unsafe_allow_html=True)
@@ -127,14 +122,14 @@ st.markdown('<br>', unsafe_allow_html=True)
 #### ------ LOGGED RUNS ------
-st.subheader('Logged runs')
-st.info('The timeline shows the creation and last event time of each run.')
-st.plotly_chart(
-    px.timeline(df, x_start='created_at', x_end='last_event_at', y='username', color='state',
-                labels={'created_at':'Created at', 'last_event_at':'Last event at', 'username':''},
-                ),
-    use_container_width=True
-)
-with st.expander('Show raw run data'):
-    st.dataframe(df)

 import pandas as pd
 import streamlit as st
 import plotly.express as px
+import requests
 import utils
 """
 UPDATE_INTERVAL = 3600
+BASE_URL = 'API_URL'
 st.title('Folding Subnet Dashboard')
 st.markdown('<br>', unsafe_allow_html=True)
 #### ------ PRODUCTIVITY ------
 st.subheader('Productivity overview')
 st.info('Productivity metrics show how many proteins have been folded, which is the primary goal of the subnet. Metrics are estimated using weights and biases data combined with heuristics.')
+productivity_all = requests.get(f'{BASE_URL}/productivity').json()
+productivity = productivity_all['all_time']
+productivity_24h = productivity_all['last_24h']
+m1, m2 = st.columns(2)
+m1.metric('Unique proteins folded', f'{productivity["unique_folded"]:,.0f}', delta=f'{productivity_24h["unique_folded"]:,.0f} (24h)')
+m2.metric('Total jobs completed', f'{productivity["total_completed_jobs"]:,.0f}', delta=f'{productivity_24h["total_completed_jobs"]:,.0f} (24h)')
+# m3.metric('Total simulation steps', f'{productivity.get("total_md_steps"):,.0f}', delta=f'{productivity_24h.get("total_md_steps"):,.0f} (24h)')
+# st.markdown('<br>', unsafe_allow_html=True)
+# time_binned_data = df.set_index('last_event_at').groupby(pd.Grouper(freq='12h'))
+# PROD_CHOICES = {
+#     'Unique proteins folded': 'unique_pdbs',
+#     'Total simulations': 'total_pdbs',
+#     'Total simulation steps': 'total_md_steps',
+# }
+# prod_choice_label = st.radio('Select productivity metric', list(PROD_CHOICES.keys()), index=0, horizontal=True)
+# prod_choice = PROD_CHOICES[prod_choice_label]
+# steps_running_total = time_binned_data[prod_choice].sum().cumsum()
+# st.plotly_chart(
+#     # add fillgradient to make it easier to see the trend
+#     px.area(steps_running_total, y=prod_choice,
+#             labels={'last_event_at':'', prod_choice: prod_choice_label},
+#     ).update_traces(fill='tozeroy'),
+#     use_container_width=True,
+# )
 st.markdown('<br>', unsafe_allow_html=True)
 st.info('Throughput metrics show the total amount of data sent and received by the validators. This is a measure of the network activity and the amount of data that is being processed by the subnet.')
 MEM_UNIT = 'GB' #st.radio('Select memory unit', ['TB','GB', 'MB'], index=0, horizontal=True)
+throughput = requests.get(f'{BASE_URL}/throughput').json()
+data_transferred = throughput['all_time']
+data_transferred_24h = throughput['last_24h']
 m1, m2, m3 = st.columns(3)
+m1.metric(f'Total validator data sent ({MEM_UNIT})', f'{data_transferred["validator_sent"]:,.0f}', delta=f'{data_transferred_24h["validator_sent"]:,.0f} (24h)')
+m2.metric(f'Total received data ({MEM_UNIT})', f'{data_transferred["miner_sent"]:,.0f}', delta=f'{data_transferred_24h["miner_sent"]:,.0f} (24h)')
+m3.metric(f'Total transferred data ({MEM_UNIT})', f'{data_transferred["validator_sent"]+data_transferred["miner_sent"]:,.0f}', delta=f'{data_transferred_24h["validator_sent"]+data_transferred_24h["miner_sent"]:,.0f} (24h)')
+# IO_CHOICES = {'total_data_sent':'Sent', 'total_data_received':'Received'}
+# io_running_total = time_binned_data[list(IO_CHOICES.keys())].sum().rename(columns=IO_CHOICES).cumsum().melt(ignore_index=False)
+# io_running_total['value'] = io_running_total['value'].apply(utils.convert_unit, args=(utils.BASE_UNITS, MEM_UNIT))
+# st.plotly_chart(
+#     px.area(io_running_total, y='value', color='variable',
+#             labels={'last_event_at':'', 'value': f'Data transferred ({MEM_UNIT})', 'variable':'Direction'},
+#     ),
+#     use_container_width=True,
+# )
 st.markdown('<br>', unsafe_allow_html=True)
 #### ------ LOGGED RUNS ------
+# st.subheader('Logged runs')
+# st.info('The timeline shows the creation and last event time of each run.')
+# st.plotly_chart(
+#     px.timeline(df, x_start='created_at', x_end='last_event_at', y='username', color='state',
+#                 labels={'created_at':'Created at', 'last_event_at':'Last event at', 'username':''},
+#                 ),
+#     use_container_width=True
+# )
+# with st.expander('Show raw run data'):
+#     st.dataframe(df)

classes.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pydantic import BaseModel
+class ProductivityData(BaseModel):
+    unique_folded: int
+    total_completed_jobs: int
+class Productivity(BaseModel):
+    all_time: ProductivityData
+    last_24h: ProductivityData
+class ThroughputData(BaseModel):
+    validator_sent: float
+    miner_sent: float
+class Throughput(BaseModel):
+    all_time: ThroughputData
+    last_24h: ThroughputData

utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ import wandb
 import streamlit as st
 import pandas as pd
 import bittensor as bt
 # TODO: Store the runs dataframe (as in sn1 dashboard) and top up with the ones created since the last snapshot
@@ -15,7 +16,7 @@ import bittensor as bt
 MIN_STEPS = 12 # minimum number of steps in wandb run in order to be worth analyzing
 MAX_RUNS = 100#0000
 NETUID = 25
-BASE_PATHS = ['macrocosmos/folding-validators--moved', 'macrocosmos/folding-validators'] # added historical data from otf wandb and current data
 NETWORK = 'finney'
 KEYS = None
 ABBREV_CHARS = 8
@@ -23,7 +24,12 @@ ENTITY_CHOICES = ('identity', 'hotkey', 'coldkey')
 PDBS_PER_RUN_STEP = 0.083
 AVG_MD_STEPS = 30_000
-BASE_UNITS = 'MB'
 api = wandb.Api(timeout=120, api_key='cdcbe340bb7937d3a289d39632491d12b39231b7')
@@ -47,24 +53,24 @@ EXTRACTORS = {
     'run_id': lambda x: x.id,
     'user': lambda x: x.user.name[:16],
     'username': lambda x: x.user.username[:16],
-    'created_at': lambda x: pd.Timestamp(x.created_at),
-    'last_event_at': lambda x: pd.Timestamp(x.summary.get('_timestamp'), unit='s'),
     'netuid': lambda x: x.config.get('netuid'),
     'mock': lambda x: x.config.get('neuron').get('mock'),
     'sample_size': lambda x: x.config.get('neuron').get('sample_size'),
     'queue_size': lambda x: x.config.get('neuron').get('queue_size'),
     'timeout': lambda x: x.config.get('neuron').get('timeout'),
-    'update_interval': lambda x: x.config.get('neuron').get('update_interval'),
     'epoch_length': lambda x: x.config.get('neuron').get('epoch_length'),
     'disable_set_weights': lambda x: x.config.get('neuron').get('disable_set_weights'),
     # This stuff is from the last logged event
     'num_steps': lambda x: x.summary.get('_step'),
-    'runtime': lambda x: x.summary.get('_runtime'),
-    'init_energy': lambda x: x.summary.get('init_energy'),
-    'best_energy': lambda x: x.summary.get('best_loss'),
-    'pdb_id': lambda x: x.summary.get('pdb_id'),
     'pdb_updates': lambda x: x.summary.get('updated_count'),
     'total_returned_sizes': lambda x: get_total_file_sizes(x),
     'total_sent_sizes': lambda x: get_total_md_input_sizes(x),
@@ -74,10 +80,12 @@ EXTRACTORS = {
     'version': lambda x: x.tags[0],
     'spec_version': lambda x: x.tags[1],
     'vali_hotkey': lambda x: x.tags[2],
     # System metrics
     'disk_read': lambda x: x.system_metrics.get('system.disk.in'),
     'disk_write': lambda x: x.system_metrics.get('system.disk.out'),
     # Really slow stuff below
     # 'started_at': lambda x: x.metadata.get('startedAt'),
     # 'disk_used': lambda x: x.metadata.get('disk').get('/').get('used'),
@@ -135,32 +143,30 @@ def get_total_md_input_sizes(run):
 def get_data_transferred(df, unit='GB'):
-    factor = convert_unit(1, from_unit=BASE_UNITS, to_unit=unit)
-    sent = df.total_data_sent.sum()
-    received = df.total_data_received.sum()
     return {
-        'sent':sent * factor,
-        'received':received * factor,
-        'total': (sent + received) * factor,
-        'read':df.disk_read.sum() * factor,
-        'write':df.disk_write.sum() * factor,
-        }
 def get_productivity(df):
     # Estimate the number of unique pdbs folded using our heuristic
-    unique_folded = df.unique_pdbs.sum().round()
-    # Estimate the total number of simulations completed using our heuristic
-    total_simulations = df.total_pdbs.sum().round()
-    # Estimate the total number of simulation steps completed using our heuristic
-    total_md_steps = df.total_md_steps.sum().round()
     return {
         'unique_folded': unique_folded,
-        'total_simulations': total_simulations,
-        'total_md_steps': total_md_steps,
     }
 def get_leaderboard(df, ntop=10, entity_choice='identity'):
@@ -169,6 +175,83 @@ def get_leaderboard(df, ntop=10, entity_choice='identity'):
     df.index = range(df.shape[0])
     return df.groupby(entity_choice).I.sum().sort_values().reset_index().tail(ntop)
 @st.cache_data()
 def get_metagraph(time):
     print(f'Loading metagraph with time {time}')
@@ -188,20 +271,26 @@ def get_metagraph(time):
     return df_m
-@st.cache_data()
-def load_run(run_path, keys=KEYS):
     print('Loading run:', run_path)
     run = api.run(run_path)
-    df = pd.DataFrame(list(run.scan_history(keys=keys)))
     for col in ['updated_at', 'best_loss_at', 'created_at']:
         if col in df.columns:
             df[col] = pd.to_datetime(df[col])
-    print(f'+ Loaded {len(df)} records')
-    return df
 @st.cache_data(show_spinner=False)
-def build_data(timestamp=None, paths=BASE_PATHS, min_steps=MIN_STEPS, use_cache=True):
     save_path = '_saved_runs.csv'
     filters = {}
@@ -272,10 +361,4 @@ def load_state_vars():
     }
-if __name__ == '__main__':
-    print('Loading runs')
-    df = load_runs()
-    df.to_csv('test_wandb_data.csv', index=False)
-    print(df)

 import streamlit as st
 import pandas as pd
 import bittensor as bt
+import ast
 # TODO: Store the runs dataframe (as in sn1 dashboard) and top up with the ones created since the last snapshot
 MIN_STEPS = 12 # minimum number of steps in wandb run in order to be worth analyzing
 MAX_RUNS = 100#0000
 NETUID = 25
+BASE_PATH = 'macrocosmos/folding-validators' # added historical data from otf wandb and current data
 NETWORK = 'finney'
 KEYS = None
 ABBREV_CHARS = 8
 PDBS_PER_RUN_STEP = 0.083
 AVG_MD_STEPS = 30_000
+BASE_UNITS = 'GB'
+SAVE_PATH = 'current_runs/'
+# Check if the directory exists
+if not os.path.exists(SAVE_PATH):
+    # If it doesn't exist, create the directory
+    os.makedirs(SAVE_PATH)
 api = wandb.Api(timeout=120, api_key='cdcbe340bb7937d3a289d39632491d12b39231b7')
     'run_id': lambda x: x.id,
     'user': lambda x: x.user.name[:16],
     'username': lambda x: x.user.username[:16],
+    # 'created_at': lambda x: pd.Timestamp(x.created_at),
+    'last_event_at': lambda x: pd.to_datetime(x.summary.get('_timestamp'), errors='coerce'),
     'netuid': lambda x: x.config.get('netuid'),
     'mock': lambda x: x.config.get('neuron').get('mock'),
     'sample_size': lambda x: x.config.get('neuron').get('sample_size'),
     'queue_size': lambda x: x.config.get('neuron').get('queue_size'),
     'timeout': lambda x: x.config.get('neuron').get('timeout'),
+    # 'update_interval': lambda x: x.config.get('neuron').get('update_interval'),
     'epoch_length': lambda x: x.config.get('neuron').get('epoch_length'),
     'disable_set_weights': lambda x: x.config.get('neuron').get('disable_set_weights'),
     # This stuff is from the last logged event
     'num_steps': lambda x: x.summary.get('_step'),
+    # 'runtime': lambda x: x.summary.get('_runtime'),
+    # 'init_energy': lambda x: x.summary.get('init_energy'),
+    # 'best_energy': lambda x: x.summary.get('best_loss'),
+    # 'pdb_id': lambda x: x.summary.get('pdb_id'),
     'pdb_updates': lambda x: x.summary.get('updated_count'),
     'total_returned_sizes': lambda x: get_total_file_sizes(x),
     'total_sent_sizes': lambda x: get_total_md_input_sizes(x),
     'version': lambda x: x.tags[0],
     'spec_version': lambda x: x.tags[1],
     'vali_hotkey': lambda x: x.tags[2],
     # System metrics
     'disk_read': lambda x: x.system_metrics.get('system.disk.in'),
     'disk_write': lambda x: x.system_metrics.get('system.disk.out'),
+    'network_sent': lambda x: x.system_metrics.get('system.network.sent'),
+    'network_recv': lambda x: x.system_metrics.get('system.network.recv'),
     # Really slow stuff below
     # 'started_at': lambda x: x.metadata.get('startedAt'),
     # 'disk_used': lambda x: x.metadata.get('disk').get('/').get('used'),
 def get_data_transferred(df, unit='GB'):
+    validator_sent = df.md_inputs_sizes.dropna().apply(lambda x: ast.literal_eval(x)).explode().sum()
+    miner_sent = df.response_returned_files_sizes.dropna().apply(lambda x: ast.literal_eval(x)).explode().explode().sum()
     return {
+        'validator_sent': convert_unit(validator_sent, from_unit='B', to_unit=BASE_UNITS),
+        'miner_sent': convert_unit(miner_sent, from_unit='B', to_unit=BASE_UNITS),
+    }
 def get_productivity(df):
     # Estimate the number of unique pdbs folded using our heuristic
+    unique_folded = len(df.pdb_id.value_counts())
+    # Estimate the total number of jobs completed using our heuristic
+    completed_jobs = len(df[df.active == False])
+    total_historical_run_updates = df.active.isna().sum()
+    total_historical_completed_jobs = total_historical_run_updates//10 # this is an estimate based on minimum number of updates per pdb
     return {
         'unique_folded': unique_folded,
+        'total_completed_jobs': (completed_jobs + total_historical_completed_jobs).item(),
     }
 def get_leaderboard(df, ntop=10, entity_choice='identity'):
     df.index = range(df.shape[0])
     return df.groupby(entity_choice).I.sum().sort_values().reset_index().tail(ntop)
+def fetch_new_runs(base_path: str = BASE_PATH , netuid: int = 25, min_steps: int = 10, save_path: str= SAVE_PATH, extractors: dict = EXTRACTORS):
+    runs_checker = pd.read_csv('runs_checker.csv')
+    current_time = pd.to_datetime(time.time(), unit='s')
+    current_time_str = current_time.strftime('%y-%m-%d')  # Format as 'YYYYMMDD'
+    new_ticker = runs_checker.check_ticker.max() + 1
+    new_rows_list = []
+    # update runs list based on all current runs running
+    for run in api.runs(base_path):
+        num_steps = run.summary.get('_step')
+        if run.config.get('netuid') != netuid:
+            continue
+        if num_steps is None or num_steps < min_steps:
+            continue
+        if run.state =='running':
+            new_rows_list.append({
+                'run_id': run.id,
+                'state': run.state,
+                'step': num_steps,
+                'check_time': current_time,
+                'check_ticker': new_ticker,
+                'user': run.user.name[:16],
+                'username': run.user.username[:16]
+            })
+    if new_rows_list:
+        new_rows_df = pd.DataFrame(new_rows_list)
+        runs_checker= pd.concat([runs_checker, new_rows_df], ignore_index=True)
+        # save
+        runs_checker.to_csv('runs_checker.csv', index=False)
+    bt.logging.info(f'Cross checking runs for ticker {new_ticker} against previous ticker')
+    previous_check = runs_checker[runs_checker.check_ticker==new_ticker - 1]
+    current_check = runs_checker[runs_checker.check_ticker == new_ticker]
+    # save ended runs from last check
+    for run_id in previous_check.run_id:
+        if run_id not in current_check.run_id:
+            frame = load_run(f'{base_path}/{run_id}', extractors=EXTRACTORS)
+            csv_path = os.path.join(save_path, f"{run_id}.csv")
+            frame.to_csv(csv_path)
+    # save new runs
+    for run in api.runs(base_path):
+        if run.config.get('netuid') != netuid:
+            continue
+        num_steps = run.summary.get('_step')
+        if num_steps is None or num_steps < min_steps:
+            continue
+        if run.state =='running':
+            frame = load_run(run_path='/'.join(run.path), extractors=EXTRACTORS)
+            csv_path = os.path.join(save_path, f"{run.id}.csv")
+            frame.to_csv(csv_path)
+def preload_data():
+    # save all the paths of files to a list in a directory
+    paths_list = []
+    for path in os.listdir(SAVE_PATH):
+        paths_list.append(os.path.join(SAVE_PATH, path))
+    df_list = []
+    for path in paths_list:
+        df = pd.read_csv(path,low_memory=False)
+        df_list.append(df)
+    combined_df = pd.concat(df_list, ignore_index=True)
+    return combined_df
 @st.cache_data()
 def get_metagraph(time):
     print(f'Loading metagraph with time {time}')
     return df_m
+def load_run(run_path: str, extractors: dict):
     print('Loading run:', run_path)
     run = api.run(run_path)
+    df = pd.DataFrame(list(run.scan_history()))
     for col in ['updated_at', 'best_loss_at', 'created_at']:
         if col in df.columns:
             df[col] = pd.to_datetime(df[col])
+    num_rows=len(df)
+    extractor_df = {key: func(run) for key, func in extractors.items()}
+    repeated_data = {key: [value] * num_rows for key, value in extractor_df.items()}
+    extractor_df = pd.DataFrame(repeated_data)
+    combined_df = pd.concat([df, extractor_df], axis=1)
+    return combined_df
 @st.cache_data(show_spinner=False)
+def build_data(timestamp=None, paths=BASE_PATH, min_steps=MIN_STEPS, use_cache=True):
     save_path = '_saved_runs.csv'
     filters = {}
     }