Spaces:
Running
Running
File size: 2,727 Bytes
7924d77 1f9efe8 3bd4bd2 415d76d 52cb009 3bd4bd2 415d76d 3bd4bd2 7924d77 1f9efe8 415d76d 52cb009 ac98601 415d76d 52cb009 1decac6 52cb009 ac98601 52cb009 ac98601 1decac6 52cb009 a386c0e 415d76d a386c0e 415d76d 1f9efe8 52cb009 a386c0e 1f9efe8 ac98601 52cb009 a386c0e 415d76d ac98601 415d76d 1f9efe8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
from datasets import load_dataset
import nbformat
from nbconvert import HTMLExporter
from traitlets.config import Config
import os
import shutil
import base64
# Configuration for HTMLExporter
config = Config()
config.HTMLExporter.preprocessors = ["nbconvert.preprocessors.ExtractOutputPreprocessor"]
html_exporter = HTMLExporter(config=config, template_name="classic")
ds = load_dataset("data-agents/kaggle-notebooks", split="train", streaming=True)
ds_iter = iter(ds)
TMP_DIR = './tmp/'
def reset_tmp_folder():
if os.path.exists(TMP_DIR):
shutil.rmtree(TMP_DIR)
os.makedirs(TMP_DIR)
def embed_figures(html_body, resources):
for key, value in resources['outputs'].items():
b64_figure = base64.b64encode(value).decode('utf-8')
img_tag = f'data:image/png;base64,{b64_figure}'
html_body = html_body.replace(key, img_tag)
return html_body
def parse_notebook(filter_options):
reset_tmp_folder()
found_notebook = False
counter = 0
while not found_notebook:
counter+=1
print(counter, filter_options)
notebook_data = next(ds_iter)
notebook_string = notebook_data["text"]
notebook_id = notebook_data["id"].split("/")[-1]
if filter_options == "none":
found_notebook = True
elif filter_options == ">1MB":
if len(notebook_string)>1 * 1024 * 1024:
found_notebook = True
elif filter_options == "has outputs":
notebook_parsed = nbformat.reads(notebook_string, as_version=4)
(notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
if len(resources)>0:
found_notebook = True
out_path = os.path.join(TMP_DIR, notebook_id)
# Save the notebook string to a file
with open(out_path, 'w') as f:
f.write(notebook_string)
notebook_parsed = nbformat.reads(notebook_string, as_version=4)
(notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
notebook_body = embed_figures(notebook_body, resources)
print("Resources:", resources["outputs"].keys())
return notebook_body, out_path
with gr.Blocks() as demo:
gr.Markdown("# Kaggle Notebooks")
filter_options = gr.Radio(["none", "has outputs", ">1MB"], label="Notebook filters", info="A lot of notebooks are short or have the outputs stripped - filters help finding interesting ones."),
button = gr.Button("Show next!")
file = gr.File()
html = gr.HTML("")
button.click(fn=parse_notebook, inputs=[filter_options], outputs=[html, file])
demo.load(fn=parse_notebook, inputs=[filter_options], outputs=[html, file])
demo.launch() |