File size: 2,727 Bytes
7924d77
1f9efe8
 
 
3bd4bd2
415d76d
 
52cb009
3bd4bd2
415d76d
3bd4bd2
 
 
7924d77
1f9efe8
 
 
415d76d
 
 
 
 
 
 
52cb009
 
 
 
 
 
 
 
ac98601
415d76d
52cb009
 
 
 
 
1decac6
52cb009
 
 
 
ac98601
52cb009
ac98601
 
 
 
 
 
 
1decac6
52cb009
a386c0e
415d76d
 
a386c0e
415d76d
 
1f9efe8
 
52cb009
 
a386c0e
1f9efe8
 
 
 
ac98601
 
52cb009
 
 
a386c0e
415d76d
ac98601
 
415d76d
1f9efe8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from datasets import load_dataset
import nbformat
from nbconvert import HTMLExporter
from traitlets.config import Config
import os
import shutil
import base64

# Configuration for HTMLExporter
config = Config()
config.HTMLExporter.preprocessors = ["nbconvert.preprocessors.ExtractOutputPreprocessor"]
html_exporter = HTMLExporter(config=config, template_name="classic")

ds = load_dataset("data-agents/kaggle-notebooks", split="train", streaming=True)
ds_iter = iter(ds)

TMP_DIR = './tmp/'

def reset_tmp_folder():
    if os.path.exists(TMP_DIR):
        shutil.rmtree(TMP_DIR)
    os.makedirs(TMP_DIR)


def embed_figures(html_body, resources):
    for key, value in resources['outputs'].items():
        b64_figure = base64.b64encode(value).decode('utf-8')
        img_tag = f'data:image/png;base64,{b64_figure}'
        html_body = html_body.replace(key, img_tag)
    return html_body

def parse_notebook(filter_options):
    reset_tmp_folder()

    found_notebook = False
    counter = 0
    while not found_notebook:
        counter+=1
        print(counter, filter_options)
        notebook_data = next(ds_iter)
        notebook_string = notebook_data["text"]
        notebook_id = notebook_data["id"].split("/")[-1]

        if filter_options == "none":
            found_notebook = True
        elif filter_options == ">1MB":
            if len(notebook_string)>1 * 1024 * 1024:
                found_notebook = True
        elif filter_options == "has outputs":
            notebook_parsed = nbformat.reads(notebook_string, as_version=4)
            (notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
            if len(resources)>0:
                found_notebook = True

    out_path = os.path.join(TMP_DIR, notebook_id)
    
    # Save the notebook string to a file
    with open(out_path, 'w') as f:
        f.write(notebook_string)
    
    notebook_parsed = nbformat.reads(notebook_string, as_version=4)
    (notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
    notebook_body = embed_figures(notebook_body, resources)
    print("Resources:", resources["outputs"].keys())
    return notebook_body, out_path


with gr.Blocks() as demo:
    gr.Markdown("# Kaggle Notebooks")
    filter_options = gr.Radio(["none", "has outputs", ">1MB"], label="Notebook filters", info="A lot of notebooks are short or have the outputs stripped - filters help finding interesting ones."),

    button = gr.Button("Show next!")
    file = gr.File()
    html = gr.HTML("")

    
    button.click(fn=parse_notebook, inputs=[filter_options], outputs=[html, file])
    demo.load(fn=parse_notebook, inputs=[filter_options], outputs=[html, file])

demo.launch()