File size: 2,941 Bytes
7924d77
1f9efe8
 
 
3bd4bd2
415d76d
 
52cb009
3bd4bd2
415d76d
3bd4bd2
 
 
7924d77
eee62c1
706aa29
1f9efe8
415d76d
 
 
 
 
 
 
52cb009
 
 
 
 
 
 
 
706aa29
 
 
 
 
 
 
 
 
e476ef0
415d76d
706aa29
e476ef0
706aa29
e476ef0
706aa29
 
 
52cb009
a386c0e
415d76d
 
a386c0e
415d76d
 
1f9efe8
 
52cb009
706aa29
a386c0e
1f9efe8
 
 
 
0452707
 
706aa29
 
52cb009
 
a386c0e
706aa29
 
 
 
 
 
 
 
 
 
 
415d76d
e476ef0
 
415d76d
1f9efe8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
from datasets import load_dataset
import nbformat
from nbconvert import HTMLExporter
from traitlets.config import Config
import os
import shutil
import base64

# Configuration for HTMLExporter
config = Config()
config.HTMLExporter.preprocessors = ["nbconvert.preprocessors.ExtractOutputPreprocessor"]
html_exporter = HTMLExporter(config=config, template_name="classic")

ds = load_dataset("data-agents/kaggle-notebooks-edu-v0")
ds_out = ds.filter(lambda x: x["contains_outputs"])

TMP_DIR = './tmp/'

def reset_tmp_folder():
    if os.path.exists(TMP_DIR):
        shutil.rmtree(TMP_DIR)
    os.makedirs(TMP_DIR)


def embed_figures(html_body, resources):
    for key, value in resources['outputs'].items():
        b64_figure = base64.b64encode(value).decode('utf-8')
        img_tag = f'data:image/png;base64,{b64_figure}'
        html_body = html_body.replace(key, img_tag)
    return html_body


def update_max_index(score_option, output_option):
    if output_option == "All":
        max_index = len(ds[score_option])
    else:
        max_index = len(ds_out[score_option])
    return gr.Slider(maximum=max_index), gr.Number(value=0, maximum=max_index)


def parse_notebook(score_options, output_options, index):
    reset_tmp_folder()
    if output_options == "All":
        sample = ds[score_options][i]
    else:
        sample = ds_out[score_options][i]
    
    notebook_string = sample["text"]
    notebook_id = sample["id"].split("/")[-1]

    out_path = os.path.join(TMP_DIR, notebook_id)
    
    # Save the notebook string to a file
    with open(out_path, 'w') as f:
        f.write(notebook_string)
    
    notebook_parsed = nbformat.reads(notebook_string, as_version=4)
    (notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
    notebook_body = embed_figures(notebook_body, resources)

    return notebook_body, out_path


with gr.Blocks() as demo:
    gr.Markdown("# Kaggle Notebooks")
    score_options = gr.Dropdown(["error","0", "1", "2", "3", "4", "5"], value="5", label="Notebook score", info="Select the assigned notebook score.")
    output_options = gr.Radio(["Outputs only", "All"], value="Outputs only", label="Output filter", info="Many notebooks contain no outputs.")
    index_slider = gr.Slider(minimum=0, maximum=100, step=1, value=0, label="Index")
    
    file = gr.File()
    html = gr.HTML("")


    score_options.change(
        fn=update_max_index,
        inputs=[score_options, output_options],
        outputs=[index_slider, index_slider]
    )
    output_options.change(
        fn=update_max_index,
        inputs=[score_options, output_options],
        outputs=[index_slider, index_slider]
    )
    
    index_slider.change(fn=parse_notebook, inputs=[score_options, output_options, index_slider], outputs=[html, file])
    demo.load(fn=parse_notebook, inputs=[score_options, output_options, index_slider], outputs=[html, file])

demo.launch()