File size: 6,941 Bytes
9f213d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9356144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f213d8
9356144
9f213d8
 
 
9356144
9f213d8
9356144
 
 
9f213d8
 
 
 
 
 
 
 
 
9356144
9f213d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341dd67
 
 
 
9f213d8
341dd67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# app.py
import os
import io
import sys
import zipfile
import tempfile
import subprocess
from pathlib import Path

import gradio as gr
import pandas as pd

import barzooka as bz


def _check_pdftocairo() -> str:
    try:
        out = subprocess.run(["pdftocairo", "-v"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        if out.returncode == 0 or "pdftocairo" in out.stdout:
            return out.stdout.strip().splitlines()[-1]
        return f"pdftocairo not found (returncode {out.returncode}). Output:\n{out.stdout}"
    except FileNotFoundError:
        return "pdftocairo not found (FileNotFoundError). Install poppler-utils."


def _maybe_download_model(target_path: Path) -> bool:
    """If BARZOOKA_MODEL_URL is set, try to download the model to target_path."""
    url = os.getenv("BARZOOKA_MODEL_URL", "").strip()
    if not url:
        return False
    try:
        import urllib.request
        print(f"Downloading Barzooka model from {url} ...")
        with urllib.request.urlopen(url) as resp:
            data = resp.read()
        target_path.write_bytes(data)
        print(f"Saved model to {target_path}")
        return True
    except Exception as e:
        print(f"Model download failed: {e}")
        return False


def _construct_barzooka() -> bz.Barzooka:
    # 1) Try package-embedded resource
    try:
        return bz.Barzooka(cpu=True)
    except Exception as e_default:
        # 2) Fallback: local pkl next to app.py
        local_pkl = Path(__file__).with_name("barzooka.pkl")
        if not local_pkl.exists():
            # 2a) Try to download if env var provided
            _maybe_download_model(local_pkl)
        if local_pkl.exists():
            try:
                return bz.Barzooka(model_file=str(local_pkl), cpu=True)
            except Exception as e_local:
                raise RuntimeError(
                    "Failed to load Barzooka model from both package resources and local 'barzooka.pkl'.\n"
                    f"Package error: {e_default}\nLocal file error: {e_local}"
                )
        raise RuntimeError(
            "Barzooka model file not found. Upload 'barzooka.pkl' next to app.py, or set BARZOOKA_MODEL_URL to a downloadable .pkl."
        )


BARZOOKA = _construct_barzooka()


def _predict_single_pdf(pdf_path: str, pagewise: bool = False) -> pd.DataFrame:
    res = BARZOOKA.predict_from_file(pdf_path, tmp_folder="./tmp/", pagewise=pagewise)
    if pagewise:
        images, page_preds = res
        rows = [{"page": i + 1, "classes": preds} for i, preds in enumerate(page_preds)]
        return pd.DataFrame(rows)
    else:
        return pd.DataFrame([res])


def screen_pdf(pdf_file, pagewise: bool):
    if pdf_file is None:
        return pd.DataFrame([{"error": "Please upload a PDF."}])
    try:
        return _predict_single_pdf(pdf_file, pagewise)
    except Exception as e:
        return pd.DataFrame([{"error": str(e)}])


def screen_images(img_files):
    if not img_files:
        return pd.DataFrame([{"error": "Please upload at least one image."}])
    try:
        classes_per_img = BARZOOKA.predict_from_img(img_files)
        rows = [{"image": os.path.basename(p), "classes": c} for p, c in zip(img_files, classes_per_img)]
        return pd.DataFrame(rows)
    except Exception as e:
        return pd.DataFrame([{"error": str(e)}])


def screen_zip_of_pdfs(zip_file, pagewise: bool):
    if zip_file is None:
        return pd.DataFrame([{"error": "Please upload a .zip containing PDFs."}]), None

    out_rows = []

    with tempfile.TemporaryDirectory() as tdir:
        with zipfile.ZipFile(zip_file, "r") as zf:
            zf.extractall(tdir)

        pdf_paths = [str(p) for p in Path(tdir).rglob("*.pdf")]
        if not pdf_paths:
            return pd.DataFrame([{"error": "No PDFs found in the ZIP."}]), None

        for pdf in sorted(pdf_paths):
            try:
                df = _predict_single_pdf(pdf, pagewise=pagewise)
                if pagewise:
                    paper_id = Path(pdf).stem
                    df.insert(0, "paper_id", paper_id)
                    out_rows.append(df)
                else:
                    out_rows.append(df)
            except Exception as e:
                out_rows.append(pd.DataFrame([{"paper_id": Path(pdf).stem, "error": str(e)}]))

        result_df = pd.concat(out_rows, ignore_index=True)

        csv_path = Path(tdir) / "barzooka_results.csv"
        result_df.to_csv(csv_path, index=False)

        out_csv = Path.cwd() / "barzooka_results.csv"
        try:
            out_csv.write_bytes(csv_path.read_bytes())
        except Exception:
            out_csv = None

        return result_df, (str(out_csv) if out_csv and out_csv.exists() else None)


def diagnostics():
    pdfto = _check_pdftocairo()
    classes = getattr(BARZOOKA, "class_names", [])
    summary = {
        "pdftocairo": pdfto,
        "classes": classes,
        "model_loaded": True,
    }
    return pd.DataFrame([summary])


with gr.Blocks(title="Barzooka: Graph-Type Screening") as demo:
    gr.Markdown(
        """
        # Barzooka
        Screen PDFs or images to detect graph types (bar, bardot, box, dot, violin, hist, pie, flow, text, other).

        **Notes:** PDF screening requires the system utility `pdftocairo` (from Poppler). This Space installs
        `poppler-utils` so conversion should work out-of-the-box.
        """
    )

    with gr.Row():
        diag_btn = gr.Button("Environment diagnostics")
        diag_out = gr.Dataframe(label="Diagnostics", wrap=True)
        diag_btn.click(diagnostics, outputs=diag_out)

    with gr.Tab("Single PDF"):
        pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
        pagewise_chk = gr.Checkbox(label="Return page-wise predictions", value=False)
        pdf_btn = gr.Button("Screen PDF")
        pdf_out = gr.Dataframe(label="Results", wrap=True)
        pdf_btn.click(fn=screen_pdf, inputs=[pdf_input, pagewise_chk], outputs=pdf_out)

    with gr.Tab("ZIP of PDFs (batch)"):
        zip_input = gr.File(label="Upload a ZIP containing PDFs", file_types=[".zip"])
        pw_chk = gr.Checkbox(label="Return page-wise predictions", value=False)
        zip_btn = gr.Button("Screen ZIP")
        zip_out = gr.Dataframe(label="Results (preview)", wrap=True)
        zip_csv = gr.File(label="Download all results (CSV)")
        zip_btn.click(fn=screen_zip_of_pdfs, inputs=[zip_input, pw_chk], outputs=[zip_out, zip_csv])

    with gr.Tab("Images"):
        imgs_input = gr.Files(label="Upload images", file_types=[".jpg", ".jpeg", ".png"])
        imgs_btn = gr.Button("Screen Images")
        imgs_out = gr.Dataframe(label="Results", wrap=True)
        imgs_btn.click(fn=screen_images, inputs=imgs_input, outputs=imgs_out)

demo.queue(api_open=True)


#In Spaces, you don’t need to call launch(); for local testing:
if __name__ == "__main__":
    demo.queue(api_open=True).launch()