File size: 6,941 Bytes
9f213d8 9356144 9f213d8 9356144 9f213d8 9356144 9f213d8 9356144 9f213d8 9356144 9f213d8 341dd67 9f213d8 341dd67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# app.py
import os
import io
import sys
import zipfile
import tempfile
import subprocess
from pathlib import Path
import gradio as gr
import pandas as pd
import barzooka as bz
def _check_pdftocairo() -> str:
try:
out = subprocess.run(["pdftocairo", "-v"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
if out.returncode == 0 or "pdftocairo" in out.stdout:
return out.stdout.strip().splitlines()[-1]
return f"pdftocairo not found (returncode {out.returncode}). Output:\n{out.stdout}"
except FileNotFoundError:
return "pdftocairo not found (FileNotFoundError). Install poppler-utils."
def _maybe_download_model(target_path: Path) -> bool:
"""If BARZOOKA_MODEL_URL is set, try to download the model to target_path."""
url = os.getenv("BARZOOKA_MODEL_URL", "").strip()
if not url:
return False
try:
import urllib.request
print(f"Downloading Barzooka model from {url} ...")
with urllib.request.urlopen(url) as resp:
data = resp.read()
target_path.write_bytes(data)
print(f"Saved model to {target_path}")
return True
except Exception as e:
print(f"Model download failed: {e}")
return False
def _construct_barzooka() -> bz.Barzooka:
# 1) Try package-embedded resource
try:
return bz.Barzooka(cpu=True)
except Exception as e_default:
# 2) Fallback: local pkl next to app.py
local_pkl = Path(__file__).with_name("barzooka.pkl")
if not local_pkl.exists():
# 2a) Try to download if env var provided
_maybe_download_model(local_pkl)
if local_pkl.exists():
try:
return bz.Barzooka(model_file=str(local_pkl), cpu=True)
except Exception as e_local:
raise RuntimeError(
"Failed to load Barzooka model from both package resources and local 'barzooka.pkl'.\n"
f"Package error: {e_default}\nLocal file error: {e_local}"
)
raise RuntimeError(
"Barzooka model file not found. Upload 'barzooka.pkl' next to app.py, or set BARZOOKA_MODEL_URL to a downloadable .pkl."
)
BARZOOKA = _construct_barzooka()
def _predict_single_pdf(pdf_path: str, pagewise: bool = False) -> pd.DataFrame:
res = BARZOOKA.predict_from_file(pdf_path, tmp_folder="./tmp/", pagewise=pagewise)
if pagewise:
images, page_preds = res
rows = [{"page": i + 1, "classes": preds} for i, preds in enumerate(page_preds)]
return pd.DataFrame(rows)
else:
return pd.DataFrame([res])
def screen_pdf(pdf_file, pagewise: bool):
if pdf_file is None:
return pd.DataFrame([{"error": "Please upload a PDF."}])
try:
return _predict_single_pdf(pdf_file, pagewise)
except Exception as e:
return pd.DataFrame([{"error": str(e)}])
def screen_images(img_files):
if not img_files:
return pd.DataFrame([{"error": "Please upload at least one image."}])
try:
classes_per_img = BARZOOKA.predict_from_img(img_files)
rows = [{"image": os.path.basename(p), "classes": c} for p, c in zip(img_files, classes_per_img)]
return pd.DataFrame(rows)
except Exception as e:
return pd.DataFrame([{"error": str(e)}])
def screen_zip_of_pdfs(zip_file, pagewise: bool):
if zip_file is None:
return pd.DataFrame([{"error": "Please upload a .zip containing PDFs."}]), None
out_rows = []
with tempfile.TemporaryDirectory() as tdir:
with zipfile.ZipFile(zip_file, "r") as zf:
zf.extractall(tdir)
pdf_paths = [str(p) for p in Path(tdir).rglob("*.pdf")]
if not pdf_paths:
return pd.DataFrame([{"error": "No PDFs found in the ZIP."}]), None
for pdf in sorted(pdf_paths):
try:
df = _predict_single_pdf(pdf, pagewise=pagewise)
if pagewise:
paper_id = Path(pdf).stem
df.insert(0, "paper_id", paper_id)
out_rows.append(df)
else:
out_rows.append(df)
except Exception as e:
out_rows.append(pd.DataFrame([{"paper_id": Path(pdf).stem, "error": str(e)}]))
result_df = pd.concat(out_rows, ignore_index=True)
csv_path = Path(tdir) / "barzooka_results.csv"
result_df.to_csv(csv_path, index=False)
out_csv = Path.cwd() / "barzooka_results.csv"
try:
out_csv.write_bytes(csv_path.read_bytes())
except Exception:
out_csv = None
return result_df, (str(out_csv) if out_csv and out_csv.exists() else None)
def diagnostics():
pdfto = _check_pdftocairo()
classes = getattr(BARZOOKA, "class_names", [])
summary = {
"pdftocairo": pdfto,
"classes": classes,
"model_loaded": True,
}
return pd.DataFrame([summary])
with gr.Blocks(title="Barzooka: Graph-Type Screening") as demo:
gr.Markdown(
"""
# Barzooka
Screen PDFs or images to detect graph types (bar, bardot, box, dot, violin, hist, pie, flow, text, other).
**Notes:** PDF screening requires the system utility `pdftocairo` (from Poppler). This Space installs
`poppler-utils` so conversion should work out-of-the-box.
"""
)
with gr.Row():
diag_btn = gr.Button("Environment diagnostics")
diag_out = gr.Dataframe(label="Diagnostics", wrap=True)
diag_btn.click(diagnostics, outputs=diag_out)
with gr.Tab("Single PDF"):
pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
pagewise_chk = gr.Checkbox(label="Return page-wise predictions", value=False)
pdf_btn = gr.Button("Screen PDF")
pdf_out = gr.Dataframe(label="Results", wrap=True)
pdf_btn.click(fn=screen_pdf, inputs=[pdf_input, pagewise_chk], outputs=pdf_out)
with gr.Tab("ZIP of PDFs (batch)"):
zip_input = gr.File(label="Upload a ZIP containing PDFs", file_types=[".zip"])
pw_chk = gr.Checkbox(label="Return page-wise predictions", value=False)
zip_btn = gr.Button("Screen ZIP")
zip_out = gr.Dataframe(label="Results (preview)", wrap=True)
zip_csv = gr.File(label="Download all results (CSV)")
zip_btn.click(fn=screen_zip_of_pdfs, inputs=[zip_input, pw_chk], outputs=[zip_out, zip_csv])
with gr.Tab("Images"):
imgs_input = gr.Files(label="Upload images", file_types=[".jpg", ".jpeg", ".png"])
imgs_btn = gr.Button("Screen Images")
imgs_out = gr.Dataframe(label="Results", wrap=True)
imgs_btn.click(fn=screen_images, inputs=imgs_input, outputs=imgs_out)
demo.queue(api_open=True)
#In Spaces, you don’t need to call launch(); for local testing:
if __name__ == "__main__":
demo.queue(api_open=True).launch()
|