Spaces:
Sleeping
Sleeping
Software loading logic rewrite
Browse filesMassive rewrite based on GColab working code for the 3 softwares instances.
- README.md +3 -3
- app.py +161 -55
- packages.txt +1 -0
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
title: FormatAnalyser
|
| 3 |
emoji: ποΈ
|
| 4 |
colorFrom: red
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
|
@@ -76,4 +76,4 @@ python app.py
|
|
| 76 |
|
| 77 |
- **JHOVE installer URL** may change with new releases β check [openpreservation.org](https://openpreservation.org/products/jhove/) if install fails
|
| 78 |
- **DROID signature file** is downloaded from The National Archives; the version number in `app.py` may need updating over time
|
| 79 |
-
- On Hugging Face Spaces the first analysis run triggers tool installation; subsequent runs in the same session skip this step
|
|
|
|
| 2 |
title: FormatAnalyser
|
| 3 |
emoji: ποΈ
|
| 4 |
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: "4.44.0"
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
|
|
|
| 76 |
|
| 77 |
- **JHOVE installer URL** may change with new releases β check [openpreservation.org](https://openpreservation.org/products/jhove/) if install fails
|
| 78 |
- **DROID signature file** is downloaded from The National Archives; the version number in `app.py` may need updating over time
|
| 79 |
+
- On Hugging Face Spaces the first analysis run triggers tool installation; subsequent runs in the same session skip this step
|
app.py
CHANGED
|
@@ -14,6 +14,10 @@ import shutil
|
|
| 14 |
import tempfile
|
| 15 |
import threading
|
| 16 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
import gradio as gr
|
| 19 |
import pandas as pd
|
|
@@ -53,50 +57,109 @@ def setup_tools():
|
|
| 53 |
log.append("β Siegfried already on PATH")
|
| 54 |
else:
|
| 55 |
log.append("β¬ Installing Siegfriedβ¦")
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
run_cmd(
|
| 58 |
"wget -q https://github.com/richardlehane/siegfried/releases/download/"
|
| 59 |
-
"v1.11.
|
| 60 |
)
|
| 61 |
-
run_cmd("
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
run_cmd("sf -update 2>&1 || true")
|
|
|
|
| 64 |
if shutil.which("sf"):
|
| 65 |
TOOL_STATE["siegfried"]["ready"] = True
|
| 66 |
-
log.append(
|
|
|
|
|
|
|
| 67 |
else:
|
| 68 |
log.append("β Siegfried install failed")
|
| 69 |
|
| 70 |
# ββ DROID βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
-
DROID_VERSION = "6.
|
| 72 |
-
jar_candidates = [
|
| 73 |
-
f"/opt/droid/droid-command-line-{DROID_VERSION}.jar",
|
| 74 |
-
"/opt/droid/droid-command-line.jar",
|
| 75 |
-
]
|
| 76 |
sig_path = "/root/.droid6/signature_files/DROID_SignatureFile_V118.xml"
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
if jar_path and os.path.exists(jar_path):
|
| 85 |
TOOL_STATE["droid"]["ready"] = True
|
| 86 |
-
TOOL_STATE["droid"]["jar"] =
|
| 87 |
-
log.append(f"β DROID already
|
| 88 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
log.append("β¬ Installing DROIDβ¦")
|
| 90 |
-
run_cmd("apt-get install -qq -y default-jre unzip wget 2>/dev/null || true")
|
|
|
|
| 91 |
DROID_URL = (
|
| 92 |
f"https://github.com/digital-preservation/droid/releases/download/"
|
| 93 |
-
f"
|
| 94 |
)
|
| 95 |
run_cmd(f"wget -q {DROID_URL} -O /tmp/droid.zip")
|
| 96 |
-
run_cmd("unzip -q /tmp/droid.zip -d /
|
| 97 |
-
|
|
|
|
| 98 |
if stdout:
|
| 99 |
jar_path = stdout
|
|
|
|
|
|
|
|
|
|
| 100 |
TOOL_STATE["droid"]["ready"] = True
|
| 101 |
TOOL_STATE["droid"]["jar"] = jar_path
|
| 102 |
log.append(f"β DROID installed: {jar_path}")
|
|
@@ -117,36 +180,67 @@ def setup_tools():
|
|
| 117 |
log.append("β DROID signature file missing β results may be limited")
|
| 118 |
|
| 119 |
# ββ JHOVE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
stdout, _, _ = run_cmd(
|
| 123 |
"find /opt/jhove -name 'jhove' -not -name '*.jar' 2>/dev/null | head -1"
|
| 124 |
)
|
| 125 |
-
if stdout:
|
| 126 |
-
|
| 127 |
|
| 128 |
-
if
|
| 129 |
-
TOOL_STATE["jhove"]["ready"] = True
|
| 130 |
-
TOOL_STATE["jhove"]["path"] = jhove_sh
|
| 131 |
-
log.append(f"β JHOVE already at {jhove_sh}")
|
| 132 |
-
else:
|
| 133 |
log.append("β¬ Installing JHOVE (may take ~2 min)β¦")
|
| 134 |
-
|
| 135 |
-
run_cmd(
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
)
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
stdout, _, _ = run_cmd(
|
| 142 |
-
"find /opt/jhove -name 'jhove' -not -name '*.jar' 2>/dev/null | head -1"
|
| 143 |
-
)
|
| 144 |
-
if stdout:
|
| 145 |
-
jhove_sh = stdout
|
| 146 |
-
if os.path.exists(jhove_sh):
|
| 147 |
-
TOOL_STATE["jhove"]["ready"] = True
|
| 148 |
-
TOOL_STATE["jhove"]["path"] = jhove_sh
|
| 149 |
-
log.append(f"β JHOVE installed: {jhove_sh}")
|
| 150 |
else:
|
| 151 |
log.append("β JHOVE install failed")
|
| 152 |
|
|
@@ -193,14 +287,26 @@ def run_droid(filepath: str) -> dict:
|
|
| 193 |
with tempfile.TemporaryDirectory() as tmp:
|
| 194 |
profile = os.path.join(tmp, "profile.droid")
|
| 195 |
csv_out = os.path.join(tmp, "export.csv")
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
if os.path.exists(csv_out):
|
| 205 |
with open(csv_out) as f:
|
| 206 |
rows = list(csv.DictReader(f))
|
|
|
|
| 14 |
import tempfile
|
| 15 |
import threading
|
| 16 |
from pathlib import Path
|
| 17 |
+
try:
|
| 18 |
+
import requests
|
| 19 |
+
except ImportError:
|
| 20 |
+
requests = None
|
| 21 |
|
| 22 |
import gradio as gr
|
| 23 |
import pandas as pd
|
|
|
|
| 57 |
log.append("β Siegfried already on PATH")
|
| 58 |
else:
|
| 59 |
log.append("β¬ Installing Siegfriedβ¦")
|
| 60 |
+
# try to fetch via apt/wget as before, but installation may not permit
|
| 61 |
+
# writing to /usr/local/bin on some environments (e.g. Colab), so we
|
| 62 |
+
# also support a user-space fallback where the binary is kept in a
|
| 63 |
+
# local directory and added to PATH.
|
| 64 |
+
run_cmd("apt-get install -qq -y wget unzip 2>/dev/null || true")
|
| 65 |
run_cmd(
|
| 66 |
"wget -q https://github.com/richardlehane/siegfried/releases/download/"
|
| 67 |
+
"v1.11.4/siegfried_1-11-4_linux64.zip -O /tmp/sf.zip"
|
| 68 |
)
|
| 69 |
+
run_cmd("unzip -q /tmp/sf.zip -d /tmp/sf")
|
| 70 |
+
sf_bin = "/tmp/sf/sf"
|
| 71 |
+
# if the download/unzip above failed (no wget or unzip), try Python
|
| 72 |
+
if not os.path.exists(sf_bin):
|
| 73 |
+
# pure-Python fallback inspired by Colab snippet
|
| 74 |
+
siegfried_url = (
|
| 75 |
+
"https://github.com/richardlehane/siegfried/"
|
| 76 |
+
"releases/download/v1.11.4/siegfried_1-11-4_linux64.zip"
|
| 77 |
+
)
|
| 78 |
+
local_filename = os.path.join("/tmp", siegfried_url.split("/")[-1])
|
| 79 |
+
try:
|
| 80 |
+
import requests, zipfile
|
| 81 |
+
|
| 82 |
+
with requests.get(siegfried_url, stream=True) as r:
|
| 83 |
+
r.raise_for_status()
|
| 84 |
+
with open(local_filename, "wb") as f:
|
| 85 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 86 |
+
f.write(chunk)
|
| 87 |
+
os.makedirs("/tmp/sf", exist_ok=True)
|
| 88 |
+
with zipfile.ZipFile(local_filename, "r") as zip_ref:
|
| 89 |
+
zip_ref.extractall(path="/tmp/sf")
|
| 90 |
+
except Exception:
|
| 91 |
+
pass
|
| 92 |
+
# ensure executable
|
| 93 |
+
if os.path.exists(sf_bin):
|
| 94 |
+
run_cmd(f"chmod +x {sf_bin}")
|
| 95 |
+
# attempt to move into system path, but don't fail if we can't
|
| 96 |
+
moved = False
|
| 97 |
+
try:
|
| 98 |
+
run_cmd(f"mv {sf_bin} /usr/local/bin/sf")
|
| 99 |
+
run_cmd("chmod +x /usr/local/bin/sf")
|
| 100 |
+
moved = True
|
| 101 |
+
except Exception:
|
| 102 |
+
# ignore permission issues
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
# always make sure the directory containing the binary is in PATH
|
| 106 |
+
os.environ["PATH"] += os.pathsep + os.path.dirname(sf_bin)
|
| 107 |
+
|
| 108 |
+
# update signatures using whatever sf binary is found in PATH
|
| 109 |
run_cmd("sf -update 2>&1 || true")
|
| 110 |
+
|
| 111 |
if shutil.which("sf"):
|
| 112 |
TOOL_STATE["siegfried"]["ready"] = True
|
| 113 |
+
log.append(
|
| 114 |
+
"β Siegfried installed" + (" to /usr/local/bin" if moved else " (user path)")
|
| 115 |
+
)
|
| 116 |
else:
|
| 117 |
log.append("β Siegfried install failed")
|
| 118 |
|
| 119 |
# ββ DROID βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
+
DROID_VERSION = "6.9.12"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
sig_path = "/root/.droid6/signature_files/DROID_SignatureFile_V118.xml"
|
| 122 |
|
| 123 |
+
# check for either jar or helper script on PATH
|
| 124 |
+
jar_path = None
|
| 125 |
+
script_path = shutil.which("droid") or shutil.which("droid.sh")
|
| 126 |
+
if script_path:
|
| 127 |
+
# we can rely on the script to locate the jar itself later
|
|
|
|
|
|
|
| 128 |
TOOL_STATE["droid"]["ready"] = True
|
| 129 |
+
TOOL_STATE["droid"]["jar"] = script_path
|
| 130 |
+
log.append(f"β DROID script already on PATH ({script_path})")
|
| 131 |
else:
|
| 132 |
+
jar_candidates = [
|
| 133 |
+
f"/opt/droid/droid-command-line-{DROID_VERSION}.jar",
|
| 134 |
+
"/opt/droid/droid-command-line.jar",
|
| 135 |
+
]
|
| 136 |
+
jar_path = next((p for p in jar_candidates if os.path.exists(p)), None)
|
| 137 |
+
if jar_path is None:
|
| 138 |
+
stdout, _, _ = run_cmd("find /opt/droid -name '*.jar' 2>/dev/null | head -1")
|
| 139 |
+
if stdout:
|
| 140 |
+
jar_path = stdout
|
| 141 |
+
if jar_path and os.path.exists(jar_path):
|
| 142 |
+
TOOL_STATE["droid"]["ready"] = True
|
| 143 |
+
TOOL_STATE["droid"]["jar"] = jar_path
|
| 144 |
+
log.append(f"β DROID already at {jar_path}")
|
| 145 |
+
# if we still aren't ready, attempt installation/fallback
|
| 146 |
+
if not TOOL_STATE["droid"]["ready"]:
|
| 147 |
log.append("β¬ Installing DROIDβ¦")
|
| 148 |
+
run_cmd("apt-get install -qq -y openjdk-21-jre-headless default-jre unzip wget 2>/dev/null || true")
|
| 149 |
+
# primary mechanism: download the binary bundle and extract locally
|
| 150 |
DROID_URL = (
|
| 151 |
f"https://github.com/digital-preservation/droid/releases/download/"
|
| 152 |
+
f"{DROID_VERSION}/droid-binary-{DROID_VERSION}-bin.zip"
|
| 153 |
)
|
| 154 |
run_cmd(f"wget -q {DROID_URL} -O /tmp/droid.zip")
|
| 155 |
+
run_cmd("mkdir -p /tmp/droid && unzip -q /tmp/droid.zip -d /tmp/droid")
|
| 156 |
+
# look for jar in extracted tree
|
| 157 |
+
stdout, _, _ = run_cmd("find /tmp/droid -name '*.jar' 2>/dev/null | head -1")
|
| 158 |
if stdout:
|
| 159 |
jar_path = stdout
|
| 160 |
+
# also add the containing directory to PATH so droid.sh can execute
|
| 161 |
+
dirname = os.path.dirname(jar_path)
|
| 162 |
+
os.environ["PATH"] += os.pathsep + dirname
|
| 163 |
TOOL_STATE["droid"]["ready"] = True
|
| 164 |
TOOL_STATE["droid"]["jar"] = jar_path
|
| 165 |
log.append(f"β DROID installed: {jar_path}")
|
|
|
|
| 180 |
log.append("β DROID signature file missing β results may be limited")
|
| 181 |
|
| 182 |
# ββ JHOVE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 183 |
+
# target installation directory (fallback to /opt if possible)
|
| 184 |
+
jhove_dir = "/opt/jhove"
|
| 185 |
+
jhove_bin = os.path.join(jhove_dir, "jhove")
|
| 186 |
+
|
| 187 |
+
# helper that sets state when we have a working binary
|
| 188 |
+
def set_jhove(path):
|
| 189 |
+
TOOL_STATE["jhove"]["ready"] = True
|
| 190 |
+
TOOL_STATE["jhove"]["path"] = path
|
| 191 |
+
log.append(f"β JHOVE ready at {path}")
|
| 192 |
+
# make sure environment is aware of it
|
| 193 |
+
os.environ["PATH"] += os.pathsep + os.path.dirname(path)
|
| 194 |
+
os.environ["JHOVE_HOME"] = os.path.dirname(path)
|
| 195 |
+
|
| 196 |
+
if os.path.exists(jhove_bin):
|
| 197 |
+
set_jhove(jhove_bin)
|
| 198 |
+
else:
|
| 199 |
+
# search anywhere under /opt/jhove in case installer wrote elsewhere
|
| 200 |
stdout, _, _ = run_cmd(
|
| 201 |
"find /opt/jhove -name 'jhove' -not -name '*.jar' 2>/dev/null | head -1"
|
| 202 |
)
|
| 203 |
+
if stdout and os.path.exists(stdout):
|
| 204 |
+
set_jhove(stdout)
|
| 205 |
|
| 206 |
+
if not TOOL_STATE["jhove"]["ready"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
log.append("β¬ Installing JHOVE (may take ~2 min)β¦")
|
| 208 |
+
# Ensure Java 21 is available
|
| 209 |
+
run_cmd("apt-get install -qq -y openjdk-21-jre-headless wget 2>/dev/null || true")
|
| 210 |
+
# download installer jar
|
| 211 |
+
run_cmd("wget -q -O /tmp/jhove-latest.jar https://software.openpreservation.org/rel/jhove-latest.jar")
|
| 212 |
+
|
| 213 |
+
# create an automated installation xml and run it
|
| 214 |
+
config_xml = "/tmp/jhove-auto.xml"
|
| 215 |
+
install_target = jhove_dir
|
| 216 |
+
xml_content = f"""<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>
|
| 217 |
+
<AutomatedInstallation langpack=\"eng\">
|
| 218 |
+
<com.izforge.izpack.panels.htmlhello.HTMLHelloPanel id=\"welcome\"/>
|
| 219 |
+
<com.izforge.izpack.panels.target.TargetPanel id=\"install_dir\">
|
| 220 |
+
<installpath>{install_target}</installpath>
|
| 221 |
+
</com.izforge.izpack.panels.target.TargetPanel>
|
| 222 |
+
<com.izforge.izpack.panels.packs.PacksPanel id=\"sdk_pack_select\">
|
| 223 |
+
<pack index=\"0\" name=\"JHOVE Application\" selected=\"true\"/>
|
| 224 |
+
<pack index=\"1\" name=\"JHOVE Shell Scripts\" selected=\"true\"/>
|
| 225 |
+
<pack index=\"2\" name=\"JHOVE External Modules\" selected=\"true\"/>
|
| 226 |
+
</com.izforge.izpack.panels.packs.PacksPanel>
|
| 227 |
+
<com.izforge.izpack.panels.install.InstallPanel id=\"install\"/>
|
| 228 |
+
<com.izforge.izpack.panels.finish.FinishPanel id=\"finish\"/>
|
| 229 |
+
</AutomatedInstallation>
|
| 230 |
+
"""
|
| 231 |
+
try:
|
| 232 |
+
with open(config_xml, "w") as f:
|
| 233 |
+
f.write(xml_content)
|
| 234 |
+
run_cmd(f"java -jar /tmp/jhove-latest.jar {config_xml}")
|
| 235 |
+
except Exception:
|
| 236 |
+
pass
|
| 237 |
+
|
| 238 |
+
# look for the binary again in the installation directory
|
| 239 |
+
stdout, _, _ = run_cmd(
|
| 240 |
+
f"find {install_target} -name 'jhove' -not -name '*.jar' 2>/dev/null | head -1"
|
| 241 |
)
|
| 242 |
+
if stdout and os.path.exists(stdout):
|
| 243 |
+
set_jhove(stdout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
else:
|
| 245 |
log.append("β JHOVE install failed")
|
| 246 |
|
|
|
|
| 287 |
with tempfile.TemporaryDirectory() as tmp:
|
| 288 |
profile = os.path.join(tmp, "profile.droid")
|
| 289 |
csv_out = os.path.join(tmp, "export.csv")
|
| 290 |
+
# if jar points to a shell script, invoke it directly; otherwise
|
| 291 |
+
# assume it's a jar and run with java -jar.
|
| 292 |
+
if jar.endswith(".sh") or jar.endswith("droid"):
|
| 293 |
+
subprocess.run(
|
| 294 |
+
[jar, "-a", filepath, "-Ns", sig, "-p", profile],
|
| 295 |
+
capture_output=True, text=True, timeout=120
|
| 296 |
+
)
|
| 297 |
+
export = subprocess.run(
|
| 298 |
+
[jar, "-p", profile, "-e", csv_out],
|
| 299 |
+
capture_output=True, text=True, timeout=60
|
| 300 |
+
)
|
| 301 |
+
else:
|
| 302 |
+
subprocess.run(
|
| 303 |
+
["java", "-jar", jar, "-a", filepath, "-Ns", sig, "-p", profile],
|
| 304 |
+
capture_output=True, text=True, timeout=120
|
| 305 |
+
)
|
| 306 |
+
export = subprocess.run(
|
| 307 |
+
["java", "-jar", jar, "-p", profile, "-e", csv_out],
|
| 308 |
+
capture_output=True, text=True, timeout=60
|
| 309 |
+
)
|
| 310 |
if os.path.exists(csv_out):
|
| 311 |
with open(csv_out) as f:
|
| 312 |
rows = list(csv.DictReader(f))
|
packages.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
default-jre
|
| 2 |
wget
|
| 3 |
unzip
|
|
|
|
| 1 |
+
openjdk-21-jre-headless
|
| 2 |
default-jre
|
| 3 |
wget
|
| 4 |
unzip
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
gradio>=4.0.0
|
| 2 |
pandas>=2.0.0
|
|
|
|
|
|
| 1 |
gradio>=4.0.0
|
| 2 |
pandas>=2.0.0
|
| 3 |
+
requests>=2.0.0
|