aaron1141 commited on
Commit
c34be21
·
1 Parent(s): 4f53326

feat: bundle extracted images + JSONL into vqa_output.zip for download

Browse files
Files changed (1) hide show
  1. app.py +40 -8
app.py CHANGED
@@ -5,6 +5,7 @@ import json
5
  import shutil
6
  import tempfile
7
  import traceback
 
8
 
9
  import gradio as gr
10
 
@@ -45,7 +46,7 @@ T = {
45
  sec_output="📤 输出",
46
  status_label="运行状态",
47
  status_ph="点击「开始提取」后进度显示在这里(运行需数分钟,请耐心等待)…",
48
- output_label="下载结果(raw_vqa.jsonl)",
49
  preview_label="结果预览",
50
  ),
51
  "en": dict(
@@ -78,7 +79,7 @@ T = {
78
  sec_output="📤 Output",
79
  status_label="Status",
80
  status_ph="Click 'Start Extraction' to begin (may take several minutes)…",
81
- output_label="Download Result (raw_vqa.jsonl)",
82
  preview_label="Result Preview",
83
  ),
84
  }
@@ -275,22 +276,53 @@ def run_vqa_extraction(
275
  max_step = max(int(re.findall(r"vqa_step(\d+)\.jsonl", f)[0]) for f in step_files)
276
  max_step_file = os.path.join(cache_dir, f"vqa_step{max_step}.jsonl")
277
 
278
- result_file = os.path.join(workspace, "raw_vqa.jsonl")
279
- count = 0
280
- with open(max_step_file) as f_in, open(result_file, "w") as f_out:
 
 
 
 
 
281
  for line in f_in:
282
  data = json.loads(line)
283
  qa_item = data.get("vqa_pair")
284
  if not qa_item:
285
  continue
286
- out = {"name": data.get("name", task_name), **qa_item}
 
287
  if not out.get("solution"):
288
  out["solution"] = out.get("answer", "")
289
  f_out.write(json.dumps(out, ensure_ascii=False) + "\n")
290
  count += 1
291
 
292
- done = f"✅ Done! Extracted {count} QA pairs." if lang == "en" else f"✅ 完成!共提取 {count} 条 QA 对。"
293
- yield result_file, done, _render_preview(result_file, lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  except Exception:
296
  yield None, f"❌ Unexpected error:\n{traceback.format_exc()}", ""
 
5
  import shutil
6
  import tempfile
7
  import traceback
8
+ import zipfile
9
 
10
  import gradio as gr
11
 
 
46
  sec_output="📤 输出",
47
  status_label="运行状态",
48
  status_ph="点击「开始提取」后进度显示在这里(运行需数分钟,请耐心等待)…",
49
+ output_label="下载结果(vqa_output.zip,含 JSONL + 图片)",
50
  preview_label="结果预览",
51
  ),
52
  "en": dict(
 
79
  sec_output="📤 Output",
80
  status_label="Status",
81
  status_ph="Click 'Start Extraction' to begin (may take several minutes)…",
82
+ output_label="Download Result (vqa_output.zip — JSONL + images)",
83
  preview_label="Result Preview",
84
  ),
85
  }
 
276
  max_step = max(int(re.findall(r"vqa_step(\d+)\.jsonl", f)[0]) for f in step_files)
277
  max_step_file = os.path.join(cache_dir, f"vqa_step{max_step}.jsonl")
278
 
279
+ # ── Collect QA pairs & copy per-task image directories ────────────────
280
+ output_dir = os.path.join(workspace, "output")
281
+ os.makedirs(output_dir, exist_ok=True)
282
+ jsonl_path = os.path.join(output_dir, "raw_vqa.jsonl")
283
+ count = 0
284
+ image_dirs_found = 0
285
+
286
+ with open(max_step_file) as f_in, open(jsonl_path, "w") as f_out:
287
  for line in f_in:
288
  data = json.loads(line)
289
  qa_item = data.get("vqa_pair")
290
  if not qa_item:
291
  continue
292
+ name = data.get("name", task_name)
293
+ out = {"name": name, **qa_item, "image_basedir": "."}
294
  if not out.get("solution"):
295
  out["solution"] = out.get("answer", "")
296
  f_out.write(json.dumps(out, ensure_ascii=False) + "\n")
297
  count += 1
298
 
299
+ # Copy cache/{name}/ output/{name}/ (contains vqa_images/)
300
+ src_task_dir = os.path.join(cache_dir, name)
301
+ dst_task_dir = os.path.join(output_dir, name)
302
+ if os.path.isdir(src_task_dir) and not os.path.exists(dst_task_dir):
303
+ shutil.copytree(src_task_dir, dst_task_dir)
304
+ image_dirs_found += 1
305
+
306
+ # ── Pack into a zip so images + JSONL download together ──────────────
307
+ zip_path = os.path.join(workspace, "vqa_output.zip")
308
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
309
+ for root, dirs, files in os.walk(output_dir):
310
+ for fname in files:
311
+ full = os.path.join(root, fname)
312
+ arcname = os.path.relpath(full, output_dir)
313
+ zf.write(full, arcname)
314
+
315
+ img_note = (
316
+ f" ({image_dirs_found} image folder(s) bundled)"
317
+ if lang == "en" else
318
+ f"(含 {image_dirs_found} 个图片文件夹)"
319
+ )
320
+ done = (
321
+ f"✅ Done! Extracted {count} QA pairs{img_note}. Download the zip to get images + JSONL."
322
+ if lang == "en" else
323
+ f"✅ 完成!共提取 {count} 条 QA 对{img_note}。下载 zip 可获得 JSONL 和图片。"
324
+ )
325
+ yield zip_path, done, _render_preview(jsonl_path, lang)
326
 
327
  except Exception:
328
  yield None, f"❌ Unexpected error:\n{traceback.format_exc()}", ""