Spaces:

OpenDCAI
/

DataFlow-VQA

Running

App Files Files Community

aaron1141 commited on Apr 1

Commit

c34be21

1 Parent(s): 4f53326

feat: bundle extracted images + JSONL into vqa_output.zip for download

Browse files

Files changed (1) hide show

app.py +40 -8

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import json
 import shutil
 import tempfile
 import traceback
 import gradio as gr
@@ -45,7 +46,7 @@ T = {
         sec_output="📤 输出",
         status_label="运行状态",
         status_ph="点击「开始提取」后进度显示在这里（运行需数分钟，请耐心等待）…",
-        output_label="下载结果（raw_vqa.jsonl）",
         preview_label="结果预览",
     ),
     "en": dict(
@@ -78,7 +79,7 @@ T = {
         sec_output="📤 Output",
         status_label="Status",
         status_ph="Click 'Start Extraction' to begin (may take several minutes)…",
-        output_label="Download Result (raw_vqa.jsonl)",
         preview_label="Result Preview",
     ),
 }
@@ -275,22 +276,53 @@ def run_vqa_extraction(
         max_step = max(int(re.findall(r"vqa_step(\d+)\.jsonl", f)[0]) for f in step_files)
         max_step_file = os.path.join(cache_dir, f"vqa_step{max_step}.jsonl")
-        result_file = os.path.join(workspace, "raw_vqa.jsonl")
-        count = 0
-        with open(max_step_file) as f_in, open(result_file, "w") as f_out:
             for line in f_in:
                 data    = json.loads(line)
                 qa_item = data.get("vqa_pair")
                 if not qa_item:
                     continue
-                out = {"name": data.get("name", task_name), **qa_item}
                 if not out.get("solution"):
                     out["solution"] = out.get("answer", "")
                 f_out.write(json.dumps(out, ensure_ascii=False) + "\n")
                 count += 1
-        done = f"✅ Done! Extracted {count} QA pairs." if lang == "en" else f"✅ 完成！共提取 {count} 条 QA 对。"
-        yield result_file, done, _render_preview(result_file, lang)
     except Exception:
         yield None, f"❌ Unexpected error:\n{traceback.format_exc()}", ""

 import shutil
 import tempfile
 import traceback
+import zipfile
 import gradio as gr
         sec_output="📤 输出",
         status_label="运行状态",
         status_ph="点击「开始提取」后进度显示在这里（运行需数分钟，请耐心等待）…",
+        output_label="下载结果（vqa_output.zip，含 JSONL + 图片）",
         preview_label="结果预览",
     ),
     "en": dict(
         sec_output="📤 Output",
         status_label="Status",
         status_ph="Click 'Start Extraction' to begin (may take several minutes)…",
+        output_label="Download Result (vqa_output.zip — JSONL + images)",
         preview_label="Result Preview",
     ),
 }
         max_step = max(int(re.findall(r"vqa_step(\d+)\.jsonl", f)[0]) for f in step_files)
         max_step_file = os.path.join(cache_dir, f"vqa_step{max_step}.jsonl")
+        # ── Collect QA pairs & copy per-task image directories ────────────────
+        output_dir  = os.path.join(workspace, "output")
+        os.makedirs(output_dir, exist_ok=True)
+        jsonl_path  = os.path.join(output_dir, "raw_vqa.jsonl")
+        count       = 0
+        image_dirs_found = 0
+        with open(max_step_file) as f_in, open(jsonl_path, "w") as f_out:
             for line in f_in:
                 data    = json.loads(line)
                 qa_item = data.get("vqa_pair")
                 if not qa_item:
                     continue
+                name = data.get("name", task_name)
+                out  = {"name": name, **qa_item, "image_basedir": "."}
                 if not out.get("solution"):
                     out["solution"] = out.get("answer", "")
                 f_out.write(json.dumps(out, ensure_ascii=False) + "\n")
                 count += 1
+                # Copy cache/{name}/ → output/{name}/  (contains vqa_images/)
+                src_task_dir = os.path.join(cache_dir, name)
+                dst_task_dir = os.path.join(output_dir, name)
+                if os.path.isdir(src_task_dir) and not os.path.exists(dst_task_dir):
+                    shutil.copytree(src_task_dir, dst_task_dir)
+                    image_dirs_found += 1
+        # ── Pack into a zip so images + JSONL download together ──────────────
+        zip_path = os.path.join(workspace, "vqa_output.zip")
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+            for root, dirs, files in os.walk(output_dir):
+                for fname in files:
+                    full = os.path.join(root, fname)
+                    arcname = os.path.relpath(full, output_dir)
+                    zf.write(full, arcname)
+        img_note = (
+            f" ({image_dirs_found} image folder(s) bundled)"
+            if lang == "en" else
+            f"（含 {image_dirs_found} 个图片文件夹）"
+        )
+        done = (
+            f"✅ Done! Extracted {count} QA pairs{img_note}. Download the zip to get images + JSONL."
+            if lang == "en" else
+            f"✅ 完成！共提取 {count} 条 QA 对{img_note}。下载 zip 可获得 JSONL 和图片。"
+        )
+        yield zip_path, done, _render_preview(jsonl_path, lang)
     except Exception:
         yield None, f"❌ Unexpected error:\n{traceback.format_exc()}", ""