Spaces:

InternScience
/

ResearchHarness

Running

App Files Files Community

black-yt commited on May 13

Commit

1fefce0

1 Parent(s): 208afb8

Collect Space trajectories to dataset PRs

Browse files

Files changed (5) hide show

.env.example +36 -26
README.md +18 -0
app.py +15 -0
frontend/local_server.py +222 -0
requirements.txt +1 -0

.env.example CHANGED Viewed

@@ -1,29 +1,39 @@
 # Required
-API_KEY="your_openai_compatible_key"                     # API key for your OpenAI-compatible LLM provider.
-API_BASE="https://your-openai-compatible-endpoint/v1"    # Base URL for the OpenAI-compatible chat-completions endpoint.
-MODEL_NAME="gpt-5.5"                                     # Main model used by the agent and WebFetch summarization.
-SERPER_KEY="your_serper_key"                             # https://serper.dev/
-JINA_KEY="your_jina_key"                                 # https://jina.ai/
-MINERU_TOKEN="your_mineru_token"                         # https://mineru.net/
 # Optional
-WORKSPACE_ROOT="./workspace"                             # Default local workspace root when --workspace-root is not provided.
-MAX_LLM_CALL_PER_RUN=100                                 # Maximum chat-completions calls allowed in one agent run.
-MAX_AGENT_ROUNDS=100                                     # Maximum ReAct loop rounds before forced termination.
-MAX_AGENT_RUNTIME_SECONDS=9000                           # Maximum wall-clock runtime per agent run.
-LLM_TIMEOUT_SECONDS=600                                  # Timeout for each chat-completions request.
-LLM_MAX_OUTPUT_TOKENS=10000                              # Maximum output tokens requested from the main model.
-MAX_INPUT_TOKENS=320000                                  # Maximum input-token budget used for runtime token accounting.
-LLM_MAX_RETRIES=10                                       # Maximum retries for transient LLM API failures.
-TEMPERATURE=0.6                                          # Main model sampling temperature.
-TOP_P=0.95                                               # Main model nucleus-sampling top_p.
-PRESENCE_PENALTY=1.1                                     # Main model presence penalty when supported by the provider.
-AUTO_COMPACT_TRIGGER_TOKENS="128k"                       # Context size threshold that triggers automatic memory compaction.
-IMAGE_PART_TOKEN_ESTIMATE=1536                           # Token estimate used for each runtime image_url content part.
-LLM_IMAGE_MAX_EDGE=1568                                  # Maximum image edge length sent to multimodal LLMs.
-LLM_IMAGE_MAX_BYTES=524288                               # Maximum compressed image payload size sent to multimodal LLMs.
-LLM_IMAGE_JPEG_QUALITY=85                                # Initial JPEG quality for runtime image compression.
-DEBUG_AGENT=false                                        # Print verbose agent-loop debug logs.
-DEBUG_SEARCH=false                                       # Print verbose WebSearch debug logs.
-DEBUG_SCHOLAR=false                                      # Print verbose ScholarSearch debug logs.
-DEBUG_VISIT=false                                        # Print verbose WebFetch debug logs.

 # Required
+API_KEY="your_openai_compatible_key"                      # API key for your OpenAI-compatible LLM provider.
+API_BASE="https://your-openai-compatible-endpoint/v1"     # Base URL for the OpenAI-compatible chat-completions endpoint.
+MODEL_NAME="gpt-5.5"                                      # Main model used by the agent and WebFetch summarization.
+SERPER_KEY="your_serper_key"                              # https://serper.dev/
+JINA_KEY="your_jina_key"                                  # https://jina.ai/
+MINERU_TOKEN="your_mineru_token"                          # https://mineru.net/
+HF_TOKEN="your_huggingface_token"                         # Hugging Face token with dataset write access when collection is enabled.
 # Optional
+WORKSPACE_ROOT="./workspace"                              # Default local workspace root when --workspace-root is not provided.
+MAX_LLM_CALL_PER_RUN=100                                  # Maximum chat-completions calls allowed in one agent run.
+MAX_AGENT_ROUNDS=100                                      # Maximum ReAct loop rounds before forced termination.
+MAX_AGENT_RUNTIME_SECONDS=9000                            # Maximum wall-clock runtime per agent run.
+LLM_TIMEOUT_SECONDS=600                                   # Timeout for each chat-completions request.
+LLM_MAX_OUTPUT_TOKENS=10000                               # Maximum output tokens requested from the main model.
+MAX_INPUT_TOKENS=320000                                   # Maximum input-token budget used for runtime token accounting.
+LLM_MAX_RETRIES=10                                        # Maximum retries for transient LLM API failures.
+TEMPERATURE=0.6                                           # Main model sampling temperature.
+TOP_P=0.95                                                # Main model nucleus-sampling top_p.
+PRESENCE_PENALTY=1.1                                      # Main model presence penalty when supported by the provider.
+AUTO_COMPACT_TRIGGER_TOKENS="128k"                        # Context size threshold that triggers automatic memory compaction.
+IMAGE_PART_TOKEN_ESTIMATE=1536                            # Token estimate used for each runtime image_url content part.
+LLM_IMAGE_MAX_EDGE=1568                                   # Maximum image edge length sent to multimodal LLMs.
+LLM_IMAGE_MAX_BYTES=524288                                # Maximum compressed image payload size sent to multimodal LLMs.
+LLM_IMAGE_JPEG_QUALITY=85                                 # Initial JPEG quality for runtime image compression.
+DEBUG_AGENT=false                                         # Print verbose agent-loop debug logs.
+DEBUG_SEARCH=false                                        # Print verbose WebSearch debug logs.
+DEBUG_SCHOLAR=false                                       # Print verbose ScholarSearch debug logs.
+DEBUG_VISIT=false                                         # Print verbose WebFetch debug logs.
+RH_SPACE_RUNS_DIR="/tmp/researchharness_space/runs"       # Parent directory for temporary per-chat runs in hosted mode.
+RH_SPACE_RETENTION_SECONDS=21600                          # Delete inactive hosted runs older than this many seconds.
+RH_SPACE_MAX_RUNS=40                                      # Keep at most this many inactive hosted runs.
+RH_SPACE_CLEANUP_INTERVAL_SECONDS=900                     # Background cleanup interval for hosted runs.
+RH_COLLECTION_ENABLED=true                                # Automatically collect hosted run traces after each completed run.
+RH_COLLECTION_DATASET_REPO="CoCoOne/ResearchHarness-Data" # Hugging Face dataset repo receiving trace PRs.
+RH_COLLECTION_BATCH_SIZE=5                                # Create one dataset PR after this many collected runs.
+RH_COLLECTION_MAX_BUNDLE_BYTES=20971520                   # Drop any single trace bundle larger than this many bytes.
+RH_ROLE_PROMPT_FILES=""                                   # Optional role prompt files separated by os.pathsep.

README.md CHANGED Viewed

@@ -19,6 +19,7 @@ It reuses the ResearchHarness tool-calling runtime and keeps the hosted mode int
 - Each new chat gets an isolated temporary runtime directory.
 - Uploaded images are saved under that chat workspace and also passed to the model when supported.
 - Agent traces and session state are stored beside the temporary workspace.
 - Old workspaces and traces are cleaned periodically so the Space does not grow without bound.
 ## Required Secrets
@@ -33,6 +34,7 @@ Configure these as Hugging Face Space secrets before starting the app:
 | `SERPER_KEY` | WebSearch / ScholarSearch key from <https://serper.dev/>. |
 | `JINA_KEY` | WebFetch key from <https://jina.ai/>. |
 | `MINERU_TOKEN` | ReadPDF key from <https://mineru.net/>. |
 ## Optional Runtime Variables
@@ -42,6 +44,10 @@ Configure these as Hugging Face Space secrets before starting the app:
 | `RH_SPACE_RETENTION_SECONDS` | `21600` | Delete inactive runs older than this many seconds. |
 | `RH_SPACE_MAX_RUNS` | `40` | Keep at most this many inactive runs. |
 | `RH_SPACE_CLEANUP_INTERVAL_SECONDS` | `900` | Background cleanup interval. |
 | `RH_ROLE_PROMPT_FILES` | empty | Optional `os.pathsep`-separated role prompt files inside the Space image. |
 | `PORT` | `7860` | Port used by Hugging Face Docker Spaces. |
@@ -57,6 +63,18 @@ Configure these as Hugging Face Space secrets before starting the app:
 The frontend only exposes the chat UI. The workspace path is managed by the server so hosted users cannot browse or select server folders.
 ## Local Smoke Test
 ```bash

 - Each new chat gets an isolated temporary runtime directory.
 - Uploaded images are saved under that chat workspace and also passed to the model when supported.
 - Agent traces and session state are stored beside the temporary workspace.
+- Completed runs are automatically packaged for trajectory collection.
 - Old workspaces and traces are cleaned periodically so the Space does not grow without bound.
 ## Required Secrets
 | `SERPER_KEY` | WebSearch / ScholarSearch key from <https://serper.dev/>. |
 | `JINA_KEY` | WebFetch key from <https://jina.ai/>. |
 | `MINERU_TOKEN` | ReadPDF key from <https://mineru.net/>. |
+| `HF_TOKEN` | Hugging Face token with write access to `CoCoOne/ResearchHarness-Data`. |
 ## Optional Runtime Variables
 | `RH_SPACE_RETENTION_SECONDS` | `21600` | Delete inactive runs older than this many seconds. |
 | `RH_SPACE_MAX_RUNS` | `40` | Keep at most this many inactive runs. |
 | `RH_SPACE_CLEANUP_INTERVAL_SECONDS` | `900` | Background cleanup interval. |
+| `RH_COLLECTION_ENABLED` | `true` | Automatically collect completed hosted runs. |
+| `RH_COLLECTION_DATASET_REPO` | `CoCoOne/ResearchHarness-Data` | Dataset repo that receives trajectory PRs. |
+| `RH_COLLECTION_BATCH_SIZE` | `5` | Create one dataset PR after this many collected runs. |
+| `RH_COLLECTION_MAX_BUNDLE_BYTES` | `20971520` | Drop a single run bundle if it exceeds this byte limit. |
 | `RH_ROLE_PROMPT_FILES` | empty | Optional `os.pathsep`-separated role prompt files inside the Space image. |
 | `PORT` | `7860` | Port used by Hugging Face Docker Spaces. |
 The frontend only exposes the chat UI. The workspace path is managed by the server so hosted users cannot browse or select server folders.
+## Trajectory Collection
+Hosted mode automatically collects completed runs without exposing extra UI to users:
+- Each completed run is zipped from `agent_workspace/` and `agent_trace/`.
+- A `manifest.json` is included inside the zip, and a sidecar `.json` file is kept beside the pending zip.
+- If a single bundle is larger than `RH_COLLECTION_MAX_BUNDLE_BYTES` (`20MB` by default), it is dropped immediately.
+- Once `RH_COLLECTION_BATCH_SIZE` pending bundles exist, the Space creates a pull request in the configured Hugging Face dataset repo.
+- After the dataset PR is created successfully, those local pending bundles are deleted.
+- If upload fails, pending bundles are retained and `last_upload_error.json` is written under the local collection directory.
+- No redaction is applied in this core hosted collector; keep the dataset private unless you intentionally want to publish the collected traces.
 ## Local Smoke Test
 ```bash

app.py CHANGED Viewed

@@ -21,6 +21,17 @@ def _int_env(name: str, default: int) -> int:
         raise ValueError(f"{name} must be an integer, got {raw!r}") from exc
 def _role_prompt_files() -> list[str]:
     raw = os.getenv("RH_ROLE_PROMPT_FILES", "").strip()
     if not raw:
@@ -37,6 +48,10 @@ def configure_space() -> None:
         cleanup_retention_seconds=_int_env("RH_SPACE_RETENTION_SECONDS", 6 * 60 * 60),
         cleanup_max_runs=_int_env("RH_SPACE_MAX_RUNS", 40),
         cleanup_interval_seconds=_int_env("RH_SPACE_CLEANUP_INTERVAL_SECONDS", 15 * 60),
     )

         raise ValueError(f"{name} must be an integer, got {raw!r}") from exc
+def _bool_env(name: str, default: bool) -> bool:
+    raw = os.getenv(name, "").strip().lower()
+    if not raw:
+        return default
+    if raw in {"1", "true", "yes", "on"}:
+        return True
+    if raw in {"0", "false", "no", "off"}:
+        return False
+    raise ValueError(f"{name} must be a boolean, got {raw!r}")
 def _role_prompt_files() -> list[str]:
     raw = os.getenv("RH_ROLE_PROMPT_FILES", "").strip()
     if not raw:
         cleanup_retention_seconds=_int_env("RH_SPACE_RETENTION_SECONDS", 6 * 60 * 60),
         cleanup_max_runs=_int_env("RH_SPACE_MAX_RUNS", 40),
         cleanup_interval_seconds=_int_env("RH_SPACE_CLEANUP_INTERVAL_SECONDS", 15 * 60),
+        collection_enabled=_bool_env("RH_COLLECTION_ENABLED", True),
+        collection_dataset_repo=os.getenv("RH_COLLECTION_DATASET_REPO", "CoCoOne/ResearchHarness-Data"),
+        collection_batch_size=_int_env("RH_COLLECTION_BATCH_SIZE", 5),
+        collection_max_bundle_bytes=_int_env("RH_COLLECTION_MAX_BUNDLE_BYTES", 20 * 1024 * 1024),
     )

frontend/local_server.py CHANGED Viewed

@@ -3,12 +3,14 @@ from __future__ import annotations
 import asyncio
 import base64
 import datetime as _dt
 import os
 import re
 import shutil
 import threading
 import time
 import traceback
 from pathlib import Path
 from typing import Any
 from uuid import uuid4
@@ -40,9 +42,15 @@ FRONTEND_MANAGED_RUNS_DIR: str | None = None
 FRONTEND_CLEANUP_RETENTION_SECONDS = 6 * 60 * 60
 FRONTEND_CLEANUP_MAX_RUNS = 40
 FRONTEND_CLEANUP_INTERVAL_SECONDS = 15 * 60
 _CLEANUP_THREAD_STARTED = False
 _ACTIVE_MANAGED_RUNS: set[str] = set()
 _ACTIVE_MANAGED_RUNS_LOCK = threading.Lock()
 app = FastAPI(title="ResearchHarness Local UI")
 app.mount("/static", StaticFiles(directory=STATIC_DIR), name="frontend-static")
@@ -56,10 +64,24 @@ def configure_frontend(
     cleanup_retention_seconds: int | None = None,
     cleanup_max_runs: int | None = None,
     cleanup_interval_seconds: int | None = None,
 ) -> None:
     global FRONTEND_ROLE_PROMPT, FRONTEND_TRACE_DIR, FRONTEND_MANAGED_RUNS_DIR
     global FRONTEND_CLEANUP_RETENTION_SECONDS, FRONTEND_CLEANUP_MAX_RUNS, FRONTEND_CLEANUP_INTERVAL_SECONDS
     FRONTEND_ROLE_PROMPT = str(role_prompt or "").strip()
     if trace_dir:
         path = Path(trace_dir).expanduser()
         if path.exists() and not path.is_dir():
@@ -81,6 +103,7 @@ def configure_frontend(
             FRONTEND_CLEANUP_MAX_RUNS = max(1, int(cleanup_max_runs))
         if cleanup_interval_seconds is not None:
             FRONTEND_CLEANUP_INTERVAL_SECONDS = max(60, int(cleanup_interval_seconds))
         cleanup_managed_runs_once()
         _start_managed_cleanup_thread()
     else:
@@ -237,6 +260,201 @@ def _start_managed_cleanup_thread() -> None:
     _CLEANUP_THREAD_STARTED = True
 class FrontendInteractiveAgent(MultiTurnReactAgent):
     def __init__(self, *, bridge: FrontendRunBridge, **kwargs: Any):
         super().__init__(**kwargs)
@@ -327,6 +545,7 @@ def _run_agent_thread(
     initial_content_parts: list[dict[str, Any]],
     trace_dir: str | None = None,
     prior_messages: list[dict[str, Any]] | None = None,
 ) -> None:
     try:
         load_dotenv(PROJECT_ROOT / ".env")
@@ -356,6 +575,8 @@ def _run_agent_thread(
         )
         bridge.conversation_messages = result.get("messages", [])
         bridge.conversation_workspace_root = str(workspace_root)
         bridge.send(
             {
                 "type": "run_finished",
@@ -546,6 +767,7 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
                         "initial_content_parts": image_parts,
                         "trace_dir": effective_trace_dir,
                         "prior_messages": prior_messages,
                     },
                     daemon=True,
                 )

 import asyncio
 import base64
 import datetime as _dt
+import json
 import os
 import re
 import shutil
 import threading
 import time
 import traceback
+import zipfile
 from pathlib import Path
 from typing import Any
 from uuid import uuid4
 FRONTEND_CLEANUP_RETENTION_SECONDS = 6 * 60 * 60
 FRONTEND_CLEANUP_MAX_RUNS = 40
 FRONTEND_CLEANUP_INTERVAL_SECONDS = 15 * 60
+FRONTEND_COLLECTION_ENABLED = True
+FRONTEND_COLLECTION_DATASET_REPO = "CoCoOne/ResearchHarness-Data"
+FRONTEND_COLLECTION_BATCH_SIZE = 5
+FRONTEND_COLLECTION_MAX_BUNDLE_BYTES = 20 * 1024 * 1024
 _CLEANUP_THREAD_STARTED = False
 _ACTIVE_MANAGED_RUNS: set[str] = set()
 _ACTIVE_MANAGED_RUNS_LOCK = threading.Lock()
+_COLLECTION_LOCK = threading.Lock()
+_COLLECTION_CONFIG_WARNED: set[str] = set()
 app = FastAPI(title="ResearchHarness Local UI")
 app.mount("/static", StaticFiles(directory=STATIC_DIR), name="frontend-static")
     cleanup_retention_seconds: int | None = None,
     cleanup_max_runs: int | None = None,
     cleanup_interval_seconds: int | None = None,
+    collection_enabled: bool | None = None,
+    collection_dataset_repo: str | None = None,
+    collection_batch_size: int | None = None,
+    collection_max_bundle_bytes: int | None = None,
 ) -> None:
     global FRONTEND_ROLE_PROMPT, FRONTEND_TRACE_DIR, FRONTEND_MANAGED_RUNS_DIR
     global FRONTEND_CLEANUP_RETENTION_SECONDS, FRONTEND_CLEANUP_MAX_RUNS, FRONTEND_CLEANUP_INTERVAL_SECONDS
+    global FRONTEND_COLLECTION_ENABLED, FRONTEND_COLLECTION_DATASET_REPO
+    global FRONTEND_COLLECTION_BATCH_SIZE, FRONTEND_COLLECTION_MAX_BUNDLE_BYTES
     FRONTEND_ROLE_PROMPT = str(role_prompt or "").strip()
+    if collection_enabled is not None:
+        FRONTEND_COLLECTION_ENABLED = bool(collection_enabled)
+    if collection_dataset_repo is not None:
+        FRONTEND_COLLECTION_DATASET_REPO = str(collection_dataset_repo or "").strip()
+    if collection_batch_size is not None:
+        FRONTEND_COLLECTION_BATCH_SIZE = max(1, int(collection_batch_size))
+    if collection_max_bundle_bytes is not None:
+        FRONTEND_COLLECTION_MAX_BUNDLE_BYTES = max(1, int(collection_max_bundle_bytes))
     if trace_dir:
         path = Path(trace_dir).expanduser()
         if path.exists() and not path.is_dir():
             FRONTEND_CLEANUP_MAX_RUNS = max(1, int(cleanup_max_runs))
         if cleanup_interval_seconds is not None:
             FRONTEND_CLEANUP_INTERVAL_SECONDS = max(60, int(cleanup_interval_seconds))
+        _collection_root()
         cleanup_managed_runs_once()
         _start_managed_cleanup_thread()
     else:
     _CLEANUP_THREAD_STARTED = True
+def _collection_root() -> Path | None:
+    root = _managed_runs_root()
+    if root is None:
+        return None
+    collection_root = root / "_collection"
+    (collection_root / "pending").mkdir(parents=True, exist_ok=True)
+    return collection_root
+def _collection_token() -> str:
+    for name in ("HF_TOKEN", "HUGGINGFACE_HUB_TOKEN", "HUGGING_FACE_HUB_TOKEN"):
+        value = os.getenv(name, "").strip()
+        if value:
+            return value
+    return ""
+def _warn_collection_once(key: str, message: str) -> None:
+    if key in _COLLECTION_CONFIG_WARNED:
+        return
+    _COLLECTION_CONFIG_WARNED.add(key)
+    print(f"[ResearchHarness Space collection] {message}", flush=True)
+def _collection_ready() -> bool:
+    if not FRONTEND_COLLECTION_ENABLED:
+        return False
+    if not FRONTEND_COLLECTION_DATASET_REPO:
+        _warn_collection_once("missing_repo", "disabled because RH_COLLECTION_DATASET_REPO is empty.")
+        return False
+    if not _collection_token():
+        _warn_collection_once("missing_token", "disabled because HF_TOKEN is not configured.")
+        return False
+    return _collection_root() is not None
+class _CollectionBundleTooLarge(RuntimeError):
+    pass
+def _iter_collection_files(run_root: Path) -> list[tuple[Path, str]]:
+    files: list[tuple[Path, str]] = []
+    for dirname in ("agent_trace", "agent_workspace"):
+        base = run_root / dirname
+        if not base.exists() or not base.is_dir():
+            continue
+        for path in sorted(base.rglob("*")):
+            if path.is_symlink() or not path.is_file():
+                continue
+            arcname = str(Path(dirname) / path.relative_to(base))
+            files.append((path, arcname))
+    return files
+def _write_collection_bundle(run_root: Path, result: dict[str, Any]) -> Path | None:
+    collection_root = _collection_root()
+    if collection_root is None:
+        return None
+    pending_dir = collection_root / "pending"
+    bundle_id = f"{run_root.name}_{_dt.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}_{uuid4().hex[:8]}"
+    zip_path = pending_dir / f"{bundle_id}.zip"
+    meta_path = pending_dir / f"{bundle_id}.json"
+    files = _iter_collection_files(run_root)
+    skipped: list[dict[str, str]] = []
+    manifest = {
+        "bundle_id": bundle_id,
+        "run_id": run_root.name,
+        "created_at_utc": _dt.datetime.utcnow().isoformat(timespec="seconds") + "Z",
+        "source": "ResearchHarness HuggingFace Space",
+        "max_bundle_bytes": FRONTEND_COLLECTION_MAX_BUNDLE_BYTES,
+        "file_count": len(files),
+        "result_text": str(result.get("result_text", "")),
+        "termination": str(result.get("termination", "")),
+    }
+    try:
+        with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+            for path, arcname in files:
+                try:
+                    archive.write(path, arcname)
+                except OSError as exc:
+                    skipped.append({"path": str(path), "error": str(exc)})
+                    continue
+                if zip_path.stat().st_size > FRONTEND_COLLECTION_MAX_BUNDLE_BYTES:
+                    raise _CollectionBundleTooLarge
+            manifest["skipped_files"] = skipped
+            archive.writestr("manifest.json", json.dumps(safe_jsonable(manifest), ensure_ascii=False, indent=2))
+            if zip_path.stat().st_size > FRONTEND_COLLECTION_MAX_BUNDLE_BYTES:
+                raise _CollectionBundleTooLarge
+    except _CollectionBundleTooLarge:
+        zip_path.unlink(missing_ok=True)
+        meta_path.unlink(missing_ok=True)
+        print(
+            f"[ResearchHarness Space collection] dropped oversized bundle for {run_root.name}; "
+            f"limit={FRONTEND_COLLECTION_MAX_BUNDLE_BYTES} bytes",
+            flush=True,
+        )
+        return None
+    except Exception:
+        zip_path.unlink(missing_ok=True)
+        meta_path.unlink(missing_ok=True)
+        print("[ResearchHarness Space collection] failed to create bundle", flush=True)
+        traceback.print_exc()
+        return None
+    meta = dict(manifest)
+    meta["bundle_bytes"] = zip_path.stat().st_size
+    meta_path.write_text(json.dumps(safe_jsonable(meta), ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"[ResearchHarness Space collection] queued bundle {zip_path.name}", flush=True)
+    return zip_path
+def _record_collection_upload_error(collection_root: Path, error: str) -> None:
+    payload = {
+        "created_at_utc": _dt.datetime.utcnow().isoformat(timespec="seconds") + "Z",
+        "error": error,
+    }
+    (collection_root / "last_upload_error.json").write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+def _create_dataset_pr_for_bundles(bundle_paths: list[Path]) -> str:
+    from huggingface_hub import CommitOperationAdd, HfApi
+    batch_id = f"batch_{_dt.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}_{uuid4().hex[:8]}"
+    operations = []
+    for bundle_path in bundle_paths:
+        operations.append(
+            CommitOperationAdd(
+                path_in_repo=f"batches/{batch_id}/{bundle_path.name}",
+                path_or_fileobj=str(bundle_path),
+            )
+        )
+        sidecar = bundle_path.with_suffix(".json")
+        if sidecar.exists():
+            operations.append(
+                CommitOperationAdd(
+                    path_in_repo=f"batches/{batch_id}/{sidecar.name}",
+                    path_or_fileobj=str(sidecar),
+                )
+            )
+    info = HfApi(token=_collection_token()).create_commit(
+        repo_id=FRONTEND_COLLECTION_DATASET_REPO,
+        repo_type="dataset",
+        operations=operations,
+        commit_message=f"Add ResearchHarness traces {batch_id}",
+        commit_description="Automatically collected ResearchHarness Space trajectories.",
+        create_pr=True,
+    )
+    return str(getattr(info, "pr_url", "") or getattr(info, "commit_url", "") or info)
+def _flush_collection_batches() -> None:
+    if not _collection_ready():
+        return
+    collection_root = _collection_root()
+    if collection_root is None:
+        return
+    with _COLLECTION_LOCK:
+        pending_dir = collection_root / "pending"
+        while True:
+            bundles = sorted(pending_dir.glob("*.zip"), key=lambda path: path.stat().st_mtime)
+            if len(bundles) < FRONTEND_COLLECTION_BATCH_SIZE:
+                return
+            selected = bundles[:FRONTEND_COLLECTION_BATCH_SIZE]
+            try:
+                pr_url = _create_dataset_pr_for_bundles(selected)
+            except Exception as exc:
+                _record_collection_upload_error(collection_root, str(exc))
+                print("[ResearchHarness Space collection] failed to create dataset PR", flush=True)
+                traceback.print_exc()
+                return
+            for bundle_path in selected:
+                bundle_path.unlink(missing_ok=True)
+                bundle_path.with_suffix(".json").unlink(missing_ok=True)
+            (collection_root / "last_upload_error.json").unlink(missing_ok=True)
+            print(
+                f"[ResearchHarness Space collection] created dataset PR for {len(selected)} bundles: {pr_url}",
+                flush=True,
+            )
+def _collect_finished_managed_run(run_root_text: str, result: dict[str, Any]) -> None:
+    if not _collection_ready() or not run_root_text:
+        return
+    run_root = Path(run_root_text)
+    if not run_root.exists() or not run_root.is_dir():
+        return
+    bundle = _write_collection_bundle(run_root, result)
+    if bundle is None:
+        return
+    threading.Thread(target=_flush_collection_batches, daemon=True).start()
 class FrontendInteractiveAgent(MultiTurnReactAgent):
     def __init__(self, *, bridge: FrontendRunBridge, **kwargs: Any):
         super().__init__(**kwargs)
     initial_content_parts: list[dict[str, Any]],
     trace_dir: str | None = None,
     prior_messages: list[dict[str, Any]] | None = None,
+    managed_run_root: str = "",
 ) -> None:
     try:
         load_dotenv(PROJECT_ROOT / ".env")
         )
         bridge.conversation_messages = result.get("messages", [])
         bridge.conversation_workspace_root = str(workspace_root)
+        if managed_run_root:
+            _collect_finished_managed_run(managed_run_root, result)
         bridge.send(
             {
                 "type": "run_finished",
                         "initial_content_parts": image_parts,
                         "trace_dir": effective_trace_dir,
                         "prior_messages": prior_messages,
+                        "managed_run_root": bridge.managed_run_root,
                     },
                     daemon=True,
                 )

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 fastapi==0.115.6
 json5==0.14.0
 openai==2.3.0
 Pillow==11.3.0

 fastapi==0.115.6
+huggingface_hub==1.2.3
 json5==0.14.0
 openai==2.3.0
 Pillow==11.3.0