Spaces:

Sachin21112004
/

DreamFlow-AI

Running

App Files Files Community

Sachin21112004 commited on Nov 11, 2025

Commit

b3303ff

verified ·

1 Parent(s): 4404e67

Upload 2 files

Browse files

Files changed (2) hide show

dataset_utils.py +186 -0
fine_tune_logger.py +119 -0

dataset_utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# dataset_utils.py
+# NEW FILE
+# This utility manages all read/write operations to your persistent HF Dataset.
+# Both counselor.py and your training scripts will use this.
+import json
+import time
+import os
+import glob
+import logging
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+from huggingface_hub import HfApi, hf_hub_download
+logger = logging.getLogger(__name__)
+# --- CONFIGURATION ---
+# !! REPLACE "Sachin21112004" with your username !!
+DATASET_REPO_ID = os.getenv("HF_DATASET_REPO_ID", "Sachin21112004/DreamFlow-AI-Data")
+EXAMPLES_FILENAME = "fine_tune_examples.jsonl"
+LOGS_FILENAME = "fine_tune_logs.jsonl"
+# Local temp paths
+LOCAL_EXAMPLES_PATH = Path(f"./{EXAMPLES_FILENAME}")
+LOCAL_LOGS_PATH = Path(f"./{LOGS_FILENAME}")
+# ---------------------
+def _get_hf_token():
+    """Reads the HF write token from environment secrets."""
+    return os.environ.get("HF_WRITE_TOKEN")
+def _download_from_hub(filename: str, local_path: Path) -> bool:
+    """Downloads a file from the dataset, returns True on success."""
+    token = _get_hf_token()
+    try:
+        hf_hub_download(
+            repo_id=DATASET_REPO_ID,
+            filename=filename,
+            repo_type="dataset",
+            local_dir=".",
+            token=token,
+            force_filename=filename
+        )
+        return True
+    except Exception as e:
+        # This is common if the file doesn't exist yet
+        logger.info(f"Could not download {filename} from Hub (may not exist yet): {e}")
+        return False
+def _upload_to_hub(local_path: Path, path_in_repo: str):
+    """Uploads a local file to the dataset repo."""
+    token = _get_hf_token()
+    try:
+        api = HfApi()
+        api.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=path_in_repo,
+            repo_id=DATASET_REPO_ID,
+            repo_type="dataset",
+            token=token
+        )
+    except Exception as e:
+        logger.error(f"Failed to upload {local_path} to Hub: {e}")
+# --- API for Training Examples (fine_tune_examples.jsonl) ---
+def persist_fine_tune_example(text: str, label: str) -> None:
+    """
+    Appends a single training example and uploads to the HF Dataset.
+    """
+    try:
+        # 1. Append the new line to the *local* file
+        line = json.dumps({"text": text, "label": label}, ensure_ascii=False)
+        with open(LOCAL_EXAMPLES_PATH, "a", encoding="utf-8") as f:
+            f.write(line + "\n")
+        # 2. Upload the *entire* file back to the dataset repo
+        _upload_to_hub(LOCAL_EXAMPLES_PATH, EXAMPLES_FILENAME)
+    except Exception as e:
+        logger.debug(f"Failed to persist fine-tune example: {e}")
+def load_fine_tune_examples() -> List[Dict[str, str]]:
+    """
+    Downloads the latest examples file from the HF Dataset and loads it.
+    """
+    # 1. Download the latest file
+    if not _download_from_hub(EXAMPLES_FILENAME, LOCAL_EXAMPLES_PATH):
+        return [] # Download failed, return empty list
+    # 2. Load from the file you just downloaded
+    try:
+        if not LOCAL_EXAMPLES_PATH.exists():
+            return []
+        with open(LOCAL_EXAMPLES_PATH, "r", encoding="utf-8") as f:
+            lines = [json.loads(l) for l in f if l.strip()]
+        return lines
+    except Exception as e:
+        logger.error(f"Failed to read local examples file {LOCAL_EXAMPLES_PATH}: {e}")
+        return []
+def clear_fine_tune_examples(archive: bool = True):
+    """
+    Archives the examples file in the dataset repo after training.
+    """
+    api = HfApi()
+    token = _get_hf_token()
+    try:
+        if archive:
+            ts = int(time.time())
+            archive_path = f"archive/examples/fine_tune_examples.{ts}.jsonl"
+            api.rename_file(
+                from_path=EXAMPLES_FILENAME,
+                to_path=archive_path,
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                token=token
+            )
+        else:
+            api.delete_file(
+                path_in_repo=EXAMPLES_FILENAME,
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                token=token
+            )
+        # Delete all local copies
+        for f in glob.glob(f"./{EXAMPLES_FILENAME}*"):
+            try:
+                os.remove(f)
+            except Exception:
+                pass
+        logger.info("Archived examples file in dataset repo.")
+    except Exception as e:
+        logger.debug(f"Failed to clear/archive examples in Hub (non-fatal): {e}")
+# --- API for Run Logs (fine_tune_logs.jsonl) ---
+def append_fine_tune_log(entry: Dict[str, Any]) -> None:
+    """
+    Appends a single log entry and uploads to the HF Dataset.
+    """
+    try:
+        # 1. Download the *current* log file first
+        _download_from_hub(LOGS_FILENAME, LOCAL_LOGS_PATH)
+        # 2. Append the new line to the *local* file
+        line = json.dumps(entry, ensure_ascii=False)
+        with open(LOCAL_LOGS_PATH, "a", encoding="utf-8") as f:
+            f.write(line + "\n")
+        # 3. Upload the *entire* file back to the dataset repo
+        _upload_to_hub(LOCAL_LOGS_PATH, LOGS_FILENAME)
+    except Exception as e:
+        logger.debug(f"Failed to persist fine-tune log: {e}")
+def load_fine_tune_logs(limit: Optional[int] = None) -> List[Dict[str, Any]]:
+    """
+    Downloads the latest log file from the HF Dataset and loads it.
+    Returns list, most-recent-first if limit is set.
+    """
+    # 1. Download the latest file
+    if not _download_from_hub(LOGS_FILENAME, LOCAL_LOGS_PATH):
+        return [] # Download failed, return empty list
+    # 2. Load from the file
+    out = []
+    try:
+        if not LOCAL_LOGS_PATH.exists():
+            return []
+        with open(LOCAL_LOGS_PATH, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    out.append(json.loads(line))
+                except Exception:
+                    continue
+                if limit and len(out) >= limit:
+                    break
+    except Exception as e:
+        logger.error(f"Failed to read local logs file {LOCAL_LOGS_PATH}: {e}")
+    return out

fine_tune_logger.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# fine_tune_logger.py
+# MODIFIED
+# This file now imports from dataset_utils to provide a consistent logging API
+# while ensuring logs are persisted to your HF Dataset.
+import json
+import time
+import uuid
+from typing import Dict, Any, Iterable, List, Optional
+import hashlib
+# Import the new persistent logging functions
+import dataset_utils
+# --- Helpers (unchanged from your original) ---
+def _short_hash(s: str, length: int = 10) -> str:
+    try:
+        return hashlib.sha256(s.encode("utf-8")).hexdigest()[:length]
+    except Exception:
+        return ""
+def _truncate_text(s: Optional[str], max_len: int = 200) -> Optional[str]:
+    if s is None:
+        return None
+    s = s.strip()
+    if len(s) <= max_len:
+        return s
+    return s[:max_len-3] + "..."
+def _sanitize_examples(examples: Iterable[str], sample_count: int = 5, max_snippet_len: int = 160) -> Dict[str, Any]:
+    # ... (this function is unchanged) ...
+    ex_list = list(examples)
+    total = len(ex_list)
+    sample = ex_list[:sample_count]
+    sanitized = []
+    hashes = []
+    for t in sample:
+        txt = t if t is not None else ""
+        sanitized.append(_truncate_text(txt, max_snippet_len))
+        hashes.append(_short_hash(txt))
+    return {"total": total, "sample_snippets": sanitized, "sample_hashes": hashes}
+# --- Public API (MODIFIED) ---
+def append_fine_tune_log(
+    model_dir: str,
+    label_map: Dict[str, int],
+    examples: Iterable[str],
+    label_counts: Dict[str, int],
+    train_args: Dict[str, Any],
+    metrics: Dict[str, Any],
+    pushed_to_hub: bool = False,
+    hub_repo: Optional[str] = None,
+    commit_sha: Optional[str] = None,
+    created_by: Optional[str] = None,
+    extra: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
+    """
+    Creates a structured log entry and appends it to the persistent log
+    in the HF Dataset.
+    Returns the log dict written.
+    """
+    run_id = str(uuid.uuid4())
+    ts = int(time.time())
+    sample_info = _sanitize_examples(examples, sample_count=5, max_snippet_len=200)
+    entry = {
+        "run_id": run_id,
+        "timestamp_utc": ts,
+        "model_dir": str(model_dir),
+        "label_map": label_map,
+        "label_counts": label_counts,
+        "total_examples": sample_info["total"],
+        "examples_sample_snippets": sample_info["sample_snippets"],
+        "examples_sample_hashes": sample_info["sample_hashes"],
+        "train_args": train_args,
+        "metrics": metrics,
+        "pushed_to_hub": bool(pushed_to_hub),
+        "hub_repo": hub_repo,
+        "commit_sha": commit_sha,
+        "created_by": created_by,
+        "extra": extra or {}
+    }
+    # Write to the persistent HF Dataset
+    try:
+        dataset_utils.append_fine_tune_log(entry)
+    except Exception as e:
+        # Fallback for logging is difficult, just log to console
+        print(f"CRITICAL: Failed to write log to HF Dataset: {e}")
+    return entry
+def load_fine_tune_logs(limit: Optional[int] = None) -> List[Dict[str, Any]]:
+    """
+    Load logs from the persistent HF Dataset.
+    Returns list, most-recent-first if limit set.
+    """
+    # This function now calls the central utility
+    return dataset_utils.load_fine_tune_logs(limit=limit)
+def summarize_logs(max_runs: int = 10) -> Dict[str, Any]:
+    """Return a compact summary of recent runs. (Unchanged)"""
+    logs = load_fine_tune_logs(limit=max_runs)
+    # reverse to show newest first
+    logs = logs[::-1]
+    summary = {"runs": [], "total_runs": len(logs)}
+    for l in logs:
+        summary["runs"].append({
+            "run_id": l.get("run_id"),
+            "timestamp_utc": l.get("timestamp_utc"),
+            "model_dir": l.get("model_dir"),
+            "total_examples": l.get("total_examples"),
+            "label_counts": l.get("label_counts"),
+            "metrics": l.get("metrics"),
+            "pushed_to_hub": l.get("pushed_to_hub"),
+            "hub_repo": l.get("hub_repo")
+        })
+    return summary