Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- README.md +10 -0
- app.py +153 -6
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -23,6 +23,16 @@ Set these Space variables in **Settings → Variables and secrets**:
|
|
| 23 |
|
| 24 |
The app will download checkpoint from your model repo and load it at runtime.
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
## Local Dev
|
| 27 |
|
| 28 |
```bash
|
|
|
|
| 23 |
|
| 24 |
The app will download checkpoint from your model repo and load it at runtime.
|
| 25 |
|
| 26 |
+
### Optional MLflow Tracking in Space
|
| 27 |
+
|
| 28 |
+
You can enable lightweight MLflow event logging for inference + task runs.
|
| 29 |
+
Set these optional variables in **Settings → Variables and secrets**:
|
| 30 |
+
|
| 31 |
+
- `MLFLOW_TRACKING_URI` (example: `file:/tmp/mlruns` or your remote tracking server URI)
|
| 32 |
+
- `MLFLOW_EXPERIMENT_NAME` (example: `hf-space-sanskrit-d3pm`)
|
| 33 |
+
|
| 34 |
+
If not set, the Space runs normally without MLflow.
|
| 35 |
+
|
| 36 |
## Local Dev
|
| 37 |
|
| 38 |
```bash
|
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import sys
|
|
| 6 |
import shutil
|
| 7 |
import threading
|
| 8 |
import uuid
|
|
|
|
| 9 |
from datetime import datetime
|
| 10 |
from pathlib import Path
|
| 11 |
|
|
@@ -31,6 +32,45 @@ DEFAULT_ANALYSIS_OUT = "analysis_outputs/T4"
|
|
| 31 |
os.makedirs(RESULTS_DIR, exist_ok=True)
|
| 32 |
_BG_JOBS = {}
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
HF_DEFAULT_MODEL_REPO = os.environ.get("HF_DEFAULT_MODEL_REPO", "bhsinghgrid/DevaFlow")
|
| 35 |
HF_DEFAULT_MODEL_FILE = os.environ.get("HF_DEFAULT_MODEL_FILE", "best_model.pt")
|
| 36 |
|
|
@@ -259,6 +299,7 @@ def generate_from_ui(
|
|
| 259 |
if not input_text.strip():
|
| 260 |
raise gr.Error("Enter input text first.")
|
| 261 |
|
|
|
|
| 262 |
cfg = copy.deepcopy(model_bundle["cfg"])
|
| 263 |
cfg["inference"]["temperature"] = float(temperature)
|
| 264 |
cfg["inference"]["top_k"] = int(top_k)
|
|
@@ -303,6 +344,30 @@ def generate_from_ui(
|
|
| 303 |
"clean_output": bool(clean_output),
|
| 304 |
}
|
| 305 |
log_path = save_generation(model_bundle["experiment"], record)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
status = f"Inference done. Saved: `{log_path}`"
|
| 307 |
return output_text, status, record
|
| 308 |
|
|
@@ -521,6 +586,7 @@ def _bg_worker(job_id: str, model_bundle, output_dir: str, input_text: str, task
|
|
| 521 |
tasks = ["1", "2", "3", "4", "5"]
|
| 522 |
failures = 0
|
| 523 |
logs = []
|
|
|
|
| 524 |
_BG_JOBS[job_id].update({"state": "running", "progress": 0, "failures": 0, "updated": datetime.now().isoformat()})
|
| 525 |
for idx, task in enumerate(tasks, start=1):
|
| 526 |
_BG_JOBS[job_id].update(
|
|
@@ -545,6 +611,22 @@ def _bg_worker(job_id: str, model_bundle, output_dir: str, input_text: str, task
|
|
| 545 |
"updated": datetime.now().isoformat(),
|
| 546 |
}
|
| 547 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
if failures:
|
| 549 |
_bundle_task_outputs(model_bundle, output_dir)
|
| 550 |
_BG_JOBS[job_id].update(
|
|
@@ -557,6 +639,20 @@ def _bg_worker(job_id: str, model_bundle, output_dir: str, input_text: str, task
|
|
| 557 |
"updated": datetime.now().isoformat(),
|
| 558 |
}
|
| 559 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
|
| 561 |
|
| 562 |
def start_run_all_background(model_bundle, output_dir, input_text, task4_phase):
|
|
@@ -606,7 +702,9 @@ def run_single_task_and_refresh(model_bundle, task, output_dir, input_text, task
|
|
| 606 |
def run_single_task(model_bundle, task, output_dir, input_text, task4_phase):
|
| 607 |
if not model_bundle:
|
| 608 |
raise gr.Error("Load a model first.")
|
|
|
|
| 609 |
code, log, used_bundled = _run_analysis_cmd(task, model_bundle["ckpt_path"], output_dir, input_text, task4_phase)
|
|
|
|
| 610 |
if code != 0:
|
| 611 |
_bundle_task_outputs(model_bundle, output_dir)
|
| 612 |
log = f"{log}\n\n--- Live task analysis ---\n{_live_task_analysis(model_bundle, task, input_text)}"
|
|
@@ -618,6 +716,21 @@ def run_single_task(model_bundle, task, output_dir, input_text, task4_phase):
|
|
| 618 |
status = f"Task {task} loaded from bundled analysis outputs + live analysis."
|
| 619 |
else:
|
| 620 |
status = f"Task {task} completed (exit={code})."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
return status, log
|
| 622 |
|
| 623 |
|
|
@@ -692,6 +805,40 @@ def refresh_task_outputs(output_dir):
|
|
| 692 |
)
|
| 693 |
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
CUSTOM_CSS = """
|
| 696 |
:root {
|
| 697 |
--bg1: #f5fbff;
|
|
@@ -886,7 +1033,7 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
|
|
| 886 |
)
|
| 887 |
|
| 888 |
run_single_btn.click(
|
| 889 |
-
fn=
|
| 890 |
inputs=[model_state, task_choice, analysis_output_dir, analysis_input, task4_phase],
|
| 891 |
outputs=[
|
| 892 |
task_run_status,
|
|
@@ -904,12 +1051,12 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
|
|
| 904 |
],
|
| 905 |
)
|
| 906 |
run_all_btn.click(
|
| 907 |
-
fn=
|
| 908 |
-
inputs=[model_state, analysis_output_dir, analysis_input, task4_phase],
|
| 909 |
outputs=[task_run_status, task_run_log, bg_job_state],
|
| 910 |
)
|
| 911 |
track_bg_btn.click(
|
| 912 |
-
fn=
|
| 913 |
inputs=[bg_job_state, analysis_output_dir],
|
| 914 |
outputs=[
|
| 915 |
task_run_status,
|
|
@@ -927,7 +1074,7 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
|
|
| 927 |
],
|
| 928 |
)
|
| 929 |
refresh_outputs_btn.click(
|
| 930 |
-
fn=
|
| 931 |
inputs=[analysis_output_dir],
|
| 932 |
outputs=[
|
| 933 |
task1_box,
|
|
@@ -947,7 +1094,7 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
|
|
| 947 |
outputs=[model_state, load_status, model_info, num_steps, analysis_output_dir],
|
| 948 |
)
|
| 949 |
demo.load(
|
| 950 |
-
fn=
|
| 951 |
inputs=[analysis_output_dir],
|
| 952 |
outputs=[
|
| 953 |
task1_box,
|
|
|
|
| 6 |
import shutil
|
| 7 |
import threading
|
| 8 |
import uuid
|
| 9 |
+
import time
|
| 10 |
from datetime import datetime
|
| 11 |
from pathlib import Path
|
| 12 |
|
|
|
|
| 32 |
os.makedirs(RESULTS_DIR, exist_ok=True)
|
| 33 |
_BG_JOBS = {}
|
| 34 |
|
| 35 |
+
try:
|
| 36 |
+
import mlflow
|
| 37 |
+
except Exception:
|
| 38 |
+
mlflow = None
|
| 39 |
+
|
| 40 |
+
_MLFLOW_READY = False
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _setup_mlflow_once():
|
| 44 |
+
global _MLFLOW_READY
|
| 45 |
+
if _MLFLOW_READY:
|
| 46 |
+
return
|
| 47 |
+
if mlflow is None:
|
| 48 |
+
return
|
| 49 |
+
try:
|
| 50 |
+
tracking_uri = os.environ.get("MLFLOW_TRACKING_URI", "file:/tmp/mlruns")
|
| 51 |
+
experiment = os.environ.get("MLFLOW_EXPERIMENT_NAME", "hf-space-sanskrit-d3pm")
|
| 52 |
+
mlflow.set_tracking_uri(tracking_uri)
|
| 53 |
+
mlflow.set_experiment(experiment)
|
| 54 |
+
_MLFLOW_READY = True
|
| 55 |
+
except Exception:
|
| 56 |
+
_MLFLOW_READY = False
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _mlflow_event(run_name: str, params: dict | None = None, metrics: dict | None = None, tags: dict | None = None):
|
| 60 |
+
_setup_mlflow_once()
|
| 61 |
+
if not _MLFLOW_READY or mlflow is None:
|
| 62 |
+
return
|
| 63 |
+
try:
|
| 64 |
+
with mlflow.start_run(run_name=run_name, nested=False):
|
| 65 |
+
if tags:
|
| 66 |
+
mlflow.set_tags({k: str(v) for k, v in tags.items()})
|
| 67 |
+
if params:
|
| 68 |
+
mlflow.log_params({k: (v if isinstance(v, (int, float, str, bool)) else str(v)) for k, v in params.items()})
|
| 69 |
+
if metrics:
|
| 70 |
+
mlflow.log_metrics({k: float(v) for k, v in metrics.items()})
|
| 71 |
+
except Exception:
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
HF_DEFAULT_MODEL_REPO = os.environ.get("HF_DEFAULT_MODEL_REPO", "bhsinghgrid/DevaFlow")
|
| 75 |
HF_DEFAULT_MODEL_FILE = os.environ.get("HF_DEFAULT_MODEL_FILE", "best_model.pt")
|
| 76 |
|
|
|
|
| 299 |
if not input_text.strip():
|
| 300 |
raise gr.Error("Enter input text first.")
|
| 301 |
|
| 302 |
+
t0 = time.perf_counter()
|
| 303 |
cfg = copy.deepcopy(model_bundle["cfg"])
|
| 304 |
cfg["inference"]["temperature"] = float(temperature)
|
| 305 |
cfg["inference"]["top_k"] = int(top_k)
|
|
|
|
| 344 |
"clean_output": bool(clean_output),
|
| 345 |
}
|
| 346 |
log_path = save_generation(model_bundle["experiment"], record)
|
| 347 |
+
latency_ms = (time.perf_counter() - t0) * 1000.0
|
| 348 |
+
toks = [t for t in output_text.split() if t]
|
| 349 |
+
uniq = len(set(toks)) / max(1, len(toks))
|
| 350 |
+
_mlflow_event(
|
| 351 |
+
run_name="space_inference",
|
| 352 |
+
params={
|
| 353 |
+
"experiment": model_bundle["experiment"],
|
| 354 |
+
"checkpoint": model_bundle["ckpt_path"],
|
| 355 |
+
"temperature": float(temperature),
|
| 356 |
+
"top_k": int(top_k),
|
| 357 |
+
"repetition_penalty": float(repetition_penalty),
|
| 358 |
+
"diversity_penalty": float(diversity_penalty),
|
| 359 |
+
"num_steps": int(num_steps),
|
| 360 |
+
"clean_output": bool(clean_output),
|
| 361 |
+
},
|
| 362 |
+
metrics={
|
| 363 |
+
"latency_ms": latency_ms,
|
| 364 |
+
"input_char_len": len(input_text.strip()),
|
| 365 |
+
"output_char_len": len(output_text),
|
| 366 |
+
"output_token_len": len(toks),
|
| 367 |
+
"output_unique_ratio": uniq,
|
| 368 |
+
},
|
| 369 |
+
tags={"source": "hf_space"},
|
| 370 |
+
)
|
| 371 |
status = f"Inference done. Saved: `{log_path}`"
|
| 372 |
return output_text, status, record
|
| 373 |
|
|
|
|
| 586 |
tasks = ["1", "2", "3", "4", "5"]
|
| 587 |
failures = 0
|
| 588 |
logs = []
|
| 589 |
+
run_start = time.perf_counter()
|
| 590 |
_BG_JOBS[job_id].update({"state": "running", "progress": 0, "failures": 0, "updated": datetime.now().isoformat()})
|
| 591 |
for idx, task in enumerate(tasks, start=1):
|
| 592 |
_BG_JOBS[job_id].update(
|
|
|
|
| 611 |
"updated": datetime.now().isoformat(),
|
| 612 |
}
|
| 613 |
)
|
| 614 |
+
_mlflow_event(
|
| 615 |
+
run_name=f"space_bg_task_{task}",
|
| 616 |
+
params={
|
| 617 |
+
"job_id": job_id,
|
| 618 |
+
"task": task,
|
| 619 |
+
"task4_phase": str(task4_phase),
|
| 620 |
+
"experiment": model_bundle.get("experiment", ""),
|
| 621 |
+
},
|
| 622 |
+
metrics={
|
| 623 |
+
"exit_code": float(code),
|
| 624 |
+
"used_bundled": 1.0 if used_bundled else 0.0,
|
| 625 |
+
"failures_so_far": float(failures),
|
| 626 |
+
"progress_pct": float(_BG_JOBS[job_id]["progress"]),
|
| 627 |
+
},
|
| 628 |
+
tags={"source": "hf_space", "mode": "background"},
|
| 629 |
+
)
|
| 630 |
if failures:
|
| 631 |
_bundle_task_outputs(model_bundle, output_dir)
|
| 632 |
_BG_JOBS[job_id].update(
|
|
|
|
| 639 |
"updated": datetime.now().isoformat(),
|
| 640 |
}
|
| 641 |
)
|
| 642 |
+
_mlflow_event(
|
| 643 |
+
run_name="space_bg_run",
|
| 644 |
+
params={
|
| 645 |
+
"job_id": job_id,
|
| 646 |
+
"task4_phase": str(task4_phase),
|
| 647 |
+
"experiment": model_bundle.get("experiment", ""),
|
| 648 |
+
"output_dir": str(output_dir),
|
| 649 |
+
},
|
| 650 |
+
metrics={
|
| 651 |
+
"failures": float(failures),
|
| 652 |
+
"elapsed_s": (time.perf_counter() - run_start),
|
| 653 |
+
},
|
| 654 |
+
tags={"source": "hf_space", "mode": "background_summary"},
|
| 655 |
+
)
|
| 656 |
|
| 657 |
|
| 658 |
def start_run_all_background(model_bundle, output_dir, input_text, task4_phase):
|
|
|
|
| 702 |
def run_single_task(model_bundle, task, output_dir, input_text, task4_phase):
|
| 703 |
if not model_bundle:
|
| 704 |
raise gr.Error("Load a model first.")
|
| 705 |
+
t0 = time.perf_counter()
|
| 706 |
code, log, used_bundled = _run_analysis_cmd(task, model_bundle["ckpt_path"], output_dir, input_text, task4_phase)
|
| 707 |
+
elapsed = (time.perf_counter() - t0) * 1000.0
|
| 708 |
if code != 0:
|
| 709 |
_bundle_task_outputs(model_bundle, output_dir)
|
| 710 |
log = f"{log}\n\n--- Live task analysis ---\n{_live_task_analysis(model_bundle, task, input_text)}"
|
|
|
|
| 716 |
status = f"Task {task} loaded from bundled analysis outputs + live analysis."
|
| 717 |
else:
|
| 718 |
status = f"Task {task} completed (exit={code})."
|
| 719 |
+
_mlflow_event(
|
| 720 |
+
run_name=f"space_task_{task}",
|
| 721 |
+
params={
|
| 722 |
+
"task": str(task),
|
| 723 |
+
"task4_phase": str(task4_phase),
|
| 724 |
+
"output_dir": str(output_dir),
|
| 725 |
+
"experiment": model_bundle.get("experiment", ""),
|
| 726 |
+
},
|
| 727 |
+
metrics={
|
| 728 |
+
"exit_code": float(code),
|
| 729 |
+
"elapsed_ms": elapsed,
|
| 730 |
+
"used_bundled": 1.0 if used_bundled else 0.0,
|
| 731 |
+
},
|
| 732 |
+
tags={"source": "hf_space", "mode": "single_task"},
|
| 733 |
+
)
|
| 734 |
return status, log
|
| 735 |
|
| 736 |
|
|
|
|
| 805 |
)
|
| 806 |
|
| 807 |
|
| 808 |
+
def _safe_refresh_task_outputs(output_dir):
|
| 809 |
+
try:
|
| 810 |
+
return refresh_task_outputs(output_dir)
|
| 811 |
+
except Exception as e:
|
| 812 |
+
err = f"Refresh error: {e}"
|
| 813 |
+
return (err, err, None, None, None, None, err, None, err, None)
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
def _safe_start_run_all_background(model_bundle, output_dir, input_text, task4_phase, current_job_id):
|
| 817 |
+
try:
|
| 818 |
+
status, log, job_id = start_run_all_background(model_bundle, output_dir, input_text, task4_phase)
|
| 819 |
+
return status, log, job_id
|
| 820 |
+
except Exception as e:
|
| 821 |
+
return f"Background start failed: {e}", f"Background start failed: {e}", current_job_id
|
| 822 |
+
|
| 823 |
+
|
| 824 |
+
def _safe_poll_run_all_background(job_id, output_dir):
|
| 825 |
+
try:
|
| 826 |
+
return poll_run_all_background(job_id, output_dir)
|
| 827 |
+
except Exception as e:
|
| 828 |
+
err = f"Track error: {e}"
|
| 829 |
+
out = _safe_refresh_task_outputs(output_dir)
|
| 830 |
+
return err, err, *out
|
| 831 |
+
|
| 832 |
+
|
| 833 |
+
def _safe_run_single_task_and_refresh(model_bundle, task, output_dir, input_text, task4_phase):
|
| 834 |
+
try:
|
| 835 |
+
return run_single_task_and_refresh(model_bundle, task, output_dir, input_text, task4_phase)
|
| 836 |
+
except Exception as e:
|
| 837 |
+
err = f"Task {task} failed: {e}"
|
| 838 |
+
out = _safe_refresh_task_outputs(output_dir)
|
| 839 |
+
return err, err, *out
|
| 840 |
+
|
| 841 |
+
|
| 842 |
CUSTOM_CSS = """
|
| 843 |
:root {
|
| 844 |
--bg1: #f5fbff;
|
|
|
|
| 1033 |
)
|
| 1034 |
|
| 1035 |
run_single_btn.click(
|
| 1036 |
+
fn=_safe_run_single_task_and_refresh,
|
| 1037 |
inputs=[model_state, task_choice, analysis_output_dir, analysis_input, task4_phase],
|
| 1038 |
outputs=[
|
| 1039 |
task_run_status,
|
|
|
|
| 1051 |
],
|
| 1052 |
)
|
| 1053 |
run_all_btn.click(
|
| 1054 |
+
fn=_safe_start_run_all_background,
|
| 1055 |
+
inputs=[model_state, analysis_output_dir, analysis_input, task4_phase, bg_job_state],
|
| 1056 |
outputs=[task_run_status, task_run_log, bg_job_state],
|
| 1057 |
)
|
| 1058 |
track_bg_btn.click(
|
| 1059 |
+
fn=_safe_poll_run_all_background,
|
| 1060 |
inputs=[bg_job_state, analysis_output_dir],
|
| 1061 |
outputs=[
|
| 1062 |
task_run_status,
|
|
|
|
| 1074 |
],
|
| 1075 |
)
|
| 1076 |
refresh_outputs_btn.click(
|
| 1077 |
+
fn=_safe_refresh_task_outputs,
|
| 1078 |
inputs=[analysis_output_dir],
|
| 1079 |
outputs=[
|
| 1080 |
task1_box,
|
|
|
|
| 1094 |
outputs=[model_state, load_status, model_info, num_steps, analysis_output_dir],
|
| 1095 |
)
|
| 1096 |
demo.load(
|
| 1097 |
+
fn=_safe_refresh_task_outputs,
|
| 1098 |
inputs=[analysis_output_dir],
|
| 1099 |
outputs=[
|
| 1100 |
task1_box,
|
requirements.txt
CHANGED
|
@@ -4,3 +4,4 @@ numpy>=1.24
|
|
| 4 |
tqdm>=4.66
|
| 5 |
huggingface_hub>=0.30,<1.0
|
| 6 |
tokenizers>=0.15
|
|
|
|
|
|
| 4 |
tqdm>=4.66
|
| 5 |
huggingface_hub>=0.30,<1.0
|
| 6 |
tokenizers>=0.15
|
| 7 |
+
mlflow-skinny>=2.16.0
|