bhsinghgrid commited on
Commit
2fdbfb0
·
verified ·
1 Parent(s): a3ec6c4

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +10 -0
  2. app.py +153 -6
  3. requirements.txt +1 -0
README.md CHANGED
@@ -23,6 +23,16 @@ Set these Space variables in **Settings → Variables and secrets**:
23
 
24
  The app will download checkpoint from your model repo and load it at runtime.
25
 
 
 
 
 
 
 
 
 
 
 
26
  ## Local Dev
27
 
28
  ```bash
 
23
 
24
  The app will download checkpoint from your model repo and load it at runtime.
25
 
26
+ ### Optional MLflow Tracking in Space
27
+
28
+ You can enable lightweight MLflow event logging for inference + task runs.
29
+ Set these optional variables in **Settings → Variables and secrets**:
30
+
31
+ - `MLFLOW_TRACKING_URI` (example: `file:/tmp/mlruns` or your remote tracking server URI)
32
+ - `MLFLOW_EXPERIMENT_NAME` (example: `hf-space-sanskrit-d3pm`)
33
+
34
+ If not set, the Space runs normally without MLflow.
35
+
36
  ## Local Dev
37
 
38
  ```bash
app.py CHANGED
@@ -6,6 +6,7 @@ import sys
6
  import shutil
7
  import threading
8
  import uuid
 
9
  from datetime import datetime
10
  from pathlib import Path
11
 
@@ -31,6 +32,45 @@ DEFAULT_ANALYSIS_OUT = "analysis_outputs/T4"
31
  os.makedirs(RESULTS_DIR, exist_ok=True)
32
  _BG_JOBS = {}
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  HF_DEFAULT_MODEL_REPO = os.environ.get("HF_DEFAULT_MODEL_REPO", "bhsinghgrid/DevaFlow")
35
  HF_DEFAULT_MODEL_FILE = os.environ.get("HF_DEFAULT_MODEL_FILE", "best_model.pt")
36
 
@@ -259,6 +299,7 @@ def generate_from_ui(
259
  if not input_text.strip():
260
  raise gr.Error("Enter input text first.")
261
 
 
262
  cfg = copy.deepcopy(model_bundle["cfg"])
263
  cfg["inference"]["temperature"] = float(temperature)
264
  cfg["inference"]["top_k"] = int(top_k)
@@ -303,6 +344,30 @@ def generate_from_ui(
303
  "clean_output": bool(clean_output),
304
  }
305
  log_path = save_generation(model_bundle["experiment"], record)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  status = f"Inference done. Saved: `{log_path}`"
307
  return output_text, status, record
308
 
@@ -521,6 +586,7 @@ def _bg_worker(job_id: str, model_bundle, output_dir: str, input_text: str, task
521
  tasks = ["1", "2", "3", "4", "5"]
522
  failures = 0
523
  logs = []
 
524
  _BG_JOBS[job_id].update({"state": "running", "progress": 0, "failures": 0, "updated": datetime.now().isoformat()})
525
  for idx, task in enumerate(tasks, start=1):
526
  _BG_JOBS[job_id].update(
@@ -545,6 +611,22 @@ def _bg_worker(job_id: str, model_bundle, output_dir: str, input_text: str, task
545
  "updated": datetime.now().isoformat(),
546
  }
547
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  if failures:
549
  _bundle_task_outputs(model_bundle, output_dir)
550
  _BG_JOBS[job_id].update(
@@ -557,6 +639,20 @@ def _bg_worker(job_id: str, model_bundle, output_dir: str, input_text: str, task
557
  "updated": datetime.now().isoformat(),
558
  }
559
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
 
561
 
562
  def start_run_all_background(model_bundle, output_dir, input_text, task4_phase):
@@ -606,7 +702,9 @@ def run_single_task_and_refresh(model_bundle, task, output_dir, input_text, task
606
  def run_single_task(model_bundle, task, output_dir, input_text, task4_phase):
607
  if not model_bundle:
608
  raise gr.Error("Load a model first.")
 
609
  code, log, used_bundled = _run_analysis_cmd(task, model_bundle["ckpt_path"], output_dir, input_text, task4_phase)
 
610
  if code != 0:
611
  _bundle_task_outputs(model_bundle, output_dir)
612
  log = f"{log}\n\n--- Live task analysis ---\n{_live_task_analysis(model_bundle, task, input_text)}"
@@ -618,6 +716,21 @@ def run_single_task(model_bundle, task, output_dir, input_text, task4_phase):
618
  status = f"Task {task} loaded from bundled analysis outputs + live analysis."
619
  else:
620
  status = f"Task {task} completed (exit={code})."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  return status, log
622
 
623
 
@@ -692,6 +805,40 @@ def refresh_task_outputs(output_dir):
692
  )
693
 
694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  CUSTOM_CSS = """
696
  :root {
697
  --bg1: #f5fbff;
@@ -886,7 +1033,7 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
886
  )
887
 
888
  run_single_btn.click(
889
- fn=run_single_task_and_refresh,
890
  inputs=[model_state, task_choice, analysis_output_dir, analysis_input, task4_phase],
891
  outputs=[
892
  task_run_status,
@@ -904,12 +1051,12 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
904
  ],
905
  )
906
  run_all_btn.click(
907
- fn=start_run_all_background,
908
- inputs=[model_state, analysis_output_dir, analysis_input, task4_phase],
909
  outputs=[task_run_status, task_run_log, bg_job_state],
910
  )
911
  track_bg_btn.click(
912
- fn=poll_run_all_background,
913
  inputs=[bg_job_state, analysis_output_dir],
914
  outputs=[
915
  task_run_status,
@@ -927,7 +1074,7 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
927
  ],
928
  )
929
  refresh_outputs_btn.click(
930
- fn=refresh_task_outputs,
931
  inputs=[analysis_output_dir],
932
  outputs=[
933
  task1_box,
@@ -947,7 +1094,7 @@ with gr.Blocks(title="Sanskrit Diffusion Client Demo", css=CUSTOM_CSS) as demo:
947
  outputs=[model_state, load_status, model_info, num_steps, analysis_output_dir],
948
  )
949
  demo.load(
950
- fn=refresh_task_outputs,
951
  inputs=[analysis_output_dir],
952
  outputs=[
953
  task1_box,
 
6
  import shutil
7
  import threading
8
  import uuid
9
+ import time
10
  from datetime import datetime
11
  from pathlib import Path
12
 
 
32
  os.makedirs(RESULTS_DIR, exist_ok=True)
33
  _BG_JOBS = {}
34
 
35
+ try:
36
+ import mlflow
37
+ except Exception:
38
+ mlflow = None
39
+
40
+ _MLFLOW_READY = False
41
+
42
+
43
+ def _setup_mlflow_once():
44
+ global _MLFLOW_READY
45
+ if _MLFLOW_READY:
46
+ return
47
+ if mlflow is None:
48
+ return
49
+ try:
50
+ tracking_uri = os.environ.get("MLFLOW_TRACKING_URI", "file:/tmp/mlruns")
51
+ experiment = os.environ.get("MLFLOW_EXPERIMENT_NAME", "hf-space-sanskrit-d3pm")
52
+ mlflow.set_tracking_uri(tracking_uri)
53
+ mlflow.set_experiment(experiment)
54
+ _MLFLOW_READY = True
55
+ except Exception:
56
+ _MLFLOW_READY = False
57
+
58
+
59
+ def _mlflow_event(run_name: str, params: dict | None = None, metrics: dict | None = None, tags: dict | None = None):
60
+ _setup_mlflow_once()
61
+ if not _MLFLOW_READY or mlflow is None:
62
+ return
63
+ try:
64
+ with mlflow.start_run(run_name=run_name, nested=False):
65
+ if tags:
66
+ mlflow.set_tags({k: str(v) for k, v in tags.items()})
67
+ if params:
68
+ mlflow.log_params({k: (v if isinstance(v, (int, float, str, bool)) else str(v)) for k, v in params.items()})
69
+ if metrics:
70
+ mlflow.log_metrics({k: float(v) for k, v in metrics.items()})
71
+ except Exception:
72
+ pass
73
+
74
  HF_DEFAULT_MODEL_REPO = os.environ.get("HF_DEFAULT_MODEL_REPO", "bhsinghgrid/DevaFlow")
75
  HF_DEFAULT_MODEL_FILE = os.environ.get("HF_DEFAULT_MODEL_FILE", "best_model.pt")
76
 
 
299
  if not input_text.strip():
300
  raise gr.Error("Enter input text first.")
301
 
302
+ t0 = time.perf_counter()
303
  cfg = copy.deepcopy(model_bundle["cfg"])
304
  cfg["inference"]["temperature"] = float(temperature)
305
  cfg["inference"]["top_k"] = int(top_k)
 
344
  "clean_output": bool(clean_output),
345
  }
346
  log_path = save_generation(model_bundle["experiment"], record)
347
+ latency_ms = (time.perf_counter() - t0) * 1000.0
348
+ toks = [t for t in output_text.split() if t]
349
+ uniq = len(set(toks)) / max(1, len(toks))
350
+ _mlflow_event(
351
+ run_name="space_inference",
352
+ params={
353
+ "experiment": model_bundle["experiment"],
354
+ "checkpoint": model_bundle["ckpt_path"],
355
+ "temperature": float(temperature),
356
+ "top_k": int(top_k),
357
+ "repetition_penalty": float(repetition_penalty),
358
+ "diversity_penalty": float(diversity_penalty),
359
+ "num_steps": int(num_steps),
360
+ "clean_output": bool(clean_output),
361
+ },
362
+ metrics={
363
+ "latency_ms": latency_ms,
364
+ "input_char_len": len(input_text.strip()),
365
+ "output_char_len": len(output_text),
366
+ "output_token_len": len(toks),
367
+ "output_unique_ratio": uniq,
368
+ },
369
+ tags={"source": "hf_space"},
370
+ )
371
  status = f"Inference done. Saved: `{log_path}`"
372
  return output_text, status, record
373
 
 
586
  tasks = ["1", "2", "3", "4", "5"]
587
  failures = 0
588
  logs = []
589
+ run_start = time.perf_counter()
590
  _BG_JOBS[job_id].update({"state": "running", "progress": 0, "failures": 0, "updated": datetime.now().isoformat()})
591
  for idx, task in enumerate(tasks, start=1):
592
  _BG_JOBS[job_id].update(
 
611
  "updated": datetime.now().isoformat(),
612
  }
613
  )
614
+ _mlflow_event(
615
+ run_name=f"space_bg_task_{task}",
616
+ params={
617
+ "job_id": job_id,
618
+ "task": task,
619
+ "task4_phase": str(task4_phase),
620
+ "experiment": model_bundle.get("experiment", ""),
621
+ },
622
+ metrics={
623
+ "exit_code": float(code),
624
+ "used_bundled": 1.0 if used_bundled else 0.0,
625
+ "failures_so_far": float(failures),
626
+ "progress_pct": float(_BG_JOBS[job_id]["progress"]),
627
+ },
628
+ tags={"source": "hf_space", "mode": "background"},
629
+ )
630
  if failures:
631
  _bundle_task_outputs(model_bundle, output_dir)
632
  _BG_JOBS[job_id].update(
 
639
  "updated": datetime.now().isoformat(),
640
  }
641
  )
642
+ _mlflow_event(
643
+ run_name="space_bg_run",
644
+ params={
645
+ "job_id": job_id,
646
+ "task4_phase": str(task4_phase),
647
+ "experiment": model_bundle.get("experiment", ""),
648
+ "output_dir": str(output_dir),
649
+ },
650
+ metrics={
651
+ "failures": float(failures),
652
+ "elapsed_s": (time.perf_counter() - run_start),
653
+ },
654
+ tags={"source": "hf_space", "mode": "background_summary"},
655
+ )
656
 
657
 
658
  def start_run_all_background(model_bundle, output_dir, input_text, task4_phase):
 
702
  def run_single_task(model_bundle, task, output_dir, input_text, task4_phase):
703
  if not model_bundle:
704
  raise gr.Error("Load a model first.")
705
+ t0 = time.perf_counter()
706
  code, log, used_bundled = _run_analysis_cmd(task, model_bundle["ckpt_path"], output_dir, input_text, task4_phase)
707
+ elapsed = (time.perf_counter() - t0) * 1000.0
708
  if code != 0:
709
  _bundle_task_outputs(model_bundle, output_dir)
710
  log = f"{log}\n\n--- Live task analysis ---\n{_live_task_analysis(model_bundle, task, input_text)}"
 
716
  status = f"Task {task} loaded from bundled analysis outputs + live analysis."
717
  else:
718
  status = f"Task {task} completed (exit={code})."
719
+ _mlflow_event(
720
+ run_name=f"space_task_{task}",
721
+ params={
722
+ "task": str(task),
723
+ "task4_phase": str(task4_phase),
724
+ "output_dir": str(output_dir),
725
+ "experiment": model_bundle.get("experiment", ""),
726
+ },
727
+ metrics={
728
+ "exit_code": float(code),
729
+ "elapsed_ms": elapsed,
730
+ "used_bundled": 1.0 if used_bundled else 0.0,
731
+ },
732
+ tags={"source": "hf_space", "mode": "single_task"},
733
+ )
734
  return status, log
735
 
736
 
 
805
  )
806
 
807
 
808
+ def _safe_refresh_task_outputs(output_dir):
809
+ try:
810
+ return refresh_task_outputs(output_dir)
811
+ except Exception as e:
812
+ err = f"Refresh error: {e}"
813
+ return (err, err, None, None, None, None, err, None, err, None)
814
+
815
+
816
+ def _safe_start_run_all_background(model_bundle, output_dir, input_text, task4_phase, current_job_id):
817
+ try:
818
+ status, log, job_id = start_run_all_background(model_bundle, output_dir, input_text, task4_phase)
819
+ return status, log, job_id
820
+ except Exception as e:
821
+ return f"Background start failed: {e}", f"Background start failed: {e}", current_job_id
822
+
823
+
824
+ def _safe_poll_run_all_background(job_id, output_dir):
825
+ try:
826
+ return poll_run_all_background(job_id, output_dir)
827
+ except Exception as e:
828
+ err = f"Track error: {e}"
829
+ out = _safe_refresh_task_outputs(output_dir)
830
+ return err, err, *out
831
+
832
+
833
+ def _safe_run_single_task_and_refresh(model_bundle, task, output_dir, input_text, task4_phase):
834
+ try:
835
+ return run_single_task_and_refresh(model_bundle, task, output_dir, input_text, task4_phase)
836
+ except Exception as e:
837
+ err = f"Task {task} failed: {e}"
838
+ out = _safe_refresh_task_outputs(output_dir)
839
+ return err, err, *out
840
+
841
+
842
  CUSTOM_CSS = """
843
  :root {
844
  --bg1: #f5fbff;
 
1033
  )
1034
 
1035
  run_single_btn.click(
1036
+ fn=_safe_run_single_task_and_refresh,
1037
  inputs=[model_state, task_choice, analysis_output_dir, analysis_input, task4_phase],
1038
  outputs=[
1039
  task_run_status,
 
1051
  ],
1052
  )
1053
  run_all_btn.click(
1054
+ fn=_safe_start_run_all_background,
1055
+ inputs=[model_state, analysis_output_dir, analysis_input, task4_phase, bg_job_state],
1056
  outputs=[task_run_status, task_run_log, bg_job_state],
1057
  )
1058
  track_bg_btn.click(
1059
+ fn=_safe_poll_run_all_background,
1060
  inputs=[bg_job_state, analysis_output_dir],
1061
  outputs=[
1062
  task_run_status,
 
1074
  ],
1075
  )
1076
  refresh_outputs_btn.click(
1077
+ fn=_safe_refresh_task_outputs,
1078
  inputs=[analysis_output_dir],
1079
  outputs=[
1080
  task1_box,
 
1094
  outputs=[model_state, load_status, model_info, num_steps, analysis_output_dir],
1095
  )
1096
  demo.load(
1097
+ fn=_safe_refresh_task_outputs,
1098
  inputs=[analysis_output_dir],
1099
  outputs=[
1100
  task1_box,
requirements.txt CHANGED
@@ -4,3 +4,4 @@ numpy>=1.24
4
  tqdm>=4.66
5
  huggingface_hub>=0.30,<1.0
6
  tokenizers>=0.15
 
 
4
  tqdm>=4.66
5
  huggingface_hub>=0.30,<1.0
6
  tokenizers>=0.15
7
+ mlflow-skinny>=2.16.0