Miyu Horiuchi commited on
Commit
3b4d471
·
1 Parent(s): a4202fc

Deploy hybrid catalog predictions

Browse files
Files changed (4) hide show
  1. .gitignore +23 -0
  2. Dockerfile +1 -0
  3. README.md +14 -0
  4. artifacts/hybrid_predictions.parquet +3 -0
.gitignore CHANGED
@@ -38,6 +38,7 @@ artifacts/*
38
  !artifacts/phase_c.log
39
  !artifacts/phase_e.log
40
  !artifacts/uncultured_predictions.parquet
 
41
  !artifacts/embedding_results.json
42
  !artifacts/v1_vs_v2_comparison.md
43
  !artifacts/train_v2.log
@@ -46,6 +47,23 @@ artifacts/*
46
  !artifacts/score_uncultured_media.log
47
  !artifacts/train_combined.log
48
  !artifacts/combined_results.json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # Trained recommender models — needed for scripts/recommend.py to work after clone
51
  !models/
@@ -67,6 +85,11 @@ notebooks/scratch/
67
  *.swp
68
  .DS_Store
69
 
 
 
 
 
 
70
  # Agent / tool state
71
  .claude/
72
  .letta/
 
38
  !artifacts/phase_c.log
39
  !artifacts/phase_e.log
40
  !artifacts/uncultured_predictions.parquet
41
+ !artifacts/hybrid_predictions.parquet
42
  !artifacts/embedding_results.json
43
  !artifacts/v1_vs_v2_comparison.md
44
  !artifacts/train_v2.log
 
47
  !artifacts/score_uncultured_media.log
48
  !artifacts/train_combined.log
49
  !artifacts/combined_results.json
50
+ !artifacts/baseline_results_pre_pme.json
51
+ !artifacts/retrain_with_pme.log
52
+ !artifacts/materialize_pme.log
53
+ !artifacts/extract_seqs_smoke.log
54
+ !artifacts/extract_seqs_full.log
55
+ !artifacts/lora_smoke.log
56
+ !artifacts/lora_smoke2.log
57
+ !artifacts/lora_fold0_real.log
58
+ !artifacts/lora/
59
+ artifacts/lora/*
60
+ !artifacts/lora/fold0_results_smoke.json
61
+ !artifacts/lora/fold0_results.json
62
+ !artifacts/lora/lambda_fold0_1ep_20260517T033023Z.log
63
+ !artifacts/lora/fold0_results_oxygen.json
64
+ !artifacts/lora/lambda_fold0_oxygen_1ep_20260517T103524Z.log
65
+ !artifacts/lora_vs_baseline.md
66
+ !artifacts/lora_oxygen_vs_all_task.md
67
 
68
  # Trained recommender models — needed for scripts/recommend.py to work after clone
69
  !models/
 
85
  *.swp
86
  .DS_Store
87
 
88
+ # Cerebrium build artifacts (HMM data files are duplicated from data/ for the image build)
89
+ cerebrium/*/kofam_relevant.hmm
90
+ cerebrium/*/ko_thresholds.tsv
91
+ cerebrium/*/markers.hmm
92
+
93
  # Agent / tool state
94
  .claude/
95
  .letta/
Dockerfile CHANGED
@@ -47,6 +47,7 @@ COPY --chown=user:user src/ ./src/
47
  COPY --chown=user:user scripts/recommend.py ./scripts/recommend.py
48
  COPY --chown=user:user models/ ./models/
49
  COPY --chown=user:user artifacts/uncultured_predictions.parquet ./artifacts/uncultured_predictions.parquet
 
50
  COPY --chown=user:user data/media_metadata.parquet ./data/media_metadata.parquet
51
  COPY --chown=user:user data/media_recipes.parquet ./data/media_recipes.parquet
52
  COPY --chown=user:user pyproject.toml README.md ./
 
47
  COPY --chown=user:user scripts/recommend.py ./scripts/recommend.py
48
  COPY --chown=user:user models/ ./models/
49
  COPY --chown=user:user artifacts/uncultured_predictions.parquet ./artifacts/uncultured_predictions.parquet
50
+ COPY --chown=user:user artifacts/hybrid_predictions.parquet ./artifacts/hybrid_predictions.parquet
51
  COPY --chown=user:user data/media_metadata.parquet ./data/media_metadata.parquet
52
  COPY --chown=user:user data/media_recipes.parquet ./data/media_recipes.parquet
53
  COPY --chown=user:user pyproject.toml README.md ./
README.md CHANGED
@@ -143,6 +143,20 @@ PYTHONPATH=src uv run --python 3.11 --extra dev --extra embeddings python script
143
  --marker-sequences data/marker_sequences.jsonl \
144
  --device mps \
145
  --output artifacts/hybrid_predictions.parquet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  ```
147
 
148
  For overnight runs, `scripts/run_train_and_eval.sh` chains the core pipeline. The HMM,
 
143
  --marker-sequences data/marker_sequences.jsonl \
144
  --device mps \
145
  --output artifacts/hybrid_predictions.parquet
146
+
147
+ # Chunked uncultured-catalog run; keeps tabular values and replaces oxygen with LoRA.
148
+ PYTHONPATH=src uv run --python 3.11 --extra dev --extra embeddings python scripts/39_predict_hybrid.py \
149
+ --features artifacts/uncultured_predictions.parquet \
150
+ --marker-sequences data/uncultured_marker_sequences.jsonl \
151
+ --join left \
152
+ --reuse-existing-tabular \
153
+ --device mps \
154
+ --batch-size 2 \
155
+ --chunk-size 250 \
156
+ --chunk-output-dir artifacts/hybrid_chunks \
157
+ --resume-chunks \
158
+ --progress-every 25 \
159
+ --output artifacts/hybrid_predictions.parquet
160
  ```
161
 
162
  For overnight runs, `scripts/run_train_and_eval.sh` chains the core pipeline. The HMM,
artifacts/hybrid_predictions.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd96774ad62b8166daec2db080c93a3f5b7e7757be7bf290c8868dd7bd67def
3
+ size 327854