James-Edmunds commited on
Commit
aad53ab
·
verified ·
1 Parent(s): e366acd

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. src/generator/generator.py +20 -15
src/generator/generator.py CHANGED
@@ -106,23 +106,28 @@ class LyricGenerator:
106
  """Download and setup embeddings from HuggingFace dataset"""
107
  print("\n=== Setting up embeddings from HuggingFace dataset ===")
108
  try:
109
- # Create data directory if it doesn't exist
110
- data_dir = Path("/data")
111
- data_dir.mkdir(exist_ok=True)
112
-
113
- # First find the snapshot directory
114
- snapshot_pattern = "**/datasets--*--*/snapshots/*/chroma"
115
- snapshots = list(data_dir.glob(snapshot_pattern))
116
- if not snapshots:
117
- raise RuntimeError("No snapshot directories found")
118
-
119
- # Use most recent snapshot
120
- chosen_path = max(snapshots, key=lambda p: p.stat().st_mtime)
121
- print(f"Using snapshot directory: {chosen_path}")
122
-
123
  # Set the chroma directory
124
- self.chroma_dir = chosen_path
125
  print(f"Chroma directory set to: {self.chroma_dir}")
 
 
 
 
 
126
 
127
  except Exception as e:
128
  print(f"\n=== Error in _setup_embeddings_from_hf ===")
 
106
  """Download and setup embeddings from HuggingFace dataset"""
107
  print("\n=== Setting up embeddings from HuggingFace dataset ===")
108
  try:
109
+ # Force fresh download of the dataset to ensure latest HNSW index
110
+ print("Downloading latest dataset snapshot...")
111
+ snapshot_path = snapshot_download(
112
+ repo_id=Settings.HF_DATASET,
113
+ repo_type="dataset",
114
+ token=Settings.HF_TOKEN,
115
+ cache_dir="/data",
116
+ )
117
+ chroma_path = Path(snapshot_path) / "chroma"
118
+ print(f"Downloaded snapshot to: {snapshot_path}")
119
+
120
+ if not chroma_path.exists():
121
+ raise RuntimeError(f"chroma/ not found in snapshot at {chroma_path}")
122
+
123
  # Set the chroma directory
124
+ self.chroma_dir = chroma_path
125
  print(f"Chroma directory set to: {self.chroma_dir}")
126
+
127
+ # Log index files for debugging
128
+ for f in sorted(chroma_path.rglob("*")):
129
+ if f.is_file():
130
+ print(f" {f.name}: {f.stat().st_size / (1024*1024):.1f} MB")
131
 
132
  except Exception as e:
133
  print(f"\n=== Error in _setup_embeddings_from_hf ===")