Upload folder using huggingface_hub
Browse files- src/generator/generator.py +20 -15
src/generator/generator.py
CHANGED
|
@@ -106,23 +106,28 @@ class LyricGenerator:
|
|
| 106 |
"""Download and setup embeddings from HuggingFace dataset"""
|
| 107 |
print("\n=== Setting up embeddings from HuggingFace dataset ===")
|
| 108 |
try:
|
| 109 |
-
#
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
# Set the chroma directory
|
| 124 |
-
self.chroma_dir =
|
| 125 |
print(f"Chroma directory set to: {self.chroma_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
except Exception as e:
|
| 128 |
print(f"\n=== Error in _setup_embeddings_from_hf ===")
|
|
|
|
| 106 |
"""Download and setup embeddings from HuggingFace dataset"""
|
| 107 |
print("\n=== Setting up embeddings from HuggingFace dataset ===")
|
| 108 |
try:
|
| 109 |
+
# Force fresh download of the dataset to ensure latest HNSW index
|
| 110 |
+
print("Downloading latest dataset snapshot...")
|
| 111 |
+
snapshot_path = snapshot_download(
|
| 112 |
+
repo_id=Settings.HF_DATASET,
|
| 113 |
+
repo_type="dataset",
|
| 114 |
+
token=Settings.HF_TOKEN,
|
| 115 |
+
cache_dir="/data",
|
| 116 |
+
)
|
| 117 |
+
chroma_path = Path(snapshot_path) / "chroma"
|
| 118 |
+
print(f"Downloaded snapshot to: {snapshot_path}")
|
| 119 |
+
|
| 120 |
+
if not chroma_path.exists():
|
| 121 |
+
raise RuntimeError(f"chroma/ not found in snapshot at {chroma_path}")
|
| 122 |
+
|
| 123 |
# Set the chroma directory
|
| 124 |
+
self.chroma_dir = chroma_path
|
| 125 |
print(f"Chroma directory set to: {self.chroma_dir}")
|
| 126 |
+
|
| 127 |
+
# Log index files for debugging
|
| 128 |
+
for f in sorted(chroma_path.rglob("*")):
|
| 129 |
+
if f.is_file():
|
| 130 |
+
print(f" {f.name}: {f.stat().st_size / (1024*1024):.1f} MB")
|
| 131 |
|
| 132 |
except Exception as e:
|
| 133 |
print(f"\n=== Error in _setup_embeddings_from_hf ===")
|