James Edmunds commited on
Commit
686446c
·
1 Parent(s): 442b10a

Working local. Added new fixes for HF emebeddings

Browse files
app.py CHANGED
@@ -16,43 +16,47 @@ def initialize_generator():
16
  """Initialize the generator with proper error handling"""
17
  try:
18
  print("\n=== Initializing Generator ===")
19
- # Check for HuggingFace environment requirements
20
  if Settings.is_huggingface():
21
  print("Running in HuggingFace environment")
22
  print("Checking environment requirements...")
23
 
 
 
 
 
 
 
 
 
 
24
  if not Settings.HF_TOKEN:
25
- error_msg = "HuggingFace token not found. Please set HF_TOKEN in Space secrets."
26
  print(f"Error: {error_msg}")
27
  st.error(error_msg)
28
  return None
29
- else:
30
- print("HF_TOKEN found in environment")
31
 
32
  # Ensure persistent storage directory exists
33
  storage_path = Path("/data/processed/embeddings")
34
- print(f"Setting up persistent storage at: {storage_path}")
35
  storage_path.mkdir(parents=True, exist_ok=True)
36
- print(f"Storage directory created/verified")
37
 
38
  if storage_path.exists():
39
- print(f"Storage directory contents: {list(storage_path.glob('**/*'))}")
40
- else:
41
- print("Running in local environment")
42
-
43
- # Debugging: Check if OpenAI API Key is set
44
- Settings.debug_openai_key()
45
 
46
  # Initialize generator
47
  print("\nInitializing LyricGenerator...")
48
- st.info("Loading embeddings...")
49
  generator = LyricGenerator()
50
- st.success("Embeddings loaded successfully!")
51
  return generator
52
 
53
  except Exception as e:
54
- error_msg = f"Failed to initialize generator: {str(e)}"
55
- print(f"\nError during initialization: {error_msg}")
56
  print(f"Error type: {type(e).__name__}")
57
  st.error(error_msg)
58
  return None
 
16
  """Initialize the generator with proper error handling"""
17
  try:
18
  print("\n=== Initializing Generator ===")
19
+
20
  if Settings.is_huggingface():
21
  print("Running in HuggingFace environment")
22
  print("Checking environment requirements...")
23
 
24
+ # Debug: List contents of /data directory
25
+ data_dir = Path("/data")
26
+ if data_dir.exists():
27
+ print("\nContents of /data directory:")
28
+ for item in data_dir.rglob("*"):
29
+ if item.is_file():
30
+ print(f"- {item.relative_to(data_dir)} "
31
+ f"({item.stat().st_size / 1024:.1f} KB)")
32
+
33
  if not Settings.HF_TOKEN:
34
+ error_msg = "HuggingFace token not found."
35
  print(f"Error: {error_msg}")
36
  st.error(error_msg)
37
  return None
 
 
38
 
39
  # Ensure persistent storage directory exists
40
  storage_path = Path("/data/processed/embeddings")
41
+ print(f"\nSetting up storage at: {storage_path}")
42
  storage_path.mkdir(parents=True, exist_ok=True)
 
43
 
44
  if storage_path.exists():
45
+ print("Storage directory contents:")
46
+ for item in storage_path.rglob("*"):
47
+ if item.is_file():
48
+ print(f"- {item.relative_to(storage_path)} "
49
+ f"({item.stat().st_size / 1024:.1f} KB)")
 
50
 
51
  # Initialize generator
52
  print("\nInitializing LyricGenerator...")
 
53
  generator = LyricGenerator()
54
+ st.success("Generator initialized successfully!")
55
  return generator
56
 
57
  except Exception as e:
58
+ error_msg = f"Initialization failed: {str(e)}"
59
+ print(f"\nError: {error_msg}")
60
  print(f"Error type: {type(e).__name__}")
61
  st.error(error_msg)
62
  return None
config/settings.py CHANGED
@@ -40,14 +40,41 @@ class Settings:
40
  def get_embeddings_path(cls) -> Path:
41
  """Get the base embeddings path"""
42
  if cls.is_huggingface():
43
- # HuggingFace: Use absolute path in persistent storage
44
- return Path("/data/processed/embeddings")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  # Local: Use project-relative path
46
- return cls.BASE_DIR / "data" / "processed" / "embeddings"
 
 
47
 
48
  @classmethod
49
  def get_chroma_path(cls) -> Path:
50
  """Get the Chroma DB path"""
 
 
 
 
51
  return cls.get_embeddings_path() / "chroma"
52
 
53
  @classmethod
@@ -60,9 +87,10 @@ class Settings:
60
  @classmethod
61
  def get_chroma_settings(cls) -> dict:
62
  """Get ChromaDB settings"""
 
63
  return {
64
  "anonymized_telemetry": False,
65
- "persist_directory": str(cls.get_chroma_path()),
66
  "collection_name": cls.CHROMA_COLLECTION_NAME
67
  }
68
 
 
40
  def get_embeddings_path(cls) -> Path:
41
  """Get the base embeddings path"""
42
  if cls.is_huggingface():
43
+ # In HuggingFace, first check the dataset cache
44
+ data_dir = Path("/data")
45
+ print(f"\nSearching for embeddings in: {data_dir}")
46
+
47
+ # Look for the most recent snapshot directory containing chroma
48
+ snapshot_pattern = "**/datasets--*--*/snapshots/*/chroma"
49
+ print(f"Using search pattern: {snapshot_pattern}")
50
+
51
+ snapshots = list(data_dir.glob(snapshot_pattern))
52
+ print(f"Found {len(snapshots)} potential snapshot directories:")
53
+ for snap in snapshots:
54
+ print(f"- {snap} (Modified: {snap.stat().st_mtime})")
55
+
56
+ if snapshots:
57
+ chosen_path = max(snapshots, key=lambda p: p.stat().st_mtime)
58
+ print(f"Selected most recent: {chosen_path}")
59
+ return chosen_path
60
+
61
+ print("No snapshots found, using fallback location")
62
+ fallback_path = data_dir / "processed/embeddings"
63
+ print(f"Fallback path: {fallback_path}")
64
+ return fallback_path
65
+
66
  # Local: Use project-relative path
67
+ embeddings_path = cls.BASE_DIR / "data" / "processed" / "embeddings"
68
+ print(f"Local embeddings path: {embeddings_path}")
69
+ return embeddings_path
70
 
71
  @classmethod
72
  def get_chroma_path(cls) -> Path:
73
  """Get the Chroma DB path"""
74
+ if cls.is_huggingface():
75
+ # In HuggingFace, the chroma path is the embeddings path itself
76
+ return cls.get_embeddings_path()
77
+ # Local: Use subdirectory
78
  return cls.get_embeddings_path() / "chroma"
79
 
80
  @classmethod
 
87
  @classmethod
88
  def get_chroma_settings(cls) -> dict:
89
  """Get ChromaDB settings"""
90
+ chroma_path = cls.get_chroma_path()
91
  return {
92
  "anonymized_telemetry": False,
93
+ "persist_directory": str(chroma_path),
94
  "collection_name": cls.CHROMA_COLLECTION_NAME
95
  }
96
 
scripts/browse_hf_data.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
2
  import streamlit as st
3
  from pathlib import Path
 
 
 
4
 
5
  def list_files_in_directory(directory):
6
  """List all files in the given directory and its subdirectories."""
@@ -10,9 +13,27 @@ def list_files_in_directory(directory):
10
  files.append(os.path.join(root, filename))
11
  return files
12
 
 
13
  def main():
14
  st.title("Embeddings File Browser")
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # Directory to browse
17
  directory = "/data" # Persistent storage directory
18
  st.write(f"Browsing directory: {directory}")
@@ -27,5 +48,6 @@ def main():
27
  else:
28
  st.write("No files found in the directory.")
29
 
 
30
  if __name__ == "__main__":
31
  main()
 
1
  import os
2
  import streamlit as st
3
  from pathlib import Path
4
+ from datasets import load_dataset
5
+ from dotenv import load_dotenv
6
+
7
 
8
  def list_files_in_directory(directory):
9
  """List all files in the given directory and its subdirectories."""
 
13
  files.append(os.path.join(root, filename))
14
  return files
15
 
16
+
17
  def main():
18
  st.title("Embeddings File Browser")
19
 
20
+ # Load environment variables
21
+ load_dotenv()
22
+
23
+ # Retrieve the Hugging Face token
24
+ hf_token = os.getenv("HF_TOKEN")
25
+ if not hf_token:
26
+ st.error("HF_TOKEN not found in environment variables.")
27
+ return
28
+
29
+ # Load the dataset using the token
30
+ try:
31
+ dataset = load_dataset("SongLift/LyrGen2_DB", use_auth_token=hf_token)
32
+ st.write("Dataset loaded successfully.")
33
+ except Exception as e:
34
+ st.error(f"Error loading dataset: {str(e)}")
35
+ return
36
+
37
  # Directory to browse
38
  directory = "/data" # Persistent storage directory
39
  st.write(f"Browsing directory: {directory}")
 
48
  else:
49
  st.write("No files found in the directory.")
50
 
51
+
52
  if __name__ == "__main__":
53
  main()
scripts/check_chroma_settings.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ # Add the project root to Python path
5
+ project_root = Path(__file__).parent.parent
6
+ sys.path.append(str(project_root))
7
+
8
+ from config.settings import Settings
9
+
10
+ def main():
11
+ """Print Chroma settings and related paths"""
12
+ print("\nChroma Settings:")
13
+ print("-" * 50)
14
+ settings = Settings.get_chroma_settings()
15
+ for key, value in settings.items():
16
+ print(f"{key}: {value}")
17
+
18
+ print("\nRelated Paths:")
19
+ print("-" * 50)
20
+ print(f"Base Dir: {Settings.BASE_DIR}")
21
+ print(f"Embeddings Path: {Settings.get_embeddings_path()}")
22
+ print(f"Chroma Path: {Settings.get_chroma_path()}")
23
+ print(f"Is HuggingFace: {Settings.is_huggingface()}")
24
+
25
+ if __name__ == "__main__":
26
+ main()
scripts/check_hf_token.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ # Add project root to Python path
5
+ project_root = Path(__file__).parent.parent
6
+ sys.path.append(str(project_root))
7
+
8
+ from config.settings import Settings
9
+ from huggingface_hub import HfApi
10
+
11
+ def check_dataset_files():
12
+ """Check files available in the HuggingFace dataset"""
13
+ print(f"\nChecking dataset: {Settings.HF_DATASET}")
14
+ print(f"Using token: {'Present' if Settings.HF_TOKEN else 'Missing'}")
15
+
16
+ api = HfApi(token=Settings.HF_TOKEN)
17
+ try:
18
+ files = api.list_repo_files(Settings.HF_DATASET, repo_type="dataset")
19
+ print("\nFiles in dataset:")
20
+ for f in files:
21
+ print(f"- {f}")
22
+ except Exception as e:
23
+ print(f"\nError accessing dataset: {type(e).__name__}")
24
+ print(f"Error details: {str(e)}")
25
+
26
+ if __name__ == "__main__":
27
+ check_dataset_files()
scripts/display_version.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ api = HfApi(token=Settings.HF_TOKEN)
3
+ try:
4
+ files = api.list_repo_files(Settings.HF_DATASET)
5
+ print("Files in dataset:")
6
+ for f in files:
7
+ print(f"- {f}")
8
+ except Exception as e:
9
+ print(f"Error: {e}")
scripts/test_download_hf_dataset.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ from huggingface_hub import hf_hub_download
4
+ from dotenv import load_dotenv
5
+ import tempfile
6
+ import shutil
7
+
8
+ # Add project root to Python path
9
+ project_root = Path(__file__).parent.parent
10
+ sys.path.append(str(project_root))
11
+
12
+ from config.settings import Settings
13
+
14
+ def download_chroma_files():
15
+ """Download Chroma files directly using hf_hub_download"""
16
+ try:
17
+ load_dotenv()
18
+
19
+ print(f"\nAttempting to download files from: {Settings.HF_DATASET}")
20
+
21
+ # Create a temporary directory for downloads
22
+ with tempfile.TemporaryDirectory() as temp_dir:
23
+ temp_path = Path(temp_dir)
24
+ print(f"\nUsing temporary directory: {temp_path}")
25
+
26
+ # Files to download
27
+ files_to_download = [
28
+ "chroma/chroma.sqlite3",
29
+ "chroma/fade0013-ed4b-4928-b81b-7435145156dc/data_level0.bin",
30
+ "chroma/fade0013-ed4b-4928-b81b-7435145156dc/header.bin",
31
+ "chroma/fade0013-ed4b-4928-b81b-7435145156dc/index_metadata.pickle",
32
+ "chroma/fade0013-ed4b-4928-b81b-7435145156dc/length.bin",
33
+ "chroma/fade0013-ed4b-4928-b81b-7435145156dc/link_lists.bin"
34
+ ]
35
+
36
+ for file_path in files_to_download:
37
+ try:
38
+ print(f"\nDownloading: {file_path}")
39
+ local_path = hf_hub_download(
40
+ repo_id=Settings.HF_DATASET,
41
+ filename=file_path,
42
+ repo_type="dataset",
43
+ token=Settings.HF_TOKEN,
44
+ cache_dir=temp_path
45
+ )
46
+ print(f"Downloaded to: {local_path}")
47
+
48
+ # Get file size
49
+ size_mb = Path(local_path).stat().st_size / (1024 * 1024)
50
+ print(f"File size: {size_mb:.2f} MB")
51
+
52
+ except Exception as e:
53
+ print(f"Error downloading {file_path}: {str(e)}")
54
+
55
+ print("\nFinal directory structure:")
56
+ def print_dir_tree(path: Path, level: int = 0):
57
+ indent = " " * level
58
+ print(f"{indent}{path.name}/")
59
+ for item in path.iterdir():
60
+ if item.is_file():
61
+ size_mb = item.stat().st_size / (1024 * 1024)
62
+ print(f"{indent} {item.name} ({size_mb:.2f} MB)")
63
+ else:
64
+ print_dir_tree(item, level + 1)
65
+
66
+ print_dir_tree(temp_path)
67
+
68
+ except Exception as e:
69
+ print(f"\nTop-level error: {type(e).__name__}")
70
+ print(f"Error details: {str(e)}")
71
+
72
+ if __name__ == "__main__":
73
+ download_chroma_files()
src/generator/generator.py CHANGED
@@ -5,10 +5,11 @@ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
5
  from langchain_chroma import Chroma
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.prompts import PromptTemplate
8
- from huggingface_hub import snapshot_download
9
  from config.settings import Settings
10
  from tenacity import retry, stop_after_attempt, wait_exponential
11
  from datasets import load_dataset
 
12
 
13
 
14
  class LyricGenerator:
@@ -48,20 +49,119 @@ class LyricGenerator:
48
  """Download and setup embeddings from HuggingFace dataset"""
49
  print("\n=== Setting up embeddings from HuggingFace dataset ===")
50
  try:
51
- # Load the latest version of the dataset into the desired directory
52
- dataset = load_dataset("SongLift/LyrGen2_DB",
53
- split='train', cache_dir="/data")
54
- print("Dataset loaded successfully into cache directory.")
55
-
56
- # Verify the contents of the cache directory
57
- self._list_cache_directory("/data")
58
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  except Exception as e:
60
  print(f"\n=== Error in _setup_embeddings_from_hf ===")
61
  print(f"Error type: {type(e).__name__}")
62
  print(f"Error message: {str(e)}")
63
- raise RuntimeError(
64
- f"Failed to setup embeddings from HuggingFace: {str(e)}")
65
 
66
  def _list_cache_directory(self, cache_dir_path: str) -> None:
67
  """List the contents of the cache directory"""
@@ -82,41 +182,29 @@ class LyricGenerator:
82
  if Settings.is_huggingface():
83
  print("HuggingFace environment detected, setting up embeddings...")
84
  self._setup_embeddings_from_hf()
85
-
86
- # Dynamically determine the correct chroma directory
87
- chroma_dir = self._find_chroma_directory("/data")
88
- if chroma_dir is None:
89
- raise RuntimeError("Chroma directory not found in any expected location.")
90
-
91
  else:
92
  print("Local environment detected")
93
  print(f"Base directory: {Settings.BASE_DIR}")
94
- chroma_dir = Path("/data/processed/embeddings/chroma") # Local environment path
95
-
96
- print(f"Checking Chroma directory: {chroma_dir}")
97
- print(f"Absolute path: {chroma_dir.absolute()}")
 
98
 
99
- if not chroma_dir.exists():
 
 
 
 
 
100
  print(
101
- f"Parent directory exists: {self.embeddings_dir.exists()}")
102
- if self.embeddings_dir.exists():
103
- print(
104
- f"Parent directory contents: {list(self.embeddings_dir.glob('**/*'))}")
105
- raise RuntimeError(
106
- f"Chroma directory not found at {chroma_dir}")
107
-
108
- sqlite_file = chroma_dir / "chroma.sqlite3"
109
- print(f"Checking SQLite file: {sqlite_file}")
110
- if not sqlite_file.exists():
111
- print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
112
- raise RuntimeError(
113
- f"Chroma database not found at {sqlite_file}")
114
- print(
115
- f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
116
 
117
  # Load vector store using environment-aware settings
118
  print("Initializing Chroma with settings:")
119
  chroma_settings = Settings.get_chroma_settings()
 
 
120
  self.vector_store = Chroma(
121
  persist_directory=chroma_settings["persist_directory"],
122
  embedding_function=self.embeddings,
@@ -137,7 +225,7 @@ class LyricGenerator:
137
  # Additional debugging for empty collection
138
  print("\nDebug Information:")
139
  print(f"Chroma directory structure:")
140
- for item in chroma_dir.glob('**/*'):
141
  print(f" {item}")
142
  if item.is_file():
143
  print(
@@ -406,3 +494,39 @@ class LyricGenerator:
406
  except Exception as e:
407
  print(f"Error in generate_lyrics: {str(e)}")
408
  raise RuntimeError(f"Failed to generate lyrics: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from langchain_chroma import Chroma
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.prompts import PromptTemplate
8
+ from huggingface_hub import snapshot_download, hf_hub_download
9
  from config.settings import Settings
10
  from tenacity import retry, stop_after_attempt, wait_exponential
11
  from datasets import load_dataset
12
+ import sqlite3
13
 
14
 
15
  class LyricGenerator:
 
49
  """Download and setup embeddings from HuggingFace dataset"""
50
  print("\n=== Setting up embeddings from HuggingFace dataset ===")
51
  try:
52
+ # Create data directory if it doesn't exist
53
+ data_dir = Path("/data")
54
+ data_dir.mkdir(exist_ok=True)
55
+
56
+ # First download just the chroma.sqlite3 file
57
+ print(f"Downloading main database file...")
58
+ main_db = hf_hub_download(
59
+ repo_id=Settings.HF_DATASET,
60
+ filename="chroma/chroma.sqlite3",
61
+ repo_type="dataset",
62
+ token=Settings.HF_TOKEN,
63
+ cache_dir=data_dir
64
+ )
65
+ print(f"Main database downloaded to: {main_db}")
66
+
67
+ # Find the collection directory by looking at the parent directory
68
+ chroma_dir = Path(main_db).parent
69
+
70
+ # Debug: Print SQLite database contents
71
+ print("\nExamining Chroma database...")
72
+ try:
73
+ conn = sqlite3.connect(main_db)
74
+ cursor = conn.cursor()
75
+
76
+ # List tables
77
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
78
+ tables = cursor.fetchall()
79
+ print("Tables in database:", [t[0] for t in tables])
80
+
81
+ # Get collection info
82
+ cursor.execute("SELECT name, directory FROM collections;")
83
+ collections = cursor.fetchall()
84
+ print("\nCollections found:")
85
+ for name, directory in collections:
86
+ print(f"- Name: {name}, Directory: {directory}")
87
+
88
+ conn.close()
89
+ except Exception as e:
90
+ print(f"Warning: Could not read SQLite database: {e}")
91
+
92
+ # Find all UUID-style directories
93
+ collection_dirs = list(chroma_dir.glob("*-*-*-*-*"))
94
+
95
+ if not collection_dirs:
96
+ raise RuntimeError("No collection directory found in Chroma folder")
97
+
98
+ if len(collection_dirs) > 1:
99
+ print(f"\nWarning: Multiple collection directories found: {[d.name for d in collection_dirs]}")
100
+ print("Using the first one found.")
101
+
102
+ collection_dir = collection_dirs[0]
103
+ collection_name = collection_dir.name
104
+ print(f"\nUsing collection directory: {collection_name}")
105
+
106
+ # Now define the files we need using the found collection name
107
+ files_to_download = [
108
+ "chroma/chroma.sqlite3",
109
+ f"chroma/{collection_name}/data_level0.bin",
110
+ f"chroma/{collection_name}/header.bin",
111
+ f"chroma/{collection_name}/index_metadata.pickle",
112
+ f"chroma/{collection_name}/length.bin",
113
+ f"chroma/{collection_name}/link_lists.bin"
114
+ ]
115
+
116
+ print(f"Downloading files to: {data_dir}")
117
+ for file_path in files_to_download[1:]: # Skip the first file as we already have it
118
+ try:
119
+ print(f"\nDownloading: {file_path}")
120
+ local_path = hf_hub_download(
121
+ repo_id=Settings.HF_DATASET,
122
+ filename=file_path,
123
+ repo_type="dataset",
124
+ token=Settings.HF_TOKEN,
125
+ cache_dir=data_dir
126
+ )
127
+ print(f"Downloaded to: {local_path}")
128
+
129
+ # Get file size
130
+ size_mb = Path(local_path).stat().st_size / (1024 * 1024)
131
+ print(f"File size: {size_mb:.2f} MB")
132
+
133
+ except Exception as e:
134
+ print(f"Error downloading {file_path}: {str(e)}")
135
+ raise RuntimeError(f"Failed to download {file_path}: {str(e)}")
136
+
137
+ # Set the chroma directory
138
+ self.chroma_dir = chroma_dir
139
+ print(f"Using Chroma directory: {self.chroma_dir}")
140
+
141
+ # Verify all required files are present
142
+ required_files = {
143
+ "chroma.sqlite3",
144
+ f"{collection_name}/data_level0.bin",
145
+ f"{collection_name}/header.bin",
146
+ f"{collection_name}/index_metadata.pickle",
147
+ f"{collection_name}/length.bin",
148
+ f"{collection_name}/link_lists.bin"
149
+ }
150
+
151
+ found_files = {p.relative_to(self.chroma_dir).as_posix()
152
+ for p in self.chroma_dir.glob("**/*") if p.is_file()}
153
+
154
+ missing_files = required_files - found_files
155
+ if missing_files:
156
+ raise RuntimeError(f"Missing required files: {missing_files}")
157
+
158
+ print("All required files downloaded and verified successfully")
159
+
160
  except Exception as e:
161
  print(f"\n=== Error in _setup_embeddings_from_hf ===")
162
  print(f"Error type: {type(e).__name__}")
163
  print(f"Error message: {str(e)}")
164
+ raise RuntimeError(f"Failed to setup embeddings from HuggingFace: {str(e)}")
 
165
 
166
  def _list_cache_directory(self, cache_dir_path: str) -> None:
167
  """List the contents of the cache directory"""
 
182
  if Settings.is_huggingface():
183
  print("HuggingFace environment detected, setting up embeddings...")
184
  self._setup_embeddings_from_hf()
 
 
 
 
 
 
185
  else:
186
  print("Local environment detected")
187
  print(f"Base directory: {Settings.BASE_DIR}")
188
+
189
+ # Verify local paths
190
+ if not self.chroma_dir.exists():
191
+ raise RuntimeError(
192
+ f"Chroma directory not found at {self.chroma_dir}")
193
 
194
+ sqlite_file = self.chroma_dir / "chroma.sqlite3"
195
+ print(f"Checking SQLite file: {sqlite_file}")
196
+ if not sqlite_file.exists():
197
+ print(f"Directory contents: {list(self.chroma_dir.glob('**/*'))}")
198
+ raise RuntimeError(
199
+ f"Chroma database not found at {sqlite_file}")
200
  print(
201
+ f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  # Load vector store using environment-aware settings
204
  print("Initializing Chroma with settings:")
205
  chroma_settings = Settings.get_chroma_settings()
206
+ print(f"Using persist directory: {chroma_settings['persist_directory']}")
207
+
208
  self.vector_store = Chroma(
209
  persist_directory=chroma_settings["persist_directory"],
210
  embedding_function=self.embeddings,
 
225
  # Additional debugging for empty collection
226
  print("\nDebug Information:")
227
  print(f"Chroma directory structure:")
228
+ for item in self.chroma_dir.glob('**/*'):
229
  print(f" {item}")
230
  if item.is_file():
231
  print(
 
494
  except Exception as e:
495
  print(f"Error in generate_lyrics: {str(e)}")
496
  raise RuntimeError(f"Failed to generate lyrics: {str(e)}")
497
+
498
+ def _examine_sqlite_db(self, db_path: Path) -> None:
499
+ """Examine the contents of the SQLite database"""
500
+ try:
501
+ print(f"\nExamining SQLite database at: {db_path}")
502
+ conn = sqlite3.connect(db_path)
503
+ cursor = conn.cursor()
504
+
505
+ # List all tables
506
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
507
+ tables = cursor.fetchall()
508
+ print("\nTables in database:")
509
+ for table in tables:
510
+ print(f"- {table[0]}")
511
+ # Get column info for each table
512
+ cursor.execute(f"PRAGMA table_info({table[0]})")
513
+ columns = cursor.fetchall()
514
+ for col in columns:
515
+ print(f" - {col[1]} ({col[2]})")
516
+
517
+ # Get collection info
518
+ print("\nCollections:")
519
+ cursor.execute("SELECT name, directory FROM collections;")
520
+ collections = cursor.fetchall()
521
+ for name, directory in collections:
522
+ print(f"- Name: {name}")
523
+ print(f" Directory: {directory}")
524
+ # Get count of embeddings
525
+ cursor.execute("SELECT COUNT(*) FROM embeddings WHERE collection_id = (SELECT id FROM collections WHERE name = ?)", (name,))
526
+ count = cursor.fetchone()[0]
527
+ print(f" Embeddings count: {count}")
528
+
529
+ conn.close()
530
+
531
+ except Exception as e:
532
+ print(f"Warning: Could not fully examine SQLite database: {e}")