James Edmunds commited on
Commit ·
0152ed4
1
Parent(s): b6d877d
Working locally, trying to fix embeddings on HF
Browse files- config/settings.py +5 -1
- docs/TROUBLESHOOTING.md +44 -0
- scripts/process_lyrics.py +3 -1
- scripts/test_embeddings.py +63 -0
- src/generator/generator.py +60 -48
config/settings.py
CHANGED
|
@@ -28,6 +28,9 @@ class Settings:
|
|
| 28 |
EMBEDDING_MODEL = "text-embedding-ada-002"
|
| 29 |
LLM_MODEL = "gpt-4"
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
@classmethod
|
| 32 |
def is_huggingface(cls) -> bool:
|
| 33 |
"""Check if running in HuggingFace environment"""
|
|
@@ -59,5 +62,6 @@ class Settings:
|
|
| 59 |
"""Get ChromaDB settings"""
|
| 60 |
return {
|
| 61 |
"anonymized_telemetry": False,
|
| 62 |
-
"persist_directory": str(cls.get_chroma_path())
|
|
|
|
| 63 |
}
|
|
|
|
| 28 |
EMBEDDING_MODEL = "text-embedding-ada-002"
|
| 29 |
LLM_MODEL = "gpt-4"
|
| 30 |
|
| 31 |
+
# ChromaDB Settings
|
| 32 |
+
CHROMA_COLLECTION_NAME = "langchain"
|
| 33 |
+
|
| 34 |
@classmethod
|
| 35 |
def is_huggingface(cls) -> bool:
|
| 36 |
"""Check if running in HuggingFace environment"""
|
|
|
|
| 62 |
"""Get ChromaDB settings"""
|
| 63 |
return {
|
| 64 |
"anonymized_telemetry": False,
|
| 65 |
+
"persist_directory": str(cls.get_chroma_path()),
|
| 66 |
+
"collection_name": cls.CHROMA_COLLECTION_NAME
|
| 67 |
}
|
docs/TROUBLESHOOTING.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Troubleshooting Guide
|
| 2 |
+
|
| 3 |
+
## Embeddings Issues
|
| 4 |
+
|
| 5 |
+
### Empty Chroma Collection (0 Documents)
|
| 6 |
+
**Symptom:**
|
| 7 |
+
- ChromaDB shows 0 documents despite large files being present
|
| 8 |
+
- SQLite database shows records (e.g., embeddings: 233998 records)
|
| 9 |
+
- Files exist and have expected sizes:
|
| 10 |
+
- chroma.sqlite3 (~576 MB)
|
| 11 |
+
- data_level0.bin (~1.3 GB)
|
| 12 |
+
|
| 13 |
+
**Cause:**
|
| 14 |
+
Collection name mismatch between processing and loading. The system uses two collections:
|
| 15 |
+
- "langchain" (contains the data)
|
| 16 |
+
- "lyrics" (empty)
|
| 17 |
+
|
| 18 |
+
**Solution:**
|
| 19 |
+
Always use "langchain" as the collection name in all operations:
|
| 20 |
+
```python
|
| 21 |
+
vector_store = Chroma(
|
| 22 |
+
persist_directory=str(chroma_dir),
|
| 23 |
+
embedding_function=embeddings,
|
| 24 |
+
collection_name="langchain" # Must be "langchain"
|
| 25 |
+
)
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
**Verification:**
|
| 29 |
+
Run the test script to check collections:
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
python scripts/test_embeddings.py
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
Expected output:
|
| 36 |
+
```
|
| 37 |
+
Collection names: [Collection(name=langchain), Collection(name=lyrics)]
|
| 38 |
+
Collection count: 233998 # For langchain collection
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
**Files to Check:**
|
| 42 |
+
1. config/settings.py: CHROMA_COLLECTION_NAME
|
| 43 |
+
2. src/generator/generator.py: vector_store initialization
|
| 44 |
+
3. scripts/process_lyrics.py: Chroma.from_documents() call
|
scripts/process_lyrics.py
CHANGED
|
@@ -35,6 +35,7 @@ class LyricsProcessor:
|
|
| 35 |
self.output_dir = Path(output_dir)
|
| 36 |
self.batch_size = batch_size
|
| 37 |
self.embeddings = OpenAIEmbeddings()
|
|
|
|
| 38 |
|
| 39 |
# Configure text splitter for lyrics
|
| 40 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -116,7 +117,8 @@ class LyricsProcessor:
|
|
| 116 |
vector_store = Chroma.from_documents(
|
| 117 |
documents=batch,
|
| 118 |
embedding=self.embeddings,
|
| 119 |
-
persist_directory=str(self.output_dir / "chroma")
|
|
|
|
| 120 |
)
|
| 121 |
else:
|
| 122 |
# Add subsequent batches
|
|
|
|
| 35 |
self.output_dir = Path(output_dir)
|
| 36 |
self.batch_size = batch_size
|
| 37 |
self.embeddings = OpenAIEmbeddings()
|
| 38 |
+
self.collection_name = "langchain"
|
| 39 |
|
| 40 |
# Configure text splitter for lyrics
|
| 41 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 117 |
vector_store = Chroma.from_documents(
|
| 118 |
documents=batch,
|
| 119 |
embedding=self.embeddings,
|
| 120 |
+
persist_directory=str(self.output_dir / "chroma"),
|
| 121 |
+
collection_name=self.collection_name
|
| 122 |
)
|
| 123 |
else:
|
| 124 |
# Add subsequent batches
|
scripts/test_embeddings.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from langchain_openai import OpenAIEmbeddings
|
| 3 |
+
from langchain_chroma import Chroma
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
import os
|
| 6 |
+
import sqlite3
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_load_embeddings():
|
| 12 |
+
print("=== Testing Embeddings Load ===")
|
| 13 |
+
|
| 14 |
+
base_dir = Path.cwd()
|
| 15 |
+
chroma_dir = base_dir / "data" / "processed" / "embeddings" / "chroma"
|
| 16 |
+
|
| 17 |
+
# Test SQLite connection directly
|
| 18 |
+
print("\nTesting SQLite database:")
|
| 19 |
+
try:
|
| 20 |
+
conn = sqlite3.connect(str(chroma_dir / "chroma.sqlite3"))
|
| 21 |
+
cursor = conn.cursor()
|
| 22 |
+
# Check tables
|
| 23 |
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
| 24 |
+
tables = cursor.fetchall()
|
| 25 |
+
print(f"Found tables: {tables}")
|
| 26 |
+
|
| 27 |
+
# Try to count records
|
| 28 |
+
for table in tables:
|
| 29 |
+
cursor.execute(f"SELECT COUNT(*) FROM {table[0]};")
|
| 30 |
+
count = cursor.fetchone()[0]
|
| 31 |
+
print(f"Table {table[0]}: {count} records")
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"SQLite Error: {str(e)}")
|
| 34 |
+
finally:
|
| 35 |
+
if 'conn' in locals():
|
| 36 |
+
conn.close()
|
| 37 |
+
|
| 38 |
+
# Now try ChromaDB with langchain collection
|
| 39 |
+
print("\nTesting ChromaDB load:")
|
| 40 |
+
try:
|
| 41 |
+
embeddings = OpenAIEmbeddings(
|
| 42 |
+
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
db = Chroma(
|
| 46 |
+
persist_directory=str(chroma_dir),
|
| 47 |
+
embedding_function=embeddings,
|
| 48 |
+
collection_name="langchain"
|
| 49 |
+
)
|
| 50 |
+
print("\nChroma instance created")
|
| 51 |
+
print(f"Collection names: {db._client.list_collections()}")
|
| 52 |
+
|
| 53 |
+
# Try to get collection details
|
| 54 |
+
collection = db._client.get_collection("langchain")
|
| 55 |
+
print(f"\nCollection count: {collection.count()}")
|
| 56 |
+
print(f"Collection peek: {collection.peek()}")
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"\nChroma Error: {str(e)}")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
test_load_embeddings()
|
src/generator/generator.py
CHANGED
|
@@ -5,7 +5,7 @@ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
|
| 5 |
from langchain_chroma import Chroma
|
| 6 |
from langchain.chains import ConversationalRetrievalChain
|
| 7 |
from langchain.prompts import PromptTemplate
|
| 8 |
-
from huggingface_hub import snapshot_download
|
| 9 |
from config.settings import Settings
|
| 10 |
|
| 11 |
|
|
@@ -14,25 +14,25 @@ class LyricGenerator:
|
|
| 14 |
"""Initialize the generator with embeddings"""
|
| 15 |
print("Initializing LyricGenerator...")
|
| 16 |
print(f"Deployment mode: {Settings.DEPLOYMENT_MODE}")
|
| 17 |
-
|
| 18 |
# Ensure paths exist (if local)
|
| 19 |
Settings.ensure_embedding_paths()
|
| 20 |
-
|
| 21 |
# Get and log paths
|
| 22 |
self.embeddings_dir = Settings.get_embeddings_path()
|
| 23 |
self.chroma_dir = Settings.get_chroma_path()
|
| 24 |
print(f"Embeddings directory: {self.embeddings_dir}")
|
| 25 |
print(f"Chroma directory: {self.chroma_dir}")
|
| 26 |
-
|
| 27 |
# Initialize OpenAI embeddings
|
| 28 |
print("Setting up OpenAI embeddings...")
|
| 29 |
self.embeddings = OpenAIEmbeddings(
|
| 30 |
openai_api_key=Settings.OPENAI_API_KEY
|
| 31 |
)
|
| 32 |
-
|
| 33 |
self.vector_store = None
|
| 34 |
self.qa_chain = None
|
| 35 |
-
|
| 36 |
# Load embeddings
|
| 37 |
self._load_embeddings()
|
| 38 |
|
|
@@ -45,13 +45,13 @@ class LyricGenerator:
|
|
| 45 |
print(f"Target Chroma directory: {chroma_dir}")
|
| 46 |
print(f"Creating parent directory: {chroma_dir.parent}")
|
| 47 |
chroma_dir.parent.mkdir(parents=True, exist_ok=True)
|
| 48 |
-
|
| 49 |
# Check if embeddings already exist in persistent storage
|
| 50 |
if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
|
| 51 |
print("\nDownloading embeddings from HuggingFace dataset...")
|
| 52 |
print(f"Dataset repo: {Settings.HF_DATASET}")
|
| 53 |
print(f"Using temp directory: /tmp/embeddings")
|
| 54 |
-
|
| 55 |
# Download the entire chroma directory from the dataset
|
| 56 |
try:
|
| 57 |
temp_dir = snapshot_download(
|
|
@@ -64,18 +64,20 @@ class LyricGenerator:
|
|
| 64 |
except Exception as e:
|
| 65 |
print(f"Error during snapshot_download: {str(e)}")
|
| 66 |
raise
|
| 67 |
-
|
| 68 |
temp_chroma = Path(temp_dir) / "chroma"
|
| 69 |
print(f"Looking for Chroma in temp dir at: {temp_chroma}")
|
| 70 |
-
|
| 71 |
if not temp_chroma.exists():
|
| 72 |
-
print(
|
|
|
|
| 73 |
raise RuntimeError(
|
| 74 |
f"Chroma directory not found in dataset at {temp_chroma}"
|
| 75 |
)
|
| 76 |
-
|
| 77 |
-
print(
|
| 78 |
-
|
|
|
|
| 79 |
# Copy the downloaded chroma directory to persistent storage
|
| 80 |
print(f"\nCopying embeddings to persistent storage...")
|
| 81 |
if chroma_dir.exists():
|
|
@@ -83,9 +85,11 @@ class LyricGenerator:
|
|
| 83 |
shutil.rmtree(chroma_dir)
|
| 84 |
print(f"Copying from {temp_chroma} to {chroma_dir}")
|
| 85 |
shutil.copytree(temp_chroma, chroma_dir)
|
| 86 |
-
print(
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
# Clean up temporary directory
|
| 90 |
print("\nCleaning up temporary directory...")
|
| 91 |
shutil.rmtree("/tmp/embeddings")
|
|
@@ -93,7 +97,7 @@ class LyricGenerator:
|
|
| 93 |
else:
|
| 94 |
print("Embeddings already exist in persistent storage")
|
| 95 |
print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
|
| 96 |
-
|
| 97 |
except Exception as e:
|
| 98 |
print(f"\n=== Error in _setup_embeddings_from_hf ===")
|
| 99 |
print(f"Error type: {type(e).__name__}")
|
|
@@ -101,7 +105,8 @@ class LyricGenerator:
|
|
| 101 |
print(f"Current directory structure:")
|
| 102 |
try:
|
| 103 |
print(f"Parent dir exists: {chroma_dir.parent.exists()}")
|
| 104 |
-
print(
|
|
|
|
| 105 |
except Exception as dir_error:
|
| 106 |
print(f"Error checking directories: {str(dir_error)}")
|
| 107 |
raise RuntimeError(
|
|
@@ -112,7 +117,7 @@ class LyricGenerator:
|
|
| 112 |
"""Load existing embeddings based on environment"""
|
| 113 |
try:
|
| 114 |
print("\n=== Loading Embeddings ===")
|
| 115 |
-
|
| 116 |
# If in HuggingFace environment, ensure embeddings are set up
|
| 117 |
if Settings.is_huggingface():
|
| 118 |
print("HuggingFace environment detected, setting up embeddings...")
|
|
@@ -120,67 +125,71 @@ class LyricGenerator:
|
|
| 120 |
else:
|
| 121 |
print("Local environment detected")
|
| 122 |
print(f"Base directory: {Settings.BASE_DIR}")
|
| 123 |
-
|
| 124 |
print(f"\nLoading vector store from: {self.embeddings_dir}")
|
| 125 |
# Check Chroma directory structure
|
| 126 |
chroma_dir = self.embeddings_dir / "chroma"
|
| 127 |
print(f"Checking Chroma directory: {chroma_dir}")
|
| 128 |
print(f"Absolute path: {chroma_dir.absolute()}")
|
| 129 |
-
|
| 130 |
if not chroma_dir.exists():
|
| 131 |
-
print(
|
|
|
|
| 132 |
if self.embeddings_dir.exists():
|
| 133 |
-
print(
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
sqlite_file = chroma_dir / "chroma.sqlite3"
|
| 137 |
print(f"Checking SQLite file: {sqlite_file}")
|
| 138 |
if not sqlite_file.exists():
|
| 139 |
print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
|
| 140 |
-
raise RuntimeError(
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
# Load vector store using environment-aware settings
|
| 144 |
print("Initializing Chroma with settings:")
|
| 145 |
-
|
| 146 |
-
print(f" collection_name: lyrics")
|
| 147 |
-
|
| 148 |
self.vector_store = Chroma(
|
| 149 |
-
persist_directory=
|
| 150 |
embedding_function=self.embeddings,
|
| 151 |
-
collection_name="
|
| 152 |
)
|
| 153 |
-
|
| 154 |
# Verify collection has documents
|
| 155 |
collection = self.vector_store._collection
|
| 156 |
count = collection.count()
|
| 157 |
print(f"Collection contains {count} documents")
|
| 158 |
-
|
| 159 |
if count == 0:
|
| 160 |
print("Collection is empty, checking details...")
|
| 161 |
# Try to peek at the collection data
|
| 162 |
peek = collection.peek()
|
| 163 |
print(f"Collection peek: {peek}")
|
| 164 |
-
|
| 165 |
# Additional debugging for empty collection
|
| 166 |
print("\nDebug Information:")
|
| 167 |
print(f"Chroma directory structure:")
|
| 168 |
for item in chroma_dir.glob('**/*'):
|
| 169 |
print(f" {item}")
|
| 170 |
if item.is_file():
|
| 171 |
-
print(
|
| 172 |
-
|
|
|
|
| 173 |
raise RuntimeError(
|
| 174 |
"Chroma DB is empty. Please ensure embeddings "
|
| 175 |
"were properly generated and uploaded."
|
| 176 |
)
|
| 177 |
else:
|
| 178 |
print("Successfully loaded embeddings")
|
| 179 |
-
|
| 180 |
except Exception as e:
|
| 181 |
print(f"Error loading embeddings: {str(e)}")
|
| 182 |
raise RuntimeError(f"Failed to load embeddings: {str(e)}")
|
| 183 |
-
|
| 184 |
# Setup QA chain
|
| 185 |
self._setup_qa_chain()
|
| 186 |
|
|
@@ -317,25 +326,27 @@ class LyricGenerator:
|
|
| 317 |
try:
|
| 318 |
print("Starting lyrics generation process...")
|
| 319 |
print(f"Using OpenAI model: {Settings.LLM_MODEL}")
|
| 320 |
-
|
| 321 |
# Get source documents with scores first
|
| 322 |
print("Searching for similar documents...")
|
| 323 |
try:
|
| 324 |
# Test embeddings function first
|
| 325 |
print("Testing embeddings function...")
|
| 326 |
test_embedding = self.embeddings.embed_query("test")
|
| 327 |
-
print(
|
| 328 |
-
|
|
|
|
| 329 |
# Now try similarity search
|
| 330 |
docs_and_scores = self.vector_store.similarity_search_with_score(
|
| 331 |
prompt,
|
| 332 |
k=20
|
| 333 |
)
|
| 334 |
print(f"Found {len(docs_and_scores)} similar documents")
|
| 335 |
-
|
| 336 |
if not docs_and_scores:
|
| 337 |
-
print(
|
| 338 |
-
|
|
|
|
| 339 |
except Exception as e:
|
| 340 |
print(f"Error during similarity search: {str(e)}")
|
| 341 |
raise RuntimeError(
|
|
@@ -353,7 +364,8 @@ class LyricGenerator:
|
|
| 353 |
'artist': doc.metadata['artist'],
|
| 354 |
'song': doc.metadata['song_title'],
|
| 355 |
'similarity': similarity,
|
| 356 |
-
|
|
|
|
| 357 |
})
|
| 358 |
|
| 359 |
try:
|
|
|
|
| 5 |
from langchain_chroma import Chroma
|
| 6 |
from langchain.chains import ConversationalRetrievalChain
|
| 7 |
from langchain.prompts import PromptTemplate
|
| 8 |
+
from huggingface_hub import snapshot_download
|
| 9 |
from config.settings import Settings
|
| 10 |
|
| 11 |
|
|
|
|
| 14 |
"""Initialize the generator with embeddings"""
|
| 15 |
print("Initializing LyricGenerator...")
|
| 16 |
print(f"Deployment mode: {Settings.DEPLOYMENT_MODE}")
|
| 17 |
+
|
| 18 |
# Ensure paths exist (if local)
|
| 19 |
Settings.ensure_embedding_paths()
|
| 20 |
+
|
| 21 |
# Get and log paths
|
| 22 |
self.embeddings_dir = Settings.get_embeddings_path()
|
| 23 |
self.chroma_dir = Settings.get_chroma_path()
|
| 24 |
print(f"Embeddings directory: {self.embeddings_dir}")
|
| 25 |
print(f"Chroma directory: {self.chroma_dir}")
|
| 26 |
+
|
| 27 |
# Initialize OpenAI embeddings
|
| 28 |
print("Setting up OpenAI embeddings...")
|
| 29 |
self.embeddings = OpenAIEmbeddings(
|
| 30 |
openai_api_key=Settings.OPENAI_API_KEY
|
| 31 |
)
|
| 32 |
+
|
| 33 |
self.vector_store = None
|
| 34 |
self.qa_chain = None
|
| 35 |
+
|
| 36 |
# Load embeddings
|
| 37 |
self._load_embeddings()
|
| 38 |
|
|
|
|
| 45 |
print(f"Target Chroma directory: {chroma_dir}")
|
| 46 |
print(f"Creating parent directory: {chroma_dir.parent}")
|
| 47 |
chroma_dir.parent.mkdir(parents=True, exist_ok=True)
|
| 48 |
+
|
| 49 |
# Check if embeddings already exist in persistent storage
|
| 50 |
if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
|
| 51 |
print("\nDownloading embeddings from HuggingFace dataset...")
|
| 52 |
print(f"Dataset repo: {Settings.HF_DATASET}")
|
| 53 |
print(f"Using temp directory: /tmp/embeddings")
|
| 54 |
+
|
| 55 |
# Download the entire chroma directory from the dataset
|
| 56 |
try:
|
| 57 |
temp_dir = snapshot_download(
|
|
|
|
| 64 |
except Exception as e:
|
| 65 |
print(f"Error during snapshot_download: {str(e)}")
|
| 66 |
raise
|
| 67 |
+
|
| 68 |
temp_chroma = Path(temp_dir) / "chroma"
|
| 69 |
print(f"Looking for Chroma in temp dir at: {temp_chroma}")
|
| 70 |
+
|
| 71 |
if not temp_chroma.exists():
|
| 72 |
+
print(
|
| 73 |
+
f"Contents of temp_dir: {list(Path(temp_dir).glob('**/*'))}")
|
| 74 |
raise RuntimeError(
|
| 75 |
f"Chroma directory not found in dataset at {temp_chroma}"
|
| 76 |
)
|
| 77 |
+
|
| 78 |
+
print(
|
| 79 |
+
f"Found Chroma directory in download. Contents: {list(temp_chroma.glob('**/*'))}")
|
| 80 |
+
|
| 81 |
# Copy the downloaded chroma directory to persistent storage
|
| 82 |
print(f"\nCopying embeddings to persistent storage...")
|
| 83 |
if chroma_dir.exists():
|
|
|
|
| 85 |
shutil.rmtree(chroma_dir)
|
| 86 |
print(f"Copying from {temp_chroma} to {chroma_dir}")
|
| 87 |
shutil.copytree(temp_chroma, chroma_dir)
|
| 88 |
+
print(
|
| 89 |
+
f"Embeddings copied to persistent storage at {chroma_dir}")
|
| 90 |
+
print(
|
| 91 |
+
f"Persistent storage contents: {list(chroma_dir.glob('**/*'))}")
|
| 92 |
+
|
| 93 |
# Clean up temporary directory
|
| 94 |
print("\nCleaning up temporary directory...")
|
| 95 |
shutil.rmtree("/tmp/embeddings")
|
|
|
|
| 97 |
else:
|
| 98 |
print("Embeddings already exist in persistent storage")
|
| 99 |
print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
|
| 100 |
+
|
| 101 |
except Exception as e:
|
| 102 |
print(f"\n=== Error in _setup_embeddings_from_hf ===")
|
| 103 |
print(f"Error type: {type(e).__name__}")
|
|
|
|
| 105 |
print(f"Current directory structure:")
|
| 106 |
try:
|
| 107 |
print(f"Parent dir exists: {chroma_dir.parent.exists()}")
|
| 108 |
+
print(
|
| 109 |
+
f"Parent dir contents: {list(chroma_dir.parent.glob('**/*'))}")
|
| 110 |
except Exception as dir_error:
|
| 111 |
print(f"Error checking directories: {str(dir_error)}")
|
| 112 |
raise RuntimeError(
|
|
|
|
| 117 |
"""Load existing embeddings based on environment"""
|
| 118 |
try:
|
| 119 |
print("\n=== Loading Embeddings ===")
|
| 120 |
+
|
| 121 |
# If in HuggingFace environment, ensure embeddings are set up
|
| 122 |
if Settings.is_huggingface():
|
| 123 |
print("HuggingFace environment detected, setting up embeddings...")
|
|
|
|
| 125 |
else:
|
| 126 |
print("Local environment detected")
|
| 127 |
print(f"Base directory: {Settings.BASE_DIR}")
|
| 128 |
+
|
| 129 |
print(f"\nLoading vector store from: {self.embeddings_dir}")
|
| 130 |
# Check Chroma directory structure
|
| 131 |
chroma_dir = self.embeddings_dir / "chroma"
|
| 132 |
print(f"Checking Chroma directory: {chroma_dir}")
|
| 133 |
print(f"Absolute path: {chroma_dir.absolute()}")
|
| 134 |
+
|
| 135 |
if not chroma_dir.exists():
|
| 136 |
+
print(
|
| 137 |
+
f"Parent directory exists: {self.embeddings_dir.exists()}")
|
| 138 |
if self.embeddings_dir.exists():
|
| 139 |
+
print(
|
| 140 |
+
f"Parent directory contents: {list(self.embeddings_dir.glob('**/*'))}")
|
| 141 |
+
raise RuntimeError(
|
| 142 |
+
f"Chroma directory not found at {chroma_dir}")
|
| 143 |
+
|
| 144 |
sqlite_file = chroma_dir / "chroma.sqlite3"
|
| 145 |
print(f"Checking SQLite file: {sqlite_file}")
|
| 146 |
if not sqlite_file.exists():
|
| 147 |
print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
|
| 148 |
+
raise RuntimeError(
|
| 149 |
+
f"Chroma database not found at {sqlite_file}")
|
| 150 |
+
print(
|
| 151 |
+
f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
|
| 152 |
+
|
| 153 |
# Load vector store using environment-aware settings
|
| 154 |
print("Initializing Chroma with settings:")
|
| 155 |
+
chroma_settings = Settings.get_chroma_settings()
|
|
|
|
|
|
|
| 156 |
self.vector_store = Chroma(
|
| 157 |
+
persist_directory=chroma_settings["persist_directory"],
|
| 158 |
embedding_function=self.embeddings,
|
| 159 |
+
collection_name=chroma_settings["collection_name"]
|
| 160 |
)
|
| 161 |
+
|
| 162 |
# Verify collection has documents
|
| 163 |
collection = self.vector_store._collection
|
| 164 |
count = collection.count()
|
| 165 |
print(f"Collection contains {count} documents")
|
| 166 |
+
|
| 167 |
if count == 0:
|
| 168 |
print("Collection is empty, checking details...")
|
| 169 |
# Try to peek at the collection data
|
| 170 |
peek = collection.peek()
|
| 171 |
print(f"Collection peek: {peek}")
|
| 172 |
+
|
| 173 |
# Additional debugging for empty collection
|
| 174 |
print("\nDebug Information:")
|
| 175 |
print(f"Chroma directory structure:")
|
| 176 |
for item in chroma_dir.glob('**/*'):
|
| 177 |
print(f" {item}")
|
| 178 |
if item.is_file():
|
| 179 |
+
print(
|
| 180 |
+
f" Size: {item.stat().st_size / (1024*1024):.2f} MB")
|
| 181 |
+
|
| 182 |
raise RuntimeError(
|
| 183 |
"Chroma DB is empty. Please ensure embeddings "
|
| 184 |
"were properly generated and uploaded."
|
| 185 |
)
|
| 186 |
else:
|
| 187 |
print("Successfully loaded embeddings")
|
| 188 |
+
|
| 189 |
except Exception as e:
|
| 190 |
print(f"Error loading embeddings: {str(e)}")
|
| 191 |
raise RuntimeError(f"Failed to load embeddings: {str(e)}")
|
| 192 |
+
|
| 193 |
# Setup QA chain
|
| 194 |
self._setup_qa_chain()
|
| 195 |
|
|
|
|
| 326 |
try:
|
| 327 |
print("Starting lyrics generation process...")
|
| 328 |
print(f"Using OpenAI model: {Settings.LLM_MODEL}")
|
| 329 |
+
|
| 330 |
# Get source documents with scores first
|
| 331 |
print("Searching for similar documents...")
|
| 332 |
try:
|
| 333 |
# Test embeddings function first
|
| 334 |
print("Testing embeddings function...")
|
| 335 |
test_embedding = self.embeddings.embed_query("test")
|
| 336 |
+
print(
|
| 337 |
+
f"Embeddings function working (vector size: {len(test_embedding)})")
|
| 338 |
+
|
| 339 |
# Now try similarity search
|
| 340 |
docs_and_scores = self.vector_store.similarity_search_with_score(
|
| 341 |
prompt,
|
| 342 |
k=20
|
| 343 |
)
|
| 344 |
print(f"Found {len(docs_and_scores)} similar documents")
|
| 345 |
+
|
| 346 |
if not docs_and_scores:
|
| 347 |
+
print(
|
| 348 |
+
"Warning: No similar documents found. This may affect generation quality.")
|
| 349 |
+
|
| 350 |
except Exception as e:
|
| 351 |
print(f"Error during similarity search: {str(e)}")
|
| 352 |
raise RuntimeError(
|
|
|
|
| 364 |
'artist': doc.metadata['artist'],
|
| 365 |
'song': doc.metadata['song_title'],
|
| 366 |
'similarity': similarity,
|
| 367 |
+
# First 200 chars
|
| 368 |
+
'content': doc.page_content[:200] + "..."
|
| 369 |
})
|
| 370 |
|
| 371 |
try:
|