James Edmunds commited on
Commit
2fb2290
·
1 Parent(s): 5006ce5

Updated to load HF dataset into HF Space on run

Browse files
Files changed (3) hide show
  1. app.py +31 -1
  2. config/settings.py +5 -2
  3. src/generator/generator.py +89 -2
app.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import streamlit as st
4
  from src.generator.generator import LyricGenerator
5
  from config.settings import Settings
 
6
 
7
  # Set SQLite path for local development
8
  if not Settings.is_huggingface():
@@ -11,14 +12,43 @@ if not Settings.is_huggingface():
11
  def initialize_generator():
12
  """Initialize the generator with proper error handling"""
13
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Initialize generator
 
15
  st.info("Loading embeddings...")
16
  generator = LyricGenerator()
17
  st.success("Embeddings loaded successfully!")
18
  return generator
19
 
20
  except Exception as e:
21
- st.error(f"Failed to initialize generator: {str(e)}")
 
 
 
22
  return None
23
 
24
  def main():
 
3
  import streamlit as st
4
  from src.generator.generator import LyricGenerator
5
  from config.settings import Settings
6
+ from pathlib import Path
7
 
8
  # Set SQLite path for local development
9
  if not Settings.is_huggingface():
 
12
  def initialize_generator():
13
  """Initialize the generator with proper error handling"""
14
  try:
15
+ print("\n=== Initializing Generator ===")
16
+ # Check for HuggingFace environment requirements
17
+ if Settings.is_huggingface():
18
+ print("Running in HuggingFace environment")
19
+ print("Checking environment requirements...")
20
+
21
+ if not Settings.HF_TOKEN:
22
+ error_msg = "HuggingFace token not found. Please set HF_TOKEN in Space secrets."
23
+ print(f"Error: {error_msg}")
24
+ st.error(error_msg)
25
+ return None
26
+ else:
27
+ print("HF_TOKEN found in environment")
28
+
29
+ # Ensure persistent storage directory exists
30
+ storage_path = Path("/data/processed/embeddings")
31
+ print(f"Setting up persistent storage at: {storage_path}")
32
+ storage_path.mkdir(parents=True, exist_ok=True)
33
+ print(f"Storage directory created/verified")
34
+
35
+ if storage_path.exists():
36
+ print(f"Storage directory contents: {list(storage_path.glob('**/*'))}")
37
+ else:
38
+ print("Running in local environment")
39
+
40
  # Initialize generator
41
+ print("\nInitializing LyricGenerator...")
42
  st.info("Loading embeddings...")
43
  generator = LyricGenerator()
44
  st.success("Embeddings loaded successfully!")
45
  return generator
46
 
47
  except Exception as e:
48
+ error_msg = f"Failed to initialize generator: {str(e)}"
49
+ print(f"\nError during initialization: {error_msg}")
50
+ print(f"Error type: {type(e).__name__}")
51
+ st.error(error_msg)
52
  return None
53
 
54
  def main():
config/settings.py CHANGED
@@ -34,6 +34,7 @@ class Settings:
34
 
35
  # HuggingFace Settings
36
  HF_SPACE = "SongLift/LyrGen2"
 
37
 
38
  @classmethod
39
  def is_huggingface(cls) -> bool:
@@ -43,5 +44,7 @@ class Settings:
43
  @classmethod
44
  def get_embeddings_path(cls) -> Path:
45
  """Get appropriate embeddings path based on deployment mode"""
46
- # Use same structure in both environments
47
- return Path("/data/processed/embeddings")
 
 
 
34
 
35
  # HuggingFace Settings
36
  HF_SPACE = "SongLift/LyrGen2"
37
+ HF_DATASET = "SongLift/LyrGen2_DB" # Updated dataset repo name
38
 
39
  @classmethod
40
  def is_huggingface(cls) -> bool:
 
44
  @classmethod
45
  def get_embeddings_path(cls) -> Path:
46
  """Get appropriate embeddings path based on deployment mode"""
47
+ if cls.is_huggingface():
48
+ # Use persistent storage in HF Spaces
49
+ return Path("/data/processed/embeddings")
50
+ return cls.EMBEDDINGS_DIR
src/generator/generator.py CHANGED
@@ -1,10 +1,11 @@
1
  from typing import Dict, List, Optional
2
  from pathlib import Path
 
3
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
4
  from langchain_chroma import Chroma
5
  from langchain.chains import ConversationalRetrievalChain
6
  from langchain.prompts import PromptTemplate
7
- from huggingface_hub import snapshot_download
8
  from config.settings import Settings
9
 
10
 
@@ -27,19 +28,105 @@ class LyricGenerator:
27
  # Load embeddings
28
  self._load_embeddings()
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def _load_embeddings(self) -> None:
31
  """Load existing embeddings based on environment"""
32
  try:
33
- print(f"Loading vector store from: {self.embeddings_dir}")
 
 
 
 
 
 
 
 
 
34
  # Check Chroma directory structure
35
  chroma_dir = self.embeddings_dir / "chroma"
36
  print(f"Checking Chroma directory: {chroma_dir}")
 
37
  if not chroma_dir.exists():
 
 
 
38
  raise RuntimeError(f"Chroma directory not found at {chroma_dir}")
39
 
40
  sqlite_file = chroma_dir / "chroma.sqlite3"
41
  print(f"Checking SQLite file: {sqlite_file}")
42
  if not sqlite_file.exists():
 
43
  raise RuntimeError(f"Chroma database not found at {sqlite_file}")
44
  print(f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
45
 
 
1
  from typing import Dict, List, Optional
2
  from pathlib import Path
3
+ import shutil
4
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
5
  from langchain_chroma import Chroma
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.prompts import PromptTemplate
8
+ from huggingface_hub import snapshot_download, hf_hub_download
9
  from config.settings import Settings
10
 
11
 
 
28
  # Load embeddings
29
  self._load_embeddings()
30
 
31
+ def _setup_embeddings_from_hf(self) -> None:
32
+ """Download and setup embeddings from HuggingFace dataset"""
33
+ print("\n=== Setting up embeddings from HuggingFace dataset ===")
34
+ try:
35
+ # Create necessary directories
36
+ chroma_dir = self.embeddings_dir / "chroma"
37
+ print(f"Target Chroma directory: {chroma_dir}")
38
+ print(f"Creating parent directory: {chroma_dir.parent}")
39
+ chroma_dir.parent.mkdir(parents=True, exist_ok=True)
40
+
41
+ # Check if embeddings already exist in persistent storage
42
+ if not chroma_dir.exists() or not (chroma_dir / "chroma.sqlite3").exists():
43
+ print("\nDownloading embeddings from HuggingFace dataset...")
44
+ print(f"Dataset repo: {Settings.HF_DATASET}")
45
+ print(f"Using temp directory: /tmp/embeddings")
46
+
47
+ # Download the entire chroma directory from the dataset
48
+ try:
49
+ temp_dir = snapshot_download(
50
+ repo_id=Settings.HF_DATASET,
51
+ repo_type="dataset",
52
+ token=Settings.HF_TOKEN,
53
+ local_dir="/tmp/embeddings"
54
+ )
55
+ print(f"Download completed to: {temp_dir}")
56
+ except Exception as e:
57
+ print(f"Error during snapshot_download: {str(e)}")
58
+ raise
59
+
60
+ temp_chroma = Path(temp_dir) / "chroma"
61
+ print(f"Looking for Chroma in temp dir at: {temp_chroma}")
62
+
63
+ if not temp_chroma.exists():
64
+ print(f"Contents of temp_dir: {list(Path(temp_dir).glob('**/*'))}")
65
+ raise RuntimeError(
66
+ f"Chroma directory not found in dataset at {temp_chroma}"
67
+ )
68
+
69
+ print(f"Found Chroma directory in download. Contents: {list(temp_chroma.glob('**/*'))}")
70
+
71
+ # Copy the downloaded chroma directory to persistent storage
72
+ print(f"\nCopying embeddings to persistent storage...")
73
+ if chroma_dir.exists():
74
+ print(f"Removing existing directory: {chroma_dir}")
75
+ shutil.rmtree(chroma_dir)
76
+ print(f"Copying from {temp_chroma} to {chroma_dir}")
77
+ shutil.copytree(temp_chroma, chroma_dir)
78
+ print(f"Embeddings copied to persistent storage at {chroma_dir}")
79
+ print(f"Persistent storage contents: {list(chroma_dir.glob('**/*'))}")
80
+
81
+ # Clean up temporary directory
82
+ print("\nCleaning up temporary directory...")
83
+ shutil.rmtree("/tmp/embeddings")
84
+ print("Cleanup complete")
85
+ else:
86
+ print("Embeddings already exist in persistent storage")
87
+ print(f"Existing contents: {list(chroma_dir.glob('**/*'))}")
88
+
89
+ except Exception as e:
90
+ print(f"\n=== Error in _setup_embeddings_from_hf ===")
91
+ print(f"Error type: {type(e).__name__}")
92
+ print(f"Error message: {str(e)}")
93
+ print(f"Current directory structure:")
94
+ try:
95
+ print(f"Parent dir exists: {chroma_dir.parent.exists()}")
96
+ print(f"Parent dir contents: {list(chroma_dir.parent.glob('**/*'))}")
97
+ except Exception as dir_error:
98
+ print(f"Error checking directories: {str(dir_error)}")
99
+ raise RuntimeError(
100
+ f"Failed to setup embeddings from HuggingFace: {str(e)}"
101
+ )
102
+
103
  def _load_embeddings(self) -> None:
104
  """Load existing embeddings based on environment"""
105
  try:
106
+ print("\n=== Loading Embeddings ===")
107
+
108
+ # If in HuggingFace environment, ensure embeddings are set up
109
+ if Settings.is_huggingface():
110
+ print("HuggingFace environment detected, setting up embeddings...")
111
+ self._setup_embeddings_from_hf()
112
+ else:
113
+ print("Local environment detected")
114
+
115
+ print(f"\nLoading vector store from: {self.embeddings_dir}")
116
  # Check Chroma directory structure
117
  chroma_dir = self.embeddings_dir / "chroma"
118
  print(f"Checking Chroma directory: {chroma_dir}")
119
+
120
  if not chroma_dir.exists():
121
+ print(f"Parent directory exists: {chroma_dir.parent.exists()}")
122
+ if chroma_dir.parent.exists():
123
+ print(f"Parent directory contents: {list(chroma_dir.parent.glob('**/*'))}")
124
  raise RuntimeError(f"Chroma directory not found at {chroma_dir}")
125
 
126
  sqlite_file = chroma_dir / "chroma.sqlite3"
127
  print(f"Checking SQLite file: {sqlite_file}")
128
  if not sqlite_file.exists():
129
+ print(f"Directory contents: {list(chroma_dir.glob('**/*'))}")
130
  raise RuntimeError(f"Chroma database not found at {sqlite_file}")
131
  print(f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
132