fguryel commited on
Commit
09acea5
·
1 Parent(s): 95d9c92

Fix HF Spaces initialization - improve ChromaDB connection and collection handling

Browse files
Files changed (1) hide show
  1. app.py +74 -27
app.py CHANGED
@@ -67,29 +67,63 @@ class RAGChatbot:
67
  Initialize ChromaDB client and embedding model for retrieval.
68
  """
69
  try:
70
- # Check if we're in Hugging Face Spaces environment
71
- if os.path.exists('chroma.sqlite3'):
72
- # We're likely in HF Spaces - use current directory
73
- self.db_path = '.'
74
-
75
- # Initialize ChromaDB client
76
- self.chroma_client = chromadb.PersistentClient(
77
- path=self.db_path,
78
- settings=Settings(anonymized_telemetry=False)
79
- )
 
80
 
81
- # Get or create collection
82
  try:
83
- self.collection = self.chroma_client.get_collection(
84
- name=self.collection_name
 
 
 
 
 
85
  )
86
- except Exception:
87
- # If collection doesn't exist, try to recreate it from chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  if os.path.exists('chunks.json'):
89
- st.warning("Database collection not found. Rebuilding from chunks...")
 
 
90
  self._rebuild_collection_from_chunks()
91
  else:
92
- raise Exception("Neither database collection nor chunks.json found. Please build the database first.")
 
 
93
 
94
  # Load embedding model (same as used for building the database)
95
  self.embedding_model = SentenceTransformer(self.embedding_model_name)
@@ -110,18 +144,31 @@ class RAGChatbot:
110
  This is useful for Hugging Face Spaces deployment.
111
  """
112
  try:
113
- st.info("🔄 Rebuilding database collection from chunks...")
 
 
114
 
115
- # Load chunks
116
- with open('chunks.json', 'r', encoding='utf-8') as f:
 
 
 
 
117
  chunks = json.load(f)
118
 
119
- # Create collection
120
- try:
121
- self.chroma_client.delete_collection(name=self.collection_name)
122
- except:
123
- pass # Collection might not exist
124
 
 
 
 
 
 
 
 
 
 
 
 
125
  self.collection = self.chroma_client.create_collection(
126
  name=self.collection_name,
127
  metadata={"description": "Scikit-learn documentation embeddings"}
@@ -617,5 +664,5 @@ def main():
617
  """)
618
 
619
 
620
- if __name__ == "__main__":
621
- main()
 
67
  Initialize ChromaDB client and embedding model for retrieval.
68
  """
69
  try:
70
+ # Detect environment and set appropriate database path
71
+ current_dir = os.getcwd()
72
+
73
+ # Check for database files in different locations
74
+ if os.path.exists(os.path.join(current_dir, 'chroma.sqlite3')):
75
+ self.db_path = current_dir
76
+ logger.info(f"Using database in current directory: {current_dir}")
77
+ elif os.path.exists(os.path.join(self.db_path, 'chroma.sqlite3')):
78
+ logger.info(f"Using database in specified path: {self.db_path}")
79
+ else:
80
+ logger.warning("No database file found, will attempt to rebuild from chunks")
81
 
82
+ # Initialize ChromaDB client with error handling
83
  try:
84
+ self.chroma_client = chromadb.PersistentClient(
85
+ path=self.db_path,
86
+ settings=Settings(
87
+ anonymized_telemetry=False,
88
+ allow_reset=True,
89
+ is_persistent=True
90
+ )
91
  )
92
+ logger.info(f"ChromaDB client initialized at: {self.db_path}")
93
+ except Exception as client_error:
94
+ logger.error(f"ChromaDB client initialization failed: {client_error}")
95
+ # Try with default settings
96
+ self.chroma_client = chromadb.PersistentClient(path=self.db_path)
97
+
98
+ # Get or create collection with robust error handling
99
+ collection_found = False
100
+ try:
101
+ # First, list all collections to see what's available
102
+ collections = self.chroma_client.list_collections()
103
+ collection_names = [col.name for col in collections]
104
+ logger.info(f"Available collections: {collection_names}")
105
+
106
+ if self.collection_name in collection_names:
107
+ self.collection = self.chroma_client.get_collection(name=self.collection_name)
108
+ collection_found = True
109
+ logger.info(f"Successfully loaded collection: {self.collection_name}")
110
+ else:
111
+ logger.warning(f"Collection '{self.collection_name}' not found in {collection_names}")
112
+
113
+ except Exception as col_error:
114
+ logger.error(f"Error accessing collections: {col_error}")
115
+
116
+ # If collection not found, rebuild from chunks
117
+ if not collection_found:
118
  if os.path.exists('chunks.json'):
119
+ logger.info("Attempting to rebuild collection from chunks.json")
120
+ if 'streamlit' in sys.modules:
121
+ st.warning("🔄 Database collection not found. Rebuilding from chunks...")
122
  self._rebuild_collection_from_chunks()
123
  else:
124
+ error_msg = f"Collection '{self.collection_name}' not found and no chunks.json available for rebuilding"
125
+ logger.error(error_msg)
126
+ raise Exception(error_msg)
127
 
128
  # Load embedding model (same as used for building the database)
129
  self.embedding_model = SentenceTransformer(self.embedding_model_name)
 
144
  This is useful for Hugging Face Spaces deployment.
145
  """
146
  try:
147
+ logger.info("Starting collection rebuild from chunks.json")
148
+ if 'streamlit' in sys.modules:
149
+ st.info("🔄 Rebuilding database collection from chunks...")
150
 
151
+ # Load chunks with error handling
152
+ chunks_path = 'chunks.json'
153
+ if not os.path.exists(chunks_path):
154
+ raise FileNotFoundError(f"chunks.json not found at {chunks_path}")
155
+
156
+ with open(chunks_path, 'r', encoding='utf-8') as f:
157
  chunks = json.load(f)
158
 
159
+ logger.info(f"Loaded {len(chunks)} chunks from {chunks_path}")
 
 
 
 
160
 
161
+ # Safely create collection
162
+ try:
163
+ # Try to delete existing collection first
164
+ existing_collections = [col.name for col in self.chroma_client.list_collections()]
165
+ if self.collection_name in existing_collections:
166
+ logger.info(f"Deleting existing collection: {self.collection_name}")
167
+ self.chroma_client.delete_collection(name=self.collection_name)
168
+ except Exception as del_error:
169
+ logger.warning(f"Could not delete existing collection: {del_error}")
170
+
171
+ # Create new collection
172
  self.collection = self.chroma_client.create_collection(
173
  name=self.collection_name,
174
  metadata={"description": "Scikit-learn documentation embeddings"}
 
664
  """)
665
 
666
 
667
+ # This ensures the app runs properly on Hugging Face Spaces
668
+ main()