daniel-was-taken commited on
Commit
cf1fb02
Β·
1 Parent(s): 1bbaa78

Change memory constraints

Browse files
Files changed (2) hide show
  1. compose.yml +5 -0
  2. populate_db.py +65 -52
compose.yml CHANGED
@@ -80,6 +80,11 @@ services:
80
  - OAUTH_GOOGLE_CLIENT_ID=${OAUTH_GOOGLE_CLIENT_ID}
81
  - OAUTH_GOOGLE_CLIENT_SECRET=${OAUTH_GOOGLE_CLIENT_SECRET}
82
 
 
 
 
 
 
83
  depends_on:
84
  - standalone
85
 
 
80
  - OAUTH_GOOGLE_CLIENT_ID=${OAUTH_GOOGLE_CLIENT_ID}
81
  - OAUTH_GOOGLE_CLIENT_SECRET=${OAUTH_GOOGLE_CLIENT_SECRET}
82
 
83
+ # Memory constraints for 4GB DigitalOcean droplet
84
+ # Allocate 1.5GB to app, leaving room for Milvus and system
85
+ mem_limit: 1536m
86
+ memswap_limit: 1536m
87
+
88
  depends_on:
89
  - standalone
90
 
populate_db.py CHANGED
@@ -120,62 +120,75 @@ def main():
120
 
121
  docs = unstructured_document_loader()
122
 
123
- # Prepare texts for batch processing
124
- texts_to_embed = []
125
- doc_data = []
126
-
127
- print(f"Preparing {len(docs)} documents for batch processing...")
128
-
129
- for i, doc in enumerate(docs):
130
- # Check text length and truncate if necessary
131
- text_content = doc.page_content
132
- if len(text_content) > 65000: # Leave some buffer below 64KB limit
133
- text_content = text_content[:65000]
134
- print(f"Document {i+1} truncated from {len(doc.page_content)} to {len(text_content)} characters")
135
-
136
- texts_to_embed.append(text_content)
137
- doc_data.append({
138
- "id": i,
139
- "text": text_content,
140
- "metadata": doc.metadata if doc.metadata else {}
141
- })
142
-
143
- # Print progress every 500 documents
144
- if (i + 1) % 500 == 0:
145
- print(f"Prepared {i + 1}/{len(docs)} documents")
146
-
147
- # Process embeddings in batches
148
- all_embeddings = process_embeddings_in_batches(texts_to_embed, batch_size=25) # Smaller batch size for better reliability
149
-
150
- # Prepare data for insertion
151
- data_to_insert = []
152
-
153
- print(f"Preparing {len(doc_data)} documents for Milvus insertion...")
154
-
155
- for i, (doc_info, embedding) in enumerate(zip(doc_data, all_embeddings)):
156
- data_entry = {
157
- "id": doc_info["id"],
158
- "vector": embedding,
159
- "text": doc_info["text"],
160
- "metadata": doc_info["metadata"]
161
- }
162
- data_to_insert.append(data_entry)
163
-
164
- # Print progress every 500 documents
165
- if (i + 1) % 500 == 0:
166
- print(f"Prepared {i + 1}/{len(doc_data)} entries for insertion")
167
 
168
- print(f"Inserting {len(data_to_insert)} documents into Milvus...")
 
169
 
170
- # Insert data into Milvus
171
- insert_result = milvus_client.insert(
172
- collection_name=collection_name,
173
- data=data_to_insert
174
- )
175
 
176
- print(f"Successfully inserted {insert_result['insert_count']} documents")
177
- print(f"Primary keys: {insert_result['ids'][:10]}...") # Show first 10 IDs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
 
179
  return docs
180
 
181
  def unstructured_document_loader():
 
120
 
121
  docs = unstructured_document_loader()
122
 
123
+ # Process documents in small chunks to avoid memory issues on 4GB droplet
124
+ chunk_size = 100 # Very conservative chunk size for 4GB memory
125
+ total_docs = len(docs)
126
+ total_chunks = (total_docs + chunk_size - 1) // chunk_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ print(f"πŸ”§ Memory-efficient processing: {total_docs} documents in {total_chunks} chunks of {chunk_size}")
129
+ print("πŸ“Š This approach prevents OOM kills on your 4GB DigitalOcean droplet")
130
 
131
+ total_inserted = 0
 
 
 
 
132
 
133
+ for chunk_idx in range(0, total_docs, chunk_size):
134
+ chunk_end = min(chunk_idx + chunk_size, total_docs)
135
+ chunk_num = chunk_idx // chunk_size + 1
136
+
137
+ print(f"\n{'='*40}")
138
+ print(f"CHUNK {chunk_num}/{total_chunks} | Docs {chunk_idx + 1}-{chunk_end}")
139
+ print(f"{'='*40}")
140
+
141
+ # Get current chunk of documents
142
+ current_chunk = docs[chunk_idx:chunk_end]
143
+
144
+ # Process this chunk
145
+ texts_to_embed = []
146
+ doc_data = []
147
+
148
+ for i, doc in enumerate(current_chunk):
149
+ text_content = doc.page_content
150
+ if len(text_content) > 65000:
151
+ text_content = text_content[:65000]
152
+ print(f"πŸ“„ Doc {chunk_idx + i + 1} truncated: {len(doc.page_content)} β†’ {len(text_content)} chars")
153
+
154
+ texts_to_embed.append(text_content)
155
+ doc_data.append({
156
+ "id": chunk_idx + i,
157
+ "text": text_content,
158
+ "metadata": doc.metadata if doc.metadata else {}
159
+ })
160
+
161
+ # Generate embeddings with small batch size
162
+ print(f"πŸš€ Generating embeddings for {len(texts_to_embed)} documents...")
163
+ all_embeddings = process_embeddings_in_batches(texts_to_embed, batch_size=5) # Very small batches
164
+
165
+ # Prepare and insert data
166
+ data_to_insert = []
167
+ for doc_info, embedding in zip(doc_data, all_embeddings):
168
+ data_to_insert.append({
169
+ "id": doc_info["id"],
170
+ "vector": embedding,
171
+ "text": doc_info["text"],
172
+ "metadata": doc_info["metadata"]
173
+ })
174
+
175
+ # Insert to Milvus
176
+ insert_result = milvus_client.insert(collection_name=collection_name, data=data_to_insert)
177
+ chunk_inserted = insert_result['insert_count']
178
+ total_inserted += chunk_inserted
179
+
180
+ print(f"βœ… Chunk {chunk_num} complete: {chunk_inserted} docs inserted")
181
+ print(f"πŸ“ˆ Overall progress: {total_inserted}/{total_docs} ({(total_inserted/total_docs)*100:.1f}%)")
182
+
183
+ # Critical: Free memory before next chunk
184
+ del texts_to_embed, doc_data, all_embeddings, data_to_insert, current_chunk
185
+
186
+ # Brief pause between chunks
187
+ if chunk_num < total_chunks:
188
+ print("⏱️ Memory cleanup pause (2s)...")
189
+ time.sleep(2)
190
 
191
+ print(f"\nπŸŽ‰ SUCCESS! All {total_inserted} documents processed and inserted!")
192
  return docs
193
 
194
  def unstructured_document_loader():