Spaces:

Bellok
/

warbler-cda

Sleeping

App Files Files Community

Bellok commited on Nov 20, 2025

Commit

a0dbf73

1 Parent(s): bfcb0d4

`Refactored background ingestion tracking and function`

Browse files

Files changed (1) hide show

app.py +79 -77

app.py CHANGED Viewed

@@ -13,6 +13,85 @@ import spaces
 from pathlib import Path
 from typing import List, Tuple, Optional, Dict
 SAMPLE_DOCS = [
     {
         "id": "wisdom_1",
@@ -123,83 +202,6 @@ class PackManager:
 pack_manager = PackManager()
-# Global variables for background ingestion tracking
-ingestion_status = {
-    "running": False,
-    "total_docs": 0,
-    "processed": 0,
-    "failed": 0,
-    "start_time": None,
-    "eta": 0,
-    "rate": 0,
-}
-def background_ingest_packs(api, pack_docs, pack_manager):
-    """Background function to ingest packs without blocking app startup"""
-    global ingestion_status
-    ingestion_status["running"] = True
-    ingestion_status["total_docs"] = len(pack_docs)
-    ingestion_status["processed"] = 0
-    ingestion_status["failed"] = 0
-    ingestion_status["start_time"] = time.time()
-    print(f"[INFO] Ingesting {len(pack_docs)} documents from Warbler packs...")
-    total_docs = len(pack_docs)
-    processed = 0
-    failed = 0
-    start_time = time.time()
-    batch_size = 1000
-    # Process in batches to avoid memory issues and provide progress
-    for batch_start in range(0, total_docs, batch_size):
-        batch_end = min(batch_start + batch_size, total_docs)
-        batch = pack_docs[batch_start:batch_end]
-        batch_processed = 0
-        batch_failed = 0
-        for doc in batch:
-            success = api.add_document(doc["id"], doc["content"], doc["metadata"])
-            if not success:
-                batch_failed += 1
-                failed += 1
-                if failed <= 5:  # Log first few failures
-                    print(f"[WARN] Failed to add document {doc['id']}")
-            batch_processed += 1
-            processed += 1
-        # Update global status
-        ingestion_status["processed"] = processed
-        ingestion_status["failed"] = failed
-        # Progress update after each batch
-        elapsed = time.time() - start_time
-        rate = processed / elapsed if elapsed > 0 else 0
-        eta = (total_docs - processed) / rate if rate > 0 else 0
-        ingestion_status["rate"] = rate
-        ingestion_status["eta"] = eta
-        print(f"[PROGRESS] {processed}/{total_docs} documents ingested "
-              f"({processed/total_docs*100:.1f}%) - "
-              f"{rate:.1f} docs/sec - ETA: {eta/60:.1f} min")
-        # Force garbage collection after large batches to free memory
-        if processed % 10000 == 0:
-            import gc
-            gc.collect()
-    packs_loaded = processed
-    pack_manager.mark_packs_ingested(1, packs_loaded)
-    total_time = time.time() - start_time
-    print(f"[OK] Loaded {packs_loaded} documents from Warbler packs "
-          f"({failed} failed) in {total_time:.1f} seconds")
-    # Mark ingestion complete
-    ingestion_status["running"] = False
 try:
     from warbler_cda import (

 from pathlib import Path
 from typing import List, Tuple, Optional, Dict
+# Global variables for background ingestion tracking
+ingestion_status = {
+    "running": False,
+    "total_docs": 0,
+    "processed": 0,
+    "failed": 0,
+    "start_time": None,
+    "eta": 0,
+    "rate": 0,
+}
+def background_ingest_packs(api, pack_docs, pack_manager):
+    """Background function to ingest packs without blocking app startup"""
+    global ingestion_status
+    ingestion_status["running"] = True
+    ingestion_status["total_docs"] = len(pack_docs)
+    ingestion_status["processed"] = 0
+    ingestion_status["failed"] = 0
+    ingestion_status["start_time"] = time.time()
+    print(f"[INFO] Ingesting {len(pack_docs)} documents from Warbler packs...")
+    total_docs = len(pack_docs)
+    processed = 0
+    failed = 0
+    start_time = time.time()
+    batch_size = 1000
+    # Process in batches to avoid memory issues and provide progress
+    for batch_start in range(0, total_docs, batch_size):
+        batch_end = min(batch_start + batch_size, total_docs)
+        batch = pack_docs[batch_start:batch_end]
+        batch_processed = 0
+        batch_failed = 0
+        for doc in batch:
+            success = api.add_document(doc["id"], doc["content"], doc["metadata"])
+            if not success:
+                batch_failed += 1
+                failed += 1
+                if failed <= 5:  # Log first few failures
+                    print(f"[WARN] Failed to add document {doc['id']}")
+            batch_processed += 1
+            processed += 1
+        # Update global status
+        ingestion_status["processed"] = processed
+        ingestion_status["failed"] = failed
+        # Progress update after each batch
+        elapsed = time.time() - start_time
+        rate = processed / elapsed if elapsed > 0 else 0
+        eta = (total_docs - processed) / rate if rate > 0 else 0
+        ingestion_status["rate"] = rate
+        ingestion_status["eta"] = eta
+        print(f"[PROGRESS] {processed}/{total_docs} documents ingested "
+              f"({processed/total_docs*100:.1f}%) - "
+              f"{rate:.1f} docs/sec - ETA: {eta/60:.1f} min")
+        # Force garbage collection after large batches to free memory
+        if processed % 10000 == 0:
+            import gc
+            gc.collect()
+    packs_loaded = processed
+    pack_manager.mark_packs_ingested(1, packs_loaded)
+    total_time = time.time() - start_time
+    print(f"[OK] Loaded {packs_loaded} documents from Warbler packs "
+          f"({failed} failed) in {total_time:.1f} seconds")
+    # Mark ingestion complete
+    ingestion_status["running"] = False
 SAMPLE_DOCS = [
     {
         "id": "wisdom_1",
 pack_manager = PackManager()
 try:
     from warbler_cda import (