import streamlit as st import pandas as pd from datasets import load_dataset, Dataset from sentence_transformers import SentenceTransformer import os import gc import logging import threading import sys # --- 1. ENTERPRISE LOGGING SETUP --- # This forces Python to print outputs to the Hugging Face System Logs logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger(__name__) # --- 2. CONFIGURATION --- st.set_page_config(page_title="Background Vectorizer", layout="wide") st.title("⚙️ AI Vectorization Engine (Detached Worker)") st.write("This process runs independently of your browser. You can safely close the window once started.") hf_token = os.environ.get("HF_TOKEN") dataset_repo = "lolhaha002/islamic-vectors" LOCK_FILE = "worker_is_running.lock" # --- 3. THE DETACHED BACKGROUND WORKER --- def background_vectorization_task(): # Prevent multiple threads from running simultaneously if button is clicked twice if os.path.exists(LOCK_FILE): logger.info("Worker start aborted: A thread is already running.") return with open(LOCK_FILE, "w") as f: f.write("running") try: logger.info("🚀 --- BACKGROUND VECTORIZATION INITIATED ---") # Load Model logger.info("Loading BAAI/bge-m3 embedding model into memory...") model = SentenceTransformer("BAAI/bge-m3") # Load Dataset logger.info(f"Fetching dataset from {dataset_repo}...") ds = load_dataset(dataset_repo, split="train", token=hf_token) df = ds.to_pandas() # Find Pending Rows pending_mask = df['status'] == 'pending' pending_indices = df[pending_mask].index.tolist() pending_count = len(pending_indices) if pending_count == 0: logger.info("🎉 SUCCESS: No pending rows found. Dataset is fully vectorized!") return logger.info(f"📊 Found {pending_count} rows awaiting vectorization. Starting micro-batches...") batch_size = 50 save_interval = 500 # Push to Hugging Face every 500 rows processed_in_this_run = 0 for i in range(0, pending_count, batch_size): batch_idx = pending_indices[i : i + batch_size] batch_texts = df.loc[batch_idx, 'combined_text'].tolist() # Generate vectors embeddings = model.encode(batch_texts, show_progress_bar=False) # Update dataframe for j, row_idx in enumerate(batch_idx): df.at[row_idx, 'vector_embedding'] = embeddings[j].tolist() df.at[row_idx, 'status'] = 'completed' processed_in_this_run += len(batch_idx) logger.info(f"⏳ Progress: {processed_in_this_run} / {pending_count} chunks embedded.") # Save Checkpoint if processed_in_this_run % save_interval == 0 or processed_in_this_run == pending_count: logger.info(f"💾 Checkpoint Reached! Pushing {processed_in_this_run} rows to Hugging Face Hub...") updated_ds = Dataset.from_pandas(df) updated_ds.push_to_hub(dataset_repo, token=hf_token) logger.info("✅ Checkpoint successfully saved to Hub! RAM flushed.") gc.collect() logger.info("🎉 --- VECTORIZATION FULLY COMPLETE ---") except Exception as e: logger.error(f"❌ CRITICAL ERROR IN WORKER THREAD: {e}") finally: # Clean up lock file so it can be run again in the future if os.path.exists(LOCK_FILE): os.remove(LOCK_FILE) logger.info("🛑 Worker thread shut down.") # --- 4. STREAMLIT UI --- if not hf_token: st.error("CRITICAL ERROR: HF_TOKEN missing in Space Secrets.") st.stop() if os.path.exists(LOCK_FILE): st.warning("🔄 **A worker is currently running in the background.**") st.info("👉 Look at the top right of your Hugging Face Space page and click the **Logs** icon (it looks like a little terminal box) to watch the live progress.") else: if st.button("🚀 Start Detached Worker", type="primary"): # Spawn the background thread thread = threading.Thread(target=background_vectorization_task) thread.start() st.success("✅ **Background worker successfully launched!**") st.info("You can now close this tab, refresh the page, or turn off your computer. The Hugging Face server will continue processing.") st.write("👉 Look at the top right of your Hugging Face Space page and click the **Logs** icon to watch the engine running in real-time.")