Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from datasets import load_dataset, Dataset | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| import gc | |
| import logging | |
| import threading | |
| import sys | |
| # --- 1. ENTERPRISE LOGGING SETUP --- | |
| # This forces Python to print outputs to the Hugging Face System Logs | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[logging.StreamHandler(sys.stdout)] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # --- 2. CONFIGURATION --- | |
| st.set_page_config(page_title="Background Vectorizer", layout="wide") | |
| st.title("βοΈ AI Vectorization Engine (Detached Worker)") | |
| st.write("This process runs independently of your browser. You can safely close the window once started.") | |
| hf_token = os.environ.get("HF_TOKEN") | |
| dataset_repo = "lolhaha002/islamic-vectors" | |
| LOCK_FILE = "worker_is_running.lock" | |
| # --- 3. THE DETACHED BACKGROUND WORKER --- | |
| def background_vectorization_task(): | |
| # Prevent multiple threads from running simultaneously if button is clicked twice | |
| if os.path.exists(LOCK_FILE): | |
| logger.info("Worker start aborted: A thread is already running.") | |
| return | |
| with open(LOCK_FILE, "w") as f: | |
| f.write("running") | |
| try: | |
| logger.info("π --- BACKGROUND VECTORIZATION INITIATED ---") | |
| # Load Model | |
| logger.info("Loading BAAI/bge-m3 embedding model into memory...") | |
| model = SentenceTransformer("BAAI/bge-m3") | |
| # Load Dataset | |
| logger.info(f"Fetching dataset from {dataset_repo}...") | |
| ds = load_dataset(dataset_repo, split="train", token=hf_token) | |
| df = ds.to_pandas() | |
| # Find Pending Rows | |
| pending_mask = df['status'] == 'pending' | |
| pending_indices = df[pending_mask].index.tolist() | |
| pending_count = len(pending_indices) | |
| if pending_count == 0: | |
| logger.info("π SUCCESS: No pending rows found. Dataset is fully vectorized!") | |
| return | |
| logger.info(f"π Found {pending_count} rows awaiting vectorization. Starting micro-batches...") | |
| batch_size = 50 | |
| save_interval = 500 # Push to Hugging Face every 500 rows | |
| processed_in_this_run = 0 | |
| for i in range(0, pending_count, batch_size): | |
| batch_idx = pending_indices[i : i + batch_size] | |
| batch_texts = df.loc[batch_idx, 'combined_text'].tolist() | |
| # Generate vectors | |
| embeddings = model.encode(batch_texts, show_progress_bar=False) | |
| # Update dataframe | |
| for j, row_idx in enumerate(batch_idx): | |
| df.at[row_idx, 'vector_embedding'] = embeddings[j].tolist() | |
| df.at[row_idx, 'status'] = 'completed' | |
| processed_in_this_run += len(batch_idx) | |
| logger.info(f"β³ Progress: {processed_in_this_run} / {pending_count} chunks embedded.") | |
| # Save Checkpoint | |
| if processed_in_this_run % save_interval == 0 or processed_in_this_run == pending_count: | |
| logger.info(f"πΎ Checkpoint Reached! Pushing {processed_in_this_run} rows to Hugging Face Hub...") | |
| updated_ds = Dataset.from_pandas(df) | |
| updated_ds.push_to_hub(dataset_repo, token=hf_token) | |
| logger.info("β Checkpoint successfully saved to Hub! RAM flushed.") | |
| gc.collect() | |
| logger.info("π --- VECTORIZATION FULLY COMPLETE ---") | |
| except Exception as e: | |
| logger.error(f"β CRITICAL ERROR IN WORKER THREAD: {e}") | |
| finally: | |
| # Clean up lock file so it can be run again in the future | |
| if os.path.exists(LOCK_FILE): | |
| os.remove(LOCK_FILE) | |
| logger.info("π Worker thread shut down.") | |
| # --- 4. STREAMLIT UI --- | |
| if not hf_token: | |
| st.error("CRITICAL ERROR: HF_TOKEN missing in Space Secrets.") | |
| st.stop() | |
| if os.path.exists(LOCK_FILE): | |
| st.warning("π **A worker is currently running in the background.**") | |
| st.info("π Look at the top right of your Hugging Face Space page and click the **Logs** icon (it looks like a little terminal box) to watch the live progress.") | |
| else: | |
| if st.button("π Start Detached Worker", type="primary"): | |
| # Spawn the background thread | |
| thread = threading.Thread(target=background_vectorization_task) | |
| thread.start() | |
| st.success("β **Background worker successfully launched!**") | |
| st.info("You can now close this tab, refresh the page, or turn off your computer. The Hugging Face server will continue processing.") | |
| st.write("π Look at the top right of your Hugging Face Space page and click the **Logs** icon to watch the engine running in real-time.") | |