Spaces:
Sleeping
Sleeping
File size: 4,853 Bytes
37c4f1b 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d df697e5 34fb22d 37c4f1b 34fb22d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | import streamlit as st
import pandas as pd
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer
import os
import gc
import logging
import threading
import sys
# --- 1. ENTERPRISE LOGGING SETUP ---
# This forces Python to print outputs to the Hugging Face System Logs
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
# --- 2. CONFIGURATION ---
st.set_page_config(page_title="Background Vectorizer", layout="wide")
st.title("βοΈ AI Vectorization Engine (Detached Worker)")
st.write("This process runs independently of your browser. You can safely close the window once started.")
hf_token = os.environ.get("HF_TOKEN")
dataset_repo = "lolhaha002/islamic-vectors"
LOCK_FILE = "worker_is_running.lock"
# --- 3. THE DETACHED BACKGROUND WORKER ---
def background_vectorization_task():
# Prevent multiple threads from running simultaneously if button is clicked twice
if os.path.exists(LOCK_FILE):
logger.info("Worker start aborted: A thread is already running.")
return
with open(LOCK_FILE, "w") as f:
f.write("running")
try:
logger.info("π --- BACKGROUND VECTORIZATION INITIATED ---")
# Load Model
logger.info("Loading BAAI/bge-m3 embedding model into memory...")
model = SentenceTransformer("BAAI/bge-m3")
# Load Dataset
logger.info(f"Fetching dataset from {dataset_repo}...")
ds = load_dataset(dataset_repo, split="train", token=hf_token)
df = ds.to_pandas()
# Find Pending Rows
pending_mask = df['status'] == 'pending'
pending_indices = df[pending_mask].index.tolist()
pending_count = len(pending_indices)
if pending_count == 0:
logger.info("π SUCCESS: No pending rows found. Dataset is fully vectorized!")
return
logger.info(f"π Found {pending_count} rows awaiting vectorization. Starting micro-batches...")
batch_size = 50
save_interval = 500 # Push to Hugging Face every 500 rows
processed_in_this_run = 0
for i in range(0, pending_count, batch_size):
batch_idx = pending_indices[i : i + batch_size]
batch_texts = df.loc[batch_idx, 'combined_text'].tolist()
# Generate vectors
embeddings = model.encode(batch_texts, show_progress_bar=False)
# Update dataframe
for j, row_idx in enumerate(batch_idx):
df.at[row_idx, 'vector_embedding'] = embeddings[j].tolist()
df.at[row_idx, 'status'] = 'completed'
processed_in_this_run += len(batch_idx)
logger.info(f"β³ Progress: {processed_in_this_run} / {pending_count} chunks embedded.")
# Save Checkpoint
if processed_in_this_run % save_interval == 0 or processed_in_this_run == pending_count:
logger.info(f"πΎ Checkpoint Reached! Pushing {processed_in_this_run} rows to Hugging Face Hub...")
updated_ds = Dataset.from_pandas(df)
updated_ds.push_to_hub(dataset_repo, token=hf_token)
logger.info("β
Checkpoint successfully saved to Hub! RAM flushed.")
gc.collect()
logger.info("π --- VECTORIZATION FULLY COMPLETE ---")
except Exception as e:
logger.error(f"β CRITICAL ERROR IN WORKER THREAD: {e}")
finally:
# Clean up lock file so it can be run again in the future
if os.path.exists(LOCK_FILE):
os.remove(LOCK_FILE)
logger.info("π Worker thread shut down.")
# --- 4. STREAMLIT UI ---
if not hf_token:
st.error("CRITICAL ERROR: HF_TOKEN missing in Space Secrets.")
st.stop()
if os.path.exists(LOCK_FILE):
st.warning("π **A worker is currently running in the background.**")
st.info("π Look at the top right of your Hugging Face Space page and click the **Logs** icon (it looks like a little terminal box) to watch the live progress.")
else:
if st.button("π Start Detached Worker", type="primary"):
# Spawn the background thread
thread = threading.Thread(target=background_vectorization_task)
thread.start()
st.success("β
**Background worker successfully launched!**")
st.info("You can now close this tab, refresh the page, or turn off your computer. The Hugging Face server will continue processing.")
st.write("π Look at the top right of your Hugging Face Space page and click the **Logs** icon to watch the engine running in real-time.")
|