islamic-vectorizer / src /streamlit_app.py
lolhaha002's picture
Update src/streamlit_app.py
34fb22d verified
import streamlit as st
import pandas as pd
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer
import os
import gc
import logging
import threading
import sys
# --- 1. ENTERPRISE LOGGING SETUP ---
# This forces Python to print outputs to the Hugging Face System Logs
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
# --- 2. CONFIGURATION ---
st.set_page_config(page_title="Background Vectorizer", layout="wide")
st.title("βš™οΈ AI Vectorization Engine (Detached Worker)")
st.write("This process runs independently of your browser. You can safely close the window once started.")
hf_token = os.environ.get("HF_TOKEN")
dataset_repo = "lolhaha002/islamic-vectors"
LOCK_FILE = "worker_is_running.lock"
# --- 3. THE DETACHED BACKGROUND WORKER ---
def background_vectorization_task():
# Prevent multiple threads from running simultaneously if button is clicked twice
if os.path.exists(LOCK_FILE):
logger.info("Worker start aborted: A thread is already running.")
return
with open(LOCK_FILE, "w") as f:
f.write("running")
try:
logger.info("πŸš€ --- BACKGROUND VECTORIZATION INITIATED ---")
# Load Model
logger.info("Loading BAAI/bge-m3 embedding model into memory...")
model = SentenceTransformer("BAAI/bge-m3")
# Load Dataset
logger.info(f"Fetching dataset from {dataset_repo}...")
ds = load_dataset(dataset_repo, split="train", token=hf_token)
df = ds.to_pandas()
# Find Pending Rows
pending_mask = df['status'] == 'pending'
pending_indices = df[pending_mask].index.tolist()
pending_count = len(pending_indices)
if pending_count == 0:
logger.info("πŸŽ‰ SUCCESS: No pending rows found. Dataset is fully vectorized!")
return
logger.info(f"πŸ“Š Found {pending_count} rows awaiting vectorization. Starting micro-batches...")
batch_size = 50
save_interval = 500 # Push to Hugging Face every 500 rows
processed_in_this_run = 0
for i in range(0, pending_count, batch_size):
batch_idx = pending_indices[i : i + batch_size]
batch_texts = df.loc[batch_idx, 'combined_text'].tolist()
# Generate vectors
embeddings = model.encode(batch_texts, show_progress_bar=False)
# Update dataframe
for j, row_idx in enumerate(batch_idx):
df.at[row_idx, 'vector_embedding'] = embeddings[j].tolist()
df.at[row_idx, 'status'] = 'completed'
processed_in_this_run += len(batch_idx)
logger.info(f"⏳ Progress: {processed_in_this_run} / {pending_count} chunks embedded.")
# Save Checkpoint
if processed_in_this_run % save_interval == 0 or processed_in_this_run == pending_count:
logger.info(f"πŸ’Ύ Checkpoint Reached! Pushing {processed_in_this_run} rows to Hugging Face Hub...")
updated_ds = Dataset.from_pandas(df)
updated_ds.push_to_hub(dataset_repo, token=hf_token)
logger.info("βœ… Checkpoint successfully saved to Hub! RAM flushed.")
gc.collect()
logger.info("πŸŽ‰ --- VECTORIZATION FULLY COMPLETE ---")
except Exception as e:
logger.error(f"❌ CRITICAL ERROR IN WORKER THREAD: {e}")
finally:
# Clean up lock file so it can be run again in the future
if os.path.exists(LOCK_FILE):
os.remove(LOCK_FILE)
logger.info("πŸ›‘ Worker thread shut down.")
# --- 4. STREAMLIT UI ---
if not hf_token:
st.error("CRITICAL ERROR: HF_TOKEN missing in Space Secrets.")
st.stop()
if os.path.exists(LOCK_FILE):
st.warning("πŸ”„ **A worker is currently running in the background.**")
st.info("πŸ‘‰ Look at the top right of your Hugging Face Space page and click the **Logs** icon (it looks like a little terminal box) to watch the live progress.")
else:
if st.button("πŸš€ Start Detached Worker", type="primary"):
# Spawn the background thread
thread = threading.Thread(target=background_vectorization_task)
thread.start()
st.success("βœ… **Background worker successfully launched!**")
st.info("You can now close this tab, refresh the page, or turn off your computer. The Hugging Face server will continue processing.")
st.write("πŸ‘‰ Look at the top right of your Hugging Face Space page and click the **Logs** icon to watch the engine running in real-time.")