Spaces:

lolhaha002
/

islamic-vectorizer

Sleeping

App Files Files Community

islamic-vectorizer / src /streamlit_app.py

lolhaha002

Update src/streamlit_app.py

34fb22d verified 3 months ago

raw

history blame contribute delete

4.85 kB

	import streamlit as st
	import pandas as pd
	from datasets import load_dataset, Dataset
	from sentence_transformers import SentenceTransformer
	import os
	import gc
	import logging
	import threading
	import sys

	# --- 1. ENTERPRISE LOGGING SETUP ---
	# This forces Python to print outputs to the Hugging Face System Logs
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[logging.StreamHandler(sys.stdout)]
	)
	logger = logging.getLogger(__name__)

	# --- 2. CONFIGURATION ---
	st.set_page_config(page_title="Background Vectorizer", layout="wide")
	st.title("⚙️ AI Vectorization Engine (Detached Worker)")
	st.write("This process runs independently of your browser. You can safely close the window once started.")

	hf_token = os.environ.get("HF_TOKEN")
	dataset_repo = "lolhaha002/islamic-vectors"
	LOCK_FILE = "worker_is_running.lock"

	# --- 3. THE DETACHED BACKGROUND WORKER ---
	def background_vectorization_task():
	# Prevent multiple threads from running simultaneously if button is clicked twice
	if os.path.exists(LOCK_FILE):
	logger.info("Worker start aborted: A thread is already running.")
	return

	with open(LOCK_FILE, "w") as f:
	f.write("running")

	try:
	logger.info("🚀 --- BACKGROUND VECTORIZATION INITIATED ---")

	# Load Model
	logger.info("Loading BAAI/bge-m3 embedding model into memory...")
	model = SentenceTransformer("BAAI/bge-m3")

	# Load Dataset
	logger.info(f"Fetching dataset from {dataset_repo}...")
	ds = load_dataset(dataset_repo, split="train", token=hf_token)
	df = ds.to_pandas()

	# Find Pending Rows
	pending_mask = df['status'] == 'pending'
	pending_indices = df[pending_mask].index.tolist()
	pending_count = len(pending_indices)

	if pending_count == 0:
	logger.info("🎉 SUCCESS: No pending rows found. Dataset is fully vectorized!")
	return

	logger.info(f"📊 Found {pending_count} rows awaiting vectorization. Starting micro-batches...")

	batch_size = 50
	save_interval = 500 # Push to Hugging Face every 500 rows
	processed_in_this_run = 0

	for i in range(0, pending_count, batch_size):
	batch_idx = pending_indices[i : i + batch_size]
	batch_texts = df.loc[batch_idx, 'combined_text'].tolist()

	# Generate vectors
	embeddings = model.encode(batch_texts, show_progress_bar=False)

	# Update dataframe
	for j, row_idx in enumerate(batch_idx):
	df.at[row_idx, 'vector_embedding'] = embeddings[j].tolist()
	df.at[row_idx, 'status'] = 'completed'

	processed_in_this_run += len(batch_idx)
	logger.info(f"⏳ Progress: {processed_in_this_run} / {pending_count} chunks embedded.")

	# Save Checkpoint
	if processed_in_this_run % save_interval == 0 or processed_in_this_run == pending_count:
	logger.info(f"💾 Checkpoint Reached! Pushing {processed_in_this_run} rows to Hugging Face Hub...")
	updated_ds = Dataset.from_pandas(df)
	updated_ds.push_to_hub(dataset_repo, token=hf_token)
	logger.info("✅ Checkpoint successfully saved to Hub! RAM flushed.")
	gc.collect()

	logger.info("🎉 --- VECTORIZATION FULLY COMPLETE ---")

	except Exception as e:
	logger.error(f"❌ CRITICAL ERROR IN WORKER THREAD: {e}")
	finally:
	# Clean up lock file so it can be run again in the future
	if os.path.exists(LOCK_FILE):
	os.remove(LOCK_FILE)
	logger.info("🛑 Worker thread shut down.")


	# --- 4. STREAMLIT UI ---
	if not hf_token:
	st.error("CRITICAL ERROR: HF_TOKEN missing in Space Secrets.")
	st.stop()

	if os.path.exists(LOCK_FILE):
	st.warning("🔄 A worker is currently running in the background.")
	st.info("👉 Look at the top right of your Hugging Face Space page and click the Logs icon (it looks like a little terminal box) to watch the live progress.")
	else:
	if st.button("🚀 Start Detached Worker", type="primary"):
	# Spawn the background thread
	thread = threading.Thread(target=background_vectorization_task)
	thread.start()

	st.success("✅ Background worker successfully launched!")
	st.info("You can now close this tab, refresh the page, or turn off your computer. The Hugging Face server will continue processing.")
	st.write("👉 Look at the top right of your Hugging Face Space page and click the Logs icon to watch the engine running in real-time.")