import streamlit as st
import pandas as pd
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer
import os
import gc
import logging
import threading
import sys

# --- 1. ENTERPRISE LOGGING SETUP ---
# This forces Python to print outputs to the Hugging Face System Logs
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

# --- 2. CONFIGURATION ---
st.set_page_config(page_title="Background Vectorizer", layout="wide")
st.title("⚙️ AI Vectorization Engine (Detached Worker)")
st.write("This process runs independently of your browser. You can safely close the window once started.")

hf_token = os.environ.get("HF_TOKEN")
dataset_repo = "lolhaha002/islamic-vectors"
LOCK_FILE = "worker_is_running.lock"

# --- 3. THE DETACHED BACKGROUND WORKER ---
def background_vectorization_task():
    # Prevent multiple threads from running simultaneously if button is clicked twice
    if os.path.exists(LOCK_FILE):
        logger.info("Worker start aborted: A thread is already running.")
        return
        
    with open(LOCK_FILE, "w") as f:
        f.write("running")
        
    try:
        logger.info("🚀 --- BACKGROUND VECTORIZATION INITIATED ---")
        
        # Load Model
        logger.info("Loading BAAI/bge-m3 embedding model into memory...")
        model = SentenceTransformer("BAAI/bge-m3")
        
        # Load Dataset
        logger.info(f"Fetching dataset from {dataset_repo}...")
        ds = load_dataset(dataset_repo, split="train", token=hf_token)
        df = ds.to_pandas()
        
        # Find Pending Rows
        pending_mask = df['status'] == 'pending'
        pending_indices = df[pending_mask].index.tolist()
        pending_count = len(pending_indices)
        
        if pending_count == 0:
            logger.info("🎉 SUCCESS: No pending rows found. Dataset is fully vectorized!")
            return
            
        logger.info(f"📊 Found {pending_count} rows awaiting vectorization. Starting micro-batches...")
        
        batch_size = 50
        save_interval = 500  # Push to Hugging Face every 500 rows
        processed_in_this_run = 0
        
        for i in range(0, pending_count, batch_size):
            batch_idx = pending_indices[i : i + batch_size]
            batch_texts = df.loc[batch_idx, 'combined_text'].tolist()
            
            # Generate vectors
            embeddings = model.encode(batch_texts, show_progress_bar=False)
            
            # Update dataframe
            for j, row_idx in enumerate(batch_idx):
                df.at[row_idx, 'vector_embedding'] = embeddings[j].tolist()
                df.at[row_idx, 'status'] = 'completed'
                
            processed_in_this_run += len(batch_idx)
            logger.info(f"⏳ Progress: {processed_in_this_run} / {pending_count} chunks embedded.")
            
            # Save Checkpoint
            if processed_in_this_run % save_interval == 0 or processed_in_this_run == pending_count:
                logger.info(f"💾 Checkpoint Reached! Pushing {processed_in_this_run} rows to Hugging Face Hub...")
                updated_ds = Dataset.from_pandas(df)
                updated_ds.push_to_hub(dataset_repo, token=hf_token)
                logger.info("✅ Checkpoint successfully saved to Hub! RAM flushed.")
                gc.collect()
                
        logger.info("🎉 --- VECTORIZATION FULLY COMPLETE ---")
        
    except Exception as e:
        logger.error(f"❌ CRITICAL ERROR IN WORKER THREAD: {e}")
    finally:
        # Clean up lock file so it can be run again in the future
        if os.path.exists(LOCK_FILE):
            os.remove(LOCK_FILE)
        logger.info("🛑 Worker thread shut down.")


# --- 4. STREAMLIT UI ---
if not hf_token:
    st.error("CRITICAL ERROR: HF_TOKEN missing in Space Secrets.")
    st.stop()

if os.path.exists(LOCK_FILE):
    st.warning("🔄 **A worker is currently running in the background.**")
    st.info("👉 Look at the top right of your Hugging Face Space page and click the **Logs** icon (it looks like a little terminal box) to watch the live progress.")
else:
    if st.button("🚀 Start Detached Worker", type="primary"):
        # Spawn the background thread
        thread = threading.Thread(target=background_vectorization_task)
        thread.start()
        
        st.success("✅ **Background worker successfully launched!**")
        st.info("You can now close this tab, refresh the page, or turn off your computer. The Hugging Face server will continue processing.")
        st.write("👉 Look at the top right of your Hugging Face Space page and click the **Logs** icon to watch the engine running in real-time.")