File size: 4,853 Bytes
37c4f1b
34fb22d
df697e5
 
 
 
34fb22d
 
 
df697e5
34fb22d
 
 
 
 
 
 
 
 
 
df697e5
34fb22d
 
df697e5
 
 
34fb22d
df697e5
34fb22d
 
 
 
 
 
 
 
 
 
df697e5
34fb22d
df697e5
34fb22d
 
 
df697e5
34fb22d
 
 
 
df697e5
34fb22d
 
 
 
df697e5
34fb22d
 
 
df697e5
34fb22d
df697e5
34fb22d
 
 
 
 
 
 
 
 
 
df697e5
34fb22d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37c4f1b
34fb22d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
import pandas as pd
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer
import os
import gc
import logging
import threading
import sys

# --- 1. ENTERPRISE LOGGING SETUP ---
# This forces Python to print outputs to the Hugging Face System Logs
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

# --- 2. CONFIGURATION ---
st.set_page_config(page_title="Background Vectorizer", layout="wide")
st.title("βš™οΈ AI Vectorization Engine (Detached Worker)")
st.write("This process runs independently of your browser. You can safely close the window once started.")

hf_token = os.environ.get("HF_TOKEN")
dataset_repo = "lolhaha002/islamic-vectors"
LOCK_FILE = "worker_is_running.lock"

# --- 3. THE DETACHED BACKGROUND WORKER ---
def background_vectorization_task():
    # Prevent multiple threads from running simultaneously if button is clicked twice
    if os.path.exists(LOCK_FILE):
        logger.info("Worker start aborted: A thread is already running.")
        return
        
    with open(LOCK_FILE, "w") as f:
        f.write("running")
        
    try:
        logger.info("πŸš€ --- BACKGROUND VECTORIZATION INITIATED ---")
        
        # Load Model
        logger.info("Loading BAAI/bge-m3 embedding model into memory...")
        model = SentenceTransformer("BAAI/bge-m3")
        
        # Load Dataset
        logger.info(f"Fetching dataset from {dataset_repo}...")
        ds = load_dataset(dataset_repo, split="train", token=hf_token)
        df = ds.to_pandas()
        
        # Find Pending Rows
        pending_mask = df['status'] == 'pending'
        pending_indices = df[pending_mask].index.tolist()
        pending_count = len(pending_indices)
        
        if pending_count == 0:
            logger.info("πŸŽ‰ SUCCESS: No pending rows found. Dataset is fully vectorized!")
            return
            
        logger.info(f"πŸ“Š Found {pending_count} rows awaiting vectorization. Starting micro-batches...")
        
        batch_size = 50
        save_interval = 500  # Push to Hugging Face every 500 rows
        processed_in_this_run = 0
        
        for i in range(0, pending_count, batch_size):
            batch_idx = pending_indices[i : i + batch_size]
            batch_texts = df.loc[batch_idx, 'combined_text'].tolist()
            
            # Generate vectors
            embeddings = model.encode(batch_texts, show_progress_bar=False)
            
            # Update dataframe
            for j, row_idx in enumerate(batch_idx):
                df.at[row_idx, 'vector_embedding'] = embeddings[j].tolist()
                df.at[row_idx, 'status'] = 'completed'
                
            processed_in_this_run += len(batch_idx)
            logger.info(f"⏳ Progress: {processed_in_this_run} / {pending_count} chunks embedded.")
            
            # Save Checkpoint
            if processed_in_this_run % save_interval == 0 or processed_in_this_run == pending_count:
                logger.info(f"πŸ’Ύ Checkpoint Reached! Pushing {processed_in_this_run} rows to Hugging Face Hub...")
                updated_ds = Dataset.from_pandas(df)
                updated_ds.push_to_hub(dataset_repo, token=hf_token)
                logger.info("βœ… Checkpoint successfully saved to Hub! RAM flushed.")
                gc.collect()
                
        logger.info("πŸŽ‰ --- VECTORIZATION FULLY COMPLETE ---")
        
    except Exception as e:
        logger.error(f"❌ CRITICAL ERROR IN WORKER THREAD: {e}")
    finally:
        # Clean up lock file so it can be run again in the future
        if os.path.exists(LOCK_FILE):
            os.remove(LOCK_FILE)
        logger.info("πŸ›‘ Worker thread shut down.")


# --- 4. STREAMLIT UI ---
if not hf_token:
    st.error("CRITICAL ERROR: HF_TOKEN missing in Space Secrets.")
    st.stop()

if os.path.exists(LOCK_FILE):
    st.warning("πŸ”„ **A worker is currently running in the background.**")
    st.info("πŸ‘‰ Look at the top right of your Hugging Face Space page and click the **Logs** icon (it looks like a little terminal box) to watch the live progress.")
else:
    if st.button("πŸš€ Start Detached Worker", type="primary"):
        # Spawn the background thread
        thread = threading.Thread(target=background_vectorization_task)
        thread.start()
        
        st.success("βœ… **Background worker successfully launched!**")
        st.info("You can now close this tab, refresh the page, or turn off your computer. The Hugging Face server will continue processing.")
        st.write("πŸ‘‰ Look at the top right of your Hugging Face Space page and click the **Logs** icon to watch the engine running in real-time.")