Spaces:
Running
Running
File size: 4,790 Bytes
388aa42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """
Pre-download and initialize embeddings model
Run this during deployment to ensure embeddings are ready
Uses FAISS for local vector storage
"""
import os
import sys
def download_embeddings():
"""Download HuggingFace embeddings model during build"""
try:
print("="*70)
print("π¦ Downloading HuggingFace Embeddings Model")
print("="*70)
# Import after requirements are installed
import sys
sys.path.insert(0, '.')
from rag.embeddings import get_embeddings
print(f"\nπ Cache directory: {os.environ.get('HF_HOME', './hf_cache')}")
print("π Downloading sentence-transformers/all-MiniLM-L6-v2...")
print(" (This is ~80MB and will be cached for future use)\n")
# Initialize embeddings - this will download the model
embeddings = get_embeddings()
# Test the embeddings
test_text = "Government welfare scheme for farmers"
print("π§ͺ Testing embeddings...")
_ = embeddings.embed_query(test_text)
print("\nβ
Embeddings model downloaded and verified successfully!")
print("="*70)
return True
except Exception as e:
print(f"\nβ Failed to download embeddings: {str(e)}")
import traceback
traceback.print_exc()
print("="*70)
return False
def build_exam_index_if_needed():
"""Build exam vectorstore if it doesn't exist"""
try:
if os.path.exists("rag/exam_index/index.faiss"):
print("β
Exam index already exists")
return True
print("\nβ οΈ Exam index not found")
# Check if we have exam PDFs
exam_pdfs_dir = "data/exams_pdfs"
if not os.path.exists(exam_pdfs_dir):
print(f" {exam_pdfs_dir} directory doesn't exist")
print(" Exam recommendations will use web search only")
return False
pdf_files = [f for f in os.listdir(exam_pdfs_dir) if f.endswith('.pdf')]
if not pdf_files:
print(f" No PDF files found in {exam_pdfs_dir}")
print(" Exam recommendations will use web search only")
return False
print(f"\nπ¨ Building exam index from {len(pdf_files)} PDF(s)...")
import sys
sys.path.insert(0, '.')
from rag.exam_vectorstore import build_exam_vectorstore
build_exam_vectorstore()
print("β
Exam index built successfully")
return True
except Exception as e:
print(f"β οΈ Could not build exam index: {str(e)}")
import traceback
traceback.print_exc()
print(" Exam recommendations will use web search only")
return False
def verify_indexes():
"""Verify that vector store indexes are accessible"""
print("\n" + "="*70)
print("π Verifying Vector Store Indexes (FAISS)")
print("="*70)
scheme_exists = os.path.exists("rag/scheme_index/index.faiss")
exam_exists = os.path.exists("rag/exam_index/index.faiss")
print(f"\nπ Scheme Index: {'β
Found' if scheme_exists else 'β Not Found'}")
if scheme_exists:
size = os.path.getsize("rag/scheme_index/index.faiss") / (1024*1024)
print(f" Size: {size:.2f} MB")
print(f"\nπ Exam Index: {'β
Found' if exam_exists else 'β Not Found'}")
if exam_exists:
size = os.path.getsize("rag/exam_index/index.faiss") / (1024*1024)
print(f" Size: {size:.2f} MB")
if not scheme_exists and not exam_exists:
print("\nβ οΈ No vector stores found!")
print(" Application will use web search only mode")
elif not scheme_exists:
print("\nβ οΈ Scheme index missing - only web search for schemes")
elif not exam_exists:
print("\nβ οΈ Exam index missing - only web search for exams")
else:
print("\nβ
All vector stores ready!")
print("="*70)
if __name__ == "__main__":
print("\nπ JanSahayak - Initializing Embeddings and Indexes")
print("π Mode: FAISS (Local Vector Database)\n")
# Step 1: Download embeddings model
embeddings_ok = download_embeddings()
if not embeddings_ok:
print("\nβ οΈ WARNING: Embeddings download failed!")
print(" Vector stores will not work. Application will use web search only.")
sys.exit(1)
# Step 2: Build exam index if needed
build_exam_index_if_needed()
# Step 3: Verify indexes
verify_indexes()
print("\nβ
Initialization complete!\n")
|