File size: 2,954 Bytes
0d96daf
 
 
 
 
89397a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d96daf
 
 
 
 
89397a4
 
59e357e
 
 
 
 
 
 
 
7052598
59e357e
 
 
 
 
 
 
 
 
 
 
 
 
89397a4
 
 
 
 
 
59e357e
0d96daf
 
 
 
59e357e
 
89397a4
59e357e
 
0d96daf
59e357e
0d96daf
 
 
89397a4
 
0d96daf
 
89397a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import sys
import logging
from pathlib import Path

# Setup logging cho HuggingFace environment
def setup_logging():
    """Setup logging phù hợp với HF environment"""
    if os.getenv("SPACE_ID"):
        # Trên HF, chỉ log ra console
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[logging.StreamHandler()]
        )
    else:
        # Local, có thể ghi file
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.StreamHandler(),
                logging.FileHandler("embed_data.log", encoding='utf-8')
            ]
        )

setup_logging()
logger = logging.getLogger(__name__)

def setup_data():
    """Setup and embed data on startup"""
    try:
        logger.info("Starting data setup process...")
        
        # SKIP auto embedding trên HuggingFace
        if os.getenv("SPACE_ID"):
            logger.info("HuggingFace environment detected")
            logger.info("⏭Skipping auto-embedding due to PyTorch meta tensor issues")
            logger.info("Use /api/embed-data endpoint to manually embed data")
            
            # Chỉ test basic init
            try:
                logger.info("Testing basic model initialization...")
                from core.embedding_model import get_embedding_model
                embedding_model = get_embedding_model()
                count = embedding_model.count()
                logger.info(f"ChromaDB initialized with {count} documents")
                logger.info("Basic initialization successful")
            except Exception as e:
                logger.error(f"Basic initialization failed: {e}")
            
            return
        
        # Local environment - chạy embedding bình thường
        logger.info("Local environment - proceeding with auto-embedding")
        
        # Kiểm tra data directory
        data_dir = "data"
        if not os.path.exists(data_dir):
            logger.error(f"Data directory {data_dir} not found!")
            return
        
        # Import và chạy embedding
        from core.embedding_model import get_embedding_model
        embedding_model = get_embedding_model()
        current_count = embedding_model.count()
        
        if current_count < 50:
            logger.info("Starting embedding process...")
            from scripts.embed_data import embed_all_data
            result = embed_all_data(data_dir, force=False)
            logger.info(f"Embedding completed: {result}")
        else:
            logger.info("⏭Data already embedded")
            
    except Exception as e:
        logger.error(f"Error in setup_data: {e}")
        import traceback
        logger.error(traceback.format_exc())

if __name__ == "__main__":
    setup_data()