Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import logging | |
| from pathlib import Path | |
| # Setup logging cho HuggingFace environment | |
| def setup_logging(): | |
| """Setup logging phù hợp với HF environment""" | |
| if os.getenv("SPACE_ID"): | |
| # Trên HF, chỉ log ra console | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[logging.StreamHandler()] | |
| ) | |
| else: | |
| # Local, có thể ghi file | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(), | |
| logging.FileHandler("embed_data.log", encoding='utf-8') | |
| ] | |
| ) | |
| setup_logging() | |
| logger = logging.getLogger(__name__) | |
| def setup_data(): | |
| """Setup and embed data on startup""" | |
| try: | |
| logger.info("Starting data setup process...") | |
| # SKIP auto embedding trên HuggingFace | |
| if os.getenv("SPACE_ID"): | |
| logger.info("HuggingFace environment detected") | |
| logger.info("⏭Skipping auto-embedding due to PyTorch meta tensor issues") | |
| logger.info("Use /api/embed-data endpoint to manually embed data") | |
| # Chỉ test basic init | |
| try: | |
| logger.info("Testing basic model initialization...") | |
| from core.embedding_model import get_embedding_model | |
| embedding_model = get_embedding_model() | |
| count = embedding_model.count() | |
| logger.info(f"ChromaDB initialized with {count} documents") | |
| logger.info("Basic initialization successful") | |
| except Exception as e: | |
| logger.error(f"Basic initialization failed: {e}") | |
| return | |
| # Local environment - chạy embedding bình thường | |
| logger.info("Local environment - proceeding with auto-embedding") | |
| # Kiểm tra data directory | |
| data_dir = "data" | |
| if not os.path.exists(data_dir): | |
| logger.error(f"Data directory {data_dir} not found!") | |
| return | |
| # Import và chạy embedding | |
| from core.embedding_model import get_embedding_model | |
| embedding_model = get_embedding_model() | |
| current_count = embedding_model.count() | |
| if current_count < 50: | |
| logger.info("Starting embedding process...") | |
| from scripts.embed_data import embed_all_data | |
| result = embed_all_data(data_dir, force=False) | |
| logger.info(f"Embedding completed: {result}") | |
| else: | |
| logger.info("⏭Data already embedded") | |
| except Exception as e: | |
| logger.error(f"Error in setup_data: {e}") | |
| import traceback | |
| logger.error(traceback.format_exc()) | |
| if __name__ == "__main__": | |
| setup_data() |