Spaces:

trungmin
/

Multimodal-RAG

Runtime error

App Files Files Community

3v324v23 commited on Aug 4, 2025

Commit

be398ac

1 Parent(s): a805413

fix

Browse files

Files changed (19) hide show

.gitignore +6 -1
README.md +1 -1
app.py +10 -24
config/database_configs.py +0 -0
config/model_configs.py +5 -8
config/settings.py +10 -10
core/data_processing/audio_processor.py +2 -21
core/data_processing/image_processor.py +5 -55
core/data_processing/text_processor.py +1 -21
core/data_processing/video_processor.py +0 -118
core/embeddings/audio_embedding_model.py +1 -41
core/embeddings/image_embedding_model.py +53 -52
core/embeddings/text_embedding_model.py +1 -21
core/retrieval/retriever.py +2 -57
core/retrieval/vector_db_manager.py +6 -58
{scripts → ingestions}/ingestion.py +105 -104
main.py +114 -0
scripts/ingest_data.py +0 -203
utils/logger.py +14 -14

.gitignore CHANGED Viewed

@@ -204,4 +204,9 @@ cython_debug/
 # Marimo
 marimo/_static/
 marimo/_lsp/
-__marimo__/

 # Marimo
 marimo/_static/
 marimo/_lsp/
+__marimo__/
+# main function
+video_processor.py
+ingestions/ingest_data.py
+test_config_log.py

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: purple
 colorTo: green
 sdk: gradio
 sdk_version: 5.39.0
-app_file: app.py
 pinned: false
 short_description: My small project while preparing for AIC
 ---

 colorTo: green
 sdk: gradio
 sdk_version: 5.39.0
+app_file: main.py
 pinned: false
 short_description: My small project while preparing for AIC
 ---

app.py CHANGED Viewed

@@ -1,32 +1,23 @@
 # app/main.py
 import gradio as gr
 import os
-import sys
-import shutil
 import zipfile
-from typing import List, Dict, Any
-from pathlib import Path
-# Thêm thư mục gốc của dự án vào Python Path để có thể import các module
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-if project_root not in sys.path:
-    sys.path.insert(0, project_root)
 from utils.logger import logger
 from config.settings import settings
 from qdrant_client import QdrantClient
 from core.retrieval.retriever import Retriever
-from scripts.ingestion import IngestionService
-# --- 1. Khởi tạo các dịch vụ toàn cục ---
 logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
 try:
-    # Tạo MỘT QdrantClient duy nhất để chia sẻ
     qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
     shared_qdrant_client = QdrantClient(path=qdrant_db_path)
     logger.info("Shared Qdrant client initialized.")
-    # Khởi tạo các dịch vụ, chia sẻ client
     ingestion_service = IngestionService(client=shared_qdrant_client)
     retriever_instance = Retriever(client=shared_qdrant_client)
@@ -35,12 +26,7 @@ except Exception as e:
     logger.error(f"Failed to initialize global services: {e}")
     raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
-# ---- HÀM XỬ LÝ CHO TAB UPLOAD ----
 def upload_handler(zip_path: str, progress=gr.Progress()):
-    """
-    Hàm này xử lý việc upload file và thư mục với progress bar.
-    """
     progress(0, desc="🚀 Starting upload process...")
     if not zip_path:
@@ -63,7 +49,7 @@ def upload_handler(zip_path: str, progress=gr.Progress()):
     logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
-    # --- Bước 1: Thu thập tất cả các đường dẫn file từ input ---
     path = Path(settings.RAW_DATA_DIR)
     all_temp_file_paths = list(path.rglob("*"))
     all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
@@ -81,11 +67,11 @@ def upload_handler(zip_path: str, progress=gr.Progress()):
     if not files_to_ingest:
         return "No valid files were moved for ingestion."
-    # --- Bước 3: Gọi dịch vụ để nạp tất cả các file một lần ---
     try:
         progress(0.4, desc="🔄 Starting file ingestion...")
         # Gọi hàm ingestion với progress callback
-        ingestion_service.ingest_files_with_progress(files_to_ingest)
         success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
         logger.success(success_message)
@@ -268,6 +254,6 @@ def create_and_run_app():
     return demo
 # --- 4. Chạy ứng dụng ---
-logger.info("Launching Gradio interface...")
-demo = create_and_run_app()
-demo.launch()

 # app/main.py
 import gradio as gr
 import os
 import zipfile
+from pathlib import Path
 from utils.logger import logger
 from config.settings import settings
 from qdrant_client import QdrantClient
 from core.retrieval.retriever import Retriever
+from ingestions.ingestion import IngestionService
+# --- Initialize global services ---
 logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
 try:
+    # Create ONE QdrantClient only for sharing
     qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
     shared_qdrant_client = QdrantClient(path=qdrant_db_path)
     logger.info("Shared Qdrant client initialized.")
     ingestion_service = IngestionService(client=shared_qdrant_client)
     retriever_instance = Retriever(client=shared_qdrant_client)
     logger.error(f"Failed to initialize global services: {e}")
     raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
 def upload_handler(zip_path: str, progress=gr.Progress()):
     progress(0, desc="🚀 Starting upload process...")
     if not zip_path:
     logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
+    # --- Retrieve all file path from input ---
     path = Path(settings.RAW_DATA_DIR)
     all_temp_file_paths = list(path.rglob("*"))
     all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
     if not files_to_ingest:
         return "No valid files were moved for ingestion."
+    # Start ingesting data
     try:
         progress(0.4, desc="🔄 Starting file ingestion...")
         # Gọi hàm ingestion với progress callback
+        ingestion_service.ingest_files_with_progress(files_to_ingest, progress)
         success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
         logger.success(success_message)
     return demo
 # --- 4. Chạy ứng dụng ---
+# logger.info("Launching Gradio interface...")
+# demo = create_and_run_app()
+# demo.launch()

config/database_configs.py DELETED Viewed

File without changes

config/model_configs.py CHANGED Viewed

@@ -1,17 +1,14 @@
 # config/model_configs.py
 # Embedding Models
-TEXT_EMBEDDING_MODEL: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-IMAGE_EMBEDDING_MODEL: str = "openai/clip-vit-base-patch32" # Hoặc các mô hình CLIP khác
-AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused" # Ví dụ về mô hình CLAP
 # Generator Model (LLM/LMM)
-GENERATOR_MODEL_NAME: str = "gpt-4o" # Hoặc "google/gemma-2b", "meta-llama/Llama-2-7b-chat-hf", "llava-hf/llava-1.5-7b-hf"
 GENERATOR_MODEL_MAX_TOKENS: int = 4096
 GENERATOR_MODEL_TEMPERATURE: float = 0.7
 # Reranker Model
-RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
-# Automatic Speech Recognition (ASR) Model (Ví dụ với Whisper của Hugging Face)
-ASR_MODEL: str = "openai/whisper-tiny" # Có thể dùng "base", "small", "medium" tùy tài nguyên GPU

 # config/model_configs.py
 # Embedding Models
+TEXT_EMBEDDING_MODEL: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
+IMAGE_EMBEDDING_MODEL: str = "google/vit-base-patch16-224-in21k"
+AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused"
 # Generator Model (LLM/LMM)
+GENERATOR_MODEL_NAME: str = "gpt-4o"
 GENERATOR_MODEL_MAX_TOKENS: int = 4096
 GENERATOR_MODEL_TEMPERATURE: float = 0.7
 # Reranker Model
+RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"

config/settings.py CHANGED Viewed

@@ -6,25 +6,25 @@ from dotenv import load_dotenv
 load_dotenv()
 class Settings(BaseSettings):
     APP_NAME: str = "Multimedia RAG Assistant"
     APP_VERSION: str = "0.1.0"
     ENVIRONMENT: str = "development"
-    DATA_DIR: str = "data"
-    RAW_DATA_DIR: str = os.path.join("data", "raw")
-    PROCESSED_DATA_DIR: str = os.path.join("data", "processed")
-    CHUNKS_DIR: str = os.path.join("data", "processed", "chunks")
-    METADATA_DIR: str = os.path.join("data", "processed", "metadata")
-    EMBEDDINGS_DIR: str = os.path.join("data", "processed", "embeddings")
     API_HOST: str = "0.0.0.0"
     API_PORT: int = 8000
-    # Cấu hình mô hình
-    # Đây là nơi bạn sẽ thêm các API key hoặc model IDs sau này
-    HUGGINGFACE_API_KEY: Optional[str] = None # Ví dụ: Nếu dùng Hugging Face models
-    # Cấu hình logger
     LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
     model_config = SettingsConfigDict(

 load_dotenv()
 class Settings(BaseSettings):
+    BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     APP_NAME: str = "Multimedia RAG Assistant"
     APP_VERSION: str = "0.1.0"
     ENVIRONMENT: str = "development"
+    DATA_DIR: str = os.path.join(BASE_DIR, "data")
+    RAW_DATA_DIR: str = os.path.join(DATA_DIR, "raw")
+    PROCESSED_DATA_DIR: str = os.path.join(DATA_DIR, "processed")
+    CHUNKS_DIR: str = os.path.join(DATA_DIR, "processed", "chunks")
+    METADATA_DIR: str = os.path.join(DATA_DIR, "processed", "metadata")
+    EMBEDDINGS_DIR: str = os.path.join(DATA_DIR, "processed", "embeddings")
     API_HOST: str = "0.0.0.0"
     API_PORT: int = 8000
+    HUGGINGFACE_API_KEY: Optional[str] = None
+    LOG_DIR: str = "logs"
     LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
     model_config = SettingsConfigDict(

core/data_processing/audio_processor.py CHANGED Viewed

@@ -37,7 +37,7 @@ class AudioProcessor:
                 segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
                 chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
-                # Lưu segment thành file WAV tạm thời
                 segment.export(chunk_file_path, format="wav")
                 metadata = {
@@ -45,8 +45,6 @@ class AudioProcessor:
                     "type": "audio",
                     "chunk_id": segment_id,
                     "chunk_data_path": chunk_file_path,
-                    # "start_time_ms": int(segment.start_time),
-                    # "end_time_ms": int(segment.end_time),
                     "duration_ms": len(segment)
                 }
                 chunks.append({
@@ -60,21 +58,4 @@ class AudioProcessor:
             return []
         except Exception as e:
             logger.error(f"Error processing audio file {file_path}: {e}")
-            return []
-# Ví dụ sử dụng (giữ nguyên để kiểm tra)
-if __name__ == "__main__":
-    sample_audio_path = os.path.join(settings.RAW_DATA_DIR, "audios", "sample_audio.wav")
-    if not os.path.exists(sample_audio_path):
-        print(f"ERROR: Sample audio not found at {sample_audio_path}. Please create it first.")
-        print("Make sure you have ffmpeg installed and available in your PATH for pydub to work.")
-    else:
-        processor = AudioProcessor()
-        audio_chunks = processor.process(sample_audio_path)
-        for i, chunk in enumerate(audio_chunks):
-            print(f"\n--- Audio Chunk {i+1} ---")
-            print(f"Type: {chunk['metadata']['type']}")
-            print(f"Content (path): {chunk['content']}")
-            print(f"Metadata: {chunk['metadata']}")
-            # Bạn có thể thử mở file chunk['content'] để nghe

                 segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
                 chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
+                # Save segments into data/processed/chunks
                 segment.export(chunk_file_path, format="wav")
                 metadata = {
                     "type": "audio",
                     "chunk_id": segment_id,
                     "chunk_data_path": chunk_file_path,
                     "duration_ms": len(segment)
                 }
                 chunks.append({
             return []
         except Exception as e:
             logger.error(f"Error processing audio file {file_path}: {e}")
+            return []

core/data_processing/image_processor.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # core/data_processing/image_processor.py
-from typing import List, Dict, Any
 import os
-from PIL import Image
 from utils.logger import logger
 class ImageProcessor:
@@ -16,14 +16,7 @@ class ImageProcessor:
                 logger.error(f"Image file not found: {file_path}")
                 return []
-            with Image.open(file_path) as img:
-                width, height = img.size
-                img_format = img.format
-            file_size = os.path.getsize(file_path)
-            # Tạo một ID duy nhất cho chunk này
-            # Lấy tên file không bao gồm phần mở rộng
             base_name = os.path.basename(file_path)
             chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
@@ -31,59 +24,16 @@ class ImageProcessor:
                 "source_id": base_name,
                 "type": "image",
                 "chunk_id": chunk_id,
-                "chunk_data_path": file_path,
-                "image_width": width,
-                "image_height": height,
-                "image_format": img_format,
-                "file_size_bytes": file_size
             }
-            # Tạo chunk
-            # Content sẽ là đường dẫn đến file, giống như audio/video segments
             chunk = {
                 "content": file_path,
                 "metadata": metadata
             }
-            # Trả về một danh sách chứa một chunk duy nhất
             return [chunk]
         except Exception as e:
             logger.error(f"Error processing image file {file_path}: {e}")
-            return []
-# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
-if __name__ == "__main__":
-    from config.settings import settings
-    import os
-    # Tạo một ảnh dummy để kiểm tra
-    dummy_image_dir = os.path.join(settings.RAW_DATA_DIR, "images")
-    os.makedirs(dummy_image_dir, exist_ok=True)
-    dummy_image_path = os.path.join(dummy_image_dir, "test_image.jpg")
-    try:
-        # Tạo một ảnh mẫu màu xanh
-        dummy_img = Image.new('RGB', (100, 150), color = 'blue')
-        dummy_img.save(dummy_image_path)
-        print(f"Created a dummy image for testing at: {dummy_image_path}")
-        # Khởi tạo processor và xử lý ảnh
-        processor = ImageProcessor()
-        image_chunks = processor.process(dummy_image_path)
-        if image_chunks:
-            print("\n--- Image Chunk Processed ---")
-            chunk = image_chunks[0]
-            print(f"Content (path): {chunk['content']}")
-            print("Metadata:")
-            for key, value in chunk['metadata'].items():
-                print(f"  - {key}: {value}")
-        else:
-            print("Failed to process the dummy image.")
-    finally:
-        # Dọn dẹp ảnh dummy
-        if os.path.exists(dummy_image_path):
-            os.remove(dummy_image_path)
-            print(f"Cleaned up dummy image: {dummy_image_path}")

 # core/data_processing/image_processor.py
 import os
+from typing import List, Dict, Any
 from utils.logger import logger
 class ImageProcessor:
                 logger.error(f"Image file not found: {file_path}")
                 return []
+            # create id by filename
             base_name = os.path.basename(file_path)
             chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
                 "source_id": base_name,
                 "type": "image",
                 "chunk_id": chunk_id,
+                "chunk_data_path": file_path
             }
             chunk = {
                 "content": file_path,
                 "metadata": metadata
             }
             return [chunk]
         except Exception as e:
             logger.error(f"Error processing image file {file_path}: {e}")
+            return []

core/data_processing/text_processor.py CHANGED Viewed

@@ -26,14 +26,11 @@ class TextProcessor:
             chunks = []
             for i, chunk_content in enumerate(split_texts):
-                start_char_idx = text.find(chunk_content) # find start index of each chunk_content
                 chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
                 metadata = {
                     "source_id": os.path.basename(file_path),
                     "type": "text",
                     "chunk_id": chunk_id,
-                    "start_char_index": start_char_idx, # Vị trí ký tự bắt đầu
-                    "end_char_index": start_char_idx + len(chunk_content), # Vị trí ký tự kết thúc
                     "content_length": len(chunk_content)
                 }
                 chunks.append({
@@ -44,21 +41,4 @@ class TextProcessor:
             return chunks
         except Exception as e:
             logger.error(f"Error processing text document {file_path}: {e}")
-            return []
-# Ví dụ sử dụng (giữ nguyên để kiểm tra)
-if __name__ == "__main__":
-    from config.settings import settings
-    import os
-    sample_doc_path = os.path.join(settings.RAW_DATA_DIR, "documents", "sample_document.txt")
-    if not os.path.exists(sample_doc_path):
-        print(f"ERROR: Sample document not found at {sample_doc_path}. Please create it first.")
-    else:
-        processor = TextProcessor(chunk_size=100, chunk_overlap=20) # Thử kích thước nhỏ hơn để thấy rõ chunk
-        text_chunks = processor.process(sample_doc_path)
-        for i, chunk in enumerate(text_chunks): # In tất cả các chunk để kiểm tra
-            print(f"\n--- Chunk {i+1} ---")
-            print(f"Content: {chunk['content']}") # In toàn bộ nội dung chunk
-            print(f"Metadata: {chunk['metadata']}")

             chunks = []
             for i, chunk_content in enumerate(split_texts):
                 chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
                 metadata = {
                     "source_id": os.path.basename(file_path),
                     "type": "text",
                     "chunk_id": chunk_id,
                     "content_length": len(chunk_content)
                 }
                 chunks.append({
             return chunks
         except Exception as e:
             logger.error(f"Error processing text document {file_path}: {e}")
+            return []

core/data_processing/video_processor.py DELETED Viewed

@@ -1,118 +0,0 @@
-# core/data_processing/video_processor.py
-import os
-import torch
-import cv2
-import numpy as np
-from typing import List, Dict, Any
-from utils.logger import logger
-from moviepy.editor import VideoFileClip
-from config.settings import settings
-class VideoProcessor:
-    def __init__(self, chunk_duration_sec: int = 10, frames_per_segment: int = 5):
-        self.chunk_duration_sec = chunk_duration_sec
-        self.frames_per_segment = frames_per_segment
-        logger.info(f"VideoProcessor initialized (chunk_duration={chunk_duration_sec}s, frames_per_segment={frames_per_segment}).")
-    def process_video(self, file_path: str) -> List[Dict[str, Any]]:
-        try:
-            logger.info(f"Processing video file: {file_path}")
-            video_clip = VideoFileClip(file_path)
-            total_duration = video_clip.duration # Tổng thời lượng video (giây)
-            all_chunks = []
-            # Tạo thư mục con để lưu các frame/ảnh tạm thời
-            image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks", os.path.basename(file_path).split('.')[0])
-            os.makedirs(image_chunks_dir, exist_ok=True)
-            # Tạo thư mục con để lưu các video segment tạm thời
-            video_segments_dir = os.path.join(settings.CHUNKS_DIR, "video/video_segments", os.path.basename(file_path).split('.')[0])
-            os.makedirs(video_segments_dir, exist_ok=True)
-            current_time = 0.0
-            chunk_idx = 0
-            while current_time < total_duration:
-                end_time = min(current_time + self.chunk_duration_sec, total_duration) # end time of each segment
-                segment_clip = video_clip.subclip(current_time, end_time)
-                segment_base_name = f"{os.path.basename(file_path).split('.')[0]}_segment_{chunk_idx}"
-                frames_paths = []
-                frame_timestamps = np.linspace(0, segment_clip.duration, self.frames_per_segment + 2)[1:-1]
-                for ts in frame_timestamps:
-                    frame = segment_clip.get_frame(ts)
-                    frame_filename = f"{segment_base_name}_frame_{int(ts*1000)}.jpg"
-                    frame_path = os.path.join(image_chunks_dir, frame_filename)
-                    cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
-                    frames_paths.append(frame_path)
-                # Tạo chunk cho các khung hình
-                image_chunk_id = f"{segment_base_name}_image"
-                all_chunks.append({
-                    "content": frames_paths, # Danh sách đường dẫn đến các file ảnh
-                    "metadata": {
-                        "source_id": os.path.basename(file_path),
-                        "type": "video_frame", # Loại chunk
-                        "chunk_id": image_chunk_id,
-                        "start_time_sec": current_time,
-                        "end_time_sec": end_time,
-                        "frame_paths": frames_paths # Lưu lại đường dẫn trong metadata
-                    }
-                })
-                # 2. Lưu đoạn video clip (optional, nhưng hữu ích cho video retrieval)
-                video_segment_path = os.path.join(video_segments_dir, f"{segment_base_name}.mp4")
-                segment_clip.write_videofile(video_segment_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
-                # Tạo chunk cho video segment
-                video_chunk_id = f"{segment_base_name}_video_clip"
-                all_chunks.append({
-                    "content": video_segment_path, # Đường dẫn đến file video clip
-                    "metadata": {
-                        "source_id": os.path.basename(file_path),
-                        "type": "video_segment_clip", # Loại chunk mới: video clip
-                        "chunk_id": video_chunk_id,
-                        "start_time_sec": current_time,
-                        "end_time_sec": end_time,
-                        "chunk_data_path": video_segment_path # Lưu lại đường dẫn trong metadata
-                    }
-                })
-                current_time = end_time
-                chunk_idx += 1
-            video_clip.close() # Đảm bảo giải phóng tài nguyên
-            logger.info(f"Generated {len(all_chunks)} chunks (frames & video segments) from video {file_path}")
-            return all_chunks
-        except FileNotFoundError:
-            logger.error(f"Video file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
-            return []
-        except Exception as e:
-            logger.error(f"Error processing video file {file_path}: {e}")
-            return []
-# Ví dụ sử dụng (giữ nguyên để kiểm tra)
-if __name__ == "__main__":
-    sample_video_path = os.path.join(settings.RAW_DATA_DIR, "videos", "sample_video.mp4")
-    if not os.path.exists(sample_video_path):
-        print(f"ERROR: Sample video not found at {sample_video_path}. Please create it first.")
-        print("Make sure you have ffmpeg installed and available in your PATH for moviepy to work.")
-    else:
-        processor = VideoProcessor(chunk_duration_sec=5, frames_per_segment=3)
-        video_chunks = processor.process_video(sample_video_path)
-        for i, chunk in enumerate(video_chunks):
-            print(f"\n--- Video Chunk {i+1} ---")
-            print(f"Type: {chunk['metadata']['type']}")
-            if chunk['metadata']['type'] == 'video_frames':
-                print(f"Content (paths): {chunk['content']}")
-                if chunk['content']:
-                    print(f"Sample frame: {chunk['content'][0]}")
-            elif chunk['metadata']['type'] == 'video_segment_clip':
-                print(f"Content (path): {chunk['content']}")
-            print(f"Metadata: {chunk['metadata']}")

core/embeddings/audio_embedding_model.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # models/embeddings/audio_embedding_model.py
 import torch
 import librosa
-import numpy as np
 from typing import List
 from transformers import AutoProcessor, AutoModel
@@ -44,43 +43,4 @@ class AudioEmbeddingModel:
         embeddings_list = embeddings.cpu().tolist()
         logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
-        return embeddings_list
-# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
-if __name__ == "__main__":
-    from config.settings import settings
-    import os
-    model = AudioEmbeddingModel()
-    sample_audio_dir = os.path.join(settings.PROCESSED_DATA_DIR, "audio_segments", "sample_audio") # Giả sử có thư mục audio từ file mẫu
-    # Tạo một audio dummy nếu không có file audio mẫu
-    if not os.path.exists(sample_audio_dir) or not os.listdir(sample_audio_dir):
-        print(f"Creating a dummy audio for testing at {sample_audio_dir}...")
-        os.makedirs(sample_audio_dir, exist_ok=True)
-        from pydub import AudioSegment
-        dummy_audio = AudioSegment.silent(duration=1000) # 1 giây im lặng
-        dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
-        dummy_audio.export(dummy_audio_path, format="wav")
-        sample_audio_paths = [dummy_audio_path]
-    else:
-        sample_audio_paths = [os.path.join(sample_audio_dir, f) for f in os.listdir(sample_audio_dir) if f.endswith(('.wav', '.mp3'))]
-        if not sample_audio_paths:
-            print(f"No audio files found in {sample_audio_dir}. Please ensure sample audio was processed.")
-            from pydub import AudioSegment
-            dummy_audio = AudioSegment.silent(duration=1000)
-            dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
-            dummy_audio.export(dummy_audio_path, format="wav")
-            sample_audio_paths = [dummy_audio_path]
-    print(f"Using {len(sample_audio_paths)} sample audio clips: {sample_audio_paths[:2]}...")
-    embeddings = model.get_embeddings(sample_audio_paths)
-    print(f"Number of embeddings: {len(embeddings)}")
-    if embeddings:
-        print(f"Dimension of embeddings: {len(embeddings[0])}")
-        print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
-        if len(embeddings) > 1:
-            from sklearn.metrics.pairwise import cosine_similarity
-            sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
-            print(f"Similarity between audio 1 and 2: {sim:.4f}")

 # models/embeddings/audio_embedding_model.py
 import torch
 import librosa
 from typing import List
 from transformers import AutoProcessor, AutoModel
         embeddings_list = embeddings.cpu().tolist()
         logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
+        return embeddings_list

core/embeddings/image_embedding_model.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import torch
 from typing import List
 from PIL import Image
-from transformers import CLIPProcessor, CLIPModel
 from utils.logger import logger
 from config.model_configs import IMAGE_EMBEDDING_MODEL
@@ -11,70 +10,72 @@ class ImageEmbeddingModel:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
-        self.model = CLIPModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
-        self.processor = CLIPProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
         logger.info("Image Embedding Model loaded successfully.")
     def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
         if not image_paths:
             return []
         images = []
         for img_path in image_paths:
             try:
-                images.append(Image.open(img_path).convert("RGB"))
             except Exception as e:
                 logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
                 continue
         if not images:
             return []
-        inputs = self.processor(images=images, return_tensors="pt").to(self.device)
-        with torch.no_grad():
-            image_features = self.model.get_image_features(pixel_values=inputs.pixel_values)
-        embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
-        embeddings_list = embeddings.cpu().tolist()
-        logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
-        return embeddings_list
-# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
-if __name__ == "__main__":
-    from config.settings import settings
-    import os
-    model = ImageEmbeddingModel()
-    sample_image_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks/sample_video") # Giả sử có thư mục ảnh từ video mẫu
-    # Tạo một ảnh dummy nếu không có ảnh mẫu
-    if not os.path.exists(sample_image_dir) or not os.listdir(sample_image_dir):
-        print(f"Creating a dummy image for testing at {sample_image_dir}...")
-        os.makedirs(sample_image_dir, exist_ok=True)
-        dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
-        dummy_img = Image.new('RGB', (60, 30), color = 'red')
-        dummy_img.save(dummy_image_path)
-        sample_image_paths = [dummy_image_path]
-    else:
-        sample_image_paths = [os.path.join(sample_image_dir, f) for f in os.listdir(sample_image_dir) if f.endswith(('.jpg', '.png'))]
-        if not sample_image_paths: # Nếu thư mục có nhưng không có ảnh
-            print(f"No images found in {sample_image_dir}. Please ensure sample video was processed.")
-            dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
-            dummy_img = Image.new('RGB', (60, 30), color = 'red')
-            dummy_img.save(dummy_image_path)
-            sample_image_paths = [dummy_image_path]
-    print(f"Using {len(sample_image_paths)} sample images: {sample_image_paths[:2]}...")
-    embeddings = model.get_embeddings(sample_image_paths)
-    print(f"Number of embeddings: {len(embeddings)}")
-    if embeddings:
-        print(f"Dimension of embeddings: {len(embeddings[0])}")
-        print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
-        # Nếu có đủ ảnh, thử so sánh 2 ảnh đầu
-        if len(embeddings) > 1:
-            from sklearn.metrics.pairwise import cosine_similarity
-            sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
-            print(f"Similarity between image 1 and 2: {sim:.4f}")

 import torch
 from typing import List
 from PIL import Image
+from transformers import ViTImageProcessor, ViTModel
 from utils.logger import logger
 from config.model_configs import IMAGE_EMBEDDING_MODEL
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
+        self.model = ViTModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
+        self.processor = ViTImageProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
+        # Set model to evaluation mode
+        self.model.eval()
         logger.info("Image Embedding Model loaded successfully.")
     def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
         if not image_paths:
+            logger.warning("No image paths provided")
             return []
         images = []
+        valid_paths = []
         for img_path in image_paths:
             try:
+                image = Image.open(img_path).convert("RGB")
+                images.append(image)
+                valid_paths.append(img_path)
             except Exception as e:
                 logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
                 continue
         if not images:
+            logger.warning("No valid images to process")
             return []
+        try:
+            # Process images
+            inputs = self.processor(images=images, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                # Get model outputs
+                outputs = self.model(**inputs)
+                # Extract embeddings from the [CLS] token (first token)
+                # Shape: (batch_size, sequence_length, hidden_size)
+                last_hidden_states = outputs.last_hidden_state
+                # Take the [CLS] token embedding (index 0)
+                # Shape: (batch_size, hidden_size)
+                cls_embeddings = last_hidden_states[:, 0, :]
+                # Alternatively, you can use pooler_output if available
+                # cls_embeddings = outputs.pooler_output
+            # Normalize embeddings (L2 normalization)
+            embeddings = cls_embeddings / cls_embeddings.norm(p=2, dim=-1, keepdim=True)
+            # Convert to list
+            embeddings_list = embeddings.cpu().tolist()
+            logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
+            # Ensure we return the right number of embeddings
+            if len(embeddings_list) != len(image_paths):
+                logger.warning(f"Mismatch: {len(embeddings_list)} embeddings for {len(image_paths)} input paths")
+                # Pad with empty lists if needed
+                while len(embeddings_list) < len(image_paths):
+                    embeddings_list.append([])
+            return embeddings_list
+        except Exception as e:
+            logger.error(f"Error generating embeddings: {e}")
+            # Return empty embeddings for all input paths
+            return [[] for _ in image_paths]

core/embeddings/text_embedding_model.py CHANGED Viewed

@@ -19,24 +19,4 @@ class TextEmbeddingModel:
         embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
         logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
-        return embeddings
-# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
-if __name__ == "__main__":
-    model = TextEmbeddingModel()
-    sample_texts = [
-        "This is a test sentence.",
-        "Another sentence for embedding.",
-        "How about some natural language processing?",
-        "Xe hơi màu đỏ đang chạy trên đường phố." # Thử với tiếng Việt
-    ]
-    embeddings = model.get_embeddings(sample_texts)
-    print(f"Number of embeddings: {len(embeddings)}")
-    if embeddings:
-        print(f"Dimension of embeddings: {len(embeddings[0])}")
-        print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
-        # Bạn có thể thử tính cosine similarity giữa các embedding ở đây để thấy độ tương đồng
-        from sklearn.metrics.pairwise import cosine_similarity
-        sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
-        print(f"Similarity between text 1 and 2: {sim:.4f}")

         embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
         logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
+        return embeddings

core/retrieval/retriever.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # core/retrieval/retriever.py
 import os
-from typing import List, Tuple, Dict, Any, Union
 from utils.logger import logger
 from config.settings import settings
 from qdrant_client import QdrantClient
 from core.embeddings.text_embedding_model import TextEmbeddingModel
@@ -93,64 +93,9 @@ class Retriever:
         logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
         return formatted_results
-    # def _get_content_from_payload(self, payload: Dict):
-    #     chunk_type = payload.get("type")
-    #     if chunk_type == "text":
-    #         return None # Sẽ cải thiện sau
-    #     elif chunk_type == 'image' or chunk_type == "audio":
-    #         return payload.get('chunk_data_path') # Trả về đường dẫn
-    #     return None
     def is_database_empty(self) -> bool:
         total_vectors = self.text_db_manager.get_total_vectors() \
             + self.image_db_manager.get_total_vectors() \
             + self.audio_db_manager.get_total_vectors()
-        return total_vectors == 0
-if __name__ == "__main__":
-    from config.settings import settings
-    logger.info("--- Running Retriever Standalone Test (Qdrant version) ---")
-    # Kiểm tra xem Qdrant đã có dữ liệu chưa
-    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
-    if not os.path.exists(qdrant_db_path):
-         print("\n\nERROR: Qdrant database not found. Please run 'python scripts/ingest_data.py' first to create the database.\n\n")
-    else:
-        retriever = Retriever()
-        # --- 1. Thử truy vấn văn bản ---
-        print("\n--- Testing Text Retrieval ---")
-        text_query = "What is artificial intelligence?"
-        text_results = retriever.retrieve(text_query, query_type="text", top_k=3)
-        print(f"Query: '{text_query}'")
-        for i, result in enumerate(text_results):
-            print(f"  Result {i+1}:")
-            print(f"    Score: {result['score']:.4f}")
-            print(f"    Type: {result['metadata']['type']}")
-            print(f"    Content Preview: {str(result.get('content'))[:200] if result.get('content') else 'N/A'}...")
-            print(f"    Source: {result['metadata']['source_id']}")
-        # --- 2. Thử truy vấn hình ảnh ---
-        print("\n--- Testing Image Retrieval ---")
-        # Lấy một ảnh từ các chunk đã xử lý để làm truy vấn
-        image_to_query = None
-        image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks")
-        if os.path.exists(image_chunks_dir):
-            for root, _, files in os.walk(image_chunks_dir):
-                if files:
-                    image_to_query = os.path.join(root, files[0])
-                    break
-        if image_to_query and os.path.exists(image_to_query):
-            print(f"Using image as query: {image_to_query}")
-            image_results = retriever.retrieve(image_to_query, query_type="image", top_k=3)
-            for i, result in enumerate(image_results):
-                print(f"  Result {i+1}:")
-                print(f"    Score: {result['score']:.4f}")
-                print(f"    Type: {result['metadata']['type']}")
-                print(f"    Content (Paths): {result['content']}")
-                print(f"    Source: {result['metadata']['source_id']}")
-        else:
-            print("Could not find a sample image to test image retrieval.")

 # core/retrieval/retriever.py
 import os
 from utils.logger import logger
 from config.settings import settings
+from typing import List, Dict, Any, Union
 from qdrant_client import QdrantClient
 from core.embeddings.text_embedding_model import TextEmbeddingModel
         logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
         return formatted_results
     def is_database_empty(self) -> bool:
         total_vectors = self.text_db_manager.get_total_vectors() \
             + self.image_db_manager.get_total_vectors() \
             + self.audio_db_manager.get_total_vectors()
+        return total_vectors == 0

core/retrieval/vector_db_manager.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import os
-from typing import List, Tuple, Dict, Any
 from uuid import uuid4
-from qdrant_client import QdrantClient, models
 from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
-from utils.logger import logger
-from config.settings import settings
 class VectorDBManager:
     def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
         logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
@@ -110,60 +109,9 @@ class VectorDBManager:
         try:
             count_result = self.client.count(
                 collection_name=self.collection_name,
-                exact=True # Đếm chính xác
             )
             return count_result.count
         except Exception as e:
             logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
-            return 0
-# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
-if __name__ == "__main__":
-    import numpy as np
-    # Các thông số cho collection test
-    TEST_COLLECTION_NAME = "my_test_collection"
-    DUMMY_DIM = 128
-    # --- Kiểm tra tạo collection ---
-    print("\n--- Testing Collection Creation ---")
-    db_manager = VectorDBManager(collection_name=TEST_COLLECTION_NAME, embedding_dim=DUMMY_DIM)
-    print(f"Total vectors initially: {db_manager.get_total_vectors()}")
-    # --- Kiểm tra thêm vector và payload ---
-    print("\n--- Testing Add Vectors ---")
-    dummy_embeddings = np.random.rand(10, DUMMY_DIM).tolist()
-    dummy_metadatas = [
-        {"chunk_id": f"dummy_chunk_{i}", "type": "text" if i < 5 else "image", "source_file": "test.txt"}
-        for i in range(10)
-    ]
-    db_manager.add_vectors(dummy_embeddings, dummy_metadatas)
-    print(f"Total vectors after adding: {db_manager.get_total_vectors()}")
-    # --- Kiểm tra tìm kiếm ---
-    print("\n--- Testing Search ---")
-    dummy_query = np.random.rand(DUMMY_DIM).tolist()
-    results = db_manager.search_vectors(dummy_query, top_k=3)
-    print(f"Top 3 results (no filter):")
-    for score, payload in results:
-        print(f"  Score: {score:.4f}, Payload: {payload}")
-    # --- Kiểm tra tìm kiếm CÓ LỌC (Pre-filtering) ---
-    print("\n--- Testing Search with Filter ---")
-    filter_condition = models.Filter(
-        must=[
-            models.FieldCondition(
-                key="type", # Lọc theo trường 'type' trong payload
-                match=models.MatchValue(value="image"), # Giá trị phải là 'image'
-            )
-        ]
-    )
-    filtered_results = db_manager.search_vectors(dummy_query, top_k=3, filter_payload=filter_condition)
-    print(f"Top 3 results (filtered for type='image'):")
-    for score, payload in filtered_results:
-        print(f"  Score: {score:.4f}, Payload: {payload}")
-    # --- Dọn dẹp collection test ---
-    print("\n--- Cleaning up test collection ---")
-    db_manager.client.delete_collection(collection_name=TEST_COLLECTION_NAME)
-    print(f"Collection '{TEST_COLLECTION_NAME}' deleted.")

 import os
+from utils.logger import logger
+from config.settings import settings
 from uuid import uuid4
+from typing import List, Tuple, Dict, Any
+from qdrant_client import QdrantClient
 from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
 class VectorDBManager:
     def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
         logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
         try:
             count_result = self.client.count(
                 collection_name=self.collection_name,
+                exact=True
             )
             return count_result.count
         except Exception as e:
             logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
+            return 0

{scripts → ingestions}/ingestion.py RENAMED Viewed

@@ -1,123 +1,108 @@
 # core/ingestion/ingestion_service.py
 import os
-import gradio as gr
 from typing import List, Optional, Callable
-from tqdm import tqdm
 from utils.logger import logger
-from config.settings import settings
 from qdrant_client import QdrantClient
-# Import các Processor (không thay đổi)
 from core.data_processing.text_processor import TextProcessor
 from core.data_processing.audio_processor import AudioProcessor
-# from core.data_processing.video_processor import VideoProcessor
 from core.data_processing.image_processor import ImageProcessor
-# Import các Embedding Model (không thay đổi)
 from core.embeddings.text_embedding_model import TextEmbeddingModel
 from core.embeddings.image_embedding_model import ImageEmbeddingModel
 from core.embeddings.audio_embedding_model import AudioEmbeddingModel
-# Import VectorDBManager phiên bản Qdrant MỚI
 from core.retrieval.vector_db_manager import VectorDBManager
 class IngestionService:
     def __init__(self, client: QdrantClient):
-        """
-        Khởi tạo IngestionService với một QdrantClient được chia sẻ.
-        Phiên bản này không theo dõi trạng thái file.
-        """
         logger.info("Initializing IngestionService (Stateless)...")
         self.client = client
         self.text_processor = TextProcessor()
         self.image_processor = ImageProcessor()
         self.audio_processor = AudioProcessor()
-        # self.video_processor = VideoProcessor()
         self.text_embedder = TextEmbeddingModel()
         self.image_embedder = ImageEmbeddingModel()
         self.audio_embedder = AudioEmbeddingModel()
-        text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
         self.text_db_manager = VectorDBManager(
             client=self.client,
             collection_name="text_collection",
-            embedding_dim=text_dim
         )
-        image_embedding_dim = 512
         self.image_vector_db_manager = VectorDBManager(
             client=self.client,
             collection_name="image_collection",
             embedding_dim=image_embedding_dim
         )
-        audio_embedding_dim = 512
         self.audio_vector_db_manager = VectorDBManager(
             client=self.client,
             collection_name="audio_collection",
             embedding_dim=audio_embedding_dim
         )
-        # video_frame_embedding_dim = 512
-        # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
         logger.info("IngestionService initialized successfully.")
     def ingest_files(self, file_paths: List[str]):
-        """
-        Xử lý một danh sách các file, tạo embedding, và thêm vào Qdrant.
-        Hàm này giả định các file đã được đặt vào đúng thư mục trong 'raw'.
-        """
         return self.ingest_files_with_progress(file_paths, None)
     def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
         """
-        Xử lý một danh sách các file với progress tracking.
         """
         logger.info(f"Starting ingestion for {len(file_paths)} files...")
-        if progress_callback:
-            progress_callback(0.4, desc="Starting file processing...")
         all_chunks_to_process = []
-        # 1. Quét qua các file và tạo chunk
         for i, file_path in enumerate(file_paths):
-            base_progress = 0.4 + (i / len(file_paths)) * 0.3  # 40% -> 70%
-            file_name = os.path.basename(file_path)
-            if progress_callback:
-                progress_callback(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
-            # Xác định loại dữ liệu dựa trên phần mở rộng file
-            file_ext = os.path.splitext(file_path)[1].lower()
-            data_type = None
             try:
-                if progress_callback:
-                    progress_callback(base_progress + 0.01, desc=f"Reading {file_name}...")
                 if file_ext in ['.txt']:
-                    data_type = 'text'
                     chunks = self.text_processor.process(file_path)
                 elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
-                    data_type = 'image'
                     chunks = self.image_processor.process(file_path)
                 elif file_ext in ['.wav', '.mp3']:
-                    data_type = 'audio'
                     chunks = self.audio_processor.process(file_path)
-                # elif file_ext in ['.mp4', '.avi', '.mov']:
-                #     data_type = 'video'
-                #     chunks = self.video_processor.process_video(file_path)
                 else:
                     logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
                     continue
-                if progress_callback:
-                    progress_callback(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
                 all_chunks_to_process.extend(chunks)
             except Exception as e:
@@ -126,77 +111,89 @@ class IngestionService:
         if not all_chunks_to_process:
             logger.warning("No processable chunks were generated from the provided files.")
             return
         logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
-        if progress_callback:
-            progress_callback(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
-        # 2. Tạo embedding và thêm vào Qdrant (theo batch)
         text_embeddings_batch, text_metadatas_batch = [], []
         audio_embeddings_batch, audio_metadatas_batch = [], []
         image_embeddings_batch, image_metadatas_batch = [], []
         BATCH_SIZE = 32
         for i, chunk_data in enumerate(all_chunks_to_process):
-            # Tính toán progress chi tiết hơn
-            base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25  # 70% -> 95%
-            chunk_type = chunk_data['metadata']['type']
-            content = chunk_data['content']
-            chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
             try:
-                if progress_callback:
-                    progress_callback(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
-                embedding = None
                 if chunk_type == "text":
-                    if progress_callback:
-                        progress_callback(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
-                    embedding = self.text_embedder.get_embeddings([content])[0]
-                    text_embeddings_batch.append(embedding)
-                    text_metadatas_batch.append(chunk_data)
                 elif chunk_type == "audio":
-                    if progress_callback:
-                        progress_callback(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
-                    embedding = self.audio_embedder.get_embeddings([content])[0]
-                    audio_embeddings_batch.append(embedding)
-                    audio_metadatas_batch.append(chunk_data)
                 elif chunk_type == "image":
-                    if progress_callback:
-                        progress_callback(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
-                    embedding = self.image_embedder.get_embeddings([content])[0]
-                    image_embeddings_batch.append(embedding)
-                    image_metadatas_batch.append(chunk_data)
-                # Thêm batch khi đủ kích thước với progress update
                 if len(text_embeddings_batch) >= BATCH_SIZE:
-                    if progress_callback:
-                        progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
                     self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
                     text_embeddings_batch, text_metadatas_batch = [], []
                 if len(audio_embeddings_batch) >= BATCH_SIZE:
-                    if progress_callback:
-                        progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
                     self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
                     audio_embeddings_batch, audio_metadatas_batch = [], []
                 if len(image_embeddings_batch) >= BATCH_SIZE:
-                    if progress_callback:
-                        progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
                     self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
                     image_embeddings_batch, image_metadatas_batch = [], []
             except Exception as e:
-                logger.error(f"Error ingesting chunk {chunk_id}: {e}")
-        if progress_callback:
-            progress_callback(0.95, desc="Saving final batches...")
-        # Thêm các embedding còn lại trong batch cuối cùng
         final_operations = []
         if text_embeddings_batch:
             final_operations.append(("text", len(text_embeddings_batch)))
@@ -205,23 +202,27 @@ class IngestionService:
         if image_embeddings_batch:
             final_operations.append(("image", len(image_embeddings_batch)))
-        for i, (batch_type, count) in enumerate(final_operations):
-            current_progress = 0.95 + (i / len(final_operations)) * 0.04  # 95% -> 99%
-            if batch_type == "text" and text_embeddings_batch:
-                if progress_callback:
-                    progress_callback(current_progress, desc=f"Saving final {count} text embeddings...")
-                self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
-            elif batch_type == "audio" and audio_embeddings_batch:
-                if progress_callback:
-                    progress_callback(current_progress, desc=f"Saving final {count} audio embeddings...")
-                self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
-            elif batch_type == "image" and image_embeddings_batch:
-                if progress_callback:
-                    progress_callback(current_progress, desc=f"Saving final {count} image embeddings...")
-                self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
-        if progress_callback:
-            progress_callback(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
         logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")

 # core/ingestion/ingestion_service.py
 import os
 from typing import List, Optional, Callable
 from utils.logger import logger
 from qdrant_client import QdrantClient
 from core.data_processing.text_processor import TextProcessor
 from core.data_processing.audio_processor import AudioProcessor
 from core.data_processing.image_processor import ImageProcessor
 from core.embeddings.text_embedding_model import TextEmbeddingModel
 from core.embeddings.image_embedding_model import ImageEmbeddingModel
 from core.embeddings.audio_embedding_model import AudioEmbeddingModel
 from core.retrieval.vector_db_manager import VectorDBManager
 class IngestionService:
     def __init__(self, client: QdrantClient):
         logger.info("Initializing IngestionService (Stateless)...")
         self.client = client
         self.text_processor = TextProcessor()
         self.image_processor = ImageProcessor()
         self.audio_processor = AudioProcessor()
         self.text_embedder = TextEmbeddingModel()
         self.image_embedder = ImageEmbeddingModel()
         self.audio_embedder = AudioEmbeddingModel()
+        text_embedding_dim = self.text_embedder.model.get_sentence_embedding_dimension()
         self.text_db_manager = VectorDBManager(
             client=self.client,
             collection_name="text_collection",
+            embedding_dim=text_embedding_dim
         )
+        image_embedding_dim = self.image_embedder.model.config.hidden_size
         self.image_vector_db_manager = VectorDBManager(
             client=self.client,
             collection_name="image_collection",
             embedding_dim=image_embedding_dim
         )
+        audio_embedding_dim = self.audio_embedder.model.config.projection_dim
         self.audio_vector_db_manager = VectorDBManager(
             client=self.client,
             collection_name="audio_collection",
             embedding_dim=audio_embedding_dim
         )
         logger.info("IngestionService initialized successfully.")
     def ingest_files(self, file_paths: List[str]):
+        '''Ingest files without displaying progress bar'''
         return self.ingest_files_with_progress(file_paths, None)
     def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
         """
+        Turn on progress bar for tracking
         """
         logger.info(f"Starting ingestion for {len(file_paths)} files...")
+        # Kiểm tra và xử lý progress_callback an toàn
+        def safe_progress(value, desc=""):
+            try:
+                if progress_callback is not None:
+                    progress_callback(value, desc=desc)
+            except Exception as e:
+                logger.warning(f"Progress callback error: {e}")
+        safe_progress(0.4, desc="Starting file processing...")
         all_chunks_to_process = []
+        # 1. Walk through files to split chunks
         for i, file_path in enumerate(file_paths):
             try:
+                base_progress = 0.4 + (i / len(file_paths)) * 0.3  # 40% -> 70%
+                file_name = os.path.basename(file_path)
+                safe_progress(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
+                file_ext = os.path.splitext(file_path)[1].lower()
+                chunks = []
+                safe_progress(base_progress + 0.01, desc=f"Reading {file_name}...")
                 if file_ext in ['.txt']:
                     chunks = self.text_processor.process(file_path)
                 elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
                     chunks = self.image_processor.process(file_path)
                 elif file_ext in ['.wav', '.mp3']:
                     chunks = self.audio_processor.process(file_path)
                 else:
                     logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
                     continue
+                # Kiểm tra chunks có hợp lệ không
+                if not chunks or len(chunks) == 0:
+                    logger.warning(f"No chunks generated from file: {file_path}")
+                    continue
+                safe_progress(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
                 all_chunks_to_process.extend(chunks)
             except Exception as e:
         if not all_chunks_to_process:
             logger.warning("No processable chunks were generated from the provided files.")
+            safe_progress(1.0, desc="No chunks to process")
             return
         logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
+        safe_progress(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
+        # 2. Create embeddings and add to batch
         text_embeddings_batch, text_metadatas_batch = [], []
         audio_embeddings_batch, audio_metadatas_batch = [], []
         image_embeddings_batch, image_metadatas_batch = [], []
         BATCH_SIZE = 32
         for i, chunk_data in enumerate(all_chunks_to_process):
             try:
+                base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25  # 70% -> 95%
+                # Kiểm tra chunk_data có hợp lệ không
+                if not chunk_data or 'metadata' not in chunk_data or 'content' not in chunk_data:
+                    logger.warning(f"Invalid chunk data at index {i}, skipping...")
+                    continue
+                chunk_type = chunk_data['metadata'].get('type', 'unknown')
+                content = chunk_data['content']
+                chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
+                # Kiểm tra content có hợp lệ không
+                if not content:
+                    logger.warning(f"Empty content for chunk {chunk_id}, skipping...")
+                    continue
+                safe_progress(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
                 if chunk_type == "text":
+                    safe_progress(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
+                    embeddings = self.text_embedder.get_embeddings([content])
+                    if embeddings and len(embeddings) > 0:
+                        text_embeddings_batch.append(embeddings[0])
+                        text_metadatas_batch.append(chunk_data)
+                    else:
+                        logger.warning(f"Failed to generate text embedding for chunk {chunk_id}")
                 elif chunk_type == "audio":
+                    safe_progress(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
+                    embeddings = self.audio_embedder.get_embeddings([content])
+                    if embeddings and len(embeddings) > 0:
+                        audio_embeddings_batch.append(embeddings[0])
+                        audio_metadatas_batch.append(chunk_data)
+                    else:
+                        logger.warning(f"Failed to generate audio embedding for chunk {chunk_id}")
                 elif chunk_type == "image":
+                    safe_progress(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
+                    embeddings = self.image_embedder.get_embeddings([content])
+                    if embeddings and len(embeddings) > 0:
+                        image_embeddings_batch.append(embeddings[0])
+                        image_metadatas_batch.append(chunk_data)
+                    else:
+                        logger.warning(f"Failed to generate image embedding for chunk {chunk_id}")
+                # add batch when reaching BATCH_SIZE
                 if len(text_embeddings_batch) >= BATCH_SIZE:
+                    safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
                     self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
                     text_embeddings_batch, text_metadatas_batch = [], []
                 if len(audio_embeddings_batch) >= BATCH_SIZE:
+                    safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
                     self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
                     audio_embeddings_batch, audio_metadatas_batch = [], []
                 if len(image_embeddings_batch) >= BATCH_SIZE:
+                    safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
                     self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
                     image_embeddings_batch, image_metadatas_batch = [], []
             except Exception as e:
+                logger.error(f"Error ingesting chunk {i}: {e}")
+                continue
+        safe_progress(0.95, desc="Saving final batches...")
+        # adding maintaining embeddings
         final_operations = []
         if text_embeddings_batch:
             final_operations.append(("text", len(text_embeddings_batch)))
         if image_embeddings_batch:
             final_operations.append(("image", len(image_embeddings_batch)))
+        # Tránh chia cho 0
+        total_operations = len(final_operations)
+        if total_operations == 0:
+            safe_progress(1.0, desc="No final batches to save")
+        else:
+            for i, (batch_type, count) in enumerate(final_operations):
+                try:
+                    current_progress = 0.95 + (i / total_operations) * 0.04  # 95% -> 99%
+                    if batch_type == "text" and text_embeddings_batch:
+                        safe_progress(current_progress, desc=f"Saving final {count} text embeddings...")
+                        self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
+                    elif batch_type == "audio" and audio_embeddings_batch:
+                        safe_progress(current_progress, desc=f"Saving final {count} audio embeddings...")
+                        self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
+                    elif batch_type == "image" and image_embeddings_batch:
+                        safe_progress(current_progress, desc=f"Saving final {count} image embeddings...")
+                        self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
+                except Exception as e:
+                    logger.error(f"Error saving final batch {batch_type}: {e}")
+        safe_progress(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
         logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")

main.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# main.py
+import os
+import sys
+import shutil
+import atexit
+import signal
+# add project folder to sys.path
+project_root = os.path.dirname(os.path.abspath(__file__))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+from config.settings import settings
+from utils.logger import logger
+from app import create_and_run_app
+from app import shared_qdrant_client
+GLOBAL_QDRANT_CLIENT = shared_qdrant_client
+def cleanup():
+    logger.info("--- Starting cleanup process ---")
+    # --- Step 1: Close Qdrant connection ---
+    # release file .lock
+    global GLOBAL_QDRANT_CLIENT
+    if GLOBAL_QDRANT_CLIENT:
+        try:
+            logger.info("Closing Qdrant client connection...")
+            GLOBAL_QDRANT_CLIENT.close()
+            logger.success("Qdrant client closed successfully.")
+        except Exception as e:
+            logger.error(f"Error closing Qdrant client: {e}")
+    # Step 2: Clean up "qdrant_data" folder
+    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
+    if os.path.exists(qdrant_db_path) and os.path.isdir(qdrant_db_path):
+        try:
+            # try many times just in case
+            import time
+            retries = 3
+            for i in range(retries):
+                try:
+                    shutil.rmtree(qdrant_db_path)
+                    logger.success(f"Successfully cleaned up Qdrant data at: {qdrant_db_path}")
+                    break
+                except OSError as e:
+                    if i < retries - 1:
+                        logger.warning(f"Cleanup attempt {i+1} failed: {e}. Retrying in 1 second...")
+                        time.sleep(1)
+                    else:
+                        raise
+        except Exception as e:
+            logger.error(f"Error cleaning up Qdrant data after retries: {e}")
+    else:
+        logger.info("Qdrant data directory not found, skipping cleanup.")
+    # Step 3: Clean up "raw" folder
+    raw_data_path = settings.RAW_DATA_DIR
+    if os.path.exists(raw_data_path) and os.path.isdir(raw_data_path):
+        try:
+            for item in os.listdir(raw_data_path):
+                item_path = os.path.join(raw_data_path, item)
+                if os.path.isdir(item_path): shutil.rmtree(item_path)
+                elif os.path.isfile(item_path): os.remove(item_path)
+            logger.success(f"Successfully cleaned up raw data directory: {raw_data_path}")
+        except Exception as e:
+            logger.error(f"Error cleaning up raw data: {e}")
+    else:
+        logger.info("Raw data directory not found, skipping cleanup.")
+    # Step 4: Clean up "processed/chunks" folder
+    chunks_data_path = settings.CHUNKS_DIR
+    if os.path.exists(chunks_data_path) and os.path.isdir(chunks_data_path):
+        try:
+            for item in os.listdir(chunks_data_path):
+                item_path = os.path.join(chunks_data_path, item)
+                if os.path.isdir(item_path): shutil.rmtree(item_path)
+                elif os.path.isfile(item_path): os.remove(item_path)
+            logger.success(f"Successfully cleaned up processed/chunks data directory: {chunks_data_path}")
+        except Exception as e:
+            logger.error(f"Error cleaning up processed/chunks data: {e}")
+    else:
+        logger.info("processed/chunks data directory not found, skipping cleanup.")
+    logger.info("--- Cleanup process finished ---")
+def signal_handler(sig, frame):
+    """
+    Xử lý tín hiệu ngắt (Ctrl+C) để thoát chương trình một cách an toàn.
+    """
+    logger.warning("\nCtrl+C detected. Shutting down the application.")
+    # atexit sẽ tự động gọi cleanup()
+    sys.exit(0)
+def main():
+    # Đăng ký hàm cleanup để được gọi khi chương trình thoát bình thường hoặc có lỗi
+    atexit.register(cleanup)
+    # Đăng ký hàm xử lý tín hiệu cho Ctrl+C
+    signal.signal(signal.SIGINT, signal_handler)
+    logger.info("--- Starting Multimedia RAG Assistant ---")
+    demo = create_and_run_app()
+    print("\n" + "="*50)
+    print("     Application is running. Press Ctrl+C to exit.     ")
+    print("     Cleanup will be performed upon exit.               ")
+    print("="*50 + "\n")
+    demo.launch()
+if __name__ == "__main__":
+    main()

scripts/ingest_data.py DELETED Viewed

@@ -1,203 +0,0 @@
-# scripts/ingest_data.py
-import os
-import json
-from tqdm import tqdm
-from pathlib import Path
-from typing import Dict
-import shutil
-from config.settings import settings
-from utils.logger import logger
-from qdrant_client import QdrantClient
-# Import các Processor (không thay đổi)
-from core.data_processing.text_processor import TextProcessor
-from core.data_processing.audio_processor import AudioProcessor
-# from core.data_processing.video_processor import VideoProcessor
-from core.data_processing.image_processor import ImageProcessor
-# Import các Embedding Model (không thay đổi)
-from core.embeddings.text_embedding_model import TextEmbeddingModel
-from core.embeddings.image_embedding_model import ImageEmbeddingModel
-from core.embeddings.audio_embedding_model import AudioEmbeddingModel
-# Import VectorDBManager phiên bản Qdrant MỚI
-from core.retrieval.vector_db_manager import VectorDBManager
-def walk_through_files(extentions: Dict, raw_dir: str, all_raw_chunks_from_processors, processor):
-    all_files = list(raw_dir.rglob("*"))
-    for filepath in tqdm(all_files, desc="Processing " + raw_dir.name):
-        if filepath.suffix in extentions and filepath.is_file():
-            all_raw_chunks_from_processors.extend(
-                processor.process(str(filepath))
-            )
-def ingest_data_pipeline():
-    logger.info("Starting comprehensive data ingestion pipeline (Chunking + Embedding + Qdrant Indexing)...")
-    # --- 1. Khởi tạo các Processor --- (Không thay đổi)
-    text_processor = TextProcessor(chunk_size=500, chunk_overlap=50)
-    audio_processor = AudioProcessor(min_silence_len=1000, silence_thresh_db=-40, target_sr=16000)
-    image_processor = ImageProcessor()
-    # video_processor = VideoProcessor(chunk_duration_sec=15, frames_per_segment=5)
-    # --- Dọn dẹp các thư mục chunk và Qdrant data cũ ---
-    dirs_to_clean_and_create = [
-        settings.CHUNKS_DIR,
-        settings.METADATA_DIR
-    ]
-    # Thư mục dữ liệu của Qdrant
-    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
-    dirs_to_clean_and_create.append(qdrant_db_path)
-    for dir_path in dirs_to_clean_and_create:
-        if os.path.exists(dir_path):
-            shutil.rmtree(dir_path)
-            logger.info(f"Cleaned up old directory: {dir_path}")
-        # Tạo lại các thư mục cho chunking, trừ thư mục qdrant (client sẽ tự tạo)
-        if dir_path != qdrant_db_path:
-            os.makedirs(dir_path, exist_ok=True)
-    logger.info("Output directories and previous Qdrant data are ready for fresh ingestion.")
-    qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
-    client = QdrantClient(path=qdrant_db_path)
-    logger.info(f"Single Qdrant client initialized for ingestion, connected to: {qdrant_db_path}")
-    all_raw_chunks_from_processors = [] # Chứa tất cả các chunk (bao gồm content và metadata)
-    # --- 2. Chạy Data Processing (Chunking) --- (Không thay đổi)
-    logger.info("--- Phase 1: Processing Raw Data into Chunks ---")
-    # Xử lý Văn bản
-    text_extentions = {".txt"}
-    text_raw_dir = Path(settings.RAW_DATA_DIR) / "texts"
-    walk_through_files(text_extentions, text_raw_dir, all_raw_chunks_from_processors, text_processor)
-    # Xử lý Âm thanh
-    audio_extentions = {".wav", ".mp3"}
-    audio_raw_dir = Path(settings.RAW_DATA_DIR) / "audios"
-    walk_through_files(audio_extentions, audio_raw_dir, all_raw_chunks_from_processors, audio_processor)
-    # process images
-    image_extentions = {".jpg", ".png"}
-    image_raw_dir = Path(settings.RAW_DATA_DIR) / "images"
-    walk_through_files(image_extentions, image_raw_dir, all_raw_chunks_from_processors, image_processor)
-    # Xử lý Video
-    # video_raw_dir = os.path.join(settings.RAW_DATA_DIR, "videos")
-    # for filename in tqdm(os.listdir(video_raw_dir), desc="Processing Video"):
-    #     if filename.endswith((".mp4", ".avi", ".mov")):
-    #         all_raw_chunks_from_processors.extend(video_processor.process_video(os.path.join(video_raw_dir, filename)))
-    logger.info(f"Total raw chunks processed from all sources: {len(all_raw_chunks_from_processors)}")
-    # --- 3. Tạo Embedding và Thêm vào Qdrant ---
-    logger.info("--- Phase 2: Generating Embeddings and Building Qdrant Collections ---")
-    # Khởi tạo các Embedding Model
-    text_embedder = TextEmbeddingModel()
-    image_embedder = ImageEmbeddingModel()
-    audio_embedder = AudioEmbeddingModel()
-    # --- Khởi tạo các VectorDBManager cho Qdrant ---
-    # Lấy kích thước embedding từ model để đảm bảo chính xác
-    text_embedding_dim = text_embedder.model.get_sentence_embedding_dimension()
-    text_vector_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_embedding_dim, client=client)
-    # Kích thước embedding cho image/audio (giả định là 512)
-    image_embedding_dim = 512
-    image_vector_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_embedding_dim, client=client)
-    # video_frame_embedding_dim = 512
-    # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
-    audio_embedding_dim = 512
-    audio_vector_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=image_embedding_dim, client=client)
-    logger.info(f"Initialized Text Qdrant Collection Manager with {text_embedding_dim}D.")
-    logger.info(f"Initialized Image Qdrant Collection Manager with {image_embedding_dim}D.")
-    logger.info(f"Initialized Audio Qdrant Collection Manager with {audio_embedding_dim}D.")
-    # Tạo các batch để thêm vào Qdrant hiệu quả hơn
-    text_embeddings_batch = []
-    text_metadatas_batch = []
-    image_embeddings_batch = []
-    image_metadatas_batch = []
-    # video_frame_embeddings_batch = []
-    # video_frame_metadatas_batch = []
-    audio_embeddings_batch = []
-    audio_metadatas_batch = []
-    BATCH_SIZE = 32 # Thêm 32 điểm một lần
-    for chunk_data in tqdm(all_raw_chunks_from_processors, desc="Generating Embeddings & Populating Qdrant"):
-        chunk_type = chunk_data['metadata']['type']
-        content = chunk_data['content']
-        try:
-            if chunk_type == "text":
-                embedding = text_embedder.get_embeddings([content])[0]
-                text_embeddings_batch.append(embedding)
-                text_metadatas_batch.append(chunk_data)
-            elif chunk_type == "audio":
-                embedding = audio_embedder.get_embeddings([content])[0]
-                audio_embeddings_batch.append(embedding)
-                audio_metadatas_batch.append(chunk_data)
-            elif chunk_type == "image":
-                embedding = image_embedder.get_embeddings([content])[0]
-                image_embeddings_batch.append(embedding)
-                image_metadatas_batch.append(chunk_data)
-            # elif chunk_type == "video_frame":
-            #     if content and isinstance(content, list) and len(content) > 0:
-            #         embedding = image_embedder.get_embeddings([content[0]])[0] # Chỉ nhúng ảnh đầu tiên
-            #         video_frame_embeddings_batch.append(embedding)
-            #         video_frame_metadatas_batch.append(chunk_data['metadata'])
-            # Xử lý batch
-            if len(text_embeddings_batch) >= BATCH_SIZE:
-                text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
-                text_embeddings_batch, text_metadatas_batch = [], [] # Reset batch
-            if len(audio_embeddings_batch) >= BATCH_SIZE:
-                audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
-                audio_embeddings_batch, audio_metadatas_batch = [], [] # Reset batch
-            if len(image_embeddings_batch) >= BATCH_SIZE:
-                image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
-                image_embeddings_batch, image_metadatas_batch = [], [] # Reset batch
-            # if len(video_frame_embeddings_batch) >= BATCH_SIZE:
-            #     video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
-            #     video_frame_embeddings_batch, video_frame_metadatas_batch = [], [] # Reset batch
-        except Exception as e:
-            logger.error(f"Error processing chunk {chunk_data['metadata']['chunk_id']}: {e}")
-    # Thêm các embedding còn lại trong batch cuối cùng
-    if text_embeddings_batch:
-        text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
-    if audio_embeddings_batch:
-        audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
-    if image_embeddings_batch:
-        image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
-    # if video_frame_embeddings_batch:
-    #     video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
-    logger.success("Finished populating Qdrant collections.")
-    logger.info(f"Total vectors in 'text_collection': {text_vector_db_manager.get_total_vectors()}")
-    logger.info(f"Total vectors in 'audio_collection': {audio_vector_db_manager.get_total_vectors()}")
-    logger.info(f"Total vectors in 'image_collection': {image_vector_db_manager.get_total_vectors()}")
-    # logger.info(f"Total vectors in 'video_frame_collection': {video_frame_vector_db_manager.get_total_vectors()}")
-    logger.info("Data ingestion pipeline completed successfully!")
-if __name__ == "__main__":
-    ingest_data_pipeline()

utils/logger.py CHANGED Viewed

@@ -1,24 +1,24 @@
 import sys
 from loguru import logger
-from config.settings import settings # Import settings của chúng ta
-# Cấu hình logger
-logger.remove() # Gỡ bỏ cấu hình mặc định
 logger.add(
-    "logs/file_{time}.log", # Lưu log vào tệp, với tên tệp theo thời gian
-    rotation="10 MB",     # Xoay tệp log khi đạt 10MB
-    compression="zip",    # Nén tệp log cũ
-    level=settings.LOG_LEVEL, # Mức độ log từ settings
-    colorize=True,        # Tô màu output trên console
     format="{time} {level} {message}",
-    enqueue=True # Sử dụng hàng đợi để ghi log không chặn (quan trọng cho các ứng dụng đa luồng/async)
 )
 logger.add(
-    sys.stderr, # Ghi log ra console
     level=settings.LOG_LEVEL,
     colorize=True,
     format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
-)
-# Xuất logger để các module khác có thể import và sử dụng
-__all__ = ["logger"]

+import os
 import sys
 from loguru import logger
+from config.settings import settings
+# logger configuration
+logger.remove() # remove default config
+log_path = os.path.join(settings.LOG_DIR, "file_{time}.log")
 logger.add(
+    log_path,
+    rotation="10 MB",
+    compression="zip",
+    level=settings.LOG_LEVEL, # log level from settings
+    colorize=True,
     format="{time} {level} {message}",
+    enqueue=True
 )
 logger.add(
+    sys.stderr, # output to console
     level=settings.LOG_LEVEL,
     colorize=True,
     format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
+)