3v324v23 commited on
Commit
be398ac
·
1 Parent(s): a805413
.gitignore CHANGED
@@ -204,4 +204,9 @@ cython_debug/
204
  # Marimo
205
  marimo/_static/
206
  marimo/_lsp/
207
- __marimo__/
 
 
 
 
 
 
204
  # Marimo
205
  marimo/_static/
206
  marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # main function
210
+ video_processor.py
211
+ ingestions/ingest_data.py
212
+ test_config_log.py
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.39.0
8
- app_file: app.py
9
  pinned: false
10
  short_description: My small project while preparing for AIC
11
  ---
 
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.39.0
8
+ app_file: main.py
9
  pinned: false
10
  short_description: My small project while preparing for AIC
11
  ---
app.py CHANGED
@@ -1,32 +1,23 @@
1
  # app/main.py
2
  import gradio as gr
3
  import os
4
- import sys
5
- import shutil
6
  import zipfile
7
- from typing import List, Dict, Any
8
- from pathlib import Path
9
-
10
- # Thêm thư mục gốc của dự án vào Python Path để có thể import các module
11
- project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
12
- if project_root not in sys.path:
13
- sys.path.insert(0, project_root)
14
 
 
15
  from utils.logger import logger
16
  from config.settings import settings
17
  from qdrant_client import QdrantClient
18
  from core.retrieval.retriever import Retriever
19
- from scripts.ingestion import IngestionService
20
 
21
- # --- 1. Khởi tạo các dịch vụ toàn cục ---
22
  logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
23
  try:
24
- # Tạo MỘT QdrantClient duy nhất để chia sẻ
25
  qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
26
  shared_qdrant_client = QdrantClient(path=qdrant_db_path)
27
  logger.info("Shared Qdrant client initialized.")
28
 
29
- # Khởi tạo các dịch vụ, chia sẻ client
30
  ingestion_service = IngestionService(client=shared_qdrant_client)
31
  retriever_instance = Retriever(client=shared_qdrant_client)
32
 
@@ -35,12 +26,7 @@ except Exception as e:
35
  logger.error(f"Failed to initialize global services: {e}")
36
  raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
37
 
38
-
39
- # ---- HÀM XỬ LÝ CHO TAB UPLOAD ----
40
  def upload_handler(zip_path: str, progress=gr.Progress()):
41
- """
42
- Hàm này xử lý việc upload file và thư mục với progress bar.
43
- """
44
  progress(0, desc="🚀 Starting upload process...")
45
 
46
  if not zip_path:
@@ -63,7 +49,7 @@ def upload_handler(zip_path: str, progress=gr.Progress()):
63
 
64
  logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
65
 
66
- # --- Bước 1: Thu thập tất cả các đường dẫn file từ input ---
67
  path = Path(settings.RAW_DATA_DIR)
68
  all_temp_file_paths = list(path.rglob("*"))
69
  all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
@@ -81,11 +67,11 @@ def upload_handler(zip_path: str, progress=gr.Progress()):
81
  if not files_to_ingest:
82
  return "No valid files were moved for ingestion."
83
 
84
- # --- Bước 3: Gọi dịch vụ để nạp tất cả các file một lần ---
85
  try:
86
  progress(0.4, desc="🔄 Starting file ingestion...")
87
  # Gọi hàm ingestion với progress callback
88
- ingestion_service.ingest_files_with_progress(files_to_ingest)
89
 
90
  success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
91
  logger.success(success_message)
@@ -268,6 +254,6 @@ def create_and_run_app():
268
  return demo
269
 
270
  # --- 4. Chạy ứng dụng ---
271
- logger.info("Launching Gradio interface...")
272
- demo = create_and_run_app()
273
- demo.launch()
 
1
  # app/main.py
2
  import gradio as gr
3
  import os
 
 
4
  import zipfile
 
 
 
 
 
 
 
5
 
6
+ from pathlib import Path
7
  from utils.logger import logger
8
  from config.settings import settings
9
  from qdrant_client import QdrantClient
10
  from core.retrieval.retriever import Retriever
11
+ from ingestions.ingestion import IngestionService
12
 
13
+ # --- Initialize global services ---
14
  logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
15
  try:
16
+ # Create ONE QdrantClient only for sharing
17
  qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
18
  shared_qdrant_client = QdrantClient(path=qdrant_db_path)
19
  logger.info("Shared Qdrant client initialized.")
20
 
 
21
  ingestion_service = IngestionService(client=shared_qdrant_client)
22
  retriever_instance = Retriever(client=shared_qdrant_client)
23
 
 
26
  logger.error(f"Failed to initialize global services: {e}")
27
  raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
28
 
 
 
29
  def upload_handler(zip_path: str, progress=gr.Progress()):
 
 
 
30
  progress(0, desc="🚀 Starting upload process...")
31
 
32
  if not zip_path:
 
49
 
50
  logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
51
 
52
+ # --- Retrieve all file path from input ---
53
  path = Path(settings.RAW_DATA_DIR)
54
  all_temp_file_paths = list(path.rglob("*"))
55
  all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
 
67
  if not files_to_ingest:
68
  return "No valid files were moved for ingestion."
69
 
70
+ # Start ingesting data
71
  try:
72
  progress(0.4, desc="🔄 Starting file ingestion...")
73
  # Gọi hàm ingestion với progress callback
74
+ ingestion_service.ingest_files_with_progress(files_to_ingest, progress)
75
 
76
  success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
77
  logger.success(success_message)
 
254
  return demo
255
 
256
  # --- 4. Chạy ứng dụng ---
257
+ # logger.info("Launching Gradio interface...")
258
+ # demo = create_and_run_app()
259
+ # demo.launch()
config/database_configs.py DELETED
File without changes
config/model_configs.py CHANGED
@@ -1,17 +1,14 @@
1
  # config/model_configs.py
2
 
3
  # Embedding Models
4
- TEXT_EMBEDDING_MODEL: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
5
- IMAGE_EMBEDDING_MODEL: str = "openai/clip-vit-base-patch32" # Hoặc các mô hình CLIP khác
6
- AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused" # Ví dụ về mô hình CLAP
7
 
8
  # Generator Model (LLM/LMM)
9
- GENERATOR_MODEL_NAME: str = "gpt-4o" # Hoặc "google/gemma-2b", "meta-llama/Llama-2-7b-chat-hf", "llava-hf/llava-1.5-7b-hf"
10
  GENERATOR_MODEL_MAX_TOKENS: int = 4096
11
  GENERATOR_MODEL_TEMPERATURE: float = 0.7
12
 
13
  # Reranker Model
14
- RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
15
-
16
- # Automatic Speech Recognition (ASR) Model (Ví dụ với Whisper của Hugging Face)
17
- ASR_MODEL: str = "openai/whisper-tiny" # Có thể dùng "base", "small", "medium" tùy tài nguyên GPU
 
1
  # config/model_configs.py
2
 
3
  # Embedding Models
4
+ TEXT_EMBEDDING_MODEL: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
5
+ IMAGE_EMBEDDING_MODEL: str = "google/vit-base-patch16-224-in21k"
6
+ AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused"
7
 
8
  # Generator Model (LLM/LMM)
9
+ GENERATOR_MODEL_NAME: str = "gpt-4o"
10
  GENERATOR_MODEL_MAX_TOKENS: int = 4096
11
  GENERATOR_MODEL_TEMPERATURE: float = 0.7
12
 
13
  # Reranker Model
14
+ RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 
 
 
config/settings.py CHANGED
@@ -6,25 +6,25 @@ from dotenv import load_dotenv
6
  load_dotenv()
7
 
8
  class Settings(BaseSettings):
 
 
9
  APP_NAME: str = "Multimedia RAG Assistant"
10
  APP_VERSION: str = "0.1.0"
11
  ENVIRONMENT: str = "development"
12
 
13
- DATA_DIR: str = "data"
14
- RAW_DATA_DIR: str = os.path.join("data", "raw")
15
- PROCESSED_DATA_DIR: str = os.path.join("data", "processed")
16
- CHUNKS_DIR: str = os.path.join("data", "processed", "chunks")
17
- METADATA_DIR: str = os.path.join("data", "processed", "metadata")
18
- EMBEDDINGS_DIR: str = os.path.join("data", "processed", "embeddings")
19
 
20
  API_HOST: str = "0.0.0.0"
21
  API_PORT: int = 8000
22
 
23
- # Cấu hình mô hình
24
- # Đây là nơi bạn sẽ thêm các API key hoặc model IDs sau này
25
- HUGGINGFACE_API_KEY: Optional[str] = None # Ví dụ: Nếu dùng Hugging Face models
26
 
27
- # Cấu hình logger
28
  LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
29
 
30
  model_config = SettingsConfigDict(
 
6
  load_dotenv()
7
 
8
  class Settings(BaseSettings):
9
+ BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10
+
11
  APP_NAME: str = "Multimedia RAG Assistant"
12
  APP_VERSION: str = "0.1.0"
13
  ENVIRONMENT: str = "development"
14
 
15
+ DATA_DIR: str = os.path.join(BASE_DIR, "data")
16
+ RAW_DATA_DIR: str = os.path.join(DATA_DIR, "raw")
17
+ PROCESSED_DATA_DIR: str = os.path.join(DATA_DIR, "processed")
18
+ CHUNKS_DIR: str = os.path.join(DATA_DIR, "processed", "chunks")
19
+ METADATA_DIR: str = os.path.join(DATA_DIR, "processed", "metadata")
20
+ EMBEDDINGS_DIR: str = os.path.join(DATA_DIR, "processed", "embeddings")
21
 
22
  API_HOST: str = "0.0.0.0"
23
  API_PORT: int = 8000
24
 
25
+ HUGGINGFACE_API_KEY: Optional[str] = None
 
 
26
 
27
+ LOG_DIR: str = "logs"
28
  LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
29
 
30
  model_config = SettingsConfigDict(
core/data_processing/audio_processor.py CHANGED
@@ -37,7 +37,7 @@ class AudioProcessor:
37
  segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
38
  chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
39
 
40
- # Lưu segment thành file WAV tạm thời
41
  segment.export(chunk_file_path, format="wav")
42
 
43
  metadata = {
@@ -45,8 +45,6 @@ class AudioProcessor:
45
  "type": "audio",
46
  "chunk_id": segment_id,
47
  "chunk_data_path": chunk_file_path,
48
- # "start_time_ms": int(segment.start_time),
49
- # "end_time_ms": int(segment.end_time),
50
  "duration_ms": len(segment)
51
  }
52
  chunks.append({
@@ -60,21 +58,4 @@ class AudioProcessor:
60
  return []
61
  except Exception as e:
62
  logger.error(f"Error processing audio file {file_path}: {e}")
63
- return []
64
-
65
- # Ví dụ sử dụng (giữ nguyên để kiểm tra)
66
- if __name__ == "__main__":
67
- sample_audio_path = os.path.join(settings.RAW_DATA_DIR, "audios", "sample_audio.wav")
68
- if not os.path.exists(sample_audio_path):
69
- print(f"ERROR: Sample audio not found at {sample_audio_path}. Please create it first.")
70
- print("Make sure you have ffmpeg installed and available in your PATH for pydub to work.")
71
- else:
72
- processor = AudioProcessor()
73
- audio_chunks = processor.process(sample_audio_path)
74
-
75
- for i, chunk in enumerate(audio_chunks):
76
- print(f"\n--- Audio Chunk {i+1} ---")
77
- print(f"Type: {chunk['metadata']['type']}")
78
- print(f"Content (path): {chunk['content']}")
79
- print(f"Metadata: {chunk['metadata']}")
80
- # Bạn có thể thử mở file chunk['content'] để nghe
 
37
  segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
38
  chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
39
 
40
+ # Save segments into data/processed/chunks
41
  segment.export(chunk_file_path, format="wav")
42
 
43
  metadata = {
 
45
  "type": "audio",
46
  "chunk_id": segment_id,
47
  "chunk_data_path": chunk_file_path,
 
 
48
  "duration_ms": len(segment)
49
  }
50
  chunks.append({
 
58
  return []
59
  except Exception as e:
60
  logger.error(f"Error processing audio file {file_path}: {e}")
61
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/data_processing/image_processor.py CHANGED
@@ -1,7 +1,7 @@
1
  # core/data_processing/image_processor.py
2
- from typing import List, Dict, Any
3
  import os
4
- from PIL import Image
 
5
  from utils.logger import logger
6
 
7
  class ImageProcessor:
@@ -16,14 +16,7 @@ class ImageProcessor:
16
  logger.error(f"Image file not found: {file_path}")
17
  return []
18
 
19
- with Image.open(file_path) as img:
20
- width, height = img.size
21
- img_format = img.format
22
-
23
- file_size = os.path.getsize(file_path)
24
-
25
- # Tạo một ID duy nhất cho chunk này
26
- # Lấy tên file không bao gồm phần mở rộng
27
  base_name = os.path.basename(file_path)
28
  chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
29
 
@@ -31,59 +24,16 @@ class ImageProcessor:
31
  "source_id": base_name,
32
  "type": "image",
33
  "chunk_id": chunk_id,
34
- "chunk_data_path": file_path,
35
- "image_width": width,
36
- "image_height": height,
37
- "image_format": img_format,
38
- "file_size_bytes": file_size
39
  }
40
 
41
- # Tạo chunk
42
- # Content sẽ là đường dẫn đến file, giống như audio/video segments
43
  chunk = {
44
  "content": file_path,
45
  "metadata": metadata
46
  }
47
 
48
- # Trả về một danh sách chứa một chunk duy nhất
49
  return [chunk]
50
 
51
  except Exception as e:
52
  logger.error(f"Error processing image file {file_path}: {e}")
53
- return []
54
-
55
- # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
56
- if __name__ == "__main__":
57
- from config.settings import settings
58
- import os
59
-
60
- # Tạo một ảnh dummy để kiểm tra
61
- dummy_image_dir = os.path.join(settings.RAW_DATA_DIR, "images")
62
- os.makedirs(dummy_image_dir, exist_ok=True)
63
- dummy_image_path = os.path.join(dummy_image_dir, "test_image.jpg")
64
-
65
- try:
66
- # Tạo một ảnh mẫu màu xanh
67
- dummy_img = Image.new('RGB', (100, 150), color = 'blue')
68
- dummy_img.save(dummy_image_path)
69
- print(f"Created a dummy image for testing at: {dummy_image_path}")
70
-
71
- # Khởi tạo processor và xử lý ảnh
72
- processor = ImageProcessor()
73
- image_chunks = processor.process(dummy_image_path)
74
-
75
- if image_chunks:
76
- print("\n--- Image Chunk Processed ---")
77
- chunk = image_chunks[0]
78
- print(f"Content (path): {chunk['content']}")
79
- print("Metadata:")
80
- for key, value in chunk['metadata'].items():
81
- print(f" - {key}: {value}")
82
- else:
83
- print("Failed to process the dummy image.")
84
-
85
- finally:
86
- # Dọn dẹp ảnh dummy
87
- if os.path.exists(dummy_image_path):
88
- os.remove(dummy_image_path)
89
- print(f"Cleaned up dummy image: {dummy_image_path}")
 
1
  # core/data_processing/image_processor.py
 
2
  import os
3
+
4
+ from typing import List, Dict, Any
5
  from utils.logger import logger
6
 
7
  class ImageProcessor:
 
16
  logger.error(f"Image file not found: {file_path}")
17
  return []
18
 
19
+ # create id by filename
 
 
 
 
 
 
 
20
  base_name = os.path.basename(file_path)
21
  chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
22
 
 
24
  "source_id": base_name,
25
  "type": "image",
26
  "chunk_id": chunk_id,
27
+ "chunk_data_path": file_path
 
 
 
 
28
  }
29
 
 
 
30
  chunk = {
31
  "content": file_path,
32
  "metadata": metadata
33
  }
34
 
 
35
  return [chunk]
36
 
37
  except Exception as e:
38
  logger.error(f"Error processing image file {file_path}: {e}")
39
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/data_processing/text_processor.py CHANGED
@@ -26,14 +26,11 @@ class TextProcessor:
26
 
27
  chunks = []
28
  for i, chunk_content in enumerate(split_texts):
29
- start_char_idx = text.find(chunk_content) # find start index of each chunk_content
30
  chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
31
  metadata = {
32
  "source_id": os.path.basename(file_path),
33
  "type": "text",
34
  "chunk_id": chunk_id,
35
- "start_char_index": start_char_idx, # Vị trí ký tự bắt đầu
36
- "end_char_index": start_char_idx + len(chunk_content), # Vị trí ký tự kết thúc
37
  "content_length": len(chunk_content)
38
  }
39
  chunks.append({
@@ -44,21 +41,4 @@ class TextProcessor:
44
  return chunks
45
  except Exception as e:
46
  logger.error(f"Error processing text document {file_path}: {e}")
47
- return []
48
-
49
- # Ví dụ sử dụng (giữ nguyên để kiểm tra)
50
- if __name__ == "__main__":
51
- from config.settings import settings
52
- import os
53
-
54
- sample_doc_path = os.path.join(settings.RAW_DATA_DIR, "documents", "sample_document.txt")
55
- if not os.path.exists(sample_doc_path):
56
- print(f"ERROR: Sample document not found at {sample_doc_path}. Please create it first.")
57
- else:
58
- processor = TextProcessor(chunk_size=100, chunk_overlap=20) # Thử kích thước nhỏ hơn để thấy rõ chunk
59
- text_chunks = processor.process(sample_doc_path)
60
-
61
- for i, chunk in enumerate(text_chunks): # In tất cả các chunk để kiểm tra
62
- print(f"\n--- Chunk {i+1} ---")
63
- print(f"Content: {chunk['content']}") # In toàn bộ nội dung chunk
64
- print(f"Metadata: {chunk['metadata']}")
 
26
 
27
  chunks = []
28
  for i, chunk_content in enumerate(split_texts):
 
29
  chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
30
  metadata = {
31
  "source_id": os.path.basename(file_path),
32
  "type": "text",
33
  "chunk_id": chunk_id,
 
 
34
  "content_length": len(chunk_content)
35
  }
36
  chunks.append({
 
41
  return chunks
42
  except Exception as e:
43
  logger.error(f"Error processing text document {file_path}: {e}")
44
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/data_processing/video_processor.py DELETED
@@ -1,118 +0,0 @@
1
- # core/data_processing/video_processor.py
2
- import os
3
- import torch
4
- import cv2
5
- import numpy as np
6
-
7
- from typing import List, Dict, Any
8
- from utils.logger import logger
9
- from moviepy.editor import VideoFileClip
10
- from config.settings import settings
11
-
12
- class VideoProcessor:
13
- def __init__(self, chunk_duration_sec: int = 10, frames_per_segment: int = 5):
14
- self.chunk_duration_sec = chunk_duration_sec
15
- self.frames_per_segment = frames_per_segment
16
- logger.info(f"VideoProcessor initialized (chunk_duration={chunk_duration_sec}s, frames_per_segment={frames_per_segment}).")
17
-
18
- def process_video(self, file_path: str) -> List[Dict[str, Any]]:
19
- try:
20
- logger.info(f"Processing video file: {file_path}")
21
- video_clip = VideoFileClip(file_path)
22
- total_duration = video_clip.duration # Tổng thời lượng video (giây)
23
-
24
- all_chunks = []
25
-
26
- # Tạo thư mục con để lưu các frame/ảnh tạm thời
27
- image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks", os.path.basename(file_path).split('.')[0])
28
- os.makedirs(image_chunks_dir, exist_ok=True)
29
-
30
- # Tạo thư mục con để lưu các video segment tạm thời
31
- video_segments_dir = os.path.join(settings.CHUNKS_DIR, "video/video_segments", os.path.basename(file_path).split('.')[0])
32
- os.makedirs(video_segments_dir, exist_ok=True)
33
-
34
- current_time = 0.0
35
- chunk_idx = 0
36
-
37
- while current_time < total_duration:
38
- end_time = min(current_time + self.chunk_duration_sec, total_duration) # end time of each segment
39
- segment_clip = video_clip.subclip(current_time, end_time)
40
-
41
- segment_base_name = f"{os.path.basename(file_path).split('.')[0]}_segment_{chunk_idx}"
42
-
43
- frames_paths = []
44
-
45
- frame_timestamps = np.linspace(0, segment_clip.duration, self.frames_per_segment + 2)[1:-1]
46
-
47
- for ts in frame_timestamps:
48
- frame = segment_clip.get_frame(ts)
49
- frame_filename = f"{segment_base_name}_frame_{int(ts*1000)}.jpg"
50
- frame_path = os.path.join(image_chunks_dir, frame_filename)
51
- cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
52
- frames_paths.append(frame_path)
53
-
54
- # Tạo chunk cho các khung hình
55
- image_chunk_id = f"{segment_base_name}_image"
56
- all_chunks.append({
57
- "content": frames_paths, # Danh sách đường dẫn đến các file ảnh
58
- "metadata": {
59
- "source_id": os.path.basename(file_path),
60
- "type": "video_frame", # Loại chunk
61
- "chunk_id": image_chunk_id,
62
- "start_time_sec": current_time,
63
- "end_time_sec": end_time,
64
- "frame_paths": frames_paths # Lưu lại đường dẫn trong metadata
65
- }
66
- })
67
-
68
- # 2. Lưu đoạn video clip (optional, nhưng hữu ích cho video retrieval)
69
- video_segment_path = os.path.join(video_segments_dir, f"{segment_base_name}.mp4")
70
- segment_clip.write_videofile(video_segment_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
71
-
72
- # Tạo chunk cho video segment
73
- video_chunk_id = f"{segment_base_name}_video_clip"
74
- all_chunks.append({
75
- "content": video_segment_path, # Đường dẫn đến file video clip
76
- "metadata": {
77
- "source_id": os.path.basename(file_path),
78
- "type": "video_segment_clip", # Loại chunk mới: video clip
79
- "chunk_id": video_chunk_id,
80
- "start_time_sec": current_time,
81
- "end_time_sec": end_time,
82
- "chunk_data_path": video_segment_path # Lưu lại đường dẫn trong metadata
83
- }
84
- })
85
-
86
- current_time = end_time
87
- chunk_idx += 1
88
-
89
- video_clip.close() # Đảm bảo giải phóng tài nguyên
90
- logger.info(f"Generated {len(all_chunks)} chunks (frames & video segments) from video {file_path}")
91
- return all_chunks
92
- except FileNotFoundError:
93
- logger.error(f"Video file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
94
- return []
95
- except Exception as e:
96
- logger.error(f"Error processing video file {file_path}: {e}")
97
- return []
98
-
99
- # Ví dụ sử dụng (giữ nguyên để kiểm tra)
100
- if __name__ == "__main__":
101
- sample_video_path = os.path.join(settings.RAW_DATA_DIR, "videos", "sample_video.mp4")
102
- if not os.path.exists(sample_video_path):
103
- print(f"ERROR: Sample video not found at {sample_video_path}. Please create it first.")
104
- print("Make sure you have ffmpeg installed and available in your PATH for moviepy to work.")
105
- else:
106
- processor = VideoProcessor(chunk_duration_sec=5, frames_per_segment=3)
107
- video_chunks = processor.process_video(sample_video_path)
108
-
109
- for i, chunk in enumerate(video_chunks):
110
- print(f"\n--- Video Chunk {i+1} ---")
111
- print(f"Type: {chunk['metadata']['type']}")
112
- if chunk['metadata']['type'] == 'video_frames':
113
- print(f"Content (paths): {chunk['content']}")
114
- if chunk['content']:
115
- print(f"Sample frame: {chunk['content'][0]}")
116
- elif chunk['metadata']['type'] == 'video_segment_clip':
117
- print(f"Content (path): {chunk['content']}")
118
- print(f"Metadata: {chunk['metadata']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/embeddings/audio_embedding_model.py CHANGED
@@ -1,7 +1,6 @@
1
  # models/embeddings/audio_embedding_model.py
2
  import torch
3
  import librosa
4
- import numpy as np
5
 
6
  from typing import List
7
  from transformers import AutoProcessor, AutoModel
@@ -44,43 +43,4 @@ class AudioEmbeddingModel:
44
 
45
  embeddings_list = embeddings.cpu().tolist()
46
  logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
47
- return embeddings_list
48
-
49
- # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
50
- if __name__ == "__main__":
51
- from config.settings import settings
52
- import os
53
-
54
- model = AudioEmbeddingModel()
55
- sample_audio_dir = os.path.join(settings.PROCESSED_DATA_DIR, "audio_segments", "sample_audio") # Giả sử có thư mục audio từ file mẫu
56
-
57
- # Tạo một audio dummy nếu không có file audio mẫu
58
- if not os.path.exists(sample_audio_dir) or not os.listdir(sample_audio_dir):
59
- print(f"Creating a dummy audio for testing at {sample_audio_dir}...")
60
- os.makedirs(sample_audio_dir, exist_ok=True)
61
- from pydub import AudioSegment
62
- dummy_audio = AudioSegment.silent(duration=1000) # 1 giây im lặng
63
- dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
64
- dummy_audio.export(dummy_audio_path, format="wav")
65
- sample_audio_paths = [dummy_audio_path]
66
- else:
67
- sample_audio_paths = [os.path.join(sample_audio_dir, f) for f in os.listdir(sample_audio_dir) if f.endswith(('.wav', '.mp3'))]
68
- if not sample_audio_paths:
69
- print(f"No audio files found in {sample_audio_dir}. Please ensure sample audio was processed.")
70
- from pydub import AudioSegment
71
- dummy_audio = AudioSegment.silent(duration=1000)
72
- dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
73
- dummy_audio.export(dummy_audio_path, format="wav")
74
- sample_audio_paths = [dummy_audio_path]
75
-
76
- print(f"Using {len(sample_audio_paths)} sample audio clips: {sample_audio_paths[:2]}...")
77
- embeddings = model.get_embeddings(sample_audio_paths)
78
-
79
- print(f"Number of embeddings: {len(embeddings)}")
80
- if embeddings:
81
- print(f"Dimension of embeddings: {len(embeddings[0])}")
82
- print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
83
- if len(embeddings) > 1:
84
- from sklearn.metrics.pairwise import cosine_similarity
85
- sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
86
- print(f"Similarity between audio 1 and 2: {sim:.4f}")
 
1
  # models/embeddings/audio_embedding_model.py
2
  import torch
3
  import librosa
 
4
 
5
  from typing import List
6
  from transformers import AutoProcessor, AutoModel
 
43
 
44
  embeddings_list = embeddings.cpu().tolist()
45
  logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
46
+ return embeddings_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/embeddings/image_embedding_model.py CHANGED
@@ -1,8 +1,7 @@
1
  import torch
2
-
3
  from typing import List
4
  from PIL import Image
5
- from transformers import CLIPProcessor, CLIPModel
6
  from utils.logger import logger
7
  from config.model_configs import IMAGE_EMBEDDING_MODEL
8
 
@@ -11,70 +10,72 @@ class ImageEmbeddingModel:
11
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
12
  logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
13
 
14
- self.model = CLIPModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
15
- self.processor = CLIPProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
 
 
 
 
16
  logger.info("Image Embedding Model loaded successfully.")
17
 
18
  def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
19
  if not image_paths:
 
20
  return []
21
 
22
  images = []
 
 
23
  for img_path in image_paths:
24
  try:
25
- images.append(Image.open(img_path).convert("RGB"))
 
 
26
  except Exception as e:
27
  logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
28
  continue
29
 
30
  if not images:
 
31
  return []
32
 
33
- inputs = self.processor(images=images, return_tensors="pt").to(self.device)
34
-
35
- with torch.no_grad():
36
- image_features = self.model.get_image_features(pixel_values=inputs.pixel_values)
37
 
38
- embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
39
-
40
- embeddings_list = embeddings.cpu().tolist()
41
- logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
42
- return embeddings_list
43
-
44
- # dụ sử dụng (chỉ để kiểm tra nội bộ module)
45
- if __name__ == "__main__":
46
- from config.settings import settings
47
- import os
48
-
49
- model = ImageEmbeddingModel()
50
- sample_image_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks/sample_video") # Giả sử thư mục ảnh từ video mẫu
51
-
52
- # Tạo một ảnh dummy nếu không có ảnh mẫu
53
- if not os.path.exists(sample_image_dir) or not os.listdir(sample_image_dir):
54
- print(f"Creating a dummy image for testing at {sample_image_dir}...")
55
- os.makedirs(sample_image_dir, exist_ok=True)
56
- dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
57
- dummy_img = Image.new('RGB', (60, 30), color = 'red')
58
- dummy_img.save(dummy_image_path)
59
- sample_image_paths = [dummy_image_path]
60
- else:
61
- sample_image_paths = [os.path.join(sample_image_dir, f) for f in os.listdir(sample_image_dir) if f.endswith(('.jpg', '.png'))]
62
- if not sample_image_paths: # Nếu thư mục có nhưng không có ảnh
63
- print(f"No images found in {sample_image_dir}. Please ensure sample video was processed.")
64
- dummy_image_path = os.path.join(sample_image_dir, "dummy_image.jpg")
65
- dummy_img = Image.new('RGB', (60, 30), color = 'red')
66
- dummy_img.save(dummy_image_path)
67
- sample_image_paths = [dummy_image_path]
68
-
69
- print(f"Using {len(sample_image_paths)} sample images: {sample_image_paths[:2]}...")
70
- embeddings = model.get_embeddings(sample_image_paths)
71
-
72
- print(f"Number of embeddings: {len(embeddings)}")
73
- if embeddings:
74
- print(f"Dimension of embeddings: {len(embeddings[0])}")
75
- print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
76
- # Nếu có đủ ảnh, thử so sánh 2 ảnh đầu
77
- if len(embeddings) > 1:
78
- from sklearn.metrics.pairwise import cosine_similarity
79
- sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
80
- print(f"Similarity between image 1 and 2: {sim:.4f}")
 
1
  import torch
 
2
  from typing import List
3
  from PIL import Image
4
+ from transformers import ViTImageProcessor, ViTModel
5
  from utils.logger import logger
6
  from config.model_configs import IMAGE_EMBEDDING_MODEL
7
 
 
10
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
11
  logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
12
 
13
+ self.model = ViTModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
14
+ self.processor = ViTImageProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
15
+
16
+ # Set model to evaluation mode
17
+ self.model.eval()
18
+
19
  logger.info("Image Embedding Model loaded successfully.")
20
 
21
  def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
22
  if not image_paths:
23
+ logger.warning("No image paths provided")
24
  return []
25
 
26
  images = []
27
+ valid_paths = []
28
+
29
  for img_path in image_paths:
30
  try:
31
+ image = Image.open(img_path).convert("RGB")
32
+ images.append(image)
33
+ valid_paths.append(img_path)
34
  except Exception as e:
35
  logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
36
  continue
37
 
38
  if not images:
39
+ logger.warning("No valid images to process")
40
  return []
41
 
42
+ try:
43
+ # Process images
44
+ inputs = self.processor(images=images, return_tensors="pt").to(self.device)
 
45
 
46
+ with torch.no_grad():
47
+ # Get model outputs
48
+ outputs = self.model(**inputs)
49
+
50
+ # Extract embeddings from the [CLS] token (first token)
51
+ # Shape: (batch_size, sequence_length, hidden_size)
52
+ last_hidden_states = outputs.last_hidden_state
53
+
54
+ # Take the [CLS] token embedding (index 0)
55
+ # Shape: (batch_size, hidden_size)
56
+ cls_embeddings = last_hidden_states[:, 0, :]
57
+
58
+ # Alternatively, you can use pooler_output if available
59
+ # cls_embeddings = outputs.pooler_output
60
+
61
+ # Normalize embeddings (L2 normalization)
62
+ embeddings = cls_embeddings / cls_embeddings.norm(p=2, dim=-1, keepdim=True)
63
+
64
+ # Convert to list
65
+ embeddings_list = embeddings.cpu().tolist()
66
+
67
+ logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
68
+
69
+ # Ensure we return the right number of embeddings
70
+ if len(embeddings_list) != len(image_paths):
71
+ logger.warning(f"Mismatch: {len(embeddings_list)} embeddings for {len(image_paths)} input paths")
72
+ # Pad with empty lists if needed
73
+ while len(embeddings_list) < len(image_paths):
74
+ embeddings_list.append([])
75
+
76
+ return embeddings_list
77
+
78
+ except Exception as e:
79
+ logger.error(f"Error generating embeddings: {e}")
80
+ # Return empty embeddings for all input paths
81
+ return [[] for _ in image_paths]
 
 
 
 
 
 
 
core/embeddings/text_embedding_model.py CHANGED
@@ -19,24 +19,4 @@ class TextEmbeddingModel:
19
 
20
  embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
21
  logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
22
- return embeddings
23
-
24
- # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
25
- if __name__ == "__main__":
26
- model = TextEmbeddingModel()
27
- sample_texts = [
28
- "This is a test sentence.",
29
- "Another sentence for embedding.",
30
- "How about some natural language processing?",
31
- "Xe hơi màu đỏ đang chạy trên đường phố." # Thử với tiếng Việt
32
- ]
33
- embeddings = model.get_embeddings(sample_texts)
34
-
35
- print(f"Number of embeddings: {len(embeddings)}")
36
- if embeddings:
37
- print(f"Dimension of embeddings: {len(embeddings[0])}")
38
- print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
39
- # Bạn có thể thử tính cosine similarity giữa các embedding ở đây để thấy độ tương đồng
40
- from sklearn.metrics.pairwise import cosine_similarity
41
- sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
42
- print(f"Similarity between text 1 and 2: {sim:.4f}")
 
19
 
20
  embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
21
  logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
22
+ return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/retrieval/retriever.py CHANGED
@@ -1,9 +1,9 @@
1
  # core/retrieval/retriever.py
2
  import os
3
 
4
- from typing import List, Tuple, Dict, Any, Union
5
  from utils.logger import logger
6
  from config.settings import settings
 
7
  from qdrant_client import QdrantClient
8
 
9
  from core.embeddings.text_embedding_model import TextEmbeddingModel
@@ -93,64 +93,9 @@ class Retriever:
93
  logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
94
  return formatted_results
95
 
96
- # def _get_content_from_payload(self, payload: Dict):
97
- # chunk_type = payload.get("type")
98
- # if chunk_type == "text":
99
- # return None # Sẽ cải thiện sau
100
- # elif chunk_type == 'image' or chunk_type == "audio":
101
- # return payload.get('chunk_data_path') # Trả về đường dẫn
102
- # return None
103
-
104
  def is_database_empty(self) -> bool:
105
  total_vectors = self.text_db_manager.get_total_vectors() \
106
  + self.image_db_manager.get_total_vectors() \
107
  + self.audio_db_manager.get_total_vectors()
108
 
109
- return total_vectors == 0
110
-
111
- if __name__ == "__main__":
112
- from config.settings import settings
113
-
114
- logger.info("--- Running Retriever Standalone Test (Qdrant version) ---")
115
-
116
- # Kiểm tra xem Qdrant đã có dữ liệu chưa
117
- qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
118
- if not os.path.exists(qdrant_db_path):
119
- print("\n\nERROR: Qdrant database not found. Please run 'python scripts/ingest_data.py' first to create the database.\n\n")
120
- else:
121
- retriever = Retriever()
122
-
123
- # --- 1. Thử truy vấn văn bản ---
124
- print("\n--- Testing Text Retrieval ---")
125
- text_query = "What is artificial intelligence?"
126
- text_results = retriever.retrieve(text_query, query_type="text", top_k=3)
127
- print(f"Query: '{text_query}'")
128
- for i, result in enumerate(text_results):
129
- print(f" Result {i+1}:")
130
- print(f" Score: {result['score']:.4f}")
131
- print(f" Type: {result['metadata']['type']}")
132
- print(f" Content Preview: {str(result.get('content'))[:200] if result.get('content') else 'N/A'}...")
133
- print(f" Source: {result['metadata']['source_id']}")
134
-
135
- # --- 2. Thử truy vấn hình ảnh ---
136
- print("\n--- Testing Image Retrieval ---")
137
- # Lấy một ảnh từ các chunk đã xử lý để làm truy vấn
138
- image_to_query = None
139
- image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks")
140
- if os.path.exists(image_chunks_dir):
141
- for root, _, files in os.walk(image_chunks_dir):
142
- if files:
143
- image_to_query = os.path.join(root, files[0])
144
- break
145
-
146
- if image_to_query and os.path.exists(image_to_query):
147
- print(f"Using image as query: {image_to_query}")
148
- image_results = retriever.retrieve(image_to_query, query_type="image", top_k=3)
149
- for i, result in enumerate(image_results):
150
- print(f" Result {i+1}:")
151
- print(f" Score: {result['score']:.4f}")
152
- print(f" Type: {result['metadata']['type']}")
153
- print(f" Content (Paths): {result['content']}")
154
- print(f" Source: {result['metadata']['source_id']}")
155
- else:
156
- print("Could not find a sample image to test image retrieval.")
 
1
  # core/retrieval/retriever.py
2
  import os
3
 
 
4
  from utils.logger import logger
5
  from config.settings import settings
6
+ from typing import List, Dict, Any, Union
7
  from qdrant_client import QdrantClient
8
 
9
  from core.embeddings.text_embedding_model import TextEmbeddingModel
 
93
  logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
94
  return formatted_results
95
 
 
 
 
 
 
 
 
 
96
  def is_database_empty(self) -> bool:
97
  total_vectors = self.text_db_manager.get_total_vectors() \
98
  + self.image_db_manager.get_total_vectors() \
99
  + self.audio_db_manager.get_total_vectors()
100
 
101
+ return total_vectors == 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/retrieval/vector_db_manager.py CHANGED
@@ -1,14 +1,13 @@
1
  import os
2
 
3
- from typing import List, Tuple, Dict, Any
 
4
  from uuid import uuid4
5
 
6
- from qdrant_client import QdrantClient, models
 
7
  from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
8
 
9
- from utils.logger import logger
10
- from config.settings import settings
11
-
12
  class VectorDBManager:
13
  def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
14
  logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
@@ -110,60 +109,9 @@ class VectorDBManager:
110
  try:
111
  count_result = self.client.count(
112
  collection_name=self.collection_name,
113
- exact=True # Đếm chính xác
114
  )
115
  return count_result.count
116
  except Exception as e:
117
  logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
118
- return 0
119
-
120
- # Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
121
- if __name__ == "__main__":
122
- import numpy as np
123
-
124
- # Các thông số cho collection test
125
- TEST_COLLECTION_NAME = "my_test_collection"
126
- DUMMY_DIM = 128
127
-
128
- # --- Kiểm tra tạo collection ---
129
- print("\n--- Testing Collection Creation ---")
130
- db_manager = VectorDBManager(collection_name=TEST_COLLECTION_NAME, embedding_dim=DUMMY_DIM)
131
- print(f"Total vectors initially: {db_manager.get_total_vectors()}")
132
-
133
- # --- Kiểm tra thêm vector và payload ---
134
- print("\n--- Testing Add Vectors ---")
135
- dummy_embeddings = np.random.rand(10, DUMMY_DIM).tolist()
136
- dummy_metadatas = [
137
- {"chunk_id": f"dummy_chunk_{i}", "type": "text" if i < 5 else "image", "source_file": "test.txt"}
138
- for i in range(10)
139
- ]
140
- db_manager.add_vectors(dummy_embeddings, dummy_metadatas)
141
- print(f"Total vectors after adding: {db_manager.get_total_vectors()}")
142
-
143
- # --- Kiểm tra tìm kiếm ---
144
- print("\n--- Testing Search ---")
145
- dummy_query = np.random.rand(DUMMY_DIM).tolist()
146
- results = db_manager.search_vectors(dummy_query, top_k=3)
147
- print(f"Top 3 results (no filter):")
148
- for score, payload in results:
149
- print(f" Score: {score:.4f}, Payload: {payload}")
150
-
151
- # --- Kiểm tra tìm kiếm CÓ LỌC (Pre-filtering) ---
152
- print("\n--- Testing Search with Filter ---")
153
- filter_condition = models.Filter(
154
- must=[
155
- models.FieldCondition(
156
- key="type", # Lọc theo trường 'type' trong payload
157
- match=models.MatchValue(value="image"), # Giá trị phải là 'image'
158
- )
159
- ]
160
- )
161
- filtered_results = db_manager.search_vectors(dummy_query, top_k=3, filter_payload=filter_condition)
162
- print(f"Top 3 results (filtered for type='image'):")
163
- for score, payload in filtered_results:
164
- print(f" Score: {score:.4f}, Payload: {payload}")
165
-
166
- # --- Dọn dẹp collection test ---
167
- print("\n--- Cleaning up test collection ---")
168
- db_manager.client.delete_collection(collection_name=TEST_COLLECTION_NAME)
169
- print(f"Collection '{TEST_COLLECTION_NAME}' deleted.")
 
1
  import os
2
 
3
+ from utils.logger import logger
4
+ from config.settings import settings
5
  from uuid import uuid4
6
 
7
+ from typing import List, Tuple, Dict, Any
8
+ from qdrant_client import QdrantClient
9
  from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
10
 
 
 
 
11
  class VectorDBManager:
12
  def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
13
  logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
 
109
  try:
110
  count_result = self.client.count(
111
  collection_name=self.collection_name,
112
+ exact=True
113
  )
114
  return count_result.count
115
  except Exception as e:
116
  logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
117
+ return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{scripts → ingestions}/ingestion.py RENAMED
@@ -1,123 +1,108 @@
1
  # core/ingestion/ingestion_service.py
2
  import os
3
- import gradio as gr
4
  from typing import List, Optional, Callable
5
- from tqdm import tqdm
6
 
7
  from utils.logger import logger
8
- from config.settings import settings
9
  from qdrant_client import QdrantClient
10
 
11
- # Import các Processor (không thay đổi)
12
  from core.data_processing.text_processor import TextProcessor
13
  from core.data_processing.audio_processor import AudioProcessor
14
- # from core.data_processing.video_processor import VideoProcessor
15
  from core.data_processing.image_processor import ImageProcessor
16
 
17
- # Import các Embedding Model (không thay đổi)
18
  from core.embeddings.text_embedding_model import TextEmbeddingModel
19
  from core.embeddings.image_embedding_model import ImageEmbeddingModel
20
  from core.embeddings.audio_embedding_model import AudioEmbeddingModel
21
 
22
- # Import VectorDBManager phiên bản Qdrant MỚI
23
  from core.retrieval.vector_db_manager import VectorDBManager
24
 
25
  class IngestionService:
26
  def __init__(self, client: QdrantClient):
27
- """
28
- Khởi tạo IngestionService với một QdrantClient được chia sẻ.
29
- Phiên bản này không theo dõi trạng thái file.
30
- """
31
  logger.info("Initializing IngestionService (Stateless)...")
32
 
33
  self.client = client
 
34
  self.text_processor = TextProcessor()
35
  self.image_processor = ImageProcessor()
36
  self.audio_processor = AudioProcessor()
37
- # self.video_processor = VideoProcessor()
38
  self.text_embedder = TextEmbeddingModel()
39
  self.image_embedder = ImageEmbeddingModel()
40
  self.audio_embedder = AudioEmbeddingModel()
41
 
42
- text_dim = self.text_embedder.model.get_sentence_embedding_dimension()
43
  self.text_db_manager = VectorDBManager(
44
  client=self.client,
45
  collection_name="text_collection",
46
- embedding_dim=text_dim
47
  )
48
 
49
- image_embedding_dim = 512
50
  self.image_vector_db_manager = VectorDBManager(
51
  client=self.client,
52
  collection_name="image_collection",
53
  embedding_dim=image_embedding_dim
54
  )
55
 
56
- audio_embedding_dim = 512
57
  self.audio_vector_db_manager = VectorDBManager(
58
  client=self.client,
59
  collection_name="audio_collection",
60
  embedding_dim=audio_embedding_dim
61
  )
62
 
63
- # video_frame_embedding_dim = 512
64
- # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
65
-
66
-
67
  logger.info("IngestionService initialized successfully.")
68
 
69
  def ingest_files(self, file_paths: List[str]):
70
- """
71
- Xử lý một danh sách các file, tạo embedding, và thêm vào Qdrant.
72
- Hàm này giả định các file đã được đặt vào đúng thư mục trong 'raw'.
73
- """
74
  return self.ingest_files_with_progress(file_paths, None)
75
 
76
  def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
77
  """
78
- Xử một danh sách các file với progress tracking.
79
  """
80
  logger.info(f"Starting ingestion for {len(file_paths)} files...")
81
 
82
- if progress_callback:
83
- progress_callback(0.4, desc="Starting file processing...")
 
 
 
 
 
 
 
84
 
85
  all_chunks_to_process = []
86
 
87
- # 1. Quét qua các file tạo chunk
88
  for i, file_path in enumerate(file_paths):
89
- base_progress = 0.4 + (i / len(file_paths)) * 0.3 # 40% -> 70%
90
- file_name = os.path.basename(file_path)
91
-
92
- if progress_callback:
93
- progress_callback(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
94
-
95
- # Xác định loại dữ liệu dựa trên phần mở rộng file
96
- file_ext = os.path.splitext(file_path)[1].lower()
97
- data_type = None
98
  try:
99
- if progress_callback:
100
- progress_callback(base_progress + 0.01, desc=f"Reading {file_name}...")
 
 
 
 
 
 
 
101
 
102
  if file_ext in ['.txt']:
103
- data_type = 'text'
104
  chunks = self.text_processor.process(file_path)
105
  elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
106
- data_type = 'image'
107
  chunks = self.image_processor.process(file_path)
108
  elif file_ext in ['.wav', '.mp3']:
109
- data_type = 'audio'
110
  chunks = self.audio_processor.process(file_path)
111
- # elif file_ext in ['.mp4', '.avi', '.mov']:
112
- # data_type = 'video'
113
- # chunks = self.video_processor.process_video(file_path)
114
  else:
115
  logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
116
  continue
117
 
118
- if progress_callback:
119
- progress_callback(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
 
 
120
 
 
121
  all_chunks_to_process.extend(chunks)
122
 
123
  except Exception as e:
@@ -126,77 +111,89 @@ class IngestionService:
126
 
127
  if not all_chunks_to_process:
128
  logger.warning("No processable chunks were generated from the provided files.")
 
129
  return
130
 
131
  logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
132
 
133
- if progress_callback:
134
- progress_callback(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
135
 
136
- # 2. Tạo embedding thêm vào Qdrant (theo batch)
137
  text_embeddings_batch, text_metadatas_batch = [], []
138
  audio_embeddings_batch, audio_metadatas_batch = [], []
139
  image_embeddings_batch, image_metadatas_batch = [], []
140
  BATCH_SIZE = 32
141
 
142
  for i, chunk_data in enumerate(all_chunks_to_process):
143
- # Tính toán progress chi tiết hơn
144
- base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25 # 70% -> 95%
145
-
146
- chunk_type = chunk_data['metadata']['type']
147
- content = chunk_data['content']
148
- chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
149
-
150
  try:
151
- if progress_callback:
152
- progress_callback(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- embedding = None
155
  if chunk_type == "text":
156
- if progress_callback:
157
- progress_callback(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
158
- embedding = self.text_embedder.get_embeddings([content])[0]
159
- text_embeddings_batch.append(embedding)
160
- text_metadatas_batch.append(chunk_data)
 
 
 
161
  elif chunk_type == "audio":
162
- if progress_callback:
163
- progress_callback(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
164
- embedding = self.audio_embedder.get_embeddings([content])[0]
165
- audio_embeddings_batch.append(embedding)
166
- audio_metadatas_batch.append(chunk_data)
 
 
 
167
  elif chunk_type == "image":
168
- if progress_callback:
169
- progress_callback(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
170
- embedding = self.image_embedder.get_embeddings([content])[0]
171
- image_embeddings_batch.append(embedding)
172
- image_metadatas_batch.append(chunk_data)
 
 
173
 
174
- # Thêm batch khi đủ kích thước với progress update
175
  if len(text_embeddings_batch) >= BATCH_SIZE:
176
- if progress_callback:
177
- progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
178
  self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
179
  text_embeddings_batch, text_metadatas_batch = [], []
180
 
181
  if len(audio_embeddings_batch) >= BATCH_SIZE:
182
- if progress_callback:
183
- progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
184
  self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
185
  audio_embeddings_batch, audio_metadatas_batch = [], []
186
 
187
  if len(image_embeddings_batch) >= BATCH_SIZE:
188
- if progress_callback:
189
- progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
190
  self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
191
  image_embeddings_batch, image_metadatas_batch = [], []
192
 
193
  except Exception as e:
194
- logger.error(f"Error ingesting chunk {chunk_id}: {e}")
 
195
 
196
- if progress_callback:
197
- progress_callback(0.95, desc="Saving final batches...")
198
 
199
- # Thêm các embedding còn lại trong batch cuối cùng
200
  final_operations = []
201
  if text_embeddings_batch:
202
  final_operations.append(("text", len(text_embeddings_batch)))
@@ -205,23 +202,27 @@ class IngestionService:
205
  if image_embeddings_batch:
206
  final_operations.append(("image", len(image_embeddings_batch)))
207
 
208
- for i, (batch_type, count) in enumerate(final_operations):
209
- current_progress = 0.95 + (i / len(final_operations)) * 0.04 # 95% -> 99%
210
-
211
- if batch_type == "text" and text_embeddings_batch:
212
- if progress_callback:
213
- progress_callback(current_progress, desc=f"Saving final {count} text embeddings...")
214
- self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
215
- elif batch_type == "audio" and audio_embeddings_batch:
216
- if progress_callback:
217
- progress_callback(current_progress, desc=f"Saving final {count} audio embeddings...")
218
- self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
219
- elif batch_type == "image" and image_embeddings_batch:
220
- if progress_callback:
221
- progress_callback(current_progress, desc=f"Saving final {count} image embeddings...")
222
- self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
223
-
224
- if progress_callback:
225
- progress_callback(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
 
 
 
 
226
 
227
  logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")
 
1
  # core/ingestion/ingestion_service.py
2
  import os
 
3
  from typing import List, Optional, Callable
 
4
 
5
  from utils.logger import logger
 
6
  from qdrant_client import QdrantClient
7
 
 
8
  from core.data_processing.text_processor import TextProcessor
9
  from core.data_processing.audio_processor import AudioProcessor
 
10
  from core.data_processing.image_processor import ImageProcessor
11
 
 
12
  from core.embeddings.text_embedding_model import TextEmbeddingModel
13
  from core.embeddings.image_embedding_model import ImageEmbeddingModel
14
  from core.embeddings.audio_embedding_model import AudioEmbeddingModel
15
 
 
16
  from core.retrieval.vector_db_manager import VectorDBManager
17
 
18
  class IngestionService:
19
  def __init__(self, client: QdrantClient):
 
 
 
 
20
  logger.info("Initializing IngestionService (Stateless)...")
21
 
22
  self.client = client
23
+
24
  self.text_processor = TextProcessor()
25
  self.image_processor = ImageProcessor()
26
  self.audio_processor = AudioProcessor()
27
+
28
  self.text_embedder = TextEmbeddingModel()
29
  self.image_embedder = ImageEmbeddingModel()
30
  self.audio_embedder = AudioEmbeddingModel()
31
 
32
+ text_embedding_dim = self.text_embedder.model.get_sentence_embedding_dimension()
33
  self.text_db_manager = VectorDBManager(
34
  client=self.client,
35
  collection_name="text_collection",
36
+ embedding_dim=text_embedding_dim
37
  )
38
 
39
+ image_embedding_dim = self.image_embedder.model.config.hidden_size
40
  self.image_vector_db_manager = VectorDBManager(
41
  client=self.client,
42
  collection_name="image_collection",
43
  embedding_dim=image_embedding_dim
44
  )
45
 
46
+ audio_embedding_dim = self.audio_embedder.model.config.projection_dim
47
  self.audio_vector_db_manager = VectorDBManager(
48
  client=self.client,
49
  collection_name="audio_collection",
50
  embedding_dim=audio_embedding_dim
51
  )
52
 
 
 
 
 
53
  logger.info("IngestionService initialized successfully.")
54
 
55
  def ingest_files(self, file_paths: List[str]):
56
+ '''Ingest files without displaying progress bar'''
 
 
 
57
  return self.ingest_files_with_progress(file_paths, None)
58
 
59
  def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
60
  """
61
+ Turn on progress bar for tracking
62
  """
63
  logger.info(f"Starting ingestion for {len(file_paths)} files...")
64
 
65
+ # Kiểm tra và xử lý progress_callback an toàn
66
+ def safe_progress(value, desc=""):
67
+ try:
68
+ if progress_callback is not None:
69
+ progress_callback(value, desc=desc)
70
+ except Exception as e:
71
+ logger.warning(f"Progress callback error: {e}")
72
+
73
+ safe_progress(0.4, desc="Starting file processing...")
74
 
75
  all_chunks_to_process = []
76
 
77
+ # 1. Walk through files to split chunks
78
  for i, file_path in enumerate(file_paths):
 
 
 
 
 
 
 
 
 
79
  try:
80
+ base_progress = 0.4 + (i / len(file_paths)) * 0.3 # 40% -> 70%
81
+ file_name = os.path.basename(file_path)
82
+
83
+ safe_progress(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
84
+
85
+ file_ext = os.path.splitext(file_path)[1].lower()
86
+ chunks = []
87
+
88
+ safe_progress(base_progress + 0.01, desc=f"Reading {file_name}...")
89
 
90
  if file_ext in ['.txt']:
 
91
  chunks = self.text_processor.process(file_path)
92
  elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
 
93
  chunks = self.image_processor.process(file_path)
94
  elif file_ext in ['.wav', '.mp3']:
 
95
  chunks = self.audio_processor.process(file_path)
 
 
 
96
  else:
97
  logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
98
  continue
99
 
100
+ # Kiểm tra chunks có hợp lệ không
101
+ if not chunks or len(chunks) == 0:
102
+ logger.warning(f"No chunks generated from file: {file_path}")
103
+ continue
104
 
105
+ safe_progress(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
106
  all_chunks_to_process.extend(chunks)
107
 
108
  except Exception as e:
 
111
 
112
  if not all_chunks_to_process:
113
  logger.warning("No processable chunks were generated from the provided files.")
114
+ safe_progress(1.0, desc="No chunks to process")
115
  return
116
 
117
  logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
118
 
119
+ safe_progress(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
 
120
 
121
+ # 2. Create embeddings and add to batch
122
  text_embeddings_batch, text_metadatas_batch = [], []
123
  audio_embeddings_batch, audio_metadatas_batch = [], []
124
  image_embeddings_batch, image_metadatas_batch = [], []
125
  BATCH_SIZE = 32
126
 
127
  for i, chunk_data in enumerate(all_chunks_to_process):
 
 
 
 
 
 
 
128
  try:
129
+ base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25 # 70% -> 95%
130
+
131
+ # Kiểm tra chunk_data có hợp lệ không
132
+ if not chunk_data or 'metadata' not in chunk_data or 'content' not in chunk_data:
133
+ logger.warning(f"Invalid chunk data at index {i}, skipping...")
134
+ continue
135
+
136
+ chunk_type = chunk_data['metadata'].get('type', 'unknown')
137
+ content = chunk_data['content']
138
+ chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
139
+
140
+ # Kiểm tra content có hợp lệ không
141
+ if not content:
142
+ logger.warning(f"Empty content for chunk {chunk_id}, skipping...")
143
+ continue
144
+
145
+ safe_progress(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
146
 
 
147
  if chunk_type == "text":
148
+ safe_progress(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
149
+ embeddings = self.text_embedder.get_embeddings([content])
150
+ if embeddings and len(embeddings) > 0:
151
+ text_embeddings_batch.append(embeddings[0])
152
+ text_metadatas_batch.append(chunk_data)
153
+ else:
154
+ logger.warning(f"Failed to generate text embedding for chunk {chunk_id}")
155
+
156
  elif chunk_type == "audio":
157
+ safe_progress(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
158
+ embeddings = self.audio_embedder.get_embeddings([content])
159
+ if embeddings and len(embeddings) > 0:
160
+ audio_embeddings_batch.append(embeddings[0])
161
+ audio_metadatas_batch.append(chunk_data)
162
+ else:
163
+ logger.warning(f"Failed to generate audio embedding for chunk {chunk_id}")
164
+
165
  elif chunk_type == "image":
166
+ safe_progress(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
167
+ embeddings = self.image_embedder.get_embeddings([content])
168
+ if embeddings and len(embeddings) > 0:
169
+ image_embeddings_batch.append(embeddings[0])
170
+ image_metadatas_batch.append(chunk_data)
171
+ else:
172
+ logger.warning(f"Failed to generate image embedding for chunk {chunk_id}")
173
 
174
+ # add batch when reaching BATCH_SIZE
175
  if len(text_embeddings_batch) >= BATCH_SIZE:
176
+ safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
 
177
  self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
178
  text_embeddings_batch, text_metadatas_batch = [], []
179
 
180
  if len(audio_embeddings_batch) >= BATCH_SIZE:
181
+ safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
 
182
  self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
183
  audio_embeddings_batch, audio_metadatas_batch = [], []
184
 
185
  if len(image_embeddings_batch) >= BATCH_SIZE:
186
+ safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
 
187
  self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
188
  image_embeddings_batch, image_metadatas_batch = [], []
189
 
190
  except Exception as e:
191
+ logger.error(f"Error ingesting chunk {i}: {e}")
192
+ continue
193
 
194
+ safe_progress(0.95, desc="Saving final batches...")
 
195
 
196
+ # adding maintaining embeddings
197
  final_operations = []
198
  if text_embeddings_batch:
199
  final_operations.append(("text", len(text_embeddings_batch)))
 
202
  if image_embeddings_batch:
203
  final_operations.append(("image", len(image_embeddings_batch)))
204
 
205
+ # Tránh chia cho 0
206
+ total_operations = len(final_operations)
207
+ if total_operations == 0:
208
+ safe_progress(1.0, desc="No final batches to save")
209
+ else:
210
+ for i, (batch_type, count) in enumerate(final_operations):
211
+ try:
212
+ current_progress = 0.95 + (i / total_operations) * 0.04 # 95% -> 99%
213
+
214
+ if batch_type == "text" and text_embeddings_batch:
215
+ safe_progress(current_progress, desc=f"Saving final {count} text embeddings...")
216
+ self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
217
+ elif batch_type == "audio" and audio_embeddings_batch:
218
+ safe_progress(current_progress, desc=f"Saving final {count} audio embeddings...")
219
+ self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
220
+ elif batch_type == "image" and image_embeddings_batch:
221
+ safe_progress(current_progress, desc=f"Saving final {count} image embeddings...")
222
+ self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
223
+ except Exception as e:
224
+ logger.error(f"Error saving final batch {batch_type}: {e}")
225
+
226
+ safe_progress(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
227
 
228
  logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")
main.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import os
3
+ import sys
4
+ import shutil
5
+ import atexit
6
+ import signal
7
+
8
+ # add project folder to sys.path
9
+ project_root = os.path.dirname(os.path.abspath(__file__))
10
+ if project_root not in sys.path:
11
+ sys.path.insert(0, project_root)
12
+
13
+ from config.settings import settings
14
+ from utils.logger import logger
15
+ from app import create_and_run_app
16
+ from app import shared_qdrant_client
17
+
18
+ GLOBAL_QDRANT_CLIENT = shared_qdrant_client
19
+
20
+ def cleanup():
21
+ logger.info("--- Starting cleanup process ---")
22
+
23
+ # --- Step 1: Close Qdrant connection ---
24
+ # release file .lock
25
+ global GLOBAL_QDRANT_CLIENT
26
+ if GLOBAL_QDRANT_CLIENT:
27
+ try:
28
+ logger.info("Closing Qdrant client connection...")
29
+ GLOBAL_QDRANT_CLIENT.close()
30
+ logger.success("Qdrant client closed successfully.")
31
+ except Exception as e:
32
+ logger.error(f"Error closing Qdrant client: {e}")
33
+
34
+ # Step 2: Clean up "qdrant_data" folder
35
+ qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
36
+ if os.path.exists(qdrant_db_path) and os.path.isdir(qdrant_db_path):
37
+ try:
38
+ # try many times just in case
39
+ import time
40
+ retries = 3
41
+ for i in range(retries):
42
+ try:
43
+ shutil.rmtree(qdrant_db_path)
44
+ logger.success(f"Successfully cleaned up Qdrant data at: {qdrant_db_path}")
45
+ break
46
+ except OSError as e:
47
+ if i < retries - 1:
48
+ logger.warning(f"Cleanup attempt {i+1} failed: {e}. Retrying in 1 second...")
49
+ time.sleep(1)
50
+ else:
51
+ raise
52
+ except Exception as e:
53
+ logger.error(f"Error cleaning up Qdrant data after retries: {e}")
54
+ else:
55
+ logger.info("Qdrant data directory not found, skipping cleanup.")
56
+
57
+ # Step 3: Clean up "raw" folder
58
+ raw_data_path = settings.RAW_DATA_DIR
59
+ if os.path.exists(raw_data_path) and os.path.isdir(raw_data_path):
60
+ try:
61
+ for item in os.listdir(raw_data_path):
62
+ item_path = os.path.join(raw_data_path, item)
63
+ if os.path.isdir(item_path): shutil.rmtree(item_path)
64
+ elif os.path.isfile(item_path): os.remove(item_path)
65
+ logger.success(f"Successfully cleaned up raw data directory: {raw_data_path}")
66
+ except Exception as e:
67
+ logger.error(f"Error cleaning up raw data: {e}")
68
+ else:
69
+ logger.info("Raw data directory not found, skipping cleanup.")
70
+
71
+ # Step 4: Clean up "processed/chunks" folder
72
+ chunks_data_path = settings.CHUNKS_DIR
73
+ if os.path.exists(chunks_data_path) and os.path.isdir(chunks_data_path):
74
+ try:
75
+ for item in os.listdir(chunks_data_path):
76
+ item_path = os.path.join(chunks_data_path, item)
77
+ if os.path.isdir(item_path): shutil.rmtree(item_path)
78
+ elif os.path.isfile(item_path): os.remove(item_path)
79
+ logger.success(f"Successfully cleaned up processed/chunks data directory: {chunks_data_path}")
80
+ except Exception as e:
81
+ logger.error(f"Error cleaning up processed/chunks data: {e}")
82
+ else:
83
+ logger.info("processed/chunks data directory not found, skipping cleanup.")
84
+
85
+ logger.info("--- Cleanup process finished ---")
86
+
87
+ def signal_handler(sig, frame):
88
+ """
89
+ Xử lý tín hiệu ngắt (Ctrl+C) để thoát chương trình một cách an toàn.
90
+ """
91
+ logger.warning("\nCtrl+C detected. Shutting down the application.")
92
+ # atexit sẽ tự động gọi cleanup()
93
+ sys.exit(0)
94
+
95
+ def main():
96
+ # Đăng ký hàm cleanup để được gọi khi chương trình thoát bình thường hoặc có lỗi
97
+ atexit.register(cleanup)
98
+
99
+ # Đăng ký hàm xử lý tín hiệu cho Ctrl+C
100
+ signal.signal(signal.SIGINT, signal_handler)
101
+
102
+ logger.info("--- Starting Multimedia RAG Assistant ---")
103
+
104
+ demo = create_and_run_app()
105
+
106
+ print("\n" + "="*50)
107
+ print(" Application is running. Press Ctrl+C to exit. ")
108
+ print(" Cleanup will be performed upon exit. ")
109
+ print("="*50 + "\n")
110
+
111
+ demo.launch()
112
+
113
+ if __name__ == "__main__":
114
+ main()
scripts/ingest_data.py DELETED
@@ -1,203 +0,0 @@
1
- # scripts/ingest_data.py
2
- import os
3
- import json
4
- from tqdm import tqdm
5
- from pathlib import Path
6
- from typing import Dict
7
- import shutil
8
-
9
- from config.settings import settings
10
- from utils.logger import logger
11
- from qdrant_client import QdrantClient
12
-
13
- # Import các Processor (không thay đổi)
14
- from core.data_processing.text_processor import TextProcessor
15
- from core.data_processing.audio_processor import AudioProcessor
16
- # from core.data_processing.video_processor import VideoProcessor
17
- from core.data_processing.image_processor import ImageProcessor
18
-
19
- # Import các Embedding Model (không thay đổi)
20
- from core.embeddings.text_embedding_model import TextEmbeddingModel
21
- from core.embeddings.image_embedding_model import ImageEmbeddingModel
22
- from core.embeddings.audio_embedding_model import AudioEmbeddingModel
23
-
24
- # Import VectorDBManager phiên bản Qdrant MỚI
25
- from core.retrieval.vector_db_manager import VectorDBManager
26
-
27
- def walk_through_files(extentions: Dict, raw_dir: str, all_raw_chunks_from_processors, processor):
28
- all_files = list(raw_dir.rglob("*"))
29
- for filepath in tqdm(all_files, desc="Processing " + raw_dir.name):
30
- if filepath.suffix in extentions and filepath.is_file():
31
- all_raw_chunks_from_processors.extend(
32
- processor.process(str(filepath))
33
- )
34
-
35
- def ingest_data_pipeline():
36
- logger.info("Starting comprehensive data ingestion pipeline (Chunking + Embedding + Qdrant Indexing)...")
37
-
38
- # --- 1. Khởi tạo các Processor --- (Không thay đổi)
39
- text_processor = TextProcessor(chunk_size=500, chunk_overlap=50)
40
- audio_processor = AudioProcessor(min_silence_len=1000, silence_thresh_db=-40, target_sr=16000)
41
- image_processor = ImageProcessor()
42
- # video_processor = VideoProcessor(chunk_duration_sec=15, frames_per_segment=5)
43
-
44
- # --- Dọn dẹp các thư mục chunk và Qdrant data cũ ---
45
- dirs_to_clean_and_create = [
46
- settings.CHUNKS_DIR,
47
- settings.METADATA_DIR
48
- ]
49
- # Thư mục dữ liệu của Qdrant
50
- qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
51
- dirs_to_clean_and_create.append(qdrant_db_path)
52
-
53
- for dir_path in dirs_to_clean_and_create:
54
- if os.path.exists(dir_path):
55
- shutil.rmtree(dir_path)
56
- logger.info(f"Cleaned up old directory: {dir_path}")
57
- # Tạo lại các thư mục cho chunking, trừ thư mục qdrant (client sẽ tự tạo)
58
- if dir_path != qdrant_db_path:
59
- os.makedirs(dir_path, exist_ok=True)
60
-
61
- logger.info("Output directories and previous Qdrant data are ready for fresh ingestion.")
62
-
63
- qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
64
- client = QdrantClient(path=qdrant_db_path)
65
- logger.info(f"Single Qdrant client initialized for ingestion, connected to: {qdrant_db_path}")
66
-
67
- all_raw_chunks_from_processors = [] # Chứa tất cả các chunk (bao gồm content và metadata)
68
-
69
- # --- 2. Chạy Data Processing (Chunking) --- (Không thay đổi)
70
- logger.info("--- Phase 1: Processing Raw Data into Chunks ---")
71
-
72
- # Xử lý Văn bản
73
- text_extentions = {".txt"}
74
- text_raw_dir = Path(settings.RAW_DATA_DIR) / "texts"
75
- walk_through_files(text_extentions, text_raw_dir, all_raw_chunks_from_processors, text_processor)
76
-
77
- # Xử lý Âm thanh
78
- audio_extentions = {".wav", ".mp3"}
79
- audio_raw_dir = Path(settings.RAW_DATA_DIR) / "audios"
80
- walk_through_files(audio_extentions, audio_raw_dir, all_raw_chunks_from_processors, audio_processor)
81
-
82
- # process images
83
- image_extentions = {".jpg", ".png"}
84
- image_raw_dir = Path(settings.RAW_DATA_DIR) / "images"
85
- walk_through_files(image_extentions, image_raw_dir, all_raw_chunks_from_processors, image_processor)
86
-
87
- # Xử lý Video
88
- # video_raw_dir = os.path.join(settings.RAW_DATA_DIR, "videos")
89
- # for filename in tqdm(os.listdir(video_raw_dir), desc="Processing Video"):
90
- # if filename.endswith((".mp4", ".avi", ".mov")):
91
- # all_raw_chunks_from_processors.extend(video_processor.process_video(os.path.join(video_raw_dir, filename)))
92
-
93
- logger.info(f"Total raw chunks processed from all sources: {len(all_raw_chunks_from_processors)}")
94
-
95
- # --- 3. Tạo Embedding và Thêm vào Qdrant ---
96
- logger.info("--- Phase 2: Generating Embeddings and Building Qdrant Collections ---")
97
-
98
- # Khởi tạo các Embedding Model
99
- text_embedder = TextEmbeddingModel()
100
- image_embedder = ImageEmbeddingModel()
101
- audio_embedder = AudioEmbeddingModel()
102
-
103
- # --- Khởi tạo các VectorDBManager cho Qdrant ---
104
- # Lấy kích thước embedding từ model để đảm bảo chính xác
105
- text_embedding_dim = text_embedder.model.get_sentence_embedding_dimension()
106
- text_vector_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_embedding_dim, client=client)
107
-
108
- # Kích thước embedding cho image/audio (giả định là 512)
109
- image_embedding_dim = 512
110
- image_vector_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_embedding_dim, client=client)
111
-
112
- # video_frame_embedding_dim = 512
113
- # video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
114
-
115
- audio_embedding_dim = 512
116
- audio_vector_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=image_embedding_dim, client=client)
117
-
118
- logger.info(f"Initialized Text Qdrant Collection Manager with {text_embedding_dim}D.")
119
- logger.info(f"Initialized Image Qdrant Collection Manager with {image_embedding_dim}D.")
120
- logger.info(f"Initialized Audio Qdrant Collection Manager with {audio_embedding_dim}D.")
121
-
122
- # Tạo các batch để thêm vào Qdrant hiệu quả hơn
123
- text_embeddings_batch = []
124
- text_metadatas_batch = []
125
-
126
- image_embeddings_batch = []
127
- image_metadatas_batch = []
128
-
129
- # video_frame_embeddings_batch = []
130
- # video_frame_metadatas_batch = []
131
-
132
- audio_embeddings_batch = []
133
- audio_metadatas_batch = []
134
-
135
- BATCH_SIZE = 32 # Thêm 32 điểm một lần
136
-
137
- for chunk_data in tqdm(all_raw_chunks_from_processors, desc="Generating Embeddings & Populating Qdrant"):
138
- chunk_type = chunk_data['metadata']['type']
139
- content = chunk_data['content']
140
-
141
- try:
142
- if chunk_type == "text":
143
- embedding = text_embedder.get_embeddings([content])[0]
144
- text_embeddings_batch.append(embedding)
145
- text_metadatas_batch.append(chunk_data)
146
-
147
- elif chunk_type == "audio":
148
- embedding = audio_embedder.get_embeddings([content])[0]
149
- audio_embeddings_batch.append(embedding)
150
- audio_metadatas_batch.append(chunk_data)
151
-
152
- elif chunk_type == "image":
153
- embedding = image_embedder.get_embeddings([content])[0]
154
- image_embeddings_batch.append(embedding)
155
- image_metadatas_batch.append(chunk_data)
156
-
157
- # elif chunk_type == "video_frame":
158
- # if content and isinstance(content, list) and len(content) > 0:
159
- # embedding = image_embedder.get_embeddings([content[0]])[0] # Chỉ nhúng ảnh đầu tiên
160
- # video_frame_embeddings_batch.append(embedding)
161
- # video_frame_metadatas_batch.append(chunk_data['metadata'])
162
-
163
- # Xử lý batch
164
- if len(text_embeddings_batch) >= BATCH_SIZE:
165
- text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
166
- text_embeddings_batch, text_metadatas_batch = [], [] # Reset batch
167
-
168
- if len(audio_embeddings_batch) >= BATCH_SIZE:
169
- audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
170
- audio_embeddings_batch, audio_metadatas_batch = [], [] # Reset batch
171
-
172
- if len(image_embeddings_batch) >= BATCH_SIZE:
173
- image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
174
- image_embeddings_batch, image_metadatas_batch = [], [] # Reset batch
175
-
176
- # if len(video_frame_embeddings_batch) >= BATCH_SIZE:
177
- # video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
178
- # video_frame_embeddings_batch, video_frame_metadatas_batch = [], [] # Reset batch
179
-
180
- except Exception as e:
181
- logger.error(f"Error processing chunk {chunk_data['metadata']['chunk_id']}: {e}")
182
-
183
- # Thêm các embedding còn lại trong batch cuối cùng
184
- if text_embeddings_batch:
185
- text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
186
- if audio_embeddings_batch:
187
- audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
188
- if image_embeddings_batch:
189
- image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
190
- # if video_frame_embeddings_batch:
191
- # video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
192
-
193
- logger.success("Finished populating Qdrant collections.")
194
- logger.info(f"Total vectors in 'text_collection': {text_vector_db_manager.get_total_vectors()}")
195
- logger.info(f"Total vectors in 'audio_collection': {audio_vector_db_manager.get_total_vectors()}")
196
- logger.info(f"Total vectors in 'image_collection': {image_vector_db_manager.get_total_vectors()}")
197
- # logger.info(f"Total vectors in 'video_frame_collection': {video_frame_vector_db_manager.get_total_vectors()}")
198
-
199
- logger.info("Data ingestion pipeline completed successfully!")
200
-
201
-
202
- if __name__ == "__main__":
203
- ingest_data_pipeline()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/logger.py CHANGED
@@ -1,24 +1,24 @@
 
1
  import sys
2
  from loguru import logger
3
- from config.settings import settings # Import settings của chúng ta
4
 
5
- # Cấu hình logger
6
- logger.remove() # Gỡ bỏ cấu hình mặc định
 
 
7
  logger.add(
8
- "logs/file_{time}.log", # Lưu log vào tệp, với tên tệp theo thời gian
9
- rotation="10 MB", # Xoay tệp log khi đạt 10MB
10
- compression="zip", # Nén tệp log cũ
11
- level=settings.LOG_LEVEL, # Mức độ log từ settings
12
- colorize=True, # Tô màu output trên console
13
  format="{time} {level} {message}",
14
- enqueue=True # Sử dụng hàng đợi để ghi log không chặn (quan trọng cho các ứng dụng đa luồng/async)
15
  )
16
  logger.add(
17
- sys.stderr, # Ghi log ra console
18
  level=settings.LOG_LEVEL,
19
  colorize=True,
20
  format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
21
- )
22
-
23
- # Xuất logger để các module khác có thể import và sử dụng
24
- __all__ = ["logger"]
 
1
+ import os
2
  import sys
3
  from loguru import logger
4
+ from config.settings import settings
5
 
6
+ # logger configuration
7
+ logger.remove() # remove default config
8
+
9
+ log_path = os.path.join(settings.LOG_DIR, "file_{time}.log")
10
  logger.add(
11
+ log_path,
12
+ rotation="10 MB",
13
+ compression="zip",
14
+ level=settings.LOG_LEVEL, # log level from settings
15
+ colorize=True,
16
  format="{time} {level} {message}",
17
+ enqueue=True
18
  )
19
  logger.add(
20
+ sys.stderr, # output to console
21
  level=settings.LOG_LEVEL,
22
  colorize=True,
23
  format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
24
+ )