Spaces:
Runtime error
Runtime error
fix
Browse files- .gitignore +6 -1
- README.md +1 -1
- app.py +10 -24
- config/database_configs.py +0 -0
- config/model_configs.py +5 -8
- config/settings.py +10 -10
- core/data_processing/audio_processor.py +2 -21
- core/data_processing/image_processor.py +5 -55
- core/data_processing/text_processor.py +1 -21
- core/data_processing/video_processor.py +0 -118
- core/embeddings/audio_embedding_model.py +1 -41
- core/embeddings/image_embedding_model.py +53 -52
- core/embeddings/text_embedding_model.py +1 -21
- core/retrieval/retriever.py +2 -57
- core/retrieval/vector_db_manager.py +6 -58
- {scripts → ingestions}/ingestion.py +105 -104
- main.py +114 -0
- scripts/ingest_data.py +0 -203
- utils/logger.py +14 -14
.gitignore
CHANGED
|
@@ -204,4 +204,9 @@ cython_debug/
|
|
| 204 |
# Marimo
|
| 205 |
marimo/_static/
|
| 206 |
marimo/_lsp/
|
| 207 |
-
__marimo__/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
# Marimo
|
| 205 |
marimo/_static/
|
| 206 |
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
|
| 209 |
+
# main function
|
| 210 |
+
video_processor.py
|
| 211 |
+
ingestions/ingest_data.py
|
| 212 |
+
test_config_log.py
|
README.md
CHANGED
|
@@ -5,7 +5,7 @@ colorFrom: purple
|
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.39.0
|
| 8 |
-
app_file:
|
| 9 |
pinned: false
|
| 10 |
short_description: My small project while preparing for AIC
|
| 11 |
---
|
|
|
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.39.0
|
| 8 |
+
app_file: main.py
|
| 9 |
pinned: false
|
| 10 |
short_description: My small project while preparing for AIC
|
| 11 |
---
|
app.py
CHANGED
|
@@ -1,32 +1,23 @@
|
|
| 1 |
# app/main.py
|
| 2 |
import gradio as gr
|
| 3 |
import os
|
| 4 |
-
import sys
|
| 5 |
-
import shutil
|
| 6 |
import zipfile
|
| 7 |
-
from typing import List, Dict, Any
|
| 8 |
-
from pathlib import Path
|
| 9 |
-
|
| 10 |
-
# Thêm thư mục gốc của dự án vào Python Path để có thể import các module
|
| 11 |
-
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
| 12 |
-
if project_root not in sys.path:
|
| 13 |
-
sys.path.insert(0, project_root)
|
| 14 |
|
|
|
|
| 15 |
from utils.logger import logger
|
| 16 |
from config.settings import settings
|
| 17 |
from qdrant_client import QdrantClient
|
| 18 |
from core.retrieval.retriever import Retriever
|
| 19 |
-
from
|
| 20 |
|
| 21 |
-
# ---
|
| 22 |
logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
|
| 23 |
try:
|
| 24 |
-
#
|
| 25 |
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 26 |
shared_qdrant_client = QdrantClient(path=qdrant_db_path)
|
| 27 |
logger.info("Shared Qdrant client initialized.")
|
| 28 |
|
| 29 |
-
# Khởi tạo các dịch vụ, chia sẻ client
|
| 30 |
ingestion_service = IngestionService(client=shared_qdrant_client)
|
| 31 |
retriever_instance = Retriever(client=shared_qdrant_client)
|
| 32 |
|
|
@@ -35,12 +26,7 @@ except Exception as e:
|
|
| 35 |
logger.error(f"Failed to initialize global services: {e}")
|
| 36 |
raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
|
| 37 |
|
| 38 |
-
|
| 39 |
-
# ---- HÀM XỬ LÝ CHO TAB UPLOAD ----
|
| 40 |
def upload_handler(zip_path: str, progress=gr.Progress()):
|
| 41 |
-
"""
|
| 42 |
-
Hàm này xử lý việc upload file và thư mục với progress bar.
|
| 43 |
-
"""
|
| 44 |
progress(0, desc="🚀 Starting upload process...")
|
| 45 |
|
| 46 |
if not zip_path:
|
|
@@ -63,7 +49,7 @@ def upload_handler(zip_path: str, progress=gr.Progress()):
|
|
| 63 |
|
| 64 |
logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
|
| 65 |
|
| 66 |
-
# ---
|
| 67 |
path = Path(settings.RAW_DATA_DIR)
|
| 68 |
all_temp_file_paths = list(path.rglob("*"))
|
| 69 |
all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
|
|
@@ -81,11 +67,11 @@ def upload_handler(zip_path: str, progress=gr.Progress()):
|
|
| 81 |
if not files_to_ingest:
|
| 82 |
return "No valid files were moved for ingestion."
|
| 83 |
|
| 84 |
-
#
|
| 85 |
try:
|
| 86 |
progress(0.4, desc="🔄 Starting file ingestion...")
|
| 87 |
# Gọi hàm ingestion với progress callback
|
| 88 |
-
ingestion_service.ingest_files_with_progress(files_to_ingest)
|
| 89 |
|
| 90 |
success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
|
| 91 |
logger.success(success_message)
|
|
@@ -268,6 +254,6 @@ def create_and_run_app():
|
|
| 268 |
return demo
|
| 269 |
|
| 270 |
# --- 4. Chạy ứng dụng ---
|
| 271 |
-
logger.info("Launching Gradio interface...")
|
| 272 |
-
demo = create_and_run_app()
|
| 273 |
-
demo.launch()
|
|
|
|
| 1 |
# app/main.py
|
| 2 |
import gradio as gr
|
| 3 |
import os
|
|
|
|
|
|
|
| 4 |
import zipfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
from pathlib import Path
|
| 7 |
from utils.logger import logger
|
| 8 |
from config.settings import settings
|
| 9 |
from qdrant_client import QdrantClient
|
| 10 |
from core.retrieval.retriever import Retriever
|
| 11 |
+
from ingestions.ingestion import IngestionService
|
| 12 |
|
| 13 |
+
# --- Initialize global services ---
|
| 14 |
logger.info("--- Initializing Global Services (Upload-Only Mode) ---")
|
| 15 |
try:
|
| 16 |
+
# Create ONE QdrantClient only for sharing
|
| 17 |
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 18 |
shared_qdrant_client = QdrantClient(path=qdrant_db_path)
|
| 19 |
logger.info("Shared Qdrant client initialized.")
|
| 20 |
|
|
|
|
| 21 |
ingestion_service = IngestionService(client=shared_qdrant_client)
|
| 22 |
retriever_instance = Retriever(client=shared_qdrant_client)
|
| 23 |
|
|
|
|
| 26 |
logger.error(f"Failed to initialize global services: {e}")
|
| 27 |
raise RuntimeError(f"Could not initialize services. Please check logs. Error: {e}")
|
| 28 |
|
|
|
|
|
|
|
| 29 |
def upload_handler(zip_path: str, progress=gr.Progress()):
|
|
|
|
|
|
|
|
|
|
| 30 |
progress(0, desc="🚀 Starting upload process...")
|
| 31 |
|
| 32 |
if not zip_path:
|
|
|
|
| 49 |
|
| 50 |
logger.info(f"Handling upload of {len(settings.RAW_DATA_DIR)} items (files/folders)...")
|
| 51 |
|
| 52 |
+
# --- Retrieve all file path from input ---
|
| 53 |
path = Path(settings.RAW_DATA_DIR)
|
| 54 |
all_temp_file_paths = list(path.rglob("*"))
|
| 55 |
all_temp_file_paths = [str(p) for p in all_temp_file_paths if os.path.isfile(p)]
|
|
|
|
| 67 |
if not files_to_ingest:
|
| 68 |
return "No valid files were moved for ingestion."
|
| 69 |
|
| 70 |
+
# Start ingesting data
|
| 71 |
try:
|
| 72 |
progress(0.4, desc="🔄 Starting file ingestion...")
|
| 73 |
# Gọi hàm ingestion với progress callback
|
| 74 |
+
ingestion_service.ingest_files_with_progress(files_to_ingest, progress)
|
| 75 |
|
| 76 |
success_message = f"Successfully uploaded and ingested {len(files_to_ingest)} file(s)."
|
| 77 |
logger.success(success_message)
|
|
|
|
| 254 |
return demo
|
| 255 |
|
| 256 |
# --- 4. Chạy ứng dụng ---
|
| 257 |
+
# logger.info("Launching Gradio interface...")
|
| 258 |
+
# demo = create_and_run_app()
|
| 259 |
+
# demo.launch()
|
config/database_configs.py
DELETED
|
File without changes
|
config/model_configs.py
CHANGED
|
@@ -1,17 +1,14 @@
|
|
| 1 |
# config/model_configs.py
|
| 2 |
|
| 3 |
# Embedding Models
|
| 4 |
-
TEXT_EMBEDDING_MODEL: str = "sentence-transformers/
|
| 5 |
-
IMAGE_EMBEDDING_MODEL: str = "
|
| 6 |
-
AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused"
|
| 7 |
|
| 8 |
# Generator Model (LLM/LMM)
|
| 9 |
-
GENERATOR_MODEL_NAME: str = "gpt-4o"
|
| 10 |
GENERATOR_MODEL_MAX_TOKENS: int = 4096
|
| 11 |
GENERATOR_MODEL_TEMPERATURE: float = 0.7
|
| 12 |
|
| 13 |
# Reranker Model
|
| 14 |
-
RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 15 |
-
|
| 16 |
-
# Automatic Speech Recognition (ASR) Model (Ví dụ với Whisper của Hugging Face)
|
| 17 |
-
ASR_MODEL: str = "openai/whisper-tiny" # Có thể dùng "base", "small", "medium" tùy tài nguyên GPU
|
|
|
|
| 1 |
# config/model_configs.py
|
| 2 |
|
| 3 |
# Embedding Models
|
| 4 |
+
TEXT_EMBEDDING_MODEL: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
| 5 |
+
IMAGE_EMBEDDING_MODEL: str = "google/vit-base-patch16-224-in21k"
|
| 6 |
+
AUDIO_EMBEDDING_MODEL: str = "laion/clap-htsat-unfused"
|
| 7 |
|
| 8 |
# Generator Model (LLM/LMM)
|
| 9 |
+
GENERATOR_MODEL_NAME: str = "gpt-4o"
|
| 10 |
GENERATOR_MODEL_MAX_TOKENS: int = 4096
|
| 11 |
GENERATOR_MODEL_TEMPERATURE: float = 0.7
|
| 12 |
|
| 13 |
# Reranker Model
|
| 14 |
+
RERANKER_MODEL: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
|
|
|
|
|
|
|
config/settings.py
CHANGED
|
@@ -6,25 +6,25 @@ from dotenv import load_dotenv
|
|
| 6 |
load_dotenv()
|
| 7 |
|
| 8 |
class Settings(BaseSettings):
|
|
|
|
|
|
|
| 9 |
APP_NAME: str = "Multimedia RAG Assistant"
|
| 10 |
APP_VERSION: str = "0.1.0"
|
| 11 |
ENVIRONMENT: str = "development"
|
| 12 |
|
| 13 |
-
DATA_DIR: str = "data"
|
| 14 |
-
RAW_DATA_DIR: str = os.path.join(
|
| 15 |
-
PROCESSED_DATA_DIR: str = os.path.join(
|
| 16 |
-
CHUNKS_DIR: str = os.path.join(
|
| 17 |
-
METADATA_DIR: str = os.path.join(
|
| 18 |
-
EMBEDDINGS_DIR: str = os.path.join(
|
| 19 |
|
| 20 |
API_HOST: str = "0.0.0.0"
|
| 21 |
API_PORT: int = 8000
|
| 22 |
|
| 23 |
-
|
| 24 |
-
# Đây là nơi bạn sẽ thêm các API key hoặc model IDs sau này
|
| 25 |
-
HUGGINGFACE_API_KEY: Optional[str] = None # Ví dụ: Nếu dùng Hugging Face models
|
| 26 |
|
| 27 |
-
|
| 28 |
LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
| 29 |
|
| 30 |
model_config = SettingsConfigDict(
|
|
|
|
| 6 |
load_dotenv()
|
| 7 |
|
| 8 |
class Settings(BaseSettings):
|
| 9 |
+
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 10 |
+
|
| 11 |
APP_NAME: str = "Multimedia RAG Assistant"
|
| 12 |
APP_VERSION: str = "0.1.0"
|
| 13 |
ENVIRONMENT: str = "development"
|
| 14 |
|
| 15 |
+
DATA_DIR: str = os.path.join(BASE_DIR, "data")
|
| 16 |
+
RAW_DATA_DIR: str = os.path.join(DATA_DIR, "raw")
|
| 17 |
+
PROCESSED_DATA_DIR: str = os.path.join(DATA_DIR, "processed")
|
| 18 |
+
CHUNKS_DIR: str = os.path.join(DATA_DIR, "processed", "chunks")
|
| 19 |
+
METADATA_DIR: str = os.path.join(DATA_DIR, "processed", "metadata")
|
| 20 |
+
EMBEDDINGS_DIR: str = os.path.join(DATA_DIR, "processed", "embeddings")
|
| 21 |
|
| 22 |
API_HOST: str = "0.0.0.0"
|
| 23 |
API_PORT: int = 8000
|
| 24 |
|
| 25 |
+
HUGGINGFACE_API_KEY: Optional[str] = None
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
LOG_DIR: str = "logs"
|
| 28 |
LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
| 29 |
|
| 30 |
model_config = SettingsConfigDict(
|
core/data_processing/audio_processor.py
CHANGED
|
@@ -37,7 +37,7 @@ class AudioProcessor:
|
|
| 37 |
segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
|
| 38 |
chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
|
| 39 |
|
| 40 |
-
#
|
| 41 |
segment.export(chunk_file_path, format="wav")
|
| 42 |
|
| 43 |
metadata = {
|
|
@@ -45,8 +45,6 @@ class AudioProcessor:
|
|
| 45 |
"type": "audio",
|
| 46 |
"chunk_id": segment_id,
|
| 47 |
"chunk_data_path": chunk_file_path,
|
| 48 |
-
# "start_time_ms": int(segment.start_time),
|
| 49 |
-
# "end_time_ms": int(segment.end_time),
|
| 50 |
"duration_ms": len(segment)
|
| 51 |
}
|
| 52 |
chunks.append({
|
|
@@ -60,21 +58,4 @@ class AudioProcessor:
|
|
| 60 |
return []
|
| 61 |
except Exception as e:
|
| 62 |
logger.error(f"Error processing audio file {file_path}: {e}")
|
| 63 |
-
return []
|
| 64 |
-
|
| 65 |
-
# Ví dụ sử dụng (giữ nguyên để kiểm tra)
|
| 66 |
-
if __name__ == "__main__":
|
| 67 |
-
sample_audio_path = os.path.join(settings.RAW_DATA_DIR, "audios", "sample_audio.wav")
|
| 68 |
-
if not os.path.exists(sample_audio_path):
|
| 69 |
-
print(f"ERROR: Sample audio not found at {sample_audio_path}. Please create it first.")
|
| 70 |
-
print("Make sure you have ffmpeg installed and available in your PATH for pydub to work.")
|
| 71 |
-
else:
|
| 72 |
-
processor = AudioProcessor()
|
| 73 |
-
audio_chunks = processor.process(sample_audio_path)
|
| 74 |
-
|
| 75 |
-
for i, chunk in enumerate(audio_chunks):
|
| 76 |
-
print(f"\n--- Audio Chunk {i+1} ---")
|
| 77 |
-
print(f"Type: {chunk['metadata']['type']}")
|
| 78 |
-
print(f"Content (path): {chunk['content']}")
|
| 79 |
-
print(f"Metadata: {chunk['metadata']}")
|
| 80 |
-
# Bạn có thể thử mở file chunk['content'] để nghe
|
|
|
|
| 37 |
segment_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_audio_{i}"
|
| 38 |
chunk_file_path = os.path.join(audio_chunks_dir, f"{segment_id}.wav")
|
| 39 |
|
| 40 |
+
# Save segments into data/processed/chunks
|
| 41 |
segment.export(chunk_file_path, format="wav")
|
| 42 |
|
| 43 |
metadata = {
|
|
|
|
| 45 |
"type": "audio",
|
| 46 |
"chunk_id": segment_id,
|
| 47 |
"chunk_data_path": chunk_file_path,
|
|
|
|
|
|
|
| 48 |
"duration_ms": len(segment)
|
| 49 |
}
|
| 50 |
chunks.append({
|
|
|
|
| 58 |
return []
|
| 59 |
except Exception as e:
|
| 60 |
logger.error(f"Error processing audio file {file_path}: {e}")
|
| 61 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/data_processing/image_processor.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# core/data_processing/image_processor.py
|
| 2 |
-
from typing import List, Dict, Any
|
| 3 |
import os
|
| 4 |
-
|
|
|
|
| 5 |
from utils.logger import logger
|
| 6 |
|
| 7 |
class ImageProcessor:
|
|
@@ -16,14 +16,7 @@ class ImageProcessor:
|
|
| 16 |
logger.error(f"Image file not found: {file_path}")
|
| 17 |
return []
|
| 18 |
|
| 19 |
-
|
| 20 |
-
width, height = img.size
|
| 21 |
-
img_format = img.format
|
| 22 |
-
|
| 23 |
-
file_size = os.path.getsize(file_path)
|
| 24 |
-
|
| 25 |
-
# Tạo một ID duy nhất cho chunk này
|
| 26 |
-
# Lấy tên file không bao gồm phần mở rộng
|
| 27 |
base_name = os.path.basename(file_path)
|
| 28 |
chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
|
| 29 |
|
|
@@ -31,59 +24,16 @@ class ImageProcessor:
|
|
| 31 |
"source_id": base_name,
|
| 32 |
"type": "image",
|
| 33 |
"chunk_id": chunk_id,
|
| 34 |
-
"chunk_data_path": file_path
|
| 35 |
-
"image_width": width,
|
| 36 |
-
"image_height": height,
|
| 37 |
-
"image_format": img_format,
|
| 38 |
-
"file_size_bytes": file_size
|
| 39 |
}
|
| 40 |
|
| 41 |
-
# Tạo chunk
|
| 42 |
-
# Content sẽ là đường dẫn đến file, giống như audio/video segments
|
| 43 |
chunk = {
|
| 44 |
"content": file_path,
|
| 45 |
"metadata": metadata
|
| 46 |
}
|
| 47 |
|
| 48 |
-
# Trả về một danh sách chứa một chunk duy nhất
|
| 49 |
return [chunk]
|
| 50 |
|
| 51 |
except Exception as e:
|
| 52 |
logger.error(f"Error processing image file {file_path}: {e}")
|
| 53 |
-
return []
|
| 54 |
-
|
| 55 |
-
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 56 |
-
if __name__ == "__main__":
|
| 57 |
-
from config.settings import settings
|
| 58 |
-
import os
|
| 59 |
-
|
| 60 |
-
# Tạo một ảnh dummy để kiểm tra
|
| 61 |
-
dummy_image_dir = os.path.join(settings.RAW_DATA_DIR, "images")
|
| 62 |
-
os.makedirs(dummy_image_dir, exist_ok=True)
|
| 63 |
-
dummy_image_path = os.path.join(dummy_image_dir, "test_image.jpg")
|
| 64 |
-
|
| 65 |
-
try:
|
| 66 |
-
# Tạo một ảnh mẫu màu xanh
|
| 67 |
-
dummy_img = Image.new('RGB', (100, 150), color = 'blue')
|
| 68 |
-
dummy_img.save(dummy_image_path)
|
| 69 |
-
print(f"Created a dummy image for testing at: {dummy_image_path}")
|
| 70 |
-
|
| 71 |
-
# Khởi tạo processor và xử lý ảnh
|
| 72 |
-
processor = ImageProcessor()
|
| 73 |
-
image_chunks = processor.process(dummy_image_path)
|
| 74 |
-
|
| 75 |
-
if image_chunks:
|
| 76 |
-
print("\n--- Image Chunk Processed ---")
|
| 77 |
-
chunk = image_chunks[0]
|
| 78 |
-
print(f"Content (path): {chunk['content']}")
|
| 79 |
-
print("Metadata:")
|
| 80 |
-
for key, value in chunk['metadata'].items():
|
| 81 |
-
print(f" - {key}: {value}")
|
| 82 |
-
else:
|
| 83 |
-
print("Failed to process the dummy image.")
|
| 84 |
-
|
| 85 |
-
finally:
|
| 86 |
-
# Dọn dẹp ảnh dummy
|
| 87 |
-
if os.path.exists(dummy_image_path):
|
| 88 |
-
os.remove(dummy_image_path)
|
| 89 |
-
print(f"Cleaned up dummy image: {dummy_image_path}")
|
|
|
|
| 1 |
# core/data_processing/image_processor.py
|
|
|
|
| 2 |
import os
|
| 3 |
+
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
from utils.logger import logger
|
| 6 |
|
| 7 |
class ImageProcessor:
|
|
|
|
| 16 |
logger.error(f"Image file not found: {file_path}")
|
| 17 |
return []
|
| 18 |
|
| 19 |
+
# create id by filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
base_name = os.path.basename(file_path)
|
| 21 |
chunk_id = f"{os.path.splitext(base_name)[0]}_image_chunk"
|
| 22 |
|
|
|
|
| 24 |
"source_id": base_name,
|
| 25 |
"type": "image",
|
| 26 |
"chunk_id": chunk_id,
|
| 27 |
+
"chunk_data_path": file_path
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
}
|
| 29 |
|
|
|
|
|
|
|
| 30 |
chunk = {
|
| 31 |
"content": file_path,
|
| 32 |
"metadata": metadata
|
| 33 |
}
|
| 34 |
|
|
|
|
| 35 |
return [chunk]
|
| 36 |
|
| 37 |
except Exception as e:
|
| 38 |
logger.error(f"Error processing image file {file_path}: {e}")
|
| 39 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/data_processing/text_processor.py
CHANGED
|
@@ -26,14 +26,11 @@ class TextProcessor:
|
|
| 26 |
|
| 27 |
chunks = []
|
| 28 |
for i, chunk_content in enumerate(split_texts):
|
| 29 |
-
start_char_idx = text.find(chunk_content) # find start index of each chunk_content
|
| 30 |
chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
|
| 31 |
metadata = {
|
| 32 |
"source_id": os.path.basename(file_path),
|
| 33 |
"type": "text",
|
| 34 |
"chunk_id": chunk_id,
|
| 35 |
-
"start_char_index": start_char_idx, # Vị trí ký tự bắt đầu
|
| 36 |
-
"end_char_index": start_char_idx + len(chunk_content), # Vị trí ký tự kết thúc
|
| 37 |
"content_length": len(chunk_content)
|
| 38 |
}
|
| 39 |
chunks.append({
|
|
@@ -44,21 +41,4 @@ class TextProcessor:
|
|
| 44 |
return chunks
|
| 45 |
except Exception as e:
|
| 46 |
logger.error(f"Error processing text document {file_path}: {e}")
|
| 47 |
-
return []
|
| 48 |
-
|
| 49 |
-
# Ví dụ sử dụng (giữ nguyên để kiểm tra)
|
| 50 |
-
if __name__ == "__main__":
|
| 51 |
-
from config.settings import settings
|
| 52 |
-
import os
|
| 53 |
-
|
| 54 |
-
sample_doc_path = os.path.join(settings.RAW_DATA_DIR, "documents", "sample_document.txt")
|
| 55 |
-
if not os.path.exists(sample_doc_path):
|
| 56 |
-
print(f"ERROR: Sample document not found at {sample_doc_path}. Please create it first.")
|
| 57 |
-
else:
|
| 58 |
-
processor = TextProcessor(chunk_size=100, chunk_overlap=20) # Thử kích thước nhỏ hơn để thấy rõ chunk
|
| 59 |
-
text_chunks = processor.process(sample_doc_path)
|
| 60 |
-
|
| 61 |
-
for i, chunk in enumerate(text_chunks): # In tất cả các chunk để kiểm tra
|
| 62 |
-
print(f"\n--- Chunk {i+1} ---")
|
| 63 |
-
print(f"Content: {chunk['content']}") # In toàn bộ nội dung chunk
|
| 64 |
-
print(f"Metadata: {chunk['metadata']}")
|
|
|
|
| 26 |
|
| 27 |
chunks = []
|
| 28 |
for i, chunk_content in enumerate(split_texts):
|
|
|
|
| 29 |
chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
|
| 30 |
metadata = {
|
| 31 |
"source_id": os.path.basename(file_path),
|
| 32 |
"type": "text",
|
| 33 |
"chunk_id": chunk_id,
|
|
|
|
|
|
|
| 34 |
"content_length": len(chunk_content)
|
| 35 |
}
|
| 36 |
chunks.append({
|
|
|
|
| 41 |
return chunks
|
| 42 |
except Exception as e:
|
| 43 |
logger.error(f"Error processing text document {file_path}: {e}")
|
| 44 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/data_processing/video_processor.py
DELETED
|
@@ -1,118 +0,0 @@
|
|
| 1 |
-
# core/data_processing/video_processor.py
|
| 2 |
-
import os
|
| 3 |
-
import torch
|
| 4 |
-
import cv2
|
| 5 |
-
import numpy as np
|
| 6 |
-
|
| 7 |
-
from typing import List, Dict, Any
|
| 8 |
-
from utils.logger import logger
|
| 9 |
-
from moviepy.editor import VideoFileClip
|
| 10 |
-
from config.settings import settings
|
| 11 |
-
|
| 12 |
-
class VideoProcessor:
|
| 13 |
-
def __init__(self, chunk_duration_sec: int = 10, frames_per_segment: int = 5):
|
| 14 |
-
self.chunk_duration_sec = chunk_duration_sec
|
| 15 |
-
self.frames_per_segment = frames_per_segment
|
| 16 |
-
logger.info(f"VideoProcessor initialized (chunk_duration={chunk_duration_sec}s, frames_per_segment={frames_per_segment}).")
|
| 17 |
-
|
| 18 |
-
def process_video(self, file_path: str) -> List[Dict[str, Any]]:
|
| 19 |
-
try:
|
| 20 |
-
logger.info(f"Processing video file: {file_path}")
|
| 21 |
-
video_clip = VideoFileClip(file_path)
|
| 22 |
-
total_duration = video_clip.duration # Tổng thời lượng video (giây)
|
| 23 |
-
|
| 24 |
-
all_chunks = []
|
| 25 |
-
|
| 26 |
-
# Tạo thư mục con để lưu các frame/ảnh tạm thời
|
| 27 |
-
image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks", os.path.basename(file_path).split('.')[0])
|
| 28 |
-
os.makedirs(image_chunks_dir, exist_ok=True)
|
| 29 |
-
|
| 30 |
-
# Tạo thư mục con để lưu các video segment tạm thời
|
| 31 |
-
video_segments_dir = os.path.join(settings.CHUNKS_DIR, "video/video_segments", os.path.basename(file_path).split('.')[0])
|
| 32 |
-
os.makedirs(video_segments_dir, exist_ok=True)
|
| 33 |
-
|
| 34 |
-
current_time = 0.0
|
| 35 |
-
chunk_idx = 0
|
| 36 |
-
|
| 37 |
-
while current_time < total_duration:
|
| 38 |
-
end_time = min(current_time + self.chunk_duration_sec, total_duration) # end time of each segment
|
| 39 |
-
segment_clip = video_clip.subclip(current_time, end_time)
|
| 40 |
-
|
| 41 |
-
segment_base_name = f"{os.path.basename(file_path).split('.')[0]}_segment_{chunk_idx}"
|
| 42 |
-
|
| 43 |
-
frames_paths = []
|
| 44 |
-
|
| 45 |
-
frame_timestamps = np.linspace(0, segment_clip.duration, self.frames_per_segment + 2)[1:-1]
|
| 46 |
-
|
| 47 |
-
for ts in frame_timestamps:
|
| 48 |
-
frame = segment_clip.get_frame(ts)
|
| 49 |
-
frame_filename = f"{segment_base_name}_frame_{int(ts*1000)}.jpg"
|
| 50 |
-
frame_path = os.path.join(image_chunks_dir, frame_filename)
|
| 51 |
-
cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
| 52 |
-
frames_paths.append(frame_path)
|
| 53 |
-
|
| 54 |
-
# Tạo chunk cho các khung hình
|
| 55 |
-
image_chunk_id = f"{segment_base_name}_image"
|
| 56 |
-
all_chunks.append({
|
| 57 |
-
"content": frames_paths, # Danh sách đường dẫn đến các file ảnh
|
| 58 |
-
"metadata": {
|
| 59 |
-
"source_id": os.path.basename(file_path),
|
| 60 |
-
"type": "video_frame", # Loại chunk
|
| 61 |
-
"chunk_id": image_chunk_id,
|
| 62 |
-
"start_time_sec": current_time,
|
| 63 |
-
"end_time_sec": end_time,
|
| 64 |
-
"frame_paths": frames_paths # Lưu lại đường dẫn trong metadata
|
| 65 |
-
}
|
| 66 |
-
})
|
| 67 |
-
|
| 68 |
-
# 2. Lưu đoạn video clip (optional, nhưng hữu ích cho video retrieval)
|
| 69 |
-
video_segment_path = os.path.join(video_segments_dir, f"{segment_base_name}.mp4")
|
| 70 |
-
segment_clip.write_videofile(video_segment_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
|
| 71 |
-
|
| 72 |
-
# Tạo chunk cho video segment
|
| 73 |
-
video_chunk_id = f"{segment_base_name}_video_clip"
|
| 74 |
-
all_chunks.append({
|
| 75 |
-
"content": video_segment_path, # Đường dẫn đến file video clip
|
| 76 |
-
"metadata": {
|
| 77 |
-
"source_id": os.path.basename(file_path),
|
| 78 |
-
"type": "video_segment_clip", # Loại chunk mới: video clip
|
| 79 |
-
"chunk_id": video_chunk_id,
|
| 80 |
-
"start_time_sec": current_time,
|
| 81 |
-
"end_time_sec": end_time,
|
| 82 |
-
"chunk_data_path": video_segment_path # Lưu lại đường dẫn trong metadata
|
| 83 |
-
}
|
| 84 |
-
})
|
| 85 |
-
|
| 86 |
-
current_time = end_time
|
| 87 |
-
chunk_idx += 1
|
| 88 |
-
|
| 89 |
-
video_clip.close() # Đảm bảo giải phóng tài nguyên
|
| 90 |
-
logger.info(f"Generated {len(all_chunks)} chunks (frames & video segments) from video {file_path}")
|
| 91 |
-
return all_chunks
|
| 92 |
-
except FileNotFoundError:
|
| 93 |
-
logger.error(f"Video file not found: {file_path}. Please ensure ffmpeg is installed and accessible.")
|
| 94 |
-
return []
|
| 95 |
-
except Exception as e:
|
| 96 |
-
logger.error(f"Error processing video file {file_path}: {e}")
|
| 97 |
-
return []
|
| 98 |
-
|
| 99 |
-
# Ví dụ sử dụng (giữ nguyên để kiểm tra)
|
| 100 |
-
if __name__ == "__main__":
|
| 101 |
-
sample_video_path = os.path.join(settings.RAW_DATA_DIR, "videos", "sample_video.mp4")
|
| 102 |
-
if not os.path.exists(sample_video_path):
|
| 103 |
-
print(f"ERROR: Sample video not found at {sample_video_path}. Please create it first.")
|
| 104 |
-
print("Make sure you have ffmpeg installed and available in your PATH for moviepy to work.")
|
| 105 |
-
else:
|
| 106 |
-
processor = VideoProcessor(chunk_duration_sec=5, frames_per_segment=3)
|
| 107 |
-
video_chunks = processor.process_video(sample_video_path)
|
| 108 |
-
|
| 109 |
-
for i, chunk in enumerate(video_chunks):
|
| 110 |
-
print(f"\n--- Video Chunk {i+1} ---")
|
| 111 |
-
print(f"Type: {chunk['metadata']['type']}")
|
| 112 |
-
if chunk['metadata']['type'] == 'video_frames':
|
| 113 |
-
print(f"Content (paths): {chunk['content']}")
|
| 114 |
-
if chunk['content']:
|
| 115 |
-
print(f"Sample frame: {chunk['content'][0]}")
|
| 116 |
-
elif chunk['metadata']['type'] == 'video_segment_clip':
|
| 117 |
-
print(f"Content (path): {chunk['content']}")
|
| 118 |
-
print(f"Metadata: {chunk['metadata']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/embeddings/audio_embedding_model.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# models/embeddings/audio_embedding_model.py
|
| 2 |
import torch
|
| 3 |
import librosa
|
| 4 |
-
import numpy as np
|
| 5 |
|
| 6 |
from typing import List
|
| 7 |
from transformers import AutoProcessor, AutoModel
|
|
@@ -44,43 +43,4 @@ class AudioEmbeddingModel:
|
|
| 44 |
|
| 45 |
embeddings_list = embeddings.cpu().tolist()
|
| 46 |
logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
|
| 47 |
-
return embeddings_list
|
| 48 |
-
|
| 49 |
-
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 50 |
-
if __name__ == "__main__":
|
| 51 |
-
from config.settings import settings
|
| 52 |
-
import os
|
| 53 |
-
|
| 54 |
-
model = AudioEmbeddingModel()
|
| 55 |
-
sample_audio_dir = os.path.join(settings.PROCESSED_DATA_DIR, "audio_segments", "sample_audio") # Giả sử có thư mục audio từ file mẫu
|
| 56 |
-
|
| 57 |
-
# Tạo một audio dummy nếu không có file audio mẫu
|
| 58 |
-
if not os.path.exists(sample_audio_dir) or not os.listdir(sample_audio_dir):
|
| 59 |
-
print(f"Creating a dummy audio for testing at {sample_audio_dir}...")
|
| 60 |
-
os.makedirs(sample_audio_dir, exist_ok=True)
|
| 61 |
-
from pydub import AudioSegment
|
| 62 |
-
dummy_audio = AudioSegment.silent(duration=1000) # 1 giây im lặng
|
| 63 |
-
dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
|
| 64 |
-
dummy_audio.export(dummy_audio_path, format="wav")
|
| 65 |
-
sample_audio_paths = [dummy_audio_path]
|
| 66 |
-
else:
|
| 67 |
-
sample_audio_paths = [os.path.join(sample_audio_dir, f) for f in os.listdir(sample_audio_dir) if f.endswith(('.wav', '.mp3'))]
|
| 68 |
-
if not sample_audio_paths:
|
| 69 |
-
print(f"No audio files found in {sample_audio_dir}. Please ensure sample audio was processed.")
|
| 70 |
-
from pydub import AudioSegment
|
| 71 |
-
dummy_audio = AudioSegment.silent(duration=1000)
|
| 72 |
-
dummy_audio_path = os.path.join(sample_audio_dir, "dummy_audio.wav")
|
| 73 |
-
dummy_audio.export(dummy_audio_path, format="wav")
|
| 74 |
-
sample_audio_paths = [dummy_audio_path]
|
| 75 |
-
|
| 76 |
-
print(f"Using {len(sample_audio_paths)} sample audio clips: {sample_audio_paths[:2]}...")
|
| 77 |
-
embeddings = model.get_embeddings(sample_audio_paths)
|
| 78 |
-
|
| 79 |
-
print(f"Number of embeddings: {len(embeddings)}")
|
| 80 |
-
if embeddings:
|
| 81 |
-
print(f"Dimension of embeddings: {len(embeddings[0])}")
|
| 82 |
-
print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
|
| 83 |
-
if len(embeddings) > 1:
|
| 84 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 85 |
-
sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
|
| 86 |
-
print(f"Similarity between audio 1 and 2: {sim:.4f}")
|
|
|
|
| 1 |
# models/embeddings/audio_embedding_model.py
|
| 2 |
import torch
|
| 3 |
import librosa
|
|
|
|
| 4 |
|
| 5 |
from typing import List
|
| 6 |
from transformers import AutoProcessor, AutoModel
|
|
|
|
| 43 |
|
| 44 |
embeddings_list = embeddings.cpu().tolist()
|
| 45 |
logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(audio_inputs)} audio clips.")
|
| 46 |
+
return embeddings_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/embeddings/image_embedding_model.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
import torch
|
| 2 |
-
|
| 3 |
from typing import List
|
| 4 |
from PIL import Image
|
| 5 |
-
from transformers import
|
| 6 |
from utils.logger import logger
|
| 7 |
from config.model_configs import IMAGE_EMBEDDING_MODEL
|
| 8 |
|
|
@@ -11,70 +10,72 @@ class ImageEmbeddingModel:
|
|
| 11 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
|
| 13 |
|
| 14 |
-
self.model =
|
| 15 |
-
self.processor =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
logger.info("Image Embedding Model loaded successfully.")
|
| 17 |
|
| 18 |
def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
|
| 19 |
if not image_paths:
|
|
|
|
| 20 |
return []
|
| 21 |
|
| 22 |
images = []
|
|
|
|
|
|
|
| 23 |
for img_path in image_paths:
|
| 24 |
try:
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
except Exception as e:
|
| 27 |
logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
|
| 28 |
continue
|
| 29 |
|
| 30 |
if not images:
|
|
|
|
| 31 |
return []
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
image_features = self.model.get_image_features(pixel_values=inputs.pixel_values)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
print(f"Dimension of embeddings: {len(embeddings[0])}")
|
| 75 |
-
print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
|
| 76 |
-
# Nếu có đủ ảnh, thử so sánh 2 ảnh đầu
|
| 77 |
-
if len(embeddings) > 1:
|
| 78 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 79 |
-
sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
|
| 80 |
-
print(f"Similarity between image 1 and 2: {sim:.4f}")
|
|
|
|
| 1 |
import torch
|
|
|
|
| 2 |
from typing import List
|
| 3 |
from PIL import Image
|
| 4 |
+
from transformers import ViTImageProcessor, ViTModel
|
| 5 |
from utils.logger import logger
|
| 6 |
from config.model_configs import IMAGE_EMBEDDING_MODEL
|
| 7 |
|
|
|
|
| 10 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
logger.info(f"Loading Image Embedding Model '{IMAGE_EMBEDDING_MODEL}' to device: {self.device}")
|
| 12 |
|
| 13 |
+
self.model = ViTModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(self.device)
|
| 14 |
+
self.processor = ViTImageProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
|
| 15 |
+
|
| 16 |
+
# Set model to evaluation mode
|
| 17 |
+
self.model.eval()
|
| 18 |
+
|
| 19 |
logger.info("Image Embedding Model loaded successfully.")
|
| 20 |
|
| 21 |
def get_embeddings(self, image_paths: List[str]) -> List[List[float]]:
|
| 22 |
if not image_paths:
|
| 23 |
+
logger.warning("No image paths provided")
|
| 24 |
return []
|
| 25 |
|
| 26 |
images = []
|
| 27 |
+
valid_paths = []
|
| 28 |
+
|
| 29 |
for img_path in image_paths:
|
| 30 |
try:
|
| 31 |
+
image = Image.open(img_path).convert("RGB")
|
| 32 |
+
images.append(image)
|
| 33 |
+
valid_paths.append(img_path)
|
| 34 |
except Exception as e:
|
| 35 |
logger.warning(f"Could not load image {img_path}: {e}. Skipping.")
|
| 36 |
continue
|
| 37 |
|
| 38 |
if not images:
|
| 39 |
+
logger.warning("No valid images to process")
|
| 40 |
return []
|
| 41 |
|
| 42 |
+
try:
|
| 43 |
+
# Process images
|
| 44 |
+
inputs = self.processor(images=images, return_tensors="pt").to(self.device)
|
|
|
|
| 45 |
|
| 46 |
+
with torch.no_grad():
|
| 47 |
+
# Get model outputs
|
| 48 |
+
outputs = self.model(**inputs)
|
| 49 |
+
|
| 50 |
+
# Extract embeddings from the [CLS] token (first token)
|
| 51 |
+
# Shape: (batch_size, sequence_length, hidden_size)
|
| 52 |
+
last_hidden_states = outputs.last_hidden_state
|
| 53 |
+
|
| 54 |
+
# Take the [CLS] token embedding (index 0)
|
| 55 |
+
# Shape: (batch_size, hidden_size)
|
| 56 |
+
cls_embeddings = last_hidden_states[:, 0, :]
|
| 57 |
+
|
| 58 |
+
# Alternatively, you can use pooler_output if available
|
| 59 |
+
# cls_embeddings = outputs.pooler_output
|
| 60 |
+
|
| 61 |
+
# Normalize embeddings (L2 normalization)
|
| 62 |
+
embeddings = cls_embeddings / cls_embeddings.norm(p=2, dim=-1, keepdim=True)
|
| 63 |
+
|
| 64 |
+
# Convert to list
|
| 65 |
+
embeddings_list = embeddings.cpu().tolist()
|
| 66 |
+
|
| 67 |
+
logger.debug(f"Generated {len(embeddings_list)} embeddings for {len(images)} images.")
|
| 68 |
+
|
| 69 |
+
# Ensure we return the right number of embeddings
|
| 70 |
+
if len(embeddings_list) != len(image_paths):
|
| 71 |
+
logger.warning(f"Mismatch: {len(embeddings_list)} embeddings for {len(image_paths)} input paths")
|
| 72 |
+
# Pad with empty lists if needed
|
| 73 |
+
while len(embeddings_list) < len(image_paths):
|
| 74 |
+
embeddings_list.append([])
|
| 75 |
+
|
| 76 |
+
return embeddings_list
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Error generating embeddings: {e}")
|
| 80 |
+
# Return empty embeddings for all input paths
|
| 81 |
+
return [[] for _ in image_paths]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/embeddings/text_embedding_model.py
CHANGED
|
@@ -19,24 +19,4 @@ class TextEmbeddingModel:
|
|
| 19 |
|
| 20 |
embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
|
| 21 |
logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
|
| 22 |
-
return embeddings
|
| 23 |
-
|
| 24 |
-
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 25 |
-
if __name__ == "__main__":
|
| 26 |
-
model = TextEmbeddingModel()
|
| 27 |
-
sample_texts = [
|
| 28 |
-
"This is a test sentence.",
|
| 29 |
-
"Another sentence for embedding.",
|
| 30 |
-
"How about some natural language processing?",
|
| 31 |
-
"Xe hơi màu đỏ đang chạy trên đường phố." # Thử với tiếng Việt
|
| 32 |
-
]
|
| 33 |
-
embeddings = model.get_embeddings(sample_texts)
|
| 34 |
-
|
| 35 |
-
print(f"Number of embeddings: {len(embeddings)}")
|
| 36 |
-
if embeddings:
|
| 37 |
-
print(f"Dimension of embeddings: {len(embeddings[0])}")
|
| 38 |
-
print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
|
| 39 |
-
# Bạn có thể thử tính cosine similarity giữa các embedding ở đây để thấy độ tương đồng
|
| 40 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 41 |
-
sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
|
| 42 |
-
print(f"Similarity between text 1 and 2: {sim:.4f}")
|
|
|
|
| 19 |
|
| 20 |
embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
|
| 21 |
logger.debug(f"Generated {len(embeddings)} embeddings for {len(texts)} texts.")
|
| 22 |
+
return embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/retrieval/retriever.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
# core/retrieval/retriever.py
|
| 2 |
import os
|
| 3 |
|
| 4 |
-
from typing import List, Tuple, Dict, Any, Union
|
| 5 |
from utils.logger import logger
|
| 6 |
from config.settings import settings
|
|
|
|
| 7 |
from qdrant_client import QdrantClient
|
| 8 |
|
| 9 |
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
|
@@ -93,64 +93,9 @@ class Retriever:
|
|
| 93 |
logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
|
| 94 |
return formatted_results
|
| 95 |
|
| 96 |
-
# def _get_content_from_payload(self, payload: Dict):
|
| 97 |
-
# chunk_type = payload.get("type")
|
| 98 |
-
# if chunk_type == "text":
|
| 99 |
-
# return None # Sẽ cải thiện sau
|
| 100 |
-
# elif chunk_type == 'image' or chunk_type == "audio":
|
| 101 |
-
# return payload.get('chunk_data_path') # Trả về đường dẫn
|
| 102 |
-
# return None
|
| 103 |
-
|
| 104 |
def is_database_empty(self) -> bool:
|
| 105 |
total_vectors = self.text_db_manager.get_total_vectors() \
|
| 106 |
+ self.image_db_manager.get_total_vectors() \
|
| 107 |
+ self.audio_db_manager.get_total_vectors()
|
| 108 |
|
| 109 |
-
return total_vectors == 0
|
| 110 |
-
|
| 111 |
-
if __name__ == "__main__":
|
| 112 |
-
from config.settings import settings
|
| 113 |
-
|
| 114 |
-
logger.info("--- Running Retriever Standalone Test (Qdrant version) ---")
|
| 115 |
-
|
| 116 |
-
# Kiểm tra xem Qdrant đã có dữ liệu chưa
|
| 117 |
-
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 118 |
-
if not os.path.exists(qdrant_db_path):
|
| 119 |
-
print("\n\nERROR: Qdrant database not found. Please run 'python scripts/ingest_data.py' first to create the database.\n\n")
|
| 120 |
-
else:
|
| 121 |
-
retriever = Retriever()
|
| 122 |
-
|
| 123 |
-
# --- 1. Thử truy vấn văn bản ---
|
| 124 |
-
print("\n--- Testing Text Retrieval ---")
|
| 125 |
-
text_query = "What is artificial intelligence?"
|
| 126 |
-
text_results = retriever.retrieve(text_query, query_type="text", top_k=3)
|
| 127 |
-
print(f"Query: '{text_query}'")
|
| 128 |
-
for i, result in enumerate(text_results):
|
| 129 |
-
print(f" Result {i+1}:")
|
| 130 |
-
print(f" Score: {result['score']:.4f}")
|
| 131 |
-
print(f" Type: {result['metadata']['type']}")
|
| 132 |
-
print(f" Content Preview: {str(result.get('content'))[:200] if result.get('content') else 'N/A'}...")
|
| 133 |
-
print(f" Source: {result['metadata']['source_id']}")
|
| 134 |
-
|
| 135 |
-
# --- 2. Thử truy vấn hình ảnh ---
|
| 136 |
-
print("\n--- Testing Image Retrieval ---")
|
| 137 |
-
# Lấy một ảnh từ các chunk đã xử lý để làm truy vấn
|
| 138 |
-
image_to_query = None
|
| 139 |
-
image_chunks_dir = os.path.join(settings.CHUNKS_DIR, "video/image_chunks")
|
| 140 |
-
if os.path.exists(image_chunks_dir):
|
| 141 |
-
for root, _, files in os.walk(image_chunks_dir):
|
| 142 |
-
if files:
|
| 143 |
-
image_to_query = os.path.join(root, files[0])
|
| 144 |
-
break
|
| 145 |
-
|
| 146 |
-
if image_to_query and os.path.exists(image_to_query):
|
| 147 |
-
print(f"Using image as query: {image_to_query}")
|
| 148 |
-
image_results = retriever.retrieve(image_to_query, query_type="image", top_k=3)
|
| 149 |
-
for i, result in enumerate(image_results):
|
| 150 |
-
print(f" Result {i+1}:")
|
| 151 |
-
print(f" Score: {result['score']:.4f}")
|
| 152 |
-
print(f" Type: {result['metadata']['type']}")
|
| 153 |
-
print(f" Content (Paths): {result['content']}")
|
| 154 |
-
print(f" Source: {result['metadata']['source_id']}")
|
| 155 |
-
else:
|
| 156 |
-
print("Could not find a sample image to test image retrieval.")
|
|
|
|
| 1 |
# core/retrieval/retriever.py
|
| 2 |
import os
|
| 3 |
|
|
|
|
| 4 |
from utils.logger import logger
|
| 5 |
from config.settings import settings
|
| 6 |
+
from typing import List, Dict, Any, Union
|
| 7 |
from qdrant_client import QdrantClient
|
| 8 |
|
| 9 |
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
|
|
|
| 93 |
logger.info(f"Retrieval complete. Found {len(formatted_results)} results.")
|
| 94 |
return formatted_results
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def is_database_empty(self) -> bool:
|
| 97 |
total_vectors = self.text_db_manager.get_total_vectors() \
|
| 98 |
+ self.image_db_manager.get_total_vectors() \
|
| 99 |
+ self.audio_db_manager.get_total_vectors()
|
| 100 |
|
| 101 |
+
return total_vectors == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/retrieval/vector_db_manager.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
from
|
|
|
|
| 4 |
from uuid import uuid4
|
| 5 |
|
| 6 |
-
from
|
|
|
|
| 7 |
from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
|
| 8 |
|
| 9 |
-
from utils.logger import logger
|
| 10 |
-
from config.settings import settings
|
| 11 |
-
|
| 12 |
class VectorDBManager:
|
| 13 |
def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
|
| 14 |
logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
|
|
@@ -110,60 +109,9 @@ class VectorDBManager:
|
|
| 110 |
try:
|
| 111 |
count_result = self.client.count(
|
| 112 |
collection_name=self.collection_name,
|
| 113 |
-
exact=True
|
| 114 |
)
|
| 115 |
return count_result.count
|
| 116 |
except Exception as e:
|
| 117 |
logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
|
| 118 |
-
return 0
|
| 119 |
-
|
| 120 |
-
# Ví dụ sử dụng (chỉ để kiểm tra nội bộ module)
|
| 121 |
-
if __name__ == "__main__":
|
| 122 |
-
import numpy as np
|
| 123 |
-
|
| 124 |
-
# Các thông số cho collection test
|
| 125 |
-
TEST_COLLECTION_NAME = "my_test_collection"
|
| 126 |
-
DUMMY_DIM = 128
|
| 127 |
-
|
| 128 |
-
# --- Kiểm tra tạo collection ---
|
| 129 |
-
print("\n--- Testing Collection Creation ---")
|
| 130 |
-
db_manager = VectorDBManager(collection_name=TEST_COLLECTION_NAME, embedding_dim=DUMMY_DIM)
|
| 131 |
-
print(f"Total vectors initially: {db_manager.get_total_vectors()}")
|
| 132 |
-
|
| 133 |
-
# --- Kiểm tra thêm vector và payload ---
|
| 134 |
-
print("\n--- Testing Add Vectors ---")
|
| 135 |
-
dummy_embeddings = np.random.rand(10, DUMMY_DIM).tolist()
|
| 136 |
-
dummy_metadatas = [
|
| 137 |
-
{"chunk_id": f"dummy_chunk_{i}", "type": "text" if i < 5 else "image", "source_file": "test.txt"}
|
| 138 |
-
for i in range(10)
|
| 139 |
-
]
|
| 140 |
-
db_manager.add_vectors(dummy_embeddings, dummy_metadatas)
|
| 141 |
-
print(f"Total vectors after adding: {db_manager.get_total_vectors()}")
|
| 142 |
-
|
| 143 |
-
# --- Kiểm tra tìm kiếm ---
|
| 144 |
-
print("\n--- Testing Search ---")
|
| 145 |
-
dummy_query = np.random.rand(DUMMY_DIM).tolist()
|
| 146 |
-
results = db_manager.search_vectors(dummy_query, top_k=3)
|
| 147 |
-
print(f"Top 3 results (no filter):")
|
| 148 |
-
for score, payload in results:
|
| 149 |
-
print(f" Score: {score:.4f}, Payload: {payload}")
|
| 150 |
-
|
| 151 |
-
# --- Kiểm tra tìm kiếm CÓ LỌC (Pre-filtering) ---
|
| 152 |
-
print("\n--- Testing Search with Filter ---")
|
| 153 |
-
filter_condition = models.Filter(
|
| 154 |
-
must=[
|
| 155 |
-
models.FieldCondition(
|
| 156 |
-
key="type", # Lọc theo trường 'type' trong payload
|
| 157 |
-
match=models.MatchValue(value="image"), # Giá trị phải là 'image'
|
| 158 |
-
)
|
| 159 |
-
]
|
| 160 |
-
)
|
| 161 |
-
filtered_results = db_manager.search_vectors(dummy_query, top_k=3, filter_payload=filter_condition)
|
| 162 |
-
print(f"Top 3 results (filtered for type='image'):")
|
| 163 |
-
for score, payload in filtered_results:
|
| 164 |
-
print(f" Score: {score:.4f}, Payload: {payload}")
|
| 165 |
-
|
| 166 |
-
# --- Dọn dẹp collection test ---
|
| 167 |
-
print("\n--- Cleaning up test collection ---")
|
| 168 |
-
db_manager.client.delete_collection(collection_name=TEST_COLLECTION_NAME)
|
| 169 |
-
print(f"Collection '{TEST_COLLECTION_NAME}' deleted.")
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
from utils.logger import logger
|
| 4 |
+
from config.settings import settings
|
| 5 |
from uuid import uuid4
|
| 6 |
|
| 7 |
+
from typing import List, Tuple, Dict, Any
|
| 8 |
+
from qdrant_client import QdrantClient
|
| 9 |
from qdrant_client.http.models import Distance, VectorParams, PointStruct, UpdateStatus
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
class VectorDBManager:
|
| 12 |
def __init__(self, collection_name: str, embedding_dim: int, client: QdrantClient = None):
|
| 13 |
logger.info(f"Initializing Qdrant VectorDBManager for collection: '{collection_name}'")
|
|
|
|
| 109 |
try:
|
| 110 |
count_result = self.client.count(
|
| 111 |
collection_name=self.collection_name,
|
| 112 |
+
exact=True
|
| 113 |
)
|
| 114 |
return count_result.count
|
| 115 |
except Exception as e:
|
| 116 |
logger.error(f"Error counting vectors in collection '{self.collection_name}': {e}")
|
| 117 |
+
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{scripts → ingestions}/ingestion.py
RENAMED
|
@@ -1,123 +1,108 @@
|
|
| 1 |
# core/ingestion/ingestion_service.py
|
| 2 |
import os
|
| 3 |
-
import gradio as gr
|
| 4 |
from typing import List, Optional, Callable
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
|
| 7 |
from utils.logger import logger
|
| 8 |
-
from config.settings import settings
|
| 9 |
from qdrant_client import QdrantClient
|
| 10 |
|
| 11 |
-
# Import các Processor (không thay đổi)
|
| 12 |
from core.data_processing.text_processor import TextProcessor
|
| 13 |
from core.data_processing.audio_processor import AudioProcessor
|
| 14 |
-
# from core.data_processing.video_processor import VideoProcessor
|
| 15 |
from core.data_processing.image_processor import ImageProcessor
|
| 16 |
|
| 17 |
-
# Import các Embedding Model (không thay đổi)
|
| 18 |
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
| 19 |
from core.embeddings.image_embedding_model import ImageEmbeddingModel
|
| 20 |
from core.embeddings.audio_embedding_model import AudioEmbeddingModel
|
| 21 |
|
| 22 |
-
# Import VectorDBManager phiên bản Qdrant MỚI
|
| 23 |
from core.retrieval.vector_db_manager import VectorDBManager
|
| 24 |
|
| 25 |
class IngestionService:
|
| 26 |
def __init__(self, client: QdrantClient):
|
| 27 |
-
"""
|
| 28 |
-
Khởi tạo IngestionService với một QdrantClient được chia sẻ.
|
| 29 |
-
Phiên bản này không theo dõi trạng thái file.
|
| 30 |
-
"""
|
| 31 |
logger.info("Initializing IngestionService (Stateless)...")
|
| 32 |
|
| 33 |
self.client = client
|
|
|
|
| 34 |
self.text_processor = TextProcessor()
|
| 35 |
self.image_processor = ImageProcessor()
|
| 36 |
self.audio_processor = AudioProcessor()
|
| 37 |
-
|
| 38 |
self.text_embedder = TextEmbeddingModel()
|
| 39 |
self.image_embedder = ImageEmbeddingModel()
|
| 40 |
self.audio_embedder = AudioEmbeddingModel()
|
| 41 |
|
| 42 |
-
|
| 43 |
self.text_db_manager = VectorDBManager(
|
| 44 |
client=self.client,
|
| 45 |
collection_name="text_collection",
|
| 46 |
-
embedding_dim=
|
| 47 |
)
|
| 48 |
|
| 49 |
-
image_embedding_dim =
|
| 50 |
self.image_vector_db_manager = VectorDBManager(
|
| 51 |
client=self.client,
|
| 52 |
collection_name="image_collection",
|
| 53 |
embedding_dim=image_embedding_dim
|
| 54 |
)
|
| 55 |
|
| 56 |
-
audio_embedding_dim =
|
| 57 |
self.audio_vector_db_manager = VectorDBManager(
|
| 58 |
client=self.client,
|
| 59 |
collection_name="audio_collection",
|
| 60 |
embedding_dim=audio_embedding_dim
|
| 61 |
)
|
| 62 |
|
| 63 |
-
# video_frame_embedding_dim = 512
|
| 64 |
-
# video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
|
| 65 |
-
|
| 66 |
-
|
| 67 |
logger.info("IngestionService initialized successfully.")
|
| 68 |
|
| 69 |
def ingest_files(self, file_paths: List[str]):
|
| 70 |
-
|
| 71 |
-
Xử lý một danh sách các file, tạo embedding, và thêm vào Qdrant.
|
| 72 |
-
Hàm này giả định các file đã được đặt vào đúng thư mục trong 'raw'.
|
| 73 |
-
"""
|
| 74 |
return self.ingest_files_with_progress(file_paths, None)
|
| 75 |
|
| 76 |
def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
|
| 77 |
"""
|
| 78 |
-
|
| 79 |
"""
|
| 80 |
logger.info(f"Starting ingestion for {len(file_paths)} files...")
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
all_chunks_to_process = []
|
| 86 |
|
| 87 |
-
# 1.
|
| 88 |
for i, file_path in enumerate(file_paths):
|
| 89 |
-
base_progress = 0.4 + (i / len(file_paths)) * 0.3 # 40% -> 70%
|
| 90 |
-
file_name = os.path.basename(file_path)
|
| 91 |
-
|
| 92 |
-
if progress_callback:
|
| 93 |
-
progress_callback(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
|
| 94 |
-
|
| 95 |
-
# Xác định loại dữ liệu dựa trên phần mở rộng file
|
| 96 |
-
file_ext = os.path.splitext(file_path)[1].lower()
|
| 97 |
-
data_type = None
|
| 98 |
try:
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
if file_ext in ['.txt']:
|
| 103 |
-
data_type = 'text'
|
| 104 |
chunks = self.text_processor.process(file_path)
|
| 105 |
elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
|
| 106 |
-
data_type = 'image'
|
| 107 |
chunks = self.image_processor.process(file_path)
|
| 108 |
elif file_ext in ['.wav', '.mp3']:
|
| 109 |
-
data_type = 'audio'
|
| 110 |
chunks = self.audio_processor.process(file_path)
|
| 111 |
-
# elif file_ext in ['.mp4', '.avi', '.mov']:
|
| 112 |
-
# data_type = 'video'
|
| 113 |
-
# chunks = self.video_processor.process_video(file_path)
|
| 114 |
else:
|
| 115 |
logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
|
| 116 |
continue
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
| 120 |
|
|
|
|
| 121 |
all_chunks_to_process.extend(chunks)
|
| 122 |
|
| 123 |
except Exception as e:
|
|
@@ -126,77 +111,89 @@ class IngestionService:
|
|
| 126 |
|
| 127 |
if not all_chunks_to_process:
|
| 128 |
logger.warning("No processable chunks were generated from the provided files.")
|
|
|
|
| 129 |
return
|
| 130 |
|
| 131 |
logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
|
| 132 |
|
| 133 |
-
|
| 134 |
-
progress_callback(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
|
| 135 |
|
| 136 |
-
# 2.
|
| 137 |
text_embeddings_batch, text_metadatas_batch = [], []
|
| 138 |
audio_embeddings_batch, audio_metadatas_batch = [], []
|
| 139 |
image_embeddings_batch, image_metadatas_batch = [], []
|
| 140 |
BATCH_SIZE = 32
|
| 141 |
|
| 142 |
for i, chunk_data in enumerate(all_chunks_to_process):
|
| 143 |
-
# Tính toán progress chi tiết hơn
|
| 144 |
-
base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25 # 70% -> 95%
|
| 145 |
-
|
| 146 |
-
chunk_type = chunk_data['metadata']['type']
|
| 147 |
-
content = chunk_data['content']
|
| 148 |
-
chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
|
| 149 |
-
|
| 150 |
try:
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
embedding = None
|
| 155 |
if chunk_type == "text":
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
| 161 |
elif chunk_type == "audio":
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
| 167 |
elif chunk_type == "image":
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
| 173 |
|
| 174 |
-
#
|
| 175 |
if len(text_embeddings_batch) >= BATCH_SIZE:
|
| 176 |
-
|
| 177 |
-
progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
|
| 178 |
self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 179 |
text_embeddings_batch, text_metadatas_batch = [], []
|
| 180 |
|
| 181 |
if len(audio_embeddings_batch) >= BATCH_SIZE:
|
| 182 |
-
|
| 183 |
-
progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
|
| 184 |
self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 185 |
audio_embeddings_batch, audio_metadatas_batch = [], []
|
| 186 |
|
| 187 |
if len(image_embeddings_batch) >= BATCH_SIZE:
|
| 188 |
-
|
| 189 |
-
progress_callback(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
|
| 190 |
self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 191 |
image_embeddings_batch, image_metadatas_batch = [], []
|
| 192 |
|
| 193 |
except Exception as e:
|
| 194 |
-
logger.error(f"Error ingesting chunk {
|
|
|
|
| 195 |
|
| 196 |
-
|
| 197 |
-
progress_callback(0.95, desc="Saving final batches...")
|
| 198 |
|
| 199 |
-
#
|
| 200 |
final_operations = []
|
| 201 |
if text_embeddings_batch:
|
| 202 |
final_operations.append(("text", len(text_embeddings_batch)))
|
|
@@ -205,23 +202,27 @@ class IngestionService:
|
|
| 205 |
if image_embeddings_batch:
|
| 206 |
final_operations.append(("image", len(image_embeddings_batch)))
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")
|
|
|
|
| 1 |
# core/ingestion/ingestion_service.py
|
| 2 |
import os
|
|
|
|
| 3 |
from typing import List, Optional, Callable
|
|
|
|
| 4 |
|
| 5 |
from utils.logger import logger
|
|
|
|
| 6 |
from qdrant_client import QdrantClient
|
| 7 |
|
|
|
|
| 8 |
from core.data_processing.text_processor import TextProcessor
|
| 9 |
from core.data_processing.audio_processor import AudioProcessor
|
|
|
|
| 10 |
from core.data_processing.image_processor import ImageProcessor
|
| 11 |
|
|
|
|
| 12 |
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
| 13 |
from core.embeddings.image_embedding_model import ImageEmbeddingModel
|
| 14 |
from core.embeddings.audio_embedding_model import AudioEmbeddingModel
|
| 15 |
|
|
|
|
| 16 |
from core.retrieval.vector_db_manager import VectorDBManager
|
| 17 |
|
| 18 |
class IngestionService:
|
| 19 |
def __init__(self, client: QdrantClient):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
logger.info("Initializing IngestionService (Stateless)...")
|
| 21 |
|
| 22 |
self.client = client
|
| 23 |
+
|
| 24 |
self.text_processor = TextProcessor()
|
| 25 |
self.image_processor = ImageProcessor()
|
| 26 |
self.audio_processor = AudioProcessor()
|
| 27 |
+
|
| 28 |
self.text_embedder = TextEmbeddingModel()
|
| 29 |
self.image_embedder = ImageEmbeddingModel()
|
| 30 |
self.audio_embedder = AudioEmbeddingModel()
|
| 31 |
|
| 32 |
+
text_embedding_dim = self.text_embedder.model.get_sentence_embedding_dimension()
|
| 33 |
self.text_db_manager = VectorDBManager(
|
| 34 |
client=self.client,
|
| 35 |
collection_name="text_collection",
|
| 36 |
+
embedding_dim=text_embedding_dim
|
| 37 |
)
|
| 38 |
|
| 39 |
+
image_embedding_dim = self.image_embedder.model.config.hidden_size
|
| 40 |
self.image_vector_db_manager = VectorDBManager(
|
| 41 |
client=self.client,
|
| 42 |
collection_name="image_collection",
|
| 43 |
embedding_dim=image_embedding_dim
|
| 44 |
)
|
| 45 |
|
| 46 |
+
audio_embedding_dim = self.audio_embedder.model.config.projection_dim
|
| 47 |
self.audio_vector_db_manager = VectorDBManager(
|
| 48 |
client=self.client,
|
| 49 |
collection_name="audio_collection",
|
| 50 |
embedding_dim=audio_embedding_dim
|
| 51 |
)
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
logger.info("IngestionService initialized successfully.")
|
| 54 |
|
| 55 |
def ingest_files(self, file_paths: List[str]):
|
| 56 |
+
'''Ingest files without displaying progress bar'''
|
|
|
|
|
|
|
|
|
|
| 57 |
return self.ingest_files_with_progress(file_paths, None)
|
| 58 |
|
| 59 |
def ingest_files_with_progress(self, file_paths: List[str], progress_callback: Optional[Callable] = None):
|
| 60 |
"""
|
| 61 |
+
Turn on progress bar for tracking
|
| 62 |
"""
|
| 63 |
logger.info(f"Starting ingestion for {len(file_paths)} files...")
|
| 64 |
|
| 65 |
+
# Kiểm tra và xử lý progress_callback an toàn
|
| 66 |
+
def safe_progress(value, desc=""):
|
| 67 |
+
try:
|
| 68 |
+
if progress_callback is not None:
|
| 69 |
+
progress_callback(value, desc=desc)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.warning(f"Progress callback error: {e}")
|
| 72 |
+
|
| 73 |
+
safe_progress(0.4, desc="Starting file processing...")
|
| 74 |
|
| 75 |
all_chunks_to_process = []
|
| 76 |
|
| 77 |
+
# 1. Walk through files to split chunks
|
| 78 |
for i, file_path in enumerate(file_paths):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
try:
|
| 80 |
+
base_progress = 0.4 + (i / len(file_paths)) * 0.3 # 40% -> 70%
|
| 81 |
+
file_name = os.path.basename(file_path)
|
| 82 |
+
|
| 83 |
+
safe_progress(base_progress, desc=f"Processing file {i+1}/{len(file_paths)}: {file_name}")
|
| 84 |
+
|
| 85 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
| 86 |
+
chunks = []
|
| 87 |
+
|
| 88 |
+
safe_progress(base_progress + 0.01, desc=f"Reading {file_name}...")
|
| 89 |
|
| 90 |
if file_ext in ['.txt']:
|
|
|
|
| 91 |
chunks = self.text_processor.process(file_path)
|
| 92 |
elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
|
|
|
|
| 93 |
chunks = self.image_processor.process(file_path)
|
| 94 |
elif file_ext in ['.wav', '.mp3']:
|
|
|
|
| 95 |
chunks = self.audio_processor.process(file_path)
|
|
|
|
|
|
|
|
|
|
| 96 |
else:
|
| 97 |
logger.warning(f"Unsupported file type '{file_ext}' for file: {file_path}. Skipping.")
|
| 98 |
continue
|
| 99 |
|
| 100 |
+
# Kiểm tra chunks có hợp lệ không
|
| 101 |
+
if not chunks or len(chunks) == 0:
|
| 102 |
+
logger.warning(f"No chunks generated from file: {file_path}")
|
| 103 |
+
continue
|
| 104 |
|
| 105 |
+
safe_progress(base_progress + 0.02, desc=f"Generated {len(chunks)} chunks from {file_name}")
|
| 106 |
all_chunks_to_process.extend(chunks)
|
| 107 |
|
| 108 |
except Exception as e:
|
|
|
|
| 111 |
|
| 112 |
if not all_chunks_to_process:
|
| 113 |
logger.warning("No processable chunks were generated from the provided files.")
|
| 114 |
+
safe_progress(1.0, desc="No chunks to process")
|
| 115 |
return
|
| 116 |
|
| 117 |
logger.info(f"Generated {len(all_chunks_to_process)} total chunks. Now generating embeddings...")
|
| 118 |
|
| 119 |
+
safe_progress(0.7, desc=f"Generated {len(all_chunks_to_process)} chunks. Starting embeddings...")
|
|
|
|
| 120 |
|
| 121 |
+
# 2. Create embeddings and add to batch
|
| 122 |
text_embeddings_batch, text_metadatas_batch = [], []
|
| 123 |
audio_embeddings_batch, audio_metadatas_batch = [], []
|
| 124 |
image_embeddings_batch, image_metadatas_batch = [], []
|
| 125 |
BATCH_SIZE = 32
|
| 126 |
|
| 127 |
for i, chunk_data in enumerate(all_chunks_to_process):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
try:
|
| 129 |
+
base_progress = 0.7 + (i / len(all_chunks_to_process)) * 0.25 # 70% -> 95%
|
| 130 |
+
|
| 131 |
+
# Kiểm tra chunk_data có hợp lệ không
|
| 132 |
+
if not chunk_data or 'metadata' not in chunk_data or 'content' not in chunk_data:
|
| 133 |
+
logger.warning(f"Invalid chunk data at index {i}, skipping...")
|
| 134 |
+
continue
|
| 135 |
+
|
| 136 |
+
chunk_type = chunk_data['metadata'].get('type', 'unknown')
|
| 137 |
+
content = chunk_data['content']
|
| 138 |
+
chunk_id = chunk_data['metadata'].get('chunk_id', f'chunk_{i}')
|
| 139 |
+
|
| 140 |
+
# Kiểm tra content có hợp lệ không
|
| 141 |
+
if not content:
|
| 142 |
+
logger.warning(f"Empty content for chunk {chunk_id}, skipping...")
|
| 143 |
+
continue
|
| 144 |
+
|
| 145 |
+
safe_progress(base_progress, desc=f"Processing chunk {i+1}/{len(all_chunks_to_process)} ({chunk_type})")
|
| 146 |
|
|
|
|
| 147 |
if chunk_type == "text":
|
| 148 |
+
safe_progress(base_progress + 0.001, desc=f"Creating text embedding for chunk {i+1}")
|
| 149 |
+
embeddings = self.text_embedder.get_embeddings([content])
|
| 150 |
+
if embeddings and len(embeddings) > 0:
|
| 151 |
+
text_embeddings_batch.append(embeddings[0])
|
| 152 |
+
text_metadatas_batch.append(chunk_data)
|
| 153 |
+
else:
|
| 154 |
+
logger.warning(f"Failed to generate text embedding for chunk {chunk_id}")
|
| 155 |
+
|
| 156 |
elif chunk_type == "audio":
|
| 157 |
+
safe_progress(base_progress + 0.001, desc=f"Creating audio embedding for chunk {i+1}")
|
| 158 |
+
embeddings = self.audio_embedder.get_embeddings([content])
|
| 159 |
+
if embeddings and len(embeddings) > 0:
|
| 160 |
+
audio_embeddings_batch.append(embeddings[0])
|
| 161 |
+
audio_metadatas_batch.append(chunk_data)
|
| 162 |
+
else:
|
| 163 |
+
logger.warning(f"Failed to generate audio embedding for chunk {chunk_id}")
|
| 164 |
+
|
| 165 |
elif chunk_type == "image":
|
| 166 |
+
safe_progress(base_progress + 0.001, desc=f"Creating image embedding for chunk {i+1}")
|
| 167 |
+
embeddings = self.image_embedder.get_embeddings([content])
|
| 168 |
+
if embeddings and len(embeddings) > 0:
|
| 169 |
+
image_embeddings_batch.append(embeddings[0])
|
| 170 |
+
image_metadatas_batch.append(chunk_data)
|
| 171 |
+
else:
|
| 172 |
+
logger.warning(f"Failed to generate image embedding for chunk {chunk_id}")
|
| 173 |
|
| 174 |
+
# add batch when reaching BATCH_SIZE
|
| 175 |
if len(text_embeddings_batch) >= BATCH_SIZE:
|
| 176 |
+
safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(text_embeddings_batch)} text embeddings...")
|
|
|
|
| 177 |
self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 178 |
text_embeddings_batch, text_metadatas_batch = [], []
|
| 179 |
|
| 180 |
if len(audio_embeddings_batch) >= BATCH_SIZE:
|
| 181 |
+
safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(audio_embeddings_batch)} audio embeddings...")
|
|
|
|
| 182 |
self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 183 |
audio_embeddings_batch, audio_metadatas_batch = [], []
|
| 184 |
|
| 185 |
if len(image_embeddings_batch) >= BATCH_SIZE:
|
| 186 |
+
safe_progress(base_progress + 0.002, desc=f"Saving batch of {len(image_embeddings_batch)} image embeddings...")
|
|
|
|
| 187 |
self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 188 |
image_embeddings_batch, image_metadatas_batch = [], []
|
| 189 |
|
| 190 |
except Exception as e:
|
| 191 |
+
logger.error(f"Error ingesting chunk {i}: {e}")
|
| 192 |
+
continue
|
| 193 |
|
| 194 |
+
safe_progress(0.95, desc="Saving final batches...")
|
|
|
|
| 195 |
|
| 196 |
+
# adding maintaining embeddings
|
| 197 |
final_operations = []
|
| 198 |
if text_embeddings_batch:
|
| 199 |
final_operations.append(("text", len(text_embeddings_batch)))
|
|
|
|
| 202 |
if image_embeddings_batch:
|
| 203 |
final_operations.append(("image", len(image_embeddings_batch)))
|
| 204 |
|
| 205 |
+
# Tránh chia cho 0
|
| 206 |
+
total_operations = len(final_operations)
|
| 207 |
+
if total_operations == 0:
|
| 208 |
+
safe_progress(1.0, desc="No final batches to save")
|
| 209 |
+
else:
|
| 210 |
+
for i, (batch_type, count) in enumerate(final_operations):
|
| 211 |
+
try:
|
| 212 |
+
current_progress = 0.95 + (i / total_operations) * 0.04 # 95% -> 99%
|
| 213 |
+
|
| 214 |
+
if batch_type == "text" and text_embeddings_batch:
|
| 215 |
+
safe_progress(current_progress, desc=f"Saving final {count} text embeddings...")
|
| 216 |
+
self.text_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 217 |
+
elif batch_type == "audio" and audio_embeddings_batch:
|
| 218 |
+
safe_progress(current_progress, desc=f"Saving final {count} audio embeddings...")
|
| 219 |
+
self.audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 220 |
+
elif batch_type == "image" and image_embeddings_batch:
|
| 221 |
+
safe_progress(current_progress, desc=f"Saving final {count} image embeddings...")
|
| 222 |
+
self.image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logger.error(f"Error saving final batch {batch_type}: {e}")
|
| 225 |
+
|
| 226 |
+
safe_progress(1.0, desc=f"✅ Successfully ingested {len(file_paths)} files with {len(all_chunks_to_process)} chunks!")
|
| 227 |
|
| 228 |
logger.success(f"Successfully completed ingestion for {len(file_paths)} files.")
|
main.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import shutil
|
| 5 |
+
import atexit
|
| 6 |
+
import signal
|
| 7 |
+
|
| 8 |
+
# add project folder to sys.path
|
| 9 |
+
project_root = os.path.dirname(os.path.abspath(__file__))
|
| 10 |
+
if project_root not in sys.path:
|
| 11 |
+
sys.path.insert(0, project_root)
|
| 12 |
+
|
| 13 |
+
from config.settings import settings
|
| 14 |
+
from utils.logger import logger
|
| 15 |
+
from app import create_and_run_app
|
| 16 |
+
from app import shared_qdrant_client
|
| 17 |
+
|
| 18 |
+
GLOBAL_QDRANT_CLIENT = shared_qdrant_client
|
| 19 |
+
|
| 20 |
+
def cleanup():
|
| 21 |
+
logger.info("--- Starting cleanup process ---")
|
| 22 |
+
|
| 23 |
+
# --- Step 1: Close Qdrant connection ---
|
| 24 |
+
# release file .lock
|
| 25 |
+
global GLOBAL_QDRANT_CLIENT
|
| 26 |
+
if GLOBAL_QDRANT_CLIENT:
|
| 27 |
+
try:
|
| 28 |
+
logger.info("Closing Qdrant client connection...")
|
| 29 |
+
GLOBAL_QDRANT_CLIENT.close()
|
| 30 |
+
logger.success("Qdrant client closed successfully.")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.error(f"Error closing Qdrant client: {e}")
|
| 33 |
+
|
| 34 |
+
# Step 2: Clean up "qdrant_data" folder
|
| 35 |
+
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 36 |
+
if os.path.exists(qdrant_db_path) and os.path.isdir(qdrant_db_path):
|
| 37 |
+
try:
|
| 38 |
+
# try many times just in case
|
| 39 |
+
import time
|
| 40 |
+
retries = 3
|
| 41 |
+
for i in range(retries):
|
| 42 |
+
try:
|
| 43 |
+
shutil.rmtree(qdrant_db_path)
|
| 44 |
+
logger.success(f"Successfully cleaned up Qdrant data at: {qdrant_db_path}")
|
| 45 |
+
break
|
| 46 |
+
except OSError as e:
|
| 47 |
+
if i < retries - 1:
|
| 48 |
+
logger.warning(f"Cleanup attempt {i+1} failed: {e}. Retrying in 1 second...")
|
| 49 |
+
time.sleep(1)
|
| 50 |
+
else:
|
| 51 |
+
raise
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.error(f"Error cleaning up Qdrant data after retries: {e}")
|
| 54 |
+
else:
|
| 55 |
+
logger.info("Qdrant data directory not found, skipping cleanup.")
|
| 56 |
+
|
| 57 |
+
# Step 3: Clean up "raw" folder
|
| 58 |
+
raw_data_path = settings.RAW_DATA_DIR
|
| 59 |
+
if os.path.exists(raw_data_path) and os.path.isdir(raw_data_path):
|
| 60 |
+
try:
|
| 61 |
+
for item in os.listdir(raw_data_path):
|
| 62 |
+
item_path = os.path.join(raw_data_path, item)
|
| 63 |
+
if os.path.isdir(item_path): shutil.rmtree(item_path)
|
| 64 |
+
elif os.path.isfile(item_path): os.remove(item_path)
|
| 65 |
+
logger.success(f"Successfully cleaned up raw data directory: {raw_data_path}")
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"Error cleaning up raw data: {e}")
|
| 68 |
+
else:
|
| 69 |
+
logger.info("Raw data directory not found, skipping cleanup.")
|
| 70 |
+
|
| 71 |
+
# Step 4: Clean up "processed/chunks" folder
|
| 72 |
+
chunks_data_path = settings.CHUNKS_DIR
|
| 73 |
+
if os.path.exists(chunks_data_path) and os.path.isdir(chunks_data_path):
|
| 74 |
+
try:
|
| 75 |
+
for item in os.listdir(chunks_data_path):
|
| 76 |
+
item_path = os.path.join(chunks_data_path, item)
|
| 77 |
+
if os.path.isdir(item_path): shutil.rmtree(item_path)
|
| 78 |
+
elif os.path.isfile(item_path): os.remove(item_path)
|
| 79 |
+
logger.success(f"Successfully cleaned up processed/chunks data directory: {chunks_data_path}")
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"Error cleaning up processed/chunks data: {e}")
|
| 82 |
+
else:
|
| 83 |
+
logger.info("processed/chunks data directory not found, skipping cleanup.")
|
| 84 |
+
|
| 85 |
+
logger.info("--- Cleanup process finished ---")
|
| 86 |
+
|
| 87 |
+
def signal_handler(sig, frame):
|
| 88 |
+
"""
|
| 89 |
+
Xử lý tín hiệu ngắt (Ctrl+C) để thoát chương trình một cách an toàn.
|
| 90 |
+
"""
|
| 91 |
+
logger.warning("\nCtrl+C detected. Shutting down the application.")
|
| 92 |
+
# atexit sẽ tự động gọi cleanup()
|
| 93 |
+
sys.exit(0)
|
| 94 |
+
|
| 95 |
+
def main():
|
| 96 |
+
# Đăng ký hàm cleanup để được gọi khi chương trình thoát bình thường hoặc có lỗi
|
| 97 |
+
atexit.register(cleanup)
|
| 98 |
+
|
| 99 |
+
# Đăng ký hàm xử lý tín hiệu cho Ctrl+C
|
| 100 |
+
signal.signal(signal.SIGINT, signal_handler)
|
| 101 |
+
|
| 102 |
+
logger.info("--- Starting Multimedia RAG Assistant ---")
|
| 103 |
+
|
| 104 |
+
demo = create_and_run_app()
|
| 105 |
+
|
| 106 |
+
print("\n" + "="*50)
|
| 107 |
+
print(" Application is running. Press Ctrl+C to exit. ")
|
| 108 |
+
print(" Cleanup will be performed upon exit. ")
|
| 109 |
+
print("="*50 + "\n")
|
| 110 |
+
|
| 111 |
+
demo.launch()
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
main()
|
scripts/ingest_data.py
DELETED
|
@@ -1,203 +0,0 @@
|
|
| 1 |
-
# scripts/ingest_data.py
|
| 2 |
-
import os
|
| 3 |
-
import json
|
| 4 |
-
from tqdm import tqdm
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from typing import Dict
|
| 7 |
-
import shutil
|
| 8 |
-
|
| 9 |
-
from config.settings import settings
|
| 10 |
-
from utils.logger import logger
|
| 11 |
-
from qdrant_client import QdrantClient
|
| 12 |
-
|
| 13 |
-
# Import các Processor (không thay đổi)
|
| 14 |
-
from core.data_processing.text_processor import TextProcessor
|
| 15 |
-
from core.data_processing.audio_processor import AudioProcessor
|
| 16 |
-
# from core.data_processing.video_processor import VideoProcessor
|
| 17 |
-
from core.data_processing.image_processor import ImageProcessor
|
| 18 |
-
|
| 19 |
-
# Import các Embedding Model (không thay đổi)
|
| 20 |
-
from core.embeddings.text_embedding_model import TextEmbeddingModel
|
| 21 |
-
from core.embeddings.image_embedding_model import ImageEmbeddingModel
|
| 22 |
-
from core.embeddings.audio_embedding_model import AudioEmbeddingModel
|
| 23 |
-
|
| 24 |
-
# Import VectorDBManager phiên bản Qdrant MỚI
|
| 25 |
-
from core.retrieval.vector_db_manager import VectorDBManager
|
| 26 |
-
|
| 27 |
-
def walk_through_files(extentions: Dict, raw_dir: str, all_raw_chunks_from_processors, processor):
|
| 28 |
-
all_files = list(raw_dir.rglob("*"))
|
| 29 |
-
for filepath in tqdm(all_files, desc="Processing " + raw_dir.name):
|
| 30 |
-
if filepath.suffix in extentions and filepath.is_file():
|
| 31 |
-
all_raw_chunks_from_processors.extend(
|
| 32 |
-
processor.process(str(filepath))
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
def ingest_data_pipeline():
|
| 36 |
-
logger.info("Starting comprehensive data ingestion pipeline (Chunking + Embedding + Qdrant Indexing)...")
|
| 37 |
-
|
| 38 |
-
# --- 1. Khởi tạo các Processor --- (Không thay đổi)
|
| 39 |
-
text_processor = TextProcessor(chunk_size=500, chunk_overlap=50)
|
| 40 |
-
audio_processor = AudioProcessor(min_silence_len=1000, silence_thresh_db=-40, target_sr=16000)
|
| 41 |
-
image_processor = ImageProcessor()
|
| 42 |
-
# video_processor = VideoProcessor(chunk_duration_sec=15, frames_per_segment=5)
|
| 43 |
-
|
| 44 |
-
# --- Dọn dẹp các thư mục chunk và Qdrant data cũ ---
|
| 45 |
-
dirs_to_clean_and_create = [
|
| 46 |
-
settings.CHUNKS_DIR,
|
| 47 |
-
settings.METADATA_DIR
|
| 48 |
-
]
|
| 49 |
-
# Thư mục dữ liệu của Qdrant
|
| 50 |
-
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 51 |
-
dirs_to_clean_and_create.append(qdrant_db_path)
|
| 52 |
-
|
| 53 |
-
for dir_path in dirs_to_clean_and_create:
|
| 54 |
-
if os.path.exists(dir_path):
|
| 55 |
-
shutil.rmtree(dir_path)
|
| 56 |
-
logger.info(f"Cleaned up old directory: {dir_path}")
|
| 57 |
-
# Tạo lại các thư mục cho chunking, trừ thư mục qdrant (client sẽ tự tạo)
|
| 58 |
-
if dir_path != qdrant_db_path:
|
| 59 |
-
os.makedirs(dir_path, exist_ok=True)
|
| 60 |
-
|
| 61 |
-
logger.info("Output directories and previous Qdrant data are ready for fresh ingestion.")
|
| 62 |
-
|
| 63 |
-
qdrant_db_path = os.path.join(settings.DATA_DIR, "qdrant_data")
|
| 64 |
-
client = QdrantClient(path=qdrant_db_path)
|
| 65 |
-
logger.info(f"Single Qdrant client initialized for ingestion, connected to: {qdrant_db_path}")
|
| 66 |
-
|
| 67 |
-
all_raw_chunks_from_processors = [] # Chứa tất cả các chunk (bao gồm content và metadata)
|
| 68 |
-
|
| 69 |
-
# --- 2. Chạy Data Processing (Chunking) --- (Không thay đổi)
|
| 70 |
-
logger.info("--- Phase 1: Processing Raw Data into Chunks ---")
|
| 71 |
-
|
| 72 |
-
# Xử lý Văn bản
|
| 73 |
-
text_extentions = {".txt"}
|
| 74 |
-
text_raw_dir = Path(settings.RAW_DATA_DIR) / "texts"
|
| 75 |
-
walk_through_files(text_extentions, text_raw_dir, all_raw_chunks_from_processors, text_processor)
|
| 76 |
-
|
| 77 |
-
# Xử lý Âm thanh
|
| 78 |
-
audio_extentions = {".wav", ".mp3"}
|
| 79 |
-
audio_raw_dir = Path(settings.RAW_DATA_DIR) / "audios"
|
| 80 |
-
walk_through_files(audio_extentions, audio_raw_dir, all_raw_chunks_from_processors, audio_processor)
|
| 81 |
-
|
| 82 |
-
# process images
|
| 83 |
-
image_extentions = {".jpg", ".png"}
|
| 84 |
-
image_raw_dir = Path(settings.RAW_DATA_DIR) / "images"
|
| 85 |
-
walk_through_files(image_extentions, image_raw_dir, all_raw_chunks_from_processors, image_processor)
|
| 86 |
-
|
| 87 |
-
# Xử lý Video
|
| 88 |
-
# video_raw_dir = os.path.join(settings.RAW_DATA_DIR, "videos")
|
| 89 |
-
# for filename in tqdm(os.listdir(video_raw_dir), desc="Processing Video"):
|
| 90 |
-
# if filename.endswith((".mp4", ".avi", ".mov")):
|
| 91 |
-
# all_raw_chunks_from_processors.extend(video_processor.process_video(os.path.join(video_raw_dir, filename)))
|
| 92 |
-
|
| 93 |
-
logger.info(f"Total raw chunks processed from all sources: {len(all_raw_chunks_from_processors)}")
|
| 94 |
-
|
| 95 |
-
# --- 3. Tạo Embedding và Thêm vào Qdrant ---
|
| 96 |
-
logger.info("--- Phase 2: Generating Embeddings and Building Qdrant Collections ---")
|
| 97 |
-
|
| 98 |
-
# Khởi tạo các Embedding Model
|
| 99 |
-
text_embedder = TextEmbeddingModel()
|
| 100 |
-
image_embedder = ImageEmbeddingModel()
|
| 101 |
-
audio_embedder = AudioEmbeddingModel()
|
| 102 |
-
|
| 103 |
-
# --- Khởi tạo các VectorDBManager cho Qdrant ---
|
| 104 |
-
# Lấy kích thước embedding từ model để đảm bảo chính xác
|
| 105 |
-
text_embedding_dim = text_embedder.model.get_sentence_embedding_dimension()
|
| 106 |
-
text_vector_db_manager = VectorDBManager(collection_name="text_collection", embedding_dim=text_embedding_dim, client=client)
|
| 107 |
-
|
| 108 |
-
# Kích thước embedding cho image/audio (giả định là 512)
|
| 109 |
-
image_embedding_dim = 512
|
| 110 |
-
image_vector_db_manager = VectorDBManager(collection_name="image_collection", embedding_dim=image_embedding_dim, client=client)
|
| 111 |
-
|
| 112 |
-
# video_frame_embedding_dim = 512
|
| 113 |
-
# video_frame_vector_db_manager = VectorDBManager(collection_name="video_frame_collection", embedding_dim=video_frame_embedding_dim, client=client)
|
| 114 |
-
|
| 115 |
-
audio_embedding_dim = 512
|
| 116 |
-
audio_vector_db_manager = VectorDBManager(collection_name="audio_collection", embedding_dim=image_embedding_dim, client=client)
|
| 117 |
-
|
| 118 |
-
logger.info(f"Initialized Text Qdrant Collection Manager with {text_embedding_dim}D.")
|
| 119 |
-
logger.info(f"Initialized Image Qdrant Collection Manager with {image_embedding_dim}D.")
|
| 120 |
-
logger.info(f"Initialized Audio Qdrant Collection Manager with {audio_embedding_dim}D.")
|
| 121 |
-
|
| 122 |
-
# Tạo các batch để thêm vào Qdrant hiệu quả hơn
|
| 123 |
-
text_embeddings_batch = []
|
| 124 |
-
text_metadatas_batch = []
|
| 125 |
-
|
| 126 |
-
image_embeddings_batch = []
|
| 127 |
-
image_metadatas_batch = []
|
| 128 |
-
|
| 129 |
-
# video_frame_embeddings_batch = []
|
| 130 |
-
# video_frame_metadatas_batch = []
|
| 131 |
-
|
| 132 |
-
audio_embeddings_batch = []
|
| 133 |
-
audio_metadatas_batch = []
|
| 134 |
-
|
| 135 |
-
BATCH_SIZE = 32 # Thêm 32 điểm một lần
|
| 136 |
-
|
| 137 |
-
for chunk_data in tqdm(all_raw_chunks_from_processors, desc="Generating Embeddings & Populating Qdrant"):
|
| 138 |
-
chunk_type = chunk_data['metadata']['type']
|
| 139 |
-
content = chunk_data['content']
|
| 140 |
-
|
| 141 |
-
try:
|
| 142 |
-
if chunk_type == "text":
|
| 143 |
-
embedding = text_embedder.get_embeddings([content])[0]
|
| 144 |
-
text_embeddings_batch.append(embedding)
|
| 145 |
-
text_metadatas_batch.append(chunk_data)
|
| 146 |
-
|
| 147 |
-
elif chunk_type == "audio":
|
| 148 |
-
embedding = audio_embedder.get_embeddings([content])[0]
|
| 149 |
-
audio_embeddings_batch.append(embedding)
|
| 150 |
-
audio_metadatas_batch.append(chunk_data)
|
| 151 |
-
|
| 152 |
-
elif chunk_type == "image":
|
| 153 |
-
embedding = image_embedder.get_embeddings([content])[0]
|
| 154 |
-
image_embeddings_batch.append(embedding)
|
| 155 |
-
image_metadatas_batch.append(chunk_data)
|
| 156 |
-
|
| 157 |
-
# elif chunk_type == "video_frame":
|
| 158 |
-
# if content and isinstance(content, list) and len(content) > 0:
|
| 159 |
-
# embedding = image_embedder.get_embeddings([content[0]])[0] # Chỉ nhúng ảnh đầu tiên
|
| 160 |
-
# video_frame_embeddings_batch.append(embedding)
|
| 161 |
-
# video_frame_metadatas_batch.append(chunk_data['metadata'])
|
| 162 |
-
|
| 163 |
-
# Xử lý batch
|
| 164 |
-
if len(text_embeddings_batch) >= BATCH_SIZE:
|
| 165 |
-
text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 166 |
-
text_embeddings_batch, text_metadatas_batch = [], [] # Reset batch
|
| 167 |
-
|
| 168 |
-
if len(audio_embeddings_batch) >= BATCH_SIZE:
|
| 169 |
-
audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 170 |
-
audio_embeddings_batch, audio_metadatas_batch = [], [] # Reset batch
|
| 171 |
-
|
| 172 |
-
if len(image_embeddings_batch) >= BATCH_SIZE:
|
| 173 |
-
image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 174 |
-
image_embeddings_batch, image_metadatas_batch = [], [] # Reset batch
|
| 175 |
-
|
| 176 |
-
# if len(video_frame_embeddings_batch) >= BATCH_SIZE:
|
| 177 |
-
# video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
|
| 178 |
-
# video_frame_embeddings_batch, video_frame_metadatas_batch = [], [] # Reset batch
|
| 179 |
-
|
| 180 |
-
except Exception as e:
|
| 181 |
-
logger.error(f"Error processing chunk {chunk_data['metadata']['chunk_id']}: {e}")
|
| 182 |
-
|
| 183 |
-
# Thêm các embedding còn lại trong batch cuối cùng
|
| 184 |
-
if text_embeddings_batch:
|
| 185 |
-
text_vector_db_manager.add_vectors(text_embeddings_batch, text_metadatas_batch)
|
| 186 |
-
if audio_embeddings_batch:
|
| 187 |
-
audio_vector_db_manager.add_vectors(audio_embeddings_batch, audio_metadatas_batch)
|
| 188 |
-
if image_embeddings_batch:
|
| 189 |
-
image_vector_db_manager.add_vectors(image_embeddings_batch, image_metadatas_batch)
|
| 190 |
-
# if video_frame_embeddings_batch:
|
| 191 |
-
# video_frame_vector_db_manager.add_vectors(video_frame_embeddings_batch, video_frame_metadatas_batch)
|
| 192 |
-
|
| 193 |
-
logger.success("Finished populating Qdrant collections.")
|
| 194 |
-
logger.info(f"Total vectors in 'text_collection': {text_vector_db_manager.get_total_vectors()}")
|
| 195 |
-
logger.info(f"Total vectors in 'audio_collection': {audio_vector_db_manager.get_total_vectors()}")
|
| 196 |
-
logger.info(f"Total vectors in 'image_collection': {image_vector_db_manager.get_total_vectors()}")
|
| 197 |
-
# logger.info(f"Total vectors in 'video_frame_collection': {video_frame_vector_db_manager.get_total_vectors()}")
|
| 198 |
-
|
| 199 |
-
logger.info("Data ingestion pipeline completed successfully!")
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
if __name__ == "__main__":
|
| 203 |
-
ingest_data_pipeline()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/logger.py
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
|
|
| 1 |
import sys
|
| 2 |
from loguru import logger
|
| 3 |
-
from config.settings import settings
|
| 4 |
|
| 5 |
-
#
|
| 6 |
-
logger.remove() #
|
|
|
|
|
|
|
| 7 |
logger.add(
|
| 8 |
-
|
| 9 |
-
rotation="10 MB",
|
| 10 |
-
compression="zip",
|
| 11 |
-
level=settings.LOG_LEVEL, #
|
| 12 |
-
colorize=True,
|
| 13 |
format="{time} {level} {message}",
|
| 14 |
-
enqueue=True
|
| 15 |
)
|
| 16 |
logger.add(
|
| 17 |
-
sys.stderr, #
|
| 18 |
level=settings.LOG_LEVEL,
|
| 19 |
colorize=True,
|
| 20 |
format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
|
| 21 |
-
)
|
| 22 |
-
|
| 23 |
-
# Xuất logger để các module khác có thể import và sử dụng
|
| 24 |
-
__all__ = ["logger"]
|
|
|
|
| 1 |
+
import os
|
| 2 |
import sys
|
| 3 |
from loguru import logger
|
| 4 |
+
from config.settings import settings
|
| 5 |
|
| 6 |
+
# logger configuration
|
| 7 |
+
logger.remove() # remove default config
|
| 8 |
+
|
| 9 |
+
log_path = os.path.join(settings.LOG_DIR, "file_{time}.log")
|
| 10 |
logger.add(
|
| 11 |
+
log_path,
|
| 12 |
+
rotation="10 MB",
|
| 13 |
+
compression="zip",
|
| 14 |
+
level=settings.LOG_LEVEL, # log level from settings
|
| 15 |
+
colorize=True,
|
| 16 |
format="{time} {level} {message}",
|
| 17 |
+
enqueue=True
|
| 18 |
)
|
| 19 |
logger.add(
|
| 20 |
+
sys.stderr, # output to console
|
| 21 |
level=settings.LOG_LEVEL,
|
| 22 |
colorize=True,
|
| 23 |
format="<green>{time}</green> <level>{level}</level> <bold>{message}</bold>"
|
| 24 |
+
)
|
|
|
|
|
|
|
|
|