Spaces:

m97j
/

knowledge-engine

Sleeping

App Files Files Community

m97j commited on 16 days ago

Commit

cda6eee

1 Parent(s): 972e4b1

feat: Change qdrant from local mode to server mode.

Browse files

Files changed (9) hide show

Dockerfile +13 -1
core/config.py +3 -2
main.py +1 -6
scripts/data_pipeline.py +444 -281
scripts/setup_db.py +4 -2
services/search_service.py +10 -2
start.sh +23 -0
storage/qdrant_client.py +4 -4
storage/sqlite_client.py +30 -25

Dockerfile CHANGED Viewed

@@ -7,16 +7,28 @@ ENV PYTHONUNBUFFERED=1
 WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 COPY . .
 VOLUME ["/app/data"]
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 WORKDIR /app
+# Install essential system packages and wget for downloading Qdrant binary
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
+    wget \
     && rm -rf /var/lib/apt/lists/*
+# Download Qdrant Binaries (Based on v1.16.2, for Linux)
+RUN wget https://github.com/qdrant/qdrant/releases/download/v1.16.2/qdrant-x86_64-unknown-linux-gnu.tar.gz && \
+    tar -xzf qdrant-x86_64-unknown-linux-gnu.tar.gz && \
+    mv qdrant /usr/local/bin/ && \
+    rm qdrant-x86_64-unknown-linux-gnu.tar.gz
 COPY requirements.txt .
 RUN pip install --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 COPY . .
+# Grant execution permissions to the startup script
+RUN chmod +x start.sh
 VOLUME ["/app/data"]
+# Control multiple processes via start.sh
+CMD ["./start.sh"]

core/config.py CHANGED Viewed

@@ -21,9 +21,10 @@ class Settings(BaseSettings):
     REPO_ID: str = Field(default="m97j/ke-store", description="Hugging Face repository ID")
     # 2. Storage Settings (Vector DB & RDBMS)
-    QDRANT_PATH: str = Field(default="./data/qdrant", description="Qdrant local storage path")
     QDRANT_COLLECTION: str = Field(default="knowledge_base", description="Qdrant collection name")
-    SQLITE_PATH: str = Field(default="./data/corpus/corpus.sqlite", description="SQLite DB file path")
     # 3. Model Settings (Embedder & Reranker)
     EMBEDDER_NAME: str = Field(default="BAAI/bge-m3", description="FlagEmbedding model name")

     REPO_ID: str = Field(default="m97j/ke-store", description="Hugging Face repository ID")
     # 2. Storage Settings (Vector DB & RDBMS)
+    SQLITE_PATH: str = Field(default="{DATA_DIR}/knowledge_base/corpus.sqlite", description="SQLite DB file path")
+    QDRANT_PATH: str = Field(default="{DATA_DIR}/vector_store/qdrant", description="Qdrant local storage path")
     QDRANT_COLLECTION: str = Field(default="knowledge_base", description="Qdrant collection name")
+    QDRANT_URL: str = Field(default="http://localhost:6333", description="Qdrant server URL (if using client-server mode)")
     # 3. Model Settings (Embedder & Reranker)
     EMBEDDER_NAME: str = Field(default="BAAI/bge-m3", description="FlagEmbedding model name")

main.py CHANGED Viewed

@@ -12,7 +12,6 @@ from core.exceptions import setup_exception_handlers
 from core.logger import setup_logger
 from models.embedder import TextEmbedder
 from models.reranker import TextReranker
-from scripts.setup_db import download_knowledge_base
 from services.search_service import HybridSearchService
 from storage.qdrant_client import QdrantStorage
 from storage.sqlite_client import SQLiteStorage
@@ -34,12 +33,8 @@ async def lifespan(app: FastAPI):
     sqlite_client = None
     try:
-        # 0. Prepare dependency data (DB) (Download if unavailable, skip if available)
-        logger.info("Checking and preparing Knowledge Base data...")
-        download_knowledge_base()
         # 1. Infrastructure Connection (Database)
-        qdrant_client = QdrantStorage(path=settings.QDRANT_PATH, collection_name=settings.QDRANT_COLLECTION)
         sqlite_client = SQLiteStorage(db_path=settings.SQLITE_PATH)
         # 2. Load AI Model (Singleton)

 from core.logger import setup_logger
 from models.embedder import TextEmbedder
 from models.reranker import TextReranker
 from services.search_service import HybridSearchService
 from storage.qdrant_client import QdrantStorage
 from storage.sqlite_client import SQLiteStorage
     sqlite_client = None
     try:
         # 1. Infrastructure Connection (Database)
+        qdrant_client = QdrantStorage(url=settings.QDRANT_URL, collection_name=settings.QDRANT_COLLECTION)
         sqlite_client = SQLiteStorage(db_path=settings.SQLITE_PATH)
         # 2. Load AI Model (Singleton)

scripts/data_pipeline.py CHANGED Viewed

@@ -1,49 +1,79 @@
 # scripts/data_pipeline.py
-import json
 import os
 import re
 import sqlite3
 import numpy as np
 from datasets import load_dataset
 from FlagEmbedding import BGEM3FlagModel
 from qdrant_client import QdrantClient
-from qdrant_client.models import (Distance, OptimizersConfigDiff, PointStruct,
-                                  ScalarQuantization, ScalarQuantizationConfig,
-                                  ScalarType, SparseIndexParams, SparseVector,
                                   SparseVectorParams, VectorParams)
 from tqdm import tqdm
 from transformers import AutoTokenizer
 class KnowledgeEngineBuilder:
-    def __init__(self, base_dir="ke_store", dim=1024):
         self.base_dir = base_dir
         self.dim = dim
-        print("Loading BGE-M3 Model and Tokenizer...")
-        self.model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
-        self.tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
-        self.max_tokens = 384
-        self.overlap_count = 2
         self._init_dirs()
-        self._init_sqlite()
-        self._init_meta()
-        self._init_qdrant()
-    # ---------------------------
-    # INIT & SETUP
-    # ---------------------------
-    def _init_dirs(self):
-        for d in ["corpus", "qdrant", "build_cache/embeddings"]:
-            os.makedirs(os.path.join(self.base_dir, d), exist_ok=True)
-    def _init_qdrant(self):
-        self.qdrant_path = f"{self.base_dir}/qdrant"
-        self.qdrant_client = QdrantClient(path=self.qdrant_path)
         self.collection_name = "knowledge_base"
         if not self.qdrant_client.collection_exists(self.collection_name):
@@ -57,331 +87,464 @@ class KnowledgeEngineBuilder:
                     "sparse": SparseVectorParams(index=SparseIndexParams(on_disk=True))
                 },
                 quantization_config=ScalarQuantization(
-                    scalar=ScalarQuantizationConfig(type=ScalarType.INT8, always_ram=True)
                 ),
-                optimizers_config=OptimizersConfigDiff(indexing_threshold=0)
             )
-    def _optimize_sqlite(self, conn):
-        conn.execute("PRAGMA journal_mode=WAL;")
-        conn.execute("PRAGMA synchronous=NORMAL;")
-        conn.execute("PRAGMA temp_store=MEMORY;")
-        conn.execute("PRAGMA cache_size=-2000000")
     def _init_sqlite(self):
-        self.conn = sqlite3.connect(f"{self.base_dir}/corpus/corpus.sqlite")
-        self._optimize_sqlite(self.conn)
-        cur = self.conn.cursor()
         cur.execute("""
         CREATE TABLE IF NOT EXISTS documents (
-            doc_id INTEGER PRIMARY KEY AUTOINCREMENT,
             external_id TEXT, title TEXT, lang TEXT, url TEXT,
             wikidata_id TEXT, date_modified TEXT, full_text TEXT)
         """)
         cur.execute("""
         CREATE TABLE IF NOT EXISTS chunks (
-            chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,
             doc_id INTEGER, chunk_index INTEGER, text TEXT,
-            token_length INTEGER, section TEXT, lang TEXT)
         """)
         cur.execute("""
         CREATE TABLE IF NOT EXISTS spans (
-            span_id INTEGER PRIMARY KEY AUTOINCREMENT,
-            chunk_id INTEGER, span_index INTEGER, text TEXT, char_length INTEGER)
         """)
         cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_doc_id ON chunks(doc_id)")
         cur.execute("CREATE INDEX IF NOT EXISTS idx_spans_chunk_id ON spans(chunk_id)")
         cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_lang ON chunks(lang)")
         self.conn.commit()
-    def _init_meta(self):
-        self.meta_path = f"{self.base_dir}/corpus/meta.json"
-        cur = self.conn.cursor()
-        cur.execute("SELECT MAX(doc_id) FROM documents")
-        db_doc = cur.fetchone()[0] or 0
-        cur.execute("SELECT MAX(chunk_id) FROM chunks")
-        db_chunk = cur.fetchone()[0] or 0
-        cur.execute("SELECT MAX(span_id) FROM spans")
-        db_span = cur.fetchone()[0] or 0
-        self.meta = {
-            "last_doc_id": db_doc + 1,
-            "last_chunk_id": db_chunk + 1,
-            "last_span_id": db_span + 1
-        }
-        self._save_meta()
-    def _save_meta(self):
-        with open(self.meta_path, "w") as f:
-            json.dump(self.meta, f, indent=4)
-    # ---------------------------
-    # TEXT PROCESSING & INGESTION
-    # ---------------------------
-    def split_sentences(self, text):
-        text = re.sub(r'[ \t]+', ' ', text)
-        pattern = r'(?<=[.!?。！？])(?<![Ar|Dr|Mr|Ms|St]\.)(?<![A-Z]\.)\s+'
-        sentences = re.split(pattern, text)
-        final_sentences = []
-        for s in sentences:
-            sub_parts = [p.strip() for p in s.split('\n') if p.strip()]
-            final_sentences.extend(sub_parts)
-        return [s for s in final_sentences if len(s) > 1]
-    def count_tokens(self, text):
-        return len(self.tokenizer.encode(text, add_special_tokens=False))
-    def get_token_counts_batch(self, texts):
-        if not texts: return []
-        encodings = self.tokenizer(texts, add_special_tokens=False, padding=False, truncation=False)
-        return [len(ids) for ids in encodings['input_ids']]
-    def _split_monster_sentence(self, sentence):
-        words = sentence.split(' ')
-        sub_spans, current_sub, current_toks = [], [], 0
-        for word in words:
-            word_toks = self.count_tokens(word)
-            if word_toks > self.max_tokens:
-                if current_sub:
-                    sub_spans.append(" ".join(current_sub))
-                    current_sub, current_toks = [], 0
-                half = len(word) // 2
-                sub_spans.extend([word[:half], word[half:]])
                 continue
-            space_tok = 1 if current_sub else 0
-            if current_toks + word_toks + space_tok > self.max_tokens and current_sub:
-                sub_spans.append(" ".join(current_sub))
-                current_sub, current_toks = [word], word_toks
             else:
-                current_sub.append(word)
-                current_toks += word_toks + space_tok
-        if current_sub: sub_spans.append(" ".join(current_sub))
-        return sub_spans
-    def chunk_text(self, text):
-        raw_sentences = self.split_sentences(text)
-        sentence_lengths = self.get_token_counts_batch(raw_sentences)
-        refined_spans = []
-        for s, length in zip(raw_sentences, sentence_lengths):
-            if length > self.max_tokens: refined_spans.extend(self._split_monster_sentence(s))
-            else: refined_spans.append(s)
-        span_toks_list = self.get_token_counts_batch(refined_spans)
-        chunks, current_spans, current_tokens = [], [], 0
-        for span, span_toks in zip(refined_spans, span_toks_list):
-            if current_tokens + span_toks > self.max_tokens and current_spans:
-                chunk_text = " ".join(current_spans)
-                chunks.append((chunk_text, self.count_tokens(chunk_text), list(current_spans)))
-                actual_overlap = min(self.overlap_count, len(current_spans) - 1)
-                if actual_overlap > 0:
-                    current_spans = current_spans[-actual_overlap:]
-                    current_tokens = self.count_tokens(" ".join(current_spans)) + 1
-                else:
-                    current_spans, current_tokens = [], 0
-            current_spans.append(span)
-            current_tokens += span_toks + 1
         if current_spans:
-            chunk_text = " ".join(current_spans)
-            chunks.append((chunk_text, self.count_tokens(chunk_text), list(current_spans)))
         return chunks
-    def ingest(self, lang="ko", batch_size=32, limit=None):
-        """
-         - The dataset is read in a streaming manner to handle large corpora without memory issues.
-         - Each document is processed to create chunks based on token limits, with an overlap strategy to ensure comprehensive coverage of the text.
-            - The processed documents, chunks, and spans are stored in SQLite with appropriate indexing for efficient retrieval during search.
-         """
         ds = load_dataset("HuggingFaceFW/finewiki", lang, split="train", streaming=True)
         cur = self.conn.cursor()
         count = 0
-        batch_docs, batch_chunks, batch_spans = [], [], []
-        for item in tqdm(ds, desc=f"Ingesting {lang}"):
             if limit and count >= limit: break
-            doc_id = self.meta["last_doc_id"]
-            batch_docs.append((doc_id, item["id"], item["title"], lang, item["url"], item.get("wikidata_id", ""), item.get("date_modified", ""), item["text"]))
-            for c_idx, (chunk_text, token_len, span_list) in enumerate(self.chunk_text(item["text"])):
-                chunk_id = self.meta["last_chunk_id"]
-                batch_chunks.append((chunk_id, doc_id, c_idx, chunk_text, token_len, item["title"], lang))
-                for s_idx, span_text in enumerate(span_list):
-                    batch_spans.append((self.meta["last_span_id"], chunk_id, s_idx, span_text, len(span_text)))
-                    self.meta["last_span_id"] += 1
-                self.meta["last_chunk_id"] += 1
-            self.meta["last_doc_id"] += 1
             count += 1
-            if len(batch_docs) >= batch_size:
-                self._commit_batch(cur, batch_docs, batch_chunks, batch_spans)
-                batch_docs, batch_chunks, batch_spans = [], [], []
-                if count % (batch_size * 10) == 0: self._save_meta()
-        self._commit_batch(cur, batch_docs, batch_chunks, batch_spans)
         self.conn.commit()
-        self.conn.execute("PRAGMA wal_checkpoint(FULL);")
-        self._save_meta()
-    def _commit_batch(self, cur, docs, chunks, spans):
-        if not docs: return
-        cur.executemany("INSERT INTO documents VALUES (?,?,?,?,?,?,?,?)", docs)
-        cur.executemany("INSERT INTO chunks VALUES (?,?,?,?,?,?,?)", chunks)
-        cur.executemany("INSERT INTO spans VALUES (?,?,?,?,?)", spans)
-    # ---------------------------
-    # EMBED TO DISK
-    # ---------------------------
-    def embed_corpus(self, lang="ko", batch_size=128, save_interval=100000):
-        """
-        Text is read in batches from SQLite, embeddings are generated using BGE-M3, and then saved to disk.
-         - Embedding generation is performed on the GPU, and data is saved to disk in fixed batches to manage memory.
-         - Dense vectors are saved in NumPy's .npz format to ensure fast loading and low disk usage.
-         - Sparse vectors are saved in JSONL format to provide flexibility and readability.
-         - The saved embeddings are subsequently uploaded to Qdrant for use in searches.
-         - This method is designed to reliably generate and save embeddings even on large-scale datasets.
-        """
         cur = self.conn.cursor()
-        cur.execute("SELECT chunk_id, text FROM chunks WHERE lang=?", (lang,))
-        rows = cur.fetchall()
-        part_id = 0
-        id_buffer = []
-        dense_buffer = []
-        sparse_buffer = []
-        save_dir = f"{self.base_dir}/build_cache/embeddings"
-        for i in tqdm(range(0, len(rows), batch_size), desc=f"1/2 GPU Embedding ({lang})"):
-            batch = rows[i:i+batch_size]
             ids = [r[0] for r in batch]
             texts = [r[1] for r in batch]
-            output = self.model.encode(
-                texts, batch_size=len(texts), max_length=self.max_tokens,
-                return_dense=True, return_sparse=True, return_colbert_vecs=False
-            )
-            id_buffer.extend(ids)
-            dense_buffer.append(output['dense_vecs'])
-            for sp_dict in output['lexical_weights']:
-                sparse_buffer.append({str(k): float(v) for k, v in sp_dict.items()})
-            # Save to disk when a certain number is reached (prevents memory explosion)
-            if len(id_buffer) >= save_interval:
-                self._save_embedding_part(save_dir, lang, part_id, id_buffer, dense_buffer, sparse_buffer)
-                part_id += 1
-                id_buffer, dense_buffer, sparse_buffer = [], [], []
-        # Save the last remaining scraps
-        self._save_embedding_part(save_dir, lang, part_id, id_buffer, dense_buffer, sparse_buffer)
-        print(f"Embedding Generation Complete. Saved to {save_dir}")
-    def _save_embedding_part(self, save_dir, lang, part_id, ids, dense_chunks, sparse_list):
-        if not ids: return
-        # Dense & IDs: High-speed storage as NumPy binaries
-        np.savez(f"{save_dir}/ebd_{lang}_{part_id}.npz",
-                 ids=np.array(ids, dtype=np.int64),
-                 dense=np.vstack(dense_chunks))
-        # Sparse: Save in JSONL format (one line at a time)
-        with open(f"{save_dir}/sparse_{lang}_{part_id}.jsonl", 'w', encoding='utf-8') as f:
-            for sp in sparse_list:
-                f.write(json.dumps(sp) + '\n')
-    # ---------------------------
-    # BUILD QDRANT INDEX
-    # ---------------------------
-    def build_qdrant_index(self, lang="ko", batch_size=2000):
-        """
-        The generated embeddings are read from disk and uploaded to Qdrant in batches.
-         - This method reads the saved dense and sparse embeddings, constructs the appropriate data structures for Qdrant, and uploads them in batches to manage memory and ensure efficient indexing.
-         - After all data is uploaded, it triggers Qdrant's indexing process to optimize search performance.
-         - The use of batch uploads and on-disk storage allows this process to scale to large datasets without overwhelming system memory.
-        """
-        save_dir = f"{self.base_dir}/build_cache/embeddings"
-        files = sorted([f for f in os.listdir(save_dir) if f.startswith(f"ebd_{lang}_") and f.endswith(".npz")])
-        for file_name in files:
-            part_id = file_name.split("_")[-1].split(".")[0]
-            # 1. Load file and convert to Qdrant point structure
-            npz_path = os.path.join(save_dir, file_name)
-            sparse_path = os.path.join(save_dir, f"sparse_{lang}_{part_id}.jsonl")
-            data = np.load(npz_path)
-            ids = data['ids']
-            dense_vecs = data['dense']
-            with open(sparse_path, 'r', encoding='utf-8') as f:
-                sparse_vecs = [json.loads(line) for line in f]
-            points_batch = []
-            # 2. Qdrant Upload Loop
-            for i in tqdm(range(len(ids)), desc=f"2/2 Qdrant Uploading (Part {part_id})"):
-                chunk_id = int(ids[i])
-                sparse_dict = sparse_vecs[i]
-                point = PointStruct(
-                    id=chunk_id,
                     vector={
-                        "dense": dense_vecs[i].tolist(),
-                        "sparse": SparseVector(
-                            indices=[int(k) for k in sparse_dict.keys()],
-                            values=list(sparse_dict.values())
-                        )
                     },
-                    payload={"chunk_id": chunk_id, "lang": lang}
-                )
-                points_batch.append(point)
-                # Upload when stacked to batch size
-                if len(points_batch) >= batch_size:
-                    self.qdrant_client.upload_points(
-                        collection_name=self.collection_name,
-                        points=points_batch
-                    )
-                    points_batch = []
-            # Uploading leftover scraps
-            if points_batch:
-                self.qdrant_client.upload_points(
-                    collection_name=self.collection_name,
-                    points=points_batch
-                )
-        print("Data upload complete. Enabling HNSW Indexing...")
-        # 3. [Key] After all uploads are complete, re-enable indexing (default 20,000) to optimize the graph
         self.qdrant_client.update_collection(
             collection_name=self.collection_name,
-            optimizer_config=OptimizersConfigDiff(indexing_threshold=20000)
         )
-        print("Qdrant Indexing Complete!")
     def close(self):
-        if hasattr(self, 'conn') and self.conn:
             self.conn.close()
 if __name__ == "__main__":
-    builder = KnowledgeEngineBuilder()
     try:
-        builder.ingest(lang="ko", batch_size=32, limit=10000)  # Process only 10,000 documents as an example
-        builder.embed_corpus(lang="ko", batch_size=128, save_interval=5000)
-        builder.build_qdrant_index(lang="ko", batch_size=2000)
     finally:
-        builder.close()

 # scripts/data_pipeline.py
+import argparse
 import os
 import re
 import sqlite3
+import subprocess
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
 import numpy as np
+import orjson
 from datasets import load_dataset
 from FlagEmbedding import BGEM3FlagModel
+from huggingface_hub import HfApi, upload_folder
 from qdrant_client import QdrantClient
+from qdrant_client.models import (Distance, HnswConfigDiff,
+                                  OptimizersConfigDiff, PayloadSchemaType,
+                                  PointStruct, ScalarQuantization,
+                                  ScalarQuantizationConfig, ScalarType,
+                                  SparseIndexParams, SparseVector,
                                   SparseVectorParams, VectorParams)
 from tqdm import tqdm
 from transformers import AutoTokenizer
 class KnowledgeEngineBuilder:
+    def __init__(self, base_dir="ke_store", dim=1024, host="localhost", port=6333, grpc_port=6334):
         self.base_dir = base_dir
         self.dim = dim
+        self.max_tokens = 512
+        # Dynamic Overlap setting constants
+        self.overlap_ratio = 0.12  # Use 12% of the chunk length as overlap (Sweet Spot)
+        self.min_overlap = 30      # Minimum guaranteed overlap token count
+        self.kb_dir = os.path.join(self.base_dir, "knowledge_base")
+        self.artifacts_dir = os.path.join(self.base_dir, "artifacts/bge_m3_cache")
+        print("Loading Initial Setup...")
         self._init_dirs()
+        self._init_sqlite()
+        self._init_qdrant(host, port, grpc_port)
+        self.model = None
+        self.tokenizer = None
+        self.prefix_map = {
+            "ko": "문서 제목",
+            "en": "Document Title",
+            "zh": "文档标题",
+            "ja": "ドキュメントタイトル",
+            "es": "Título del documento",
+            "fr": "Titre du document",
+            "de": "Dokumenttitel",
+        }
+    def _load_models(self):
+        if self.model is None:
+            print("Loading BGE-M3 Model and Tokenizer to GPU...")
+            self.model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
+            self.tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-m3')
+    def _init_dirs(self):
+        os.makedirs(self.kb_dir, exist_ok=True)
+        os.makedirs(self.artifacts_dir, exist_ok=True)
+    def _init_qdrant(self, host, port, grpc_port):
+        self.qdrant_client = QdrantClient(
+            host=host,
+            port=port,
+            grpc_port=grpc_port,
+            prefer_grpc=True,
+            timeout=300
+        )
         self.collection_name = "knowledge_base"
         if not self.qdrant_client.collection_exists(self.collection_name):
                     "sparse": SparseVectorParams(index=SparseIndexParams(on_disk=True))
                 },
                 quantization_config=ScalarQuantization(
+                    scalar=ScalarQuantizationConfig(
+                        type=ScalarType.INT8,
+                        always_ram=False
+                    )
                 ),
+                hnsw_config=HnswConfigDiff(on_disk=True),
+                optimizers_config=OptimizersConfigDiff(indexing_threshold=0)
             )
+            # Index for metadata-based filtering search (e.g., language)
+            self.qdrant_client.create_payload_index(
+                collection_name=self.collection_name, field_name="lang", field_schema=PayloadSchemaType.KEYWORD
+            )
     def _init_sqlite(self):
+        self.conn = sqlite3.connect(f"{self.kb_dir}/corpus.sqlite", check_same_thread=False)
+        self.conn.execute("PRAGMA journal_mode=WAL;")
+        self.conn.execute("PRAGMA synchronous=NORMAL;")
+        self.conn.execute("PRAGMA cache_size=-10000000;") # 10GB cache
+        self.conn.execute("PRAGMA foreign_keys=ON;")
+        cur = self.conn.cursor()
         cur.execute("""
         CREATE TABLE IF NOT EXISTS documents (
+            doc_id INTEGER PRIMARY KEY,
             external_id TEXT, title TEXT, lang TEXT, url TEXT,
             wikidata_id TEXT, date_modified TEXT, full_text TEXT)
         """)
         cur.execute("""
         CREATE TABLE IF NOT EXISTS chunks (
+            chunk_id INTEGER PRIMARY KEY,
             doc_id INTEGER, chunk_index INTEGER, text TEXT,
+            token_length INTEGER, section TEXT, lang TEXT,
+            FOREIGN KEY (doc_id) REFERENCES documents (doc_id) ON DELETE CASCADE)
         """)
         cur.execute("""
         CREATE TABLE IF NOT EXISTS spans (
+            span_id INTEGER PRIMARY KEY,
+            chunk_id INTEGER, span_index INTEGER, text TEXT, char_length INTEGER,
+            FOREIGN KEY (chunk_id) REFERENCES chunks (chunk_id) ON DELETE CASCADE)
         """)
         cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_doc_id ON chunks(doc_id)")
         cur.execute("CREATE INDEX IF NOT EXISTS idx_spans_chunk_id ON spans(chunk_id)")
         cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_lang ON chunks(lang)")
         self.conn.commit()
+    # ---------------------------------------------------------------
+    # PHASE 1: Sophisticated Semantic Chunking and SQLite Ingestion
+    # ---------------------------------------------------------------
+    def split_sentences(self, text, lang="ko"):
+        """
+        Global Multilingual Sentence Splitter
+        1st: Physical separation based on line breaks (compatible with table and list data)
+        2nd: Semantic separation based on punctuation
+        """
+        # 1st physical line break separation (remove empty strings)
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        # Setting up 2nd Language-Specific Regular Expressions for Punctuation Separation
+        if lang in ["ko", "zh", "ja"]:
+            # CJK: Includes full-width characters, immediately separated
+            pattern = r'(?<=[.!?。！？])\s*'
+        else:
+            # Global: Abbreviation Defense and Multilingual Period Support
+            pattern = r'(?<=[.!?。！？।॥؟۔])(?<!\bMr\.)(?<!\bDr\.)(?<!\bMs\.)(?<!\bSt\.)(?<!\b[A-Z]\.)\s+'
+        final_spans = []
+        for line in lines:
+            # Normalization of consecutive spaces and tabs within lines
+            line = re.sub(r'[ \t]+', ' ', line)
+            # Punctuation-based separation
+            spans = [s.strip() for s in re.split(pattern, line) if len(s.strip()) > 0]
+            final_spans.extend(spans)
+        return final_spans
+    def chunk_text(self, text, title="", lang="ko"):
+        """
+        Context-Aware Dynamic Overlap Chunker
+        Injects the document's title at the top of each chunk to maximize BGE-M3 embedding context retention.
+        """
+        raw_sentences = self.split_sentences(text, lang)
+        chunks = []
+        # 1. Context Injection Format Settings Optimized for BGE-M3 (Fixed Prefix)
+        prefix_label = self.prefix_map.get(lang, "Document Title")
+        prefix = f"{prefix_label}: [{title}]\n" if title else ""
+        prefix_toks = self.tokenizer.encode(prefix, add_special_tokens=False) if prefix else []
+        prefix_len = len(prefix_toks)
+        # [Safety Mechanism] If the title itself is abnormally long and consumes all tokens, a forced cutoff is set to a maximum of 100 tokens.
+        if prefix_len > 100:
+            prefix_toks = prefix_toks[:100]
+            prefix = self.tokenizer.decode(prefix_toks) + "...\n"
+            prefix_len = len(prefix_toks)
+        # 2. Calculation of the actual maximum number of tokens that can be inserted into the body (Span combinations + Overlap)
+        eff_max_tokens = self.max_tokens - prefix_len
+        current_spans = []
+        current_tokens = 0 # Cumulative number of tokens in the body (excluding prefix)
+        for span in raw_sentences:
+            span_toks = len(self.tokenizer.encode(span, add_special_tokens=False))
+            # ---------------------------------------------------------
+            # Case 1: Monster Sentence (when a single Span exceeds eff_max_tokens)
+            # ---------------------------------------------------------
+            if span_toks > eff_max_tokens:
+                # 1. If there is accumulated span, release it first.
+                if current_spans:
+                    chunk_body = " ".join(current_spans)
+                    chunk_text_final = prefix + chunk_body
+                    final_tokens = prefix_len + len(self.tokenizer.encode(chunk_body, add_special_tokens=False))
+                    chunks.append((chunk_text_final, final_tokens, list(current_spans)))
+                    # Dynamic Overlap Calculation (Based on Emitted 'Body')
+                    target_overlap = max(self.min_overlap, int(current_tokens * self.overlap_ratio))
+                    prev_tokens = self.tokenizer.encode(chunk_body, add_special_tokens=False)
+                    overlap_tokens = prev_tokens[-target_overlap:]
+                    overlap_text = self.tokenizer.decode(overlap_tokens)
+                    current_spans = [overlap_text]
+                # 2. Merging Overlap and Monster Sentences
+                combined_text = " ".join(current_spans + [span]) if current_spans else span
+                combined_tokens = self.tokenizer.encode(combined_text, add_special_tokens=False)
+                # 3. Slicing Monster Sentences into eff_max_tokens (Sliding Window)
+                i = 0
+                while i + eff_max_tokens < len(combined_tokens):
+                    slice_toks = combined_tokens[i : i + eff_max_tokens]
+                    slice_text = self.tokenizer.decode(slice_toks)
+                    chunk_text_final = prefix + slice_text
+                    # Configure db_spans to store only the text (slice_text)
+                    chunks.append((chunk_text_final, prefix_len + len(slice_toks), [slice_text]))
+                    # Overlap calculation when moving to the next window (Overlap inside monster sentences)
+                    dyn_overlap = max(self.min_overlap, int(eff_max_tokens * self.overlap_ratio))
+                    i += (eff_max_tokens - dyn_overlap)
+                # 4. Save the remaining tail portion after the loop
+                remainder_toks = combined_tokens[i:]
+                if remainder_toks:
+                    rem_text = self.tokenizer.decode(remainder_toks)
+                    current_spans = [rem_text]
+                    current_tokens = len(self.tokenizer.encode(rem_text, add_special_tokens=False))
+                else:
+                    current_spans = []
+                    current_tokens = 0
                 continue
+            # ---------------------------------------------------------
+            # Case 2: General Sentence (Accumulation of general sentences)
+            # ---------------------------------------------------------
+            # +1 is a fake calculation that takes into account spacing between sentences
+            if current_tokens + span_toks + 1 <= eff_max_tokens:
+                current_spans.append(span)
+                current_tokens += span_toks + 1
             else:
+                # 1. Release accumulated span upon overflow
+                chunk_body = " ".join(current_spans)
+                body_tokens = self.tokenizer.encode(chunk_body, add_special_tokens=False)
+                chunk_text_final = prefix + chunk_body
+                final_tokens = prefix_len + len(body_tokens)
+                chunks.append((chunk_text_final, final_tokens, list(current_spans)))
+                # 2. Dynamic Overlap Calculation (Based on Emitted 'Body')
+                target_overlap = max(self.min_overlap, int(len(body_tokens) * self.overlap_ratio))
+                overlap_tokens = body_tokens[-target_overlap:]
+                overlap_text = self.tokenizer.decode(overlap_tokens)
+                # 3. Start of new chunk (previous chunk overlap + current span)
+                current_spans = [overlap_text, span]
+                current_tokens = len(self.tokenizer.encode(" ".join(current_spans), add_special_tokens=False))
+        # ---------------------------------------------------------
+        # Handle remaining spans after loop termination
+        # ---------------------------------------------------------
         if current_spans:
+            chunk_body = " ".join(current_spans)
+            chunk_text_final = prefix + chunk_body
+            final_tokens = prefix_len + len(self.tokenizer.encode(chunk_body, add_special_tokens=False))
+            chunks.append((chunk_text_final, final_tokens, list(current_spans)))
         return chunks
+    def ingest_to_db(self, lang="ko", chunk_batch_size=10000, limit=None):
+        self._load_models()
         ds = load_dataset("HuggingFaceFW/finewiki", lang, split="train", streaming=True)
         cur = self.conn.cursor()
+        cur.execute("SELECT MAX(doc_id) FROM documents")
+        next_doc_id = (cur.fetchone()[0] or 0) + 1
+        cur.execute("SELECT MAX(chunk_id) FROM chunks")
+        next_chunk_id = (cur.fetchone()[0] or 0) + 1
+        cur.execute("SELECT MAX(span_id) FROM spans")
+        next_span_id = (cur.fetchone()[0] or 0) + 1
         count = 0
+        b_docs, b_chunks, b_spans = [], [], []
+        for item in tqdm(ds, desc=f"1/3: Ingesting {lang}wiki to SQLite"):
             if limit and count >= limit: break
+            doc_id = next_doc_id
+            doc_title = item.get("title", "")
+            b_docs.append((doc_id, item["id"], doc_title, lang, item.get("url", ""),
+                           item.get("wikidata_id", ""), item.get("date_modified", ""), item["text"]))
+            for c_idx, (c_text, c_len, span_list) in enumerate(self.chunk_text(item["text"], doc_title, lang)):
+                chunk_id = next_chunk_id
+                b_chunks.append((chunk_id, doc_id, c_idx, c_text, c_len, doc_title, lang))
+                for s_idx, s_text in enumerate(span_list):
+                    span_id = next_span_id
+                    b_spans.append((span_id, chunk_id, s_idx, s_text, len(s_text)))
+                    next_span_id += 1
+                next_chunk_id += 1
+            next_doc_id += 1
             count += 1
+            if len(b_chunks) >= chunk_batch_size:
+                self._commit(cur, b_docs, b_chunks, b_spans)
+                b_docs, b_chunks, b_spans = [], [], []
+        self._commit(cur, b_docs, b_chunks, b_spans)
         self.conn.commit()
+    def _commit(self, cur, d, c, s):
+        if d: cur.executemany("INSERT INTO documents VALUES (?,?,?,?,?,?,?,?)", d)
+        if c: cur.executemany("INSERT INTO chunks VALUES (?,?,?,?,?,?,?)", c)
+        if s: cur.executemany("INSERT INTO spans VALUES (?,?,?,?,?)", s)
+    # --------------------------------------------------------------
+    # PHASE 2: GPU Embedding and Disk Caching (Full Resume Support)
+    # --------------------------------------------------------------
+    def embed_corpus(self, lang="ko", batch_size=1024):
+        self._load_models()
         cur = self.conn.cursor()
+        cur.execute("SELECT COUNT(*) FROM chunks WHERE lang=?", (lang,))
+        total_chunks = cur.fetchone()[0]
+        cur.execute("SELECT chunk_id, text FROM chunks WHERE lang=? ORDER BY chunk_id ASC", (lang,))
+        batch_idx = 0
+        pbar = tqdm(total=total_chunks, desc="2/3 GPU Embedding to Disk")
+        while True:
+            batch = cur.fetchmany(batch_size)
+            if not batch: break
+            npz_path = f"{self.artifacts_dir}/chunk_{lang}_{batch_idx}.npz"
+            jsonl_path = f"{self.artifacts_dir}/chunk_{lang}_{batch_idx}.jsonl"
+            # Resume Defense Logic: Skip embedding if both .npz and .jsonl files for the batch already exist (Assumes that if .npz exists, .jsonl also exists, but double-checking for safety)
+            if os.path.exists(npz_path) and os.path.exists(jsonl_path):
+                batch_idx += 1
+                pbar.update(len(batch))
+                continue
             ids = [r[0] for r in batch]
             texts = [r[1] for r in batch]
+            # GPU Batch Embedding with BGE-M3 (Dense + Sparse Extraction)
+            output = self.model.encode(texts, batch_size=len(texts), max_length=self.max_tokens, return_dense=True, return_sparse=True)
+            np.savez(npz_path, ids=np.array(ids), dense=output['dense_vecs'])
+            # Ultra-fast serialization using orjson for sparse vectors (List of Dicts) to JSONL format
+            with open(jsonl_path, 'wb') as f:
+                for sp in output['lexical_weights']:
+                    f.write(orjson.dumps({str(k): float(v) for k, v in sp.items()}) + b'\n')
+            batch_idx += 1
+            pbar.update(len(batch))
+        pbar.close()
+    # ----------------------------------------------------------------------
+    # PHASE 3: Qdrant Server Parallel Upload and Indexing Finalized on Disk
+    # ----------------------------------------------------------------------
+    def upload_to_qdrant(self, lang="ko", parallel_workers=None):
+        save_dir = self.artifacts_dir
+        files = [f for f in os.listdir(save_dir) if f.startswith(f"chunk_{lang}_") and f.endswith(".npz")]
+        if parallel_workers is None:
+            num_cores = os.cpu_count() or 1
+            parallel_workers = max(1, min(8, int(num_cores * 0.2)))  # Use up to 20% of CPU cores, capped at 8 workers
+        def upload_worker(file_name):
+            data = np.load(os.path.join(save_dir, file_name))
+            ids, dense = data['ids'], data['dense']
+            # Ultra-fast deserialization using orjson for sparse vectors (List of Dicts) from JSONL format
+            with open(os.path.join(save_dir, file_name.replace(".npz", ".jsonl")), 'rb') as f:
+                sparse = [orjson.loads(line) for line in f]
+            points = []
+            for j in range(len(ids)):
+                points.append(PointStruct(
+                    id=int(ids[j]),
                     vector={
+                        "dense": dense[j].tolist(),
+                        "sparse": SparseVector(indices=[int(k) for k in sparse[j].keys()],
+                                               values=list(sparse[j].values()))
                     },
+                    payload={"lang": lang, "chunk_id": int(ids[j])}
+                ))
+            self.qdrant_client.upload_points(
+                collection_name=self.collection_name,
+                points=points,
+                wait=False
+            )
+        print(f"3/3 Starting Qdrant parallel upload with {parallel_workers} workers...")
+        with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
+            list(tqdm(executor.map(upload_worker, files), total=len(files), desc="Qdrant Upload"))
+        print("Upload complete. Finalizing HNSW Index on Disk...")
         self.qdrant_client.update_collection(
             collection_name=self.collection_name,
+            optimizer_config=OptimizersConfigDiff(indexing_threshold=20000)
         )
+        print("Pipeline Complete!")
     def close(self):
+        """DB Connection Close Method for Safe Resource Management"""
+        if hasattr(self, 'conn'):
             self.conn.close()
+            print("SQLite connection closed.")
+    def wait_for_indexing(self):
+        """
+        Wait until optimizer_status is 'ok' and there are no ongoing tasks
+        (indicating that indexing is complete and the collection is fully optimized on disk)
+        """
+        print("Waiting for Qdrant to finish indexing (HNSW Merging)...")
+        while True:
+            try:
+                info = self.qdrant_client.get_collection(self.collection_name)
+                if info.status == "green":
+                    print("Indexing confirmed complete.")
+                    break
+            except Exception as e:
+                print(f"Checking index status... (Error: {e})")
+                print("Retrying in 10 seconds...")
+            time.sleep(10)
+    # Magic method to support Python's 'with' statement for automatic resource management
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        # Wait for indexing only when there are no exceptions (exc_type) (normal exit).
+        if exc_type is None:
+            self.wait_for_indexing()
+        else:
+            print(f"Pipeline failed with error, skipping index wait: {exc_val}")
+def manage_qdrant_server(storage_path, http_port=6333, grpc_port=6334):
+    """Helper function that manages the lifecycle of the Qdrant server"""
+    abs_storage_path = os.path.abspath(storage_path)
+    os.makedirs(abs_storage_path, exist_ok=True)
+    # 1. Terminate existing processes (prevent port conflicts)
+    subprocess.run(["pkill", "-9", "qdrant"], capture_output=True)
+    # 2. Check for Binary Existence (Installation Guide)
+    if not os.path.exists("./qdrant"):
+        print("Error: 'qdrant' binary not found in current directory.")
+        print("Please download it first: wget https://github.com/qdrant/qdrant/releases/download/v1.16.2/qdrant-x86_64-unknown-linux-gnu.tar.gz")
+        sys.exit(1)
+    print(f"Starting Qdrant server [Storage: {abs_storage_path}]...")
+    env = os.environ.copy()
+    env["QDRANT__SERVICE__HTTP_PORT"] = str(http_port)
+    env["QDRANT__SERVICE__GRPC_PORT"] = str(grpc_port)
+    env["QDRANT__STORAGE__STORAGE_PATH"] = abs_storage_path
+    log_file = open("qdrant_log.txt", "w")
+    process = subprocess.Popen(
+        ["./qdrant"],
+        env=env,
+        stdout=log_file,
+        stderr=log_file,
+        preexec_fn=os.setpgrp
+    )
+    time.sleep(10) # Waiting for server initialization
+    return process
 if __name__ == "__main__":
+    # ---1. CLI Argument Settings---
+    parser = argparse.ArgumentParser(description="Knowledge Engine Data Pipeline Runner")
+    parser.add_argument("--lang", type=str, default="ko", help="Language code (e.g., ko, en)")
+    parser.add_argument("--chunk_batch_size", type=int, default=10000, help="Batch size for SQLite ingestion")
+    parser.add_argument("--limit", type=int, default=50000, help="Ingestion document limit")
+    parser.add_argument("--batch_size", type=int, default=1024, help="Embedding batch size")
+    parser.add_argument("--workers", type=int, default=4, help="Number of parallel workers for Qdrant upload")
+    parser.add_argument("--upload", action="store_true", help="Upload to HuggingFace after completion")
+    parser.add_argument("--repo_id", type=str, default="user_id/repo", help="Hugging Face repository ID for upload (e.g., user_id/repo)")
+    args = parser.parse_args()
+    # --- 2. Environment Setup ---
+    STORAGE_PATH = "./ke_store/qdrant_storage"
+    # --- 3. Server Execution ---
+    server_process = manage_qdrant_server(STORAGE_PATH)
+    # --- 4. Pipeline Execution (Utilizing Context Manager) ---
     try:
+        print(f"--- Starting Pipeline for language: {args.lang} ---")
+        with KnowledgeEngineBuilder() as builder:
+            builder.ingest_to_db(lang=args.lang, chunk_batch_size=args.chunk_batch_size, limit=args.limit)
+            builder.embed_corpus(lang=args.lang, batch_size=args.batch_size)
+            builder.upload_to_qdrant(lang=args.lang, parallel_workers=args.workers)
+        print("--- Pipeline Execution Successful ---")
+    except Exception as e:
+        print(f"Critical Error during pipeline: {e}")
     finally:
+        # --- 5. Graceful Shutdown ---
+        print("Shutting down Qdrant server safely...")
+        subprocess.run(["pkill", "-15", "qdrant"], check=False)
+        time.sleep(5) # Waiting for data flush
+    # --- 6. Hugging Face Upload (Optional) ---
+    if args.upload:
+        print("Uploading to Hugging Face Hub...")
+        api = HfApi()
+        upload_folder(
+            repo_id=args.repo_id,
+            folder_path="ke_store",
+            repo_type="dataset"
+        )
+        print("Upload complete!")

scripts/setup_db.py CHANGED Viewed

@@ -6,6 +6,8 @@ import sys
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import HfHubHTTPError
 from core.config import settings
 from core.logger import setup_logger
@@ -34,8 +36,8 @@ def download_knowledge_base():
             repo_id=repo_id,
             repo_type="dataset",
             local_dir=local_dir,
-            allow_patterns=["corpus/*", "qdrant/*"],
-            ignore_patterns=["build_cache/*", ".gitattributes"],
             max_workers=4
         )
         logger.info(f"✅ Download complete! Data is ready at: {download_path}")

 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import HfHubHTTPError
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from core.config import settings
 from core.logger import setup_logger
             repo_id=repo_id,
             repo_type="dataset",
             local_dir=local_dir,
+            allow_patterns=["knowledge_base/*", "vector_store/qdrant/*"],
+            ignore_patterns=["artifacts/*", ".gitattributes"],
             max_workers=4
         )
         logger.info(f"✅ Download complete! Data is ready at: {download_path}")

services/search_service.py CHANGED Viewed

@@ -24,13 +24,15 @@ class HybridSearchService:
         self.embedder = embedder
         self.reranker = reranker
-    def search(self, query: str, top_k: int = 5, limit: int = 50) -> Dict[str, Any]:
         """
         Receives user queries and performs hybrid search and reranking.
         :param query: User search query
         :param top_k: Number of documents to return (after reranking)
         :param limit: Number of candidate documents to fetch from Qdrant (after RRF fusion, before reranking)
         """
         start_time = time.time()
         logger.info(f"🔍 Starting search pipeline for query: '{query}'")
@@ -97,12 +99,16 @@ class HybridSearchService:
             latency_ms = int((time.time() - start_time) * 1000)
             logger.info(f"✅ Search completed in {latency_ms}ms. Found {len(final_results)} final chunks.")
-            return {
                 "query": query,
                 "results": final_results,
                 "latency_ms": latency_ms
             }
         except Exception as e:
             # Wrap unexpected errors in custom errors and throw them to the router
             logger.error(f"❌ Pipeline failed: {str(e)}", exc_info=True)
@@ -113,6 +119,7 @@ class HybridSearchService:
         return {
             "query": query,
             "results": [],
             "latency_ms": int((time.time() - start_time) * 1000)
         }
@@ -138,6 +145,7 @@ class HybridSearchService:
                 f"<doc id=\"{i}\" source=\"{source}\" "
                 f"url=\"{meta.get('url', 'N/A')}\" "
                 f"relevance_score=\"{res['score']}\">\n"
                 f"{res['text']}\n"
                 f"</doc>"
             )

         self.embedder = embedder
         self.reranker = reranker
+    def search(self, query: str, top_k: int = 5, limit: int = 50, include_llm_context: bool = True) -> Dict[str, Any]:
         """
         Receives user queries and performs hybrid search and reranking.
         :param query: User search query
         :param top_k: Number of documents to return (after reranking)
         :param limit: Number of candidate documents to fetch from Qdrant (after RRF fusion, before reranking)
+        :param include_llm_context: Whether to include LLM context in the response (formatted text for LLM consumption)
+        :return: A dictionary containing the original query, a list of search results, and latency information. Each search result includes chunk_id, text, relevance score, and metadata.
         """
         start_time = time.time()
         logger.info(f"🔍 Starting search pipeline for query: '{query}'")
             latency_ms = int((time.time() - start_time) * 1000)
             logger.info(f"✅ Search completed in {latency_ms}ms. Found {len(final_results)} final chunks.")
+            response = {
                 "query": query,
                 "results": final_results,
                 "latency_ms": latency_ms
             }
+            if include_llm_context:
+                # 7. Optional: Format results into LLM-friendly context (Markdown/XML mixed format)
+                response["llm_context"] = self.format_for_llm(final_results)
         except Exception as e:
             # Wrap unexpected errors in custom errors and throw them to the router
             logger.error(f"❌ Pipeline failed: {str(e)}", exc_info=True)
         return {
             "query": query,
             "results": [],
+            "llm_context": "No relevant knowledge (documents) available.",
             "latency_ms": int((time.time() - start_time) * 1000)
         }
                 f"<doc id=\"{i}\" source=\"{source}\" "
                 f"url=\"{meta.get('url', 'N/A')}\" "
                 f"relevance_score=\"{res['score']}\">\n"
+                f"date_modified=\"{meta.get('date_modified', 'N/A')}\">\n"
                 f"{res['text']}\n"
                 f"</doc>"
             )

start.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+# start.sh
+echo "1. Downloading Knowledge Base Data (Syncing with Hugging Face)..."
+# Download data first before executing FastAPI so that Qdrant can recognize it.
+python scripts/setup_db.py
+echo "2. Starting Qdrant Server in background..."
+# Run in the background by explicitly specifying the repository path of Qdrant
+export QDRANT__STORAGE__STORAGE_PATH="/app/data/vector_store/qdrant"
+/usr/local/bin/qdrant &
+# Wait until the Qdrant server is fully running before starting FastAPI
+echo "Waiting for Qdrant to initialize..."
+until curl -s http://localhost:6333/readyz > /dev/null; do
+    echo "Qdrant is not ready yet. Retrying in 2 seconds..."
+    sleep 2
+done
+echo "Qdrant is fully initialized!"
+echo "3. Starting FastAPI Server..."
+# Run Uvicorn in the foreground
+uvicorn main:app --host 0.0.0.0 --port 7860

storage/qdrant_client.py CHANGED Viewed

@@ -13,13 +13,13 @@ class QdrantStorage:
     """
     Qdrant client performing hybrid search based on dense and sparse vectors
     """
-    def __init__(self, path: str, collection_name: str = "knowledge_base"):
-        self.path = path
         self.collection_name = collection_name
         try:
             # Local file system-based Qdrant connection (v1.10+)
-            self.client = QdrantClient(path=self.path)
-            logger.info(f"✅ Connected to local Qdrant at {self.path} (Collection: {self.collection_name})")
         except Exception as e:
             logger.critical(f"❌ Qdrant connection failed: {e}")
             raise e

     """
     Qdrant client performing hybrid search based on dense and sparse vectors
     """
+    def __init__(self, url: str, collection_name: str = "knowledge_base"):
+        self.url = url
         self.collection_name = collection_name
         try:
             # Local file system-based Qdrant connection (v1.10+)
+            self.client = QdrantClient(url=self.url, timeout=60.0)
+            logger.info(f"✅ Connected to local Qdrant at {self.url} (Collection: {self.collection_name})")
         except Exception as e:
             logger.critical(f"❌ Qdrant connection failed: {e}")
             raise e

storage/sqlite_client.py CHANGED Viewed

@@ -31,35 +31,40 @@ class SQLiteStorage:
         if not chunk_ids:
             return {}
-        placeholders = ",".join("?" * len(chunk_ids))
-        query = f"""
-            SELECT
-                c.chunk_id, c.text AS chunk_text,
-                d.doc_id, d.title, d.lang, d.url, d.date_modified
-            FROM chunks c
-            JOIN documents d ON c.doc_id = d.doc_id
-            WHERE c.chunk_id IN ({placeholders})
-        """
         try:
             cur = self.conn.cursor()
-            cur.execute(query, chunk_ids)
-            rows = cur.fetchall()
-            # Transform the result into a dictionary for O(1) access: { chunk_id: { "text": "...", "metadata": {...} } }
-            result_dict = {}
-            for row in rows:
-                result_dict[row["chunk_id"]] = {
-                    "text": row["chunk_text"],
-                    "metadata": {
-                        "doc_id": row["doc_id"],
-                        "title": row["title"],
-                        "lang": row["lang"],
-                        "url": row["url"],
-                        "date_modified": row["date_modified"]
                     }
-                }
             return result_dict
         except sqlite3.Error as e:

         if not chunk_ids:
             return {}
+        CHUNK_SIZE_LIMIT = 900  # SQLite has a default limit of 999 variables per query, so we use 900 to be safe
+        result_dict = {}
         try:
             cur = self.conn.cursor()
+            for i in range(0, len(chunk_ids), CHUNK_SIZE_LIMIT):
+                batch_ids = chunk_ids[i:i + CHUNK_SIZE_LIMIT]
+                placeholders = ",".join("?" * len(batch_ids))
+                query = f"""
+                    SELECT
+                        c.chunk_id, c.text AS chunk_text,
+                        d.doc_id, d.title, d.lang, d.url, d.date_modified
+                    FROM chunks c
+                    JOIN documents d ON c.doc_id = d.doc_id
+                    WHERE c.chunk_id IN ({placeholders})
+                """
+                cur.execute(query, batch_ids)
+                rows = cur.fetchall()
+                # Transform the result into a dictionary for O(1) access: { chunk_id: { "text": "...", "metadata": {...} } }
+                for row in rows:
+                    result_dict[row["chunk_id"]] = {
+                        "text": row["chunk_text"],
+                        "metadata": {
+                            "doc_id": row["doc_id"],
+                            "title": row["title"],
+                            "lang": row["lang"],
+                            "url": row["url"],
+                            "date_modified": row["date_modified"]
+                        }
                     }
             return result_dict
         except sqlite3.Error as e: