diff --git a/space/README.md b/space/README.md
index 180cc0e79b51e0d5d32a5c5f0a411ed8fe718cee..e77d468b8225709cdb1d5bfb6a6037a76902131d 100644
--- a/space/README.md
+++ b/space/README.md
@@ -70,7 +70,6 @@ python main.py --mode rtc-gpt-server --port 7862
 
 ### Chatbot Interface
 ```bash
-cd app
 python main.py --mode chatbot --port 7861
 ```
 
diff --git a/space/space/space/Dockerfile b/space/space/space/Dockerfile
index a54879e434bbcf0ee9205f62a2d14f64ac086afa..b80b189b9c8d5e33b3fbdb4ce76255c50e921215 100644
--- a/space/space/space/Dockerfile
+++ b/space/space/space/Dockerfile
@@ -1,13 +1,10 @@
-# Gunakan image dasar Python versi 3.13
+
 FROM python:3.13
 
-# Tambahkan user non-root untuk keamanan
 RUN useradd -m -u 1001 appuser
 
-# Set working directory
 WORKDIR /rag_be
 
-# Set cache directories ke writable location
 ENV HF_HOME=/tmp/.cache/huggingface
 ENV TRANSFORMERS_CACHE=/tmp/.cache/transformers
 ENV TORCH_HOME=/tmp/.cache/torch
@@ -15,35 +12,26 @@ ENV XDG_CACHE_HOME=/tmp/.cache
 ENV TMPDIR=/tmp
 ENV WHISPER_CACHE_DIR=/tmp/.cache/whisper
 
-# Copy requirements dan install dependencies
-COPY requirements.txt ./
+COPY requirements.txt ./ 
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
-# Copy aplikasi dengan ownership ke appuser
 COPY --chown=appuser:appuser . /rag_be
 
-# Buat file .env dengan variabel environment menggunakan Hugging Face secrets
-RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=false \
-    --mount=type=secret,id=HF_TOKEN,mode=0444,required=false \
-    --mount=type=secret,id=ELEVENLABS_API_KEY,mode=0444,required=false \
-    echo "OPENAI_API_KEY=$(cat /run/secrets/OPENAI_API_KEY 2>/dev/null || echo '')" >> .env && \
-    echo "HF_TOKEN=$(cat /run/secrets/HF_TOKEN 2>/dev/null || echo '')" >> .env && \
-    echo "ELEVENLABS_API_KEY=$(cat /run/secrets/ELEVENLABS_API_KEY 2>/dev/null || echo '')" >> .env
-    
-RUN ls -l /rag_be/app && whoami && id
-
-# Buat directories yang diperlukan dengan permissions yang tepat
-RUN mkdir -p /tmp/.cache /tmp/.cache/whisper /tmp/.cache/huggingface /rag_be/vectorstore  /tmp/.cache/transformers /tmp/.cache/torch \
-             /rag_be/app/vectorstore /rag_be/documents  && \
-    chmod -R 777 /tmp/.cache /rag_be/app /rag_be/app/vectorstore /rag_be/vectorstore /rag_be/documents && \
-    chown -R appuser:appuser /tmp/.cache /rag_be/app /rag_be/app/vectorstore /rag_be/vectorstore  /rag_be/documents /rag_be/.env
-
-RUN apt-get update && apt-get install -y ffmpeg
-# Beralih ke user non-root
+RUN mkdir -p /tmp/.cache \
+    /tmp/.cache/whisper \
+    /tmp/.cache/huggingface \
+    /tmp/.cache/transformers \
+    /tmp/.cache/torch \
+    /rag_be/vectorstore \
+    /rag_be/app/vectorstore \
+    /rag_be/documents && \
+    chmod -R 777 /tmp/.cache /rag_be/app /rag_be/vectorstore /rag_be/documents
+
+
+RUN apt-get update && apt-get install -y ffmpeg && apt-get clean
+
 USER appuser
 
-# Expose port untuk Hugging Face Spaces
-EXPOSE 7860
+EXPOSE 8000
 
-# Jalankan aplikasi
-CMD ["python", "app/__test__.py"]
\ No newline at end of file
+CMD ["python", "main.py --mode rtc-ui --port 7860"]
diff --git a/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/README.md
index 4e60b6b87669e85251a87d95f1fa5a45df3bdb4f..18091cf8398ea9acfa35da01074f253fb6ca05b8 100644
--- a/space/space/space/space/space/space/space/README.md
+++ b/space/space/space/space/space/space/space/README.md
@@ -104,21 +104,3 @@ docker run -p 8080:8080 cs-ai-sakura-dev
 Once the server is running, you can access the API documentation at:
 - `http://localhost:{port}/docs` (if using FastAPI)
 - `http://localhost:{port}` (for Gradio interface)
-
-
-## 🏗️ Project Structure
-
-```
-cs-ai-sakura-dev/
-├── app/
-│   └── main.py          # Chatbot application
-├── main.py              # Main application entry point
-├── requirements.txt     # Python dependencies
-├── .env                 # Environment variables (create this)
-├── Dockerfile          # Docker configuration
-└── README.md           # Project documentation
-```
-
----
-
-**Happy coding! 🌸**
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/space/space/README.md
index 71a1645bca7ac06725b65503b43ebcd8ecc36631..4e60b6b87669e85251a87d95f1fa5a45df3bdb4f 100644
--- a/space/space/space/space/space/space/space/space/space/README.md
+++ b/space/space/space/space/space/space/space/space/space/README.md
@@ -1,34 +1,124 @@
----
-title: Cs Ai Sakura Dev
-emoji: 🏢
-colorFrom: indigo
-colorTo: indigo
-sdk: docker
-pinned: false
----
+# CS AI Sakura Dev 🏢
+
+A comprehensive AI-powered application with multiple modes including RTC (Real-Time Communication), GPT integration, and chatbot functionality.
+
+## 🚀 Features
+
+- **RTC Mode**: Real-time communication interface
+- **GPT Integration**: Enhanced AI capabilities with OpenAI GPT models
+- **Chatbot Interface**: Interactive chat functionality
+- **Gradio UI**: User-friendly web interface
+- **API Server**: RESTful API endpoints
+- **Docker Support**: Containerized deployment
 
-**Install The Requirements**
+## 📋 Prerequisites
 
-1.Create a virtual environment and install the dependencies
+- Python 3.8 or higher
+- OpenAI API Key
+- Docker (optional)
+
+## ⚙️ Installation
+
+### 1. Clone the Repository
+```bash
+git clone <repository-url>
+cd cs-ai-sakura-dev
 ```
+
+### 2. Create Virtual Environment
+```bash
 python3 -m venv env
-source env/bin/activate
+source env/bin/activate  # On Windows: env\Scripts\activate
+```
+
+### 3. Install Dependencies
+```bash
 pip install -r requirements.txt
 ```
 
-2. Set your OPENAI_API_KEY in .env file
+### 4. Environment Configuration
+Create a `.env` file in the root directory and add your OpenAI API key:
+```bash
+OPENAI_API_KEY=your_openai_api_key_here
+```
+
+## 🖥️ Usage
+
+### Gradio Web Interface
 
-3. **TO LAUNCH THE GRADIO UI** Run the command below :
+#### Non-GPT Based UI
+```bash
+python main.py --mode rtc-ui --port 8080
 ```
-python main.py --mode rtc-ui --port {your_port}
+
+#### GPT-Powered UI
+```bash
+python main.py --mode rtc-gpt-ui --port 8080
 ```
 
-4. **TO LAUNCH THE API ENDPOINT (SERVER)** Run the command below :
+### API Server
+
+#### Non-GPT Based Server
+```bash
+python main.py --mode rtc-server --port 8080
 ```
-python main.py --mode rtc-server --port {your_port}
+
+#### GPT-Powered Server
+```bash
+python main.py --mode rtc-gpt-server --port 8080
 ```
 
-5. **TO LAUNCH THE CHATBOT UI** Run the command below :
+### Chatbot Interface
+```bash
+cd app
+python main.py --mode chatbot --port 8080
 ```
-python main.py --mode chatbot --port {your_port}
+
+## 🐳 Docker Deployment
+
+The application supports Docker deployment. Build and run the container:
+
+```bash
+docker build -t cs-ai-sakura-dev .
+docker run -p 8080:8080 cs-ai-sakura-dev
+```
+
+## 📚 Available Modes
+
+| Mode | Description | Command |
+|------|-------------|---------|
+| `rtc-ui` | Real-time communication web interface | `python main.py --mode rtc-ui --port {port}` |
+| `rtc-gpt-ui` | GPT-powered real-time communication UI | `python main.py --mode rtc-gpt-ui --port {port}` |
+| `rtc-server` | Real-time communication API server | `python main.py --mode rtc-server --port {port}` |
+| `rtc-gpt-server` | GPT-powered API server | `python main.py --mode rtc-gpt-server --port {port}` |
+| `chatbot` | Interactive chatbot interface | `cd app && python main.py --mode chatbot --port {port}` |
+
+## 🔧 Configuration
+
+### Environment Variables
+- `OPENAI_API_KEY`: Your OpenAI API key (required for GPT modes)
+- `PORT`: Application port (default: 8080)
+
+## 📖 API Documentation
+
+Once the server is running, you can access the API documentation at:
+- `http://localhost:{port}/docs` (if using FastAPI)
+- `http://localhost:{port}` (for Gradio interface)
+
+
+## 🏗️ Project Structure
+
 ```
+cs-ai-sakura-dev/
+├── app/
+│   └── main.py          # Chatbot application
+├── main.py              # Main application entry point
+├── requirements.txt     # Python dependencies
+├── .env                 # Environment variables (create this)
+├── Dockerfile          # Docker configuration
+└── README.md           # Project documentation
+```
+
+---
+
+**Happy coding! 🌸**
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/data/prompt_template/customer_service.txt b/space/space/space/space/space/space/space/space/space/data/prompt_template/customer_service.txt
index 0eca246a79d34b17715ad5e17d4a5bfd1140a75b..ce2540910cc60e421512ab37c1a6ee0a119f42cb 100644
--- a/space/space/space/space/space/space/space/space/space/data/prompt_template/customer_service.txt
+++ b/space/space/space/space/space/space/space/space/space/data/prompt_template/customer_service.txt
@@ -1,12 +1,12 @@
-You are a friendly and professional Customer Service for Human Resource Information System (HRIS) field,
-representative, fluent in Indonesian. Your job is to assist customers with accurate information based on your company's basic knowledge. Follow these guidelines:
+Anda adalah seorang Customer Service yang ramah dan profesional di bidang Human Resource Information System (HRIS),
+fasih berbahasa Indonesia. Tugas Anda adalah membantu pelanggan dengan informasi yang akurat berdasarkan pengetahuan dasar perusahaan Anda. Ikuti panduan berikut:
 
-- Always greet customers in a friendly and professional manner.
-- Your answers are contextual and objective.
-- Provide clear, easy-to-understand, and structured answers based on the context provided by the user.
-- If information is not available, offer alternative assistance or direct them to the appropriate channel.
-- Use polite language and empathize with the customer's needs.
-- Conclude by offering further assistance.
-- You are highly skilled in the area relevant to the given context.
+- Selalu menyapa pelanggan dengan ramah dan profesional.
+- Jawaban Anda kontekstual dan objektif.
+- Berikan jawaban yang jelas, mudah dipahami, dan terstruktur berdasarkan konteks yang diberikan oleh pengguna.
+- Jika informasi tidak tersedia, tawarkan bantuan alternatif atau arahkan mereka ke saluran yang tepat.
+- Gunakan bahasa yang sopan dan berempati terhadap kebutuhan pelanggan.
+- Akhiri dengan menawarkan bantuan lebih lanjut.
+- Anda sangat terampil di bidang yang relevan dengan konteks yang diberikan.
 
-Please use the given context to answer accurately.
\ No newline at end of file
+Harap gunakan konteks yang diberikan untuk menjawab dengan akurat.
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/main.py b/space/space/space/space/space/space/space/space/space/main.py
index 4ef3560650d8230c5ff8f323979653124b46b0aa..23fd955dd06c6ea966d7f01e0ca75a8c762b53ae 100644
--- a/space/space/space/space/space/space/space/space/space/main.py
+++ b/space/space/space/space/space/space/space/space/space/main.py
@@ -1,10 +1,14 @@
 import argparse
 from src.provider import AppProvider
+from src.config import OPENAI_API_KEY
+from openai import OpenAI
 
-app = AppProvider()
+openai_client = OpenAI(api_key = OPENAI_API_KEY)
+app = AppProvider(openai_client)
 chatbot_ui = app.provide_chatbot().provide_chatbot_ui()
 rtc = app.provide_rtc()
 rtc_handler = rtc.provide_rtc_handler()
+rtc_gpt_handler = rtc.provide_rtc_gpt_handler()
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--mode", choices=[
@@ -27,6 +31,11 @@ elif(args.mode == "rtc-server"):
 elif(args.mode == "rtc-ui"):
     print("launching RTC UI Mode ... ")
     rtc_handler.launch_ui(port = int(args.port))
+elif(args.mode == "rtc-gpt-ui"):
+    print("RTC GPT UI mode ...")
+    rtc_gpt_handler.launch_ui(port = int(args.port))
+elif(args.mode == "rtc-gpt-server"):
+    rtc_gpt_handler.start_server(port = int(args.port))
 else:
-    print("ERROR : INVALID ARGUMENT | PLEASE CHOOSE ONE BETWEEN chatbot/rtc-server/rtc-ui mode ")
+    print("ERROR : INVALID ARGUMENT | PLEASE CHOOSE ONE BETWEEN chatbot / rtc-server/ rtc-ui / rtc-gpt-server / rtc-gpt-ui mode ")
 
diff --git a/space/space/space/space/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/space/space/space/space/README.md
index 67e59869e6d850c1be2132b833e009b288454320..71a1645bca7ac06725b65503b43ebcd8ecc36631 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/README.md
+++ b/space/space/space/space/space/space/space/space/space/space/space/README.md
@@ -25,12 +25,10 @@ python main.py --mode rtc-ui --port {your_port}
 
 4. **TO LAUNCH THE API ENDPOINT (SERVER)** Run the command below :
 ```
-cd app
 python main.py --mode rtc-server --port {your_port}
 ```
 
 5. **TO LAUNCH THE CHATBOT UI** Run the command below :
 ```
-cd app
 python main.py --mode chatbot --port {your_port}
 ```
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
index 25f5df9e7b98eba45f5c9ffe8caf5c9f2d672762..67e59869e6d850c1be2132b833e009b288454320 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
@@ -29,7 +29,7 @@ cd app
 python main.py --mode rtc-server --port {your_port}
 ```
 
-54. **TO LAUNCH THE CHATBOT UI** Run the command below :
+5. **TO LAUNCH THE CHATBOT UI** Run the command below :
 ```
 cd app
 python main.py --mode chatbot --port {your_port}
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
index ef267daf6fea185f4ceaf65100ca030d8583a6f5..25f5df9e7b98eba45f5c9ffe8caf5c9f2d672762 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
@@ -20,12 +20,17 @@ pip install -r requirements.txt
 
 3. **TO LAUNCH THE GRADIO UI** Run the command below :
 ```
-cd app
-python __test__.py
+python main.py --mode rtc-ui --port {your_port}
 ```
 
 4. **TO LAUNCH THE API ENDPOINT (SERVER)** Run the command below :
 ```
 cd app
-python __server__.py
+python main.py --mode rtc-server --port {your_port}
+```
+
+54. **TO LAUNCH THE CHATBOT UI** Run the command below :
+```
+cd app
+python main.py --mode chatbot --port {your_port}
 ```
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/data/prompt_template/customer_service.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/data/prompt_template/customer_service.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0eca246a79d34b17715ad5e17d4a5bfd1140a75b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/data/prompt_template/customer_service.txt
@@ -0,0 +1,12 @@
+You are a friendly and professional Customer Service for Human Resource Information System (HRIS) field,
+representative, fluent in Indonesian. Your job is to assist customers with accurate information based on your company's basic knowledge. Follow these guidelines:
+
+- Always greet customers in a friendly and professional manner.
+- Your answers are contextual and objective.
+- Provide clear, easy-to-understand, and structured answers based on the context provided by the user.
+- If information is not available, offer alternative assistance or direct them to the appropriate channel.
+- Use polite language and empathize with the customer's needs.
+- Conclude by offering further assistance.
+- You are highly skilled in the area relevant to the given context.
+
+Please use the given context to answer accurately.
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/data/prompt_template/query_maker.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/data/prompt_template/query_maker.txt
new file mode 100644
index 0000000000000000000000000000000000000000..15e21bccb8a5229bf21853055bceb700f7fcddf7
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/data/prompt_template/query_maker.txt
@@ -0,0 +1,35 @@
+Anda adalah agen AI yang tepat dan objektif, 
+Anda bertugas mengubah pertanyaan atau pernyataan pengguna menjadi query yang eksplisit dan efisien untuk keperluan pencarian dokumen dalam sistem RAG (Retrieval-Augmented Generation).
+
+Ikuti langkah-langkah berikut:
+
+1. Ekstrak bagian-bagian penting dari input pengguna:
+   - **Intent**: Tujuan utama atau jenis permintaan (misalnya: apa itu, cara, syarat, apakah bisa, berapa).
+   - **Entity/Noun Phrase**: Objek utama yang dibahas (misalnya: BPJS, tokenizer truncation, RWKV, gaji).
+   - **Context**: Informasi pendukung yang menyempitkan fokus (misalnya: kecelakaan kerja, gaji 1 juta per bulan, perusahaan mitra BPJS).
+   - **Question**: Pertanyaan spesifik yang ingin dijawab (misalnya: bagaimana prosesnya, apa manfaatnya, berapa jumlahnya).
+
+2. Setelah semua elemen diidentifikasi, bentuk **Query RAG** dengan struktur: [INTENT] + [ENTITY] + [CONTEXT] + [QUESTION]
+3. Gunakan bahasa natural yang ringkas, namun informatif dan eksplisit.
+4. Generate hanya hasil akhirnya saja berupa satu buah kalimat
+
+Contoh 0 :
+User Input:
+> Apa itu BPJS
+Output : Pengertian BPJS
+
+Contoh 1 :
+User Input:
+> Di mana lokasi PT Sakura System Solution ?
+
+Output: Lokasi PT Sakura System Solution
+
+Contoh 2:
+User Input:
+> Saya mengalami kecelakaan di kantor dan ingin tahu apakah bisa klaim BPJS karena perusahaan saya adalah mitra.
+
+Output: apakah bisa klaim BPJS kecelakaan kerja di kantor jika perusahaan mitra dan apakah saya memenuhi syarat
+
+
+**Tugas Anda sekarang:**
+Lakukan proses di atas untuk setiap input pengguna yang diberikan. Hasilkan query RAG akhir yang siap digunakan dalam pencarian dokumen.
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/main.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ef3560650d8230c5ff8f323979653124b46b0aa
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/main.py
@@ -0,0 +1,32 @@
+import argparse
+from src.provider import AppProvider
+
+app = AppProvider()
+chatbot_ui = app.provide_chatbot().provide_chatbot_ui()
+rtc = app.provide_rtc()
+rtc_handler = rtc.provide_rtc_handler()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--mode", choices=[
+    "rtc-server", 
+    "rtc-ui", 
+    "rtc-gpt-server",
+    "rtc-gpt-ui",
+    "chatbot",
+    ], required=True)
+
+parser.add_argument("--port", default=7861, required=True)
+args = parser.parse_args()
+
+if(args.mode == "chatbot"):
+    print("Launching Chabot UI :))))))")
+    chatbot_ui.launch(port = int(args.port))
+elif(args.mode == "rtc-server"):
+    print("launching RTC Server Mode ... ")
+    rtc_handler.start_server(port = int(args.port))
+elif(args.mode == "rtc-ui"):
+    print("launching RTC UI Mode ... ")
+    rtc_handler.launch_ui(port = int(args.port))
+else:
+    print("ERROR : INVALID ARGUMENT | PLEASE CHOOSE ONE BETWEEN chatbot/rtc-server/rtc-ui mode ")
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
index 866d8600acf58b14a7f0075fff1e549e2d103267..db167cc935318eae668b96f316bc41d06fb03e1c 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
@@ -6,8 +6,6 @@ warnings.filterwarnings("ignore")
 import asyncio
 def run_test():
     try:
-        # await test_document_retriever()
-        # await test_language_model()
         test_inference()
     except Exception as e:
         print(e)
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
index 2fbfd49af3beada80f73b0b75cc8bd5f88a7d9ad..6339c42af4b2b9c60220d68da5d18966023d66f7 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
@@ -29,7 +29,7 @@ bnb = BitsAndBytesConfig(
 
 
 config = LMConfig(
-                model_name = "Qwen/Qwen2.5-1.5B-Instruct",
+                model_name = "meta-llama/Llama-3.1-8B",
                 temperature=0.3,
                 max_length=512,
                 generation_timeout=100,
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/language_model.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/language_model.py
index 3e3cb62b6843c9aacfa6039d2e7dc2e1ea77d36f..8971ec89a93387491fdab95059d58b9a453c782b 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/language_model.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/language_model.py
@@ -26,17 +26,17 @@ class LMConfig:
     quantization_config: any = None
     pad_token_id: Optional[int] = None
     eos_token_id: Optional[int] = None
-    # RAG-specific configs
+    
     max_context_length: int = 1500
     context_separator: str = "\n---\n"
-    instruction_template: str = "system"  # "system", "instruction", "custom"
-    # Async-specific configs
+    instruction_template: str = "system"  
+    
     max_workers: int = 2
     generation_timeout: float = 30
     repetition_penalty: float = 1.0
-    # Streaming-specific configs
-    stream_timeout: float = 100  # timeout untuk stream chunk
-    skip_prompt: bool = True     # skip prompt dari streaming output
+    
+    stream_timeout: float = 100  
+    skip_prompt: bool = True     
 
 class LM:
     """
@@ -65,11 +65,11 @@ class LM:
         self.is_loaded = False
         self.executor = ThreadPoolExecutor(max_workers=self.config.max_workers)
         self._lock = asyncio.Lock()
-        # Setup logging
+        
         logging.basicConfig(level=logging.INFO)
         self.logger = logging.getLogger(__name__)
         
-        # RAG prompt templates
+        
         self.prompt_template = prompt_template
     
     async def load_model(self) -> None:
@@ -82,7 +82,7 @@ class LM:
             try:
                 self.logger.info(f"Loading model: {self.config.model_name}")
                 
-                # Load tokenizer dalam thread pool
+                
                 self.tokenizer = await asyncio.get_event_loop().run_in_executor(
                     self.executor,
                     lambda: AutoTokenizer.from_pretrained(
@@ -93,7 +93,7 @@ class LM:
                     )
                 )
                 
-                # Load model dalam thread pool
+                
                 self.model = await asyncio.get_event_loop().run_in_executor(
                     self.executor,
                     lambda: AutoModelForCausalLM.from_pretrained(
@@ -105,7 +105,7 @@ class LM:
                     )
                 )
                 
-                # Setup generation config
+                
                 self.generation_config = GenerationConfig(
                     max_length=self.config.max_length,
                     temperature=self.config.temperature,
@@ -150,7 +150,7 @@ class LM:
             return f"Template '{template_type}' tidak tersedia. Available: {self.get_available_templates()}"
         
         template_data = copy.deepcopy(self.prompt_template)
-        # template_key = "user_template" if "user_template" in template_data else "template"
+        
         
         return template_data["content"].format(
             context=sample_context,
@@ -210,7 +210,7 @@ class LM:
         if len(context) <= max_length:
             return context
         
-        # Truncate dan tambahkan indicator
+        
         truncated = context[:max_length - 50]
         return truncated + "\n\n[... Context dipotong karena terlalu panjang ...]"
 
@@ -228,7 +228,7 @@ class LM:
         
         def _format_sync():
             
-            # Handle RetrievalResult secara eksplisit
+            
             if isinstance(contexts, RetrievalResult):
                 docs = contexts.documents
                 if max_contexts:
@@ -241,38 +241,38 @@ class LM:
                     metadata=contexts.metadata
                 )
             else:
-                # contexts diasumsikan sebagai list biasa (list[str] atau list[Document])
+                
                 processed_contexts = contexts[:max_contexts] if max_contexts and len(contexts) > max_contexts else contexts
 
-            # Format context menjadi string
+            
             formatted_context = self._format_context(processed_contexts, context_numbering)
 
-            # Truncate jika panjang melebihi batas
+            
             formatted_context = self._truncate_context(
                 formatted_context, 
                 self.config.max_context_length
             )
 
-            # Tambah metadata jika diizinkan dan konteks adalah RetrievalResult
+            
             if include_metadata and isinstance(processed_contexts, RetrievalResult):
                 metadata_info = []
                 for i, doc in enumerate(processed_contexts.documents, 1):
                     if hasattr(doc, "metadata") and doc.metadata:
                         metadata_info.append(f"Dokumen {i}: {doc.metadata}")
-                # if metadata_info:
-                #     formatted_context += f"\n\n[Metadata]\n" + "\n".join(metadata_info)
+                
+                
 
             return formatted_context
 
-        # Jalankan _format_sync di thread pool
+        
         formatted_context = await asyncio.get_event_loop().run_in_executor(
             self.executor, _format_sync
         )
         self.logger.info(f"Formatted Context {formatted_context}")
-        # Tentukan template yang akan dipakai
+        
         if(template_type == ""):
             self.config.instruction_template = "system"
-        # Gunakan custom template jika disediakan
+        
         if custom_template:
             return custom_template.format(
                 context=formatted_context,
@@ -283,14 +283,14 @@ class LM:
            
             template_data = copy.deepcopy(self.prompt_template)
             print("template = ", template_type, "rag template = ", template_data)
-            # template_key = "user_template" if "user_template" in template_data else "template"
+            
 
             formatted_template = []
             for cht in template_data:
-                    # Create a copy of the content to avoid modifying the original
+                    
                 content = cht["content"]
                 
-                # Format both placeholders at once to avoid KeyError
+                
                 if "{context}" in content or "{question}" in content:
                     try:
                         content = content.format(
@@ -299,29 +299,29 @@ class LM:
                         )
                     except KeyError as e:
                         self.logger.error(f"Missing placeholder in template: {e}")
-                        # Fallback: format only available placeholders
+                        
                         if "{context}" in content:
                             content = content.replace("{context}", formatted_context)
                         if "{question}" in content:
                             content = content.replace("{question}", question)
                 
-                # Create new dict with formatted content
+                
                 formatted_chat = {
                     "role": cht["role"],
                     "content": content
                 }
                 
-                # Copy other fields if they exist
+                
                 if "description" in cht:
                     formatted_chat["description"] = cht["description"]
                     
                 formatted_template.append(formatted_chat)
 
-            # self.logger.info(f"Formatted Template {formatted_template}")
-            # print("Forrmatted Template", formatted_template)
+            
+            
             return formatted_template
         else:
-            # Fallback default template
+            
             return [
                  {"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": question}
@@ -348,7 +348,7 @@ class LM:
         """
         await self._check_model_loaded()
         
-        # Setup streamer
+        
         streamer = TextIteratorStreamer(
             self.tokenizer, 
             timeout=self.config.stream_timeout,
@@ -358,14 +358,14 @@ class LM:
         
         def _generate_sync():
             try:
-                # Tokenize input
+                
                 inputs = self.tokenizer.apply_chat_template(
                     prompt,
                     add_generation_prompt=True,
                     return_tensors="pt"
                 )
                 
-                # Override generation config jika diperlukan
+                
                 gen_config = self.generation_config
                 if any([max_new_tokens, temperature, top_p]):
                     gen_config = GenerationConfig(
@@ -380,11 +380,11 @@ class LM:
                         **kwargs
                     )
                 
-                # Move to GPU
+                
                 self.model.to("cuda")
                 input_ids = inputs.to("cuda")
                 
-                # Generate dalam thread terpisah
+                
                 generation_kwargs = {
                     "input_ids": input_ids,
                     "generation_config": gen_config,
@@ -401,25 +401,25 @@ class LM:
                 self.logger.error(f"Error during stream generation setup: {e}")
                 raise
         
-        # Setup generation thread
+        
         generation_thread = await asyncio.get_event_loop().run_in_executor(
             self.executor, _generate_sync
         )
         err = None
         try:
-            # Stream tokens
+            
             for token in streamer:
-                if token:  # Skip empty tokens
+                if token:  
                     yield token
                     
-            # Wait for generation thread to finish
+            
             err = await asyncio.get_event_loop().run_in_executor(
                 self.executor, generation_thread.join
             )
             
         except Exception as e:
             self.logger.error(f"Error during streaming: {e}, {err}")
-            # Make sure thread is cleaned up
+            
             if generation_thread.is_alive():
                 generation_thread.join(timeout=1.0)
             raise
@@ -447,10 +447,10 @@ class LM:
         """
         await self._check_model_loaded()
         
-        # Format prompt
+        
         prompt = await self.format_rag_prompt(question, contexts, template_type)
         
-        # Generate dengan temperature yang lebih rendah untuk RAG (lebih faktual)
+        
         temp = temperature if temperature is not None else 0.3
         
         async for chunk in self.generate_stream(
@@ -480,7 +480,7 @@ class LM:
         
         def _format_chat():
             try:
-                # Format messages untuk chat
+                
                 formatted_prompt = self.tokenizer.apply_chat_template(
                     messages,
                     tokenize=False,
@@ -492,7 +492,7 @@ class LM:
                 self.logger.error(f"Error during chat formatting: {e}")
                 raise
         
-        # Format chat template dalam thread pool
+        
         formatted_prompt = await asyncio.get_event_loop().run_in_executor(
             self.executor, _format_chat
         )
@@ -525,14 +525,14 @@ class LM:
         """
         await self._check_model_loaded()
         
-        # Ambil last user message sebagai question
+        
         user_messages = [msg for msg in messages if msg.get("role") == "user"]
         if not user_messages:
             raise ValueError("No user message found in conversation")
         
         last_question = user_messages[-1]["content"]
         
-        # Generate RAG response secara streaming
+        
         async for chunk in self.rag_generate_stream(
             question=last_question,
             contexts=contexts,
@@ -542,7 +542,7 @@ class LM:
         ):
             yield chunk
 
-    # Utility method untuk collect full response dari stream
+    
     async def collect_stream(self, stream_generator: AsyncGenerator[str, None]) -> str:
         """
         Collect semua chunks dari stream generator menjadi full text
@@ -579,7 +579,7 @@ class LM:
         """
         await self._check_model_loaded()
         
-        # Create tasks untuk concurrent generation
+        
         tasks = []
         for template_type in template_types:
             task = asyncio.create_task(
@@ -589,7 +589,7 @@ class LM:
             )
             tasks.append((template_type, task))
         
-        # Wait for all tasks
+        
         results = {}
         for template_type, task in tasks:
             try:
@@ -639,10 +639,10 @@ class LM:
         """
         await self._check_model_loaded()
         
-        # Format prompt
+        
         prompt = await self.format_rag_prompt(question, contexts, template_type)
         
-        # Generate dengan temperature yang lebih rendah untuk RAG (lebih faktual)
+        
         temp = temperature if temperature is not None else 0.3
         
         return await self.generate(
@@ -673,14 +673,14 @@ class LM:
         """
         await self._check_model_loaded()
         
-        # Ambil last user message sebagai question
+        
         user_messages = [msg for msg in messages if msg.get("role") == "user"]
         if not user_messages:
             raise ValueError("No user message found in conversation")
         
         last_question = user_messages[-1]["content"]
         
-        # Generate RAG response
+        
         return await self.rag_generate(
             question=last_question,
             contexts=contexts,
@@ -718,14 +718,14 @@ class LM:
         
         def _generate_sync():
             try:
-                # Tokenize input
+                
                 inputs = self.tokenizer.apply_chat_template(
                     prompt,
                     add_generation_prompt=True,
                     return_tensors="pt"
                 )
                 
-                # Override generation config jika diperlukan
+                
                 gen_config = self.generation_config
                 if any([max_new_tokens, temperature, top_p]):
                     gen_config = GenerationConfig(
@@ -740,7 +740,7 @@ class LM:
                         **kwargs
                     )
                 
-                # Generate
+                
                 with torch.no_grad():
                     
                     self.model.to("cuda")
@@ -752,21 +752,21 @@ class LM:
                         **kwargs
                     )
                 
-                # Decode output
+                
                 generated_text = self.tokenizer.decode(
                     outputs[0][prompt_length:], 
                     skip_special_tokens=True
                 )
 
                 print("Generated Text", generated_text)
-                # Remove input prompt dari output
+                
                 return generated_text
                 
             except Exception as e:
                 self.logger.error(f"Error during generation: {e}")
                 raise
         
-        # Run generation in thread pool dengan timeout
+        
         try:
             result = await asyncio.wait_for(
                 asyncio.get_event_loop().run_in_executor(self.executor, _generate_sync),
@@ -796,7 +796,7 @@ class LM:
         
         def _format_chat():
             try:
-                # Format messages untuk chat
+                
                 formatted_prompt = self.tokenizer.apply_chat_template(
                     messages,
                     chat_template="rag",
@@ -808,7 +808,7 @@ class LM:
                 self.logger.error(f"Error during chat formatting: {e}")
                 raise
         
-        # Format chat template dalam thread pool
+        
         formatted_prompt = await asyncio.get_event_loop().run_in_executor(
             self.executor, _format_chat
         )
@@ -834,7 +834,7 @@ class LM:
                 else:
                     self.logger.warning(f"Unknown config parameter: {key}")
             
-            # Update generation config jika model sudah loaded
+            
             if self.is_loaded:
                 self.generation_config = GenerationConfig(
                     max_length=self.config.max_length,
@@ -862,7 +862,7 @@ class LM:
         }
         
         if self.is_loaded:
-            # Get model info dalam thread pool
+            
             def _get_info():
                 return {
                     "vocab_size": self.tokenizer.vocab_size,
@@ -894,7 +894,7 @@ class LM:
         """
         await self._check_model_loaded()
         
-        # Create tasks untuk concurrent generation
+        
         tasks = [
             asyncio.create_task(
                 self.generate(prompt, max_new_tokens=max_new_tokens, **kwargs)
@@ -902,10 +902,10 @@ class LM:
             for prompt in prompts
         ]
         
-        # Wait for all tasks
+        
         results = await asyncio.gather(*tasks, return_exceptions=True)
         
-        # Process results
+        
         processed_results = []
         for i, result in enumerate(results):
             if isinstance(result, Exception):
@@ -922,10 +922,10 @@ class LM:
         """
         self.logger.info("Closing LM...")
         
-        # Shutdown executor
+        
         self.executor.shutdown(wait=True)
         
-        # Clear GPU memory
+        
         if hasattr(self, 'model') and self.model is not None:
             del self.model
         if hasattr(self, 'tokenizer') and self.tokenizer is not None:
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/preprocessing.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/preprocessing.py
index 6b439abce750b049976b4f68b2094ba665f078eb..5cd29bd3ecdf5b356f996fec503fdf9fa5db644f 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/preprocessing.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/preprocessing.py
@@ -6,7 +6,7 @@ import logging
 from datetime import datetime
 import hashlib
 
-# Import types yang sudah ada
+
 from typing import List, Dict, Any, Optional, Union
 from dataclasses import dataclass
 from enum import Enum
@@ -15,36 +15,36 @@ from rag.retriever.retriever_types import *
 @dataclass
 class PreprocessingConfig:
     """Konfigurasi untuk preprocessing"""
-    # Text cleaning options
+    
     remove_extra_whitespace: bool = True
     remove_special_chars: bool = False
     normalize_unicode: bool = True
     remove_urls: bool = False
     remove_emails: bool = False
     
-    # Chunking options
-    enable_chunking: bool = False        # Apakah perlu chunking lagi
+    
+    enable_chunking: bool = False        
     chunk_size: int = 500
     chunk_overlap: int = 50
-    chunk_method: str = "sentence"       # "sentence", "paragraph", "fixed"
+    chunk_method: str = "sentence"       
+    
     
-    # Content filtering
     min_content_length: int = 20
     max_content_length: int = 3000
     filter_empty_content: bool = True
     filter_duplicate_content: bool = True
     
-    # Metadata options
+    
     extract_metadata: bool = True
     include_retrieval_info: bool = True
     include_document_info: bool = True
     include_timestamps: bool = True
     
-    # Scoring options
-    use_retrieval_scores: bool = True    # Use scores dari retrieval system
-    normalize_scores: bool = True        # Normalize scores ke range 0-1
-    min_score_threshold: float = 0.0    # Filter berdasarkan minimum score
-    score_boost_factor: float = 1.0     # Boost factor untuk scores
+    
+    use_retrieval_scores: bool = True    
+    normalize_scores: bool = True        
+    min_score_threshold: float = 0.0    
+    score_boost_factor: float = 1.0     
 
 class RetrievalPreprocessor:
     """
@@ -61,17 +61,17 @@ class RetrievalPreprocessor:
         """
         self.config = config or PreprocessingConfig()
         
-        # Setup logging
+        
         logging.basicConfig(level=logging.INFO)
         self.logger = logging.getLogger(__name__)
         
-        # Regex patterns untuk cleaning
+        
         self.url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
         self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
         self.special_chars_pattern = re.compile(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/]')
         self.whitespace_pattern = re.compile(r'\s+')
         
-        # Cache untuk duplicate detection
+        
         self._seen_content_hashes = set()
     
     def process_retrieval_result(self, retrieval_result: RetrievalResult) -> List[RetrievalResult]:
@@ -98,18 +98,18 @@ class RetrievalPreprocessor:
             f"Processing {len(retrieval_result.documents)} documents from retrieval result for query: '{retrieval_result.query}'"
         )
         
-        # Clear cache untuk setiap batch baru
+        
         self._seen_content_hashes.clear()
         
         contexts = []
         
-        # Process setiap document
+        
         for i, doc in enumerate(retrieval_result.documents):
             try:
-                # Get corresponding score
+                
                 score = retrieval_result.scores[i] if i < len(retrieval_result.scores) else 0.0
                 
-                # Process single document
+                
                 processed_contexts = self._process_single_document(
                     document=doc,
                     retrieval_score=score,
@@ -124,7 +124,7 @@ class RetrievalPreprocessor:
                 self.logger.error(f"Error processing document {i}: {e}")
                 continue
         
-        # Post-processing
+        
         contexts = self._post_process_contexts(contexts)
         
         self.logger.info(f"Successfully processed {len(contexts)} contexts from retrieval result")
@@ -154,23 +154,23 @@ class RetrievalPreprocessor:
             self.logger.warning(f"Empty content in document {document_index}")
             return []
         
-        # Clean content
+        
         cleaned_content = self._clean_text(document.page_content)
         
         if not cleaned_content:
             return []
         
-        # Filter by length
+        
         if len(cleaned_content) < self.config.min_content_length:
             self.logger.debug(f"Content too short in document {document_index}: {len(cleaned_content)} chars")
             return []
         
         if len(cleaned_content) > self.config.max_content_length:
-            # Truncate content
+            
             cleaned_content = self._truncate_content(cleaned_content)
             self.logger.debug(f"Content truncated in document {document_index}")
         
-        # Check for duplicates
+        
         if self.config.filter_duplicate_content:
             content_hash = hashlib.md5(cleaned_content.encode()).hexdigest()
             if content_hash in self._seen_content_hashes:
@@ -178,12 +178,12 @@ class RetrievalPreprocessor:
                 return []
             self._seen_content_hashes.add(content_hash)
         
-        # Filter by score threshold
+        
         if self.config.use_retrieval_scores and retrieval_score < self.config.min_score_threshold:
             self.logger.debug(f"Score too low in document {document_index}: {retrieval_score}")
             return []
         
-        # Chunking (if enabled)
+        
         if self.config.enable_chunking:
             chunks = self._chunk_content(cleaned_content)
             contexts = []
@@ -203,7 +203,7 @@ class RetrievalPreprocessor:
             
             return contexts
         else:
-            # Single context per document
+            
             context = self._create_retrieved_context(
                 content=cleaned_content,
                 document=document,
@@ -229,13 +229,13 @@ class RetrievalPreprocessor:
         """
         Create RetrievalResult object
         """
-        # Process score
+        
         final_score = self._process_score(retrieval_score, document_index, total_documents)
         
-        # Extract source
+        
         source = self._extract_source(document)
         
-        # Build metadata
+        
         metadata = self._build_metadata(
             document=document,
             retrieval_result=retrieval_result,
@@ -260,24 +260,24 @@ class RetrievalPreprocessor:
         
         cleaned = text
         
-        # Normalize unicode
+        
         if self.config.normalize_unicode:
             import unicodedata
             cleaned = unicodedata.normalize('NFKC', cleaned)
         
-        # Remove URLs
+        
         if self.config.remove_urls:
             cleaned = self.url_pattern.sub('', cleaned)
         
-        # Remove emails
+        
         if self.config.remove_emails:
             cleaned = self.email_pattern.sub('', cleaned)
         
-        # Remove special characters
+        
         if self.config.remove_special_chars:
             cleaned = self.special_chars_pattern.sub(' ', cleaned)
         
-        # Remove extra whitespace
+        
         if self.config.remove_extra_whitespace:
             cleaned = self.whitespace_pattern.sub(' ', cleaned)
         
@@ -290,7 +290,7 @@ class RetrievalPreprocessor:
         if len(content) <= max_length:
             return content
         
-        # Try to cut at sentence boundary
+        
         truncated = content[:max_length - 20]
         last_sentence_end = max(
             truncated.rfind('.'),
@@ -301,7 +301,7 @@ class RetrievalPreprocessor:
         if last_sentence_end > len(truncated) * 0.7:
             return truncated[:last_sentence_end + 1]
         else:
-            # Cut at word boundary
+            
             last_space = truncated.rfind(' ')
             if last_space > len(truncated) * 0.8:
                 return truncated[:last_space] + "..."
@@ -320,7 +320,7 @@ class RetrievalPreprocessor:
         elif self.config.chunk_method == "fixed":
             return self._chunk_by_fixed_size(content)
         else:
-            return [content]  # No chunking
+            return [content]  
     
     def _chunk_by_sentence(self, text: str) -> List[str]:
         """Chunk by sentences"""
@@ -334,7 +334,7 @@ class RetrievalPreprocessor:
             if len(current_chunk) + len(sentence) > self.config.chunk_size and current_chunk:
                 chunks.append(current_chunk.strip())
                 
-                # Handle overlap
+                
                 if self.config.chunk_overlap > 0:
                     overlap_text = current_chunk[-self.config.chunk_overlap:]
                     current_chunk = overlap_text + " " + sentence
@@ -383,7 +383,7 @@ class RetrievalPreprocessor:
             end = start + self.config.chunk_size
             chunk = text[start:end]
             
-            # Try to break at word boundary
+            
             if end < len(text):
                 last_space = chunk.rfind(' ')
                 if last_space > len(chunk) * 0.8:
@@ -392,7 +392,7 @@ class RetrievalPreprocessor:
             
             chunks.append(chunk.strip())
             
-            # Move with overlap
+            
             start = end - self.config.chunk_overlap
             if start <= 0:
                 start = end
@@ -406,9 +406,9 @@ class RetrievalPreprocessor:
         
         score = retrieval_score * self.config.score_boost_factor
         
-        # Normalize to 0-1 range jika diperlukan
+        
         if self.config.normalize_scores:
-            # Assume retrieval scores are already normalized, but ensure they are in range
+            
             score = max(0.0, min(1.0, score))
         
         return round(score, 4)
@@ -417,14 +417,14 @@ class RetrievalPreprocessor:
         """Extract source dari document metadata"""
         metadata = document.metadata or {}
         
-        # Try different metadata keys for source
+        
         source_keys = ['source', 'file_name', 'filename', 'title', 'file_path', 'path']
         
         for key in source_keys:
             if key in metadata and metadata[key]:
                 return str(metadata[key])
         
-        # Fallback to generic source
+        
         return "unknown_source"
     
     def _build_metadata(self,
@@ -439,7 +439,7 @@ class RetrievalPreprocessor:
         metadata = {}
         
         if self.config.extract_metadata:
-            # Include original document metadata
+            
             if document.metadata and self.config.include_document_info:
                 metadata.update({
                     "original_metadata": document.metadata,
@@ -447,7 +447,7 @@ class RetrievalPreprocessor:
                     "total_documents": total_documents
                 })
             
-            # Include chunking info
+            
             if chunk_index is not None:
                 metadata.update({
                     "chunk_index": chunk_index,
@@ -455,7 +455,7 @@ class RetrievalPreprocessor:
                     "is_chunked": total_chunks > 1
                 })
             
-            # Include retrieval info
+            
             if self.config.include_retrieval_info:
                 metadata.update({
                     "retrieval_query": retrieval_result.query,
@@ -463,7 +463,7 @@ class RetrievalPreprocessor:
                     "retrieval_metadata": retrieval_result.metadata
                 })
             
-            # Include processing info
+            
             if self.config.include_timestamps:
                 metadata.update({
                     "processed_at": datetime.now().isoformat(),
@@ -480,7 +480,7 @@ class RetrievalPreprocessor:
                     }
                 })
             
-            # Content statistics
+            
             word_count = len(content.split())
             sentence_count = len(re.split(r'[.!?]+', content))
             
@@ -500,11 +500,11 @@ class RetrievalPreprocessor:
         if not contexts:
             return contexts
         
-        # Sort by score (descending)
+        
         if self.config.use_retrieval_scores:
             contexts.sort(key=lambda x: x.score or 0.0, reverse=True)
         
-        # Additional filtering jika diperlukan
+        
         filtered_contexts = []
         for ctx in contexts:
             if self.config.filter_empty_content and not ctx.content.strip():
@@ -522,16 +522,16 @@ class RetrievalPreprocessor:
         total_words = sum(len(ctx.content.split()) for ctx in contexts)
         total_chars = sum(len(ctx.content) for ctx in contexts)
         
-        # Score distribution
+        
         scores = [ctx.score for ctx in contexts if ctx.score is not None]
         
-        # Source distribution
+        
         sources = {}
         for ctx in contexts:
             if ctx.source:
                 sources[ctx.source] = sources.get(ctx.source, 0) + 1
         
-        # Chunking stats
+        
         chunked_contexts = sum(1 for ctx in contexts 
                              if ctx.metadata and ctx.metadata.get("is_chunked", False))
         
@@ -557,7 +557,7 @@ class RetrievalPreprocessor:
             stats["source_distribution"] = sources
             stats["unique_sources"] = len(sources)
         
-        # Content length distribution
+        
         lengths = [len(ctx.content) for ctx in contexts]
         stats["content_length_stats"] = {
             "min_length": min(lengths),
@@ -589,7 +589,7 @@ class RetrievalPreprocessor:
             try:
                 contexts = self.process_retrieval_result(result)
                 
-                # Add batch info to metadata
+                
                 for ctx in contexts:
                     if ctx.metadata:
                         ctx.metadata["batch_index"] = i
@@ -606,7 +606,7 @@ class RetrievalPreprocessor:
                 self.logger.error(f"Error processing retrieval result {i}: {e}")
                 continue
         
-        # Final post-processing untuk batch
+        
         all_contexts = self._post_process_contexts(all_contexts)
         
         self.logger.info(f"Batch processing completed: {len(all_contexts)} total contexts")
@@ -637,12 +637,12 @@ class RetrievalPreprocessor:
         for ctx in contexts:
             content_words = set(ctx.content.lower().split())
             
-            # Simple relevance calculation: overlap of words
+            
             overlap = len(query_words.intersection(content_words))
             relevance_score = overlap / len(query_words) if query_words else 0.0
             
             if relevance_score >= min_relevance_score:
-                # Update metadata dengan relevance info
+                
                 if ctx.metadata:
                     ctx.metadata["query_relevance_score"] = round(relevance_score, 3)
                     ctx.metadata["matched_query_words"] = list(query_words.intersection(content_words))
@@ -654,7 +654,7 @@ class RetrievalPreprocessor:
                 
                 filtered_contexts.append(ctx)
         
-        # Sort by relevance score
+        
         filtered_contexts.sort(
             key=lambda x: x.metadata.get("query_relevance_score", 0.0), 
             reverse=True
@@ -699,9 +699,9 @@ class RetrievalPreprocessor:
                 if sim_score >= similarity_threshold:
                     is_duplicate = True
                     
-                    # Keep the one with higher score
+                    
                     if (ctx.score or 0.0) > (existing_ctx.score or 0.0):
-                        # Replace existing with current
+                        
                         idx = deduplicated.index(existing_ctx)
                         deduplicated[idx] = ctx
                     
@@ -741,12 +741,12 @@ class RetrievalPreprocessor:
             if not proc_result.chunks:
                 continue
             
-            # Convert Document chunks to RetrievalResult
+            
             for j, chunk in enumerate(proc_result.chunks):
-                # Extract source dari document metadata
+                
                 source = self._extract_source(chunk)
                 
-                # Build metadata from ProcessingResult
+                
                 metadata = {
                     "document_metadata": proc_result.document_metadata.__dict__,
                     "chunk_index": j,
@@ -755,21 +755,21 @@ class RetrievalPreprocessor:
                     "processed_at": datetime.now().isoformat()
                 }
                 
-                # Include original chunk metadata
+                
                 if chunk.metadata:
                     metadata["original_chunk_metadata"] = chunk.metadata
                 
-                # Clean content
+                
                 cleaned_content = self._clean_text(chunk.page_content)
                 
                 if not cleaned_content or len(cleaned_content) < self.config.min_content_length:
                     continue
                 
-                # Create RetrievalResult
+                
                 context = RetrievalResult(
                     content=cleaned_content,
                     source=source,
-                    score=1.0,  # Default score for processing results
+                    score=1.0,  
                     metadata=metadata
                 )
                 
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/base_retriever.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/base_retriever.py
index 10627dc41ef2d73995f43870f8b1ed9dd1eb1ad3..3379066126a80213df782a2c86244d070f1ec0df 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/base_retriever.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/base_retriever.py
@@ -47,7 +47,7 @@ class BaseRetriever(ABC):
         """Delete documents by IDs"""
         pass
 
-# ===== DOCUMENT LOADERS =====
+
 
 class MultiFormatDocumentLoader(BaseDocumentLoader):
     """Document loader supporting multiple formats"""
@@ -68,10 +68,10 @@ class MultiFormatDocumentLoader(BaseDocumentLoader):
             if not file_path.exists():
                 raise FileNotFoundError(f"File not found: {file_path}")
             
-            # Determine document type
+            
             doc_type = self._get_document_type(file_path)
             
-            # Load document
+            
             loader_func = self.loaders.get(doc_type)
             if not loader_func:
                 raise ValueError(f"Unsupported file type: {doc_type}")
@@ -79,7 +79,7 @@ class MultiFormatDocumentLoader(BaseDocumentLoader):
             logger.info(f"Loading {doc_type} document: {file_path}")
             documents = await loader_func(str(file_path))
             
-            # Add metadata to documents
+            
             for doc in documents:
                 doc.metadata.update({
                     "file_path": str(file_path),
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_loader.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_loader.py
index af371daefcf29622bd7aab9a5089e6b382a9eea4..38f0e48f5791f9c785563d2568c1b88791185936 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_loader.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_loader.py
@@ -56,10 +56,10 @@ class MultiFormatDocumentLoader(BaseDocumentLoader):
             if not file_path.exists():
                 raise FileNotFoundError(f"File not found: {file_path}")
             
-            # Determine document type
+            
             doc_type = self._get_document_type(file_path)
             
-            # Load document
+            
             loader_func = self.loaders.get(doc_type)
             if not loader_func:
                 raise ValueError(f"Unsupported file type: {doc_type}")
@@ -67,7 +67,7 @@ class MultiFormatDocumentLoader(BaseDocumentLoader):
             logger.info(f"Loading {doc_type} document: {file_path}")
             documents = await loader_func(str(file_path))
             
-            # Add metadata to documents
+            
             for doc in documents:
                 doc.metadata.update({
                     "file_path": str(file_path),
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_processor.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_processor.py
index 9f7fc12d3e1adab606dae3c82631ef7233f8db8b..7596d32e8ea3c72685b0ced26eb711503e59656b 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_processor.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_processor.py
@@ -18,7 +18,7 @@ class DocumentProcessor:
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
         
-        # Default separators for better chunking
+        
         if separators is None:
             separators = ["\n\n", "\n", " ", ""]
         
@@ -34,12 +34,12 @@ class DocumentProcessor:
         try:
             logger.info(f"Processing {len(documents)} documents")
             
-            # Split documents into chunks
+            
             chunks = await asyncio.get_event_loop().run_in_executor(
                 None, self.text_splitter.split_documents, documents
             )
             
-            # Add chunk metadata
+            
             for i, chunk in enumerate(chunks):
                 chunk.metadata.update({
                     "chunk_id": i,
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
index 4de11510f04804a54e5d163933c3005340d5836e..1f10acaff0ad106bb98449ce65e50894945de11d 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
@@ -1,14 +1,14 @@
 from rag.retriever.base_retriever import BaseRetriever
 
-# Embeddings
+
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_openai import OpenAIEmbeddings
 
-# Vector stores
+
 from langchain_community.vectorstores import Chroma, FAISS, Pinecone
 from langchain.retrievers import EnsembleRetriever
 
-# Retriever base
+
 from langchain_core.vectorstores import VectorStoreRetriever
 from langchain_community.retrievers import BM25Retriever
 from langchain.retrievers import ContextualCompressionRetriever
@@ -85,8 +85,8 @@ class LangChainRetriever(BaseRetriever):
                 search_kwargs={"k": 10}
             )
             if self.use_hybrid_search:
-                self.bm25_retriever = None  # initialized later after adding docs
-                return vector_retriever  # temporary fallback
+                self.bm25_retriever = None  
+                return vector_retriever  
             else:
                 return vector_retriever
         except Exception as e:
@@ -162,13 +162,13 @@ class LangChainRetriever(BaseRetriever):
             return False
     async def _update_bm25_retriever(self, documents: List[Document]):
         try:
-            # Create BM25 retriever from documents
+            
             self.bm25_retriever = BM25Retriever.from_documents(documents)
-            self.bm25_retriever.k = 10  # Set number of documents to retrieve
+            self.bm25_retriever.k = 10  
+            
+            
             
-            # For hybrid search, you have several options:
             
-            # Option 1: Use only BM25 retriever (simplest fix)
             self.retriever = self.bm25_retriever
             
             vector_retriever = VectorStoreRetriever(
@@ -178,12 +178,12 @@ class LangChainRetriever(BaseRetriever):
 
             self.retriever = EnsembleRetriever(
                 retrievers=[vector_retriever, self.bm25_retriever],
-                weights=[0.5, 0.5]  # Equal weight to both retrievers
+                weights=[0.5, 0.5]  
             )
             
         except Exception as e:
             logger.error(f"Error updating BM25 retriever: {str(e)}")
-            # Fallback to vector retriever only
+            
             self.retriever = VectorStoreRetriever(
                 vectorstore=self.vectorstore,
                 search_kwargs={"k": 10}
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/duckduckgo_search.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/duckduckgo_search.py
index 29729e28665083b36ecda3b202f673dccfcd1584..9948924e29c694b0876a7edce7ecf6e946d34a8b 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/duckduckgo_search.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/duckduckgo_search.py
@@ -7,7 +7,7 @@ from typing import AsyncGenerator, List
 
 class DuckDuckGoSearch:
     def __init__(self, html_loader: AsyncChromiumLoader = None, html_parser = None):
-        # Initialize dengan default values jika tidak diberikan
+        
         self.html_loader = html_loader or AsyncChromiumLoader([])
         self.html_parser = html_parser or BeautifulSoupTransformer()
         self.logger = logging.getLogger("ddgs_logger")
@@ -16,7 +16,7 @@ class DuckDuckGoSearch:
         """Get page content from URLs - returns list of documents"""
         try:
             self.html_loader.urls = urls
-            html = await self.html_loader.aload()  # This returns a LIST
+            html = await self.html_loader.aload()  
             self.logger.info(f"search engine aload result: {len(html)} documents loaded")
             
             docs_transformed = self.html_parser.transform_documents(
@@ -24,11 +24,11 @@ class DuckDuckGoSearch:
                 tags_to_extract=["p"], 
                 remove_unwanted_tags=["a"]
             )
-            return docs_transformed  # Returns LIST of documents
+            return docs_transformed  
             
         except Exception as e:
             self.logger.error(f"Error loading pages: {e}", exc_info=True)
-            return []  # Return empty list on error
+            return []  
     
     def truncate(self, text: str, max_words: int = 400) -> str:
         """Truncate text to specified number of words"""
@@ -51,12 +51,12 @@ class DuckDuckGoSearch:
         try:
             self.logger.info(f"Searching for: {query} (max_results: {max_results})")
             
-            # Step 1: Get search results from DDGS (regular iterator)
+            
             results = DDGS().text(query, max_results=max_results)
             urls = []
             
-            # Step 2: Extract URLs using regular for loop (NOT async for)
-            for result in results:  # ← FIXED: Regular for loop
+            
+            for result in results:  
                 url = result.get('href')
                 if url:
                     urls.append(url)
@@ -67,20 +67,20 @@ class DuckDuckGoSearch:
                 self.logger.warning("No URLs found from search results")
                 return
             
-            # Step 3: Get page content (await the coroutine first)
-            docs = await self.get_page(urls)  # ← FIXED: Await first, get list
             
-            # Step 4: Process documents using regular for loop (NOT async for)
-            for doc in docs:  # ← FIXED: Regular for loop on list
+            docs = await self.get_page(urls)  
+            
+            
+            for doc in docs:  
                 try:
                     if hasattr(doc, 'page_content') and doc.page_content:
-                        # Clean up text
+                        
                         page_text = re.sub(r"\n\n+", "\n", doc.page_content)
                         page_text = page_text.strip()
                         
-                        if page_text:  # Only yield if there's actual content
+                        if page_text:  
                             text = self.truncate(page_text)
-                            yield text  # Yield makes this an async generator
+                            yield text  
                         
                 except Exception as e:
                     self.logger.error(f"Error processing document: {e}")
@@ -88,7 +88,7 @@ class DuckDuckGoSearch:
                     
         except Exception as e:
             self.logger.error(f"Error in search method: {e}", exc_info=True)
-            # Don't re-raise, just log and return (generator will be empty)
+            
     
     async def search_with_metadata(self, query: str, max_results: int = 5) -> AsyncGenerator[dict, None]:
         """
@@ -98,7 +98,7 @@ class DuckDuckGoSearch:
             results = DDGS().text(query, max_results=max_results)
             urls_and_titles = []
             
-            # Collect URLs and titles
+            
             for result in results:
                 url = result.get('href')
                 title = result.get('title', 'No title')
@@ -108,11 +108,11 @@ class DuckDuckGoSearch:
             if not urls_and_titles:
                 return
             
-            # Get page content
+            
             urls = [item['url'] for item in urls_and_titles]
             docs = await self.get_page(urls)
             
-            # Process and yield with metadata
+            
             for i, doc in enumerate(docs):
                 try:
                     if hasattr(doc, 'page_content') and doc.page_content:
@@ -122,7 +122,7 @@ class DuckDuckGoSearch:
                         if page_text:
                             text = self.truncate(page_text)
                             
-                            # Get metadata if available
+                            
                             metadata = {}
                             if i < len(urls_and_titles):
                                 metadata = urls_and_titles[i]
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
index 7604bca5419b74a7cca98aa8e74fe5f4a83ded1c..ef300335424842ef980dd375870741a66a55012a 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
@@ -31,7 +31,7 @@ import re
 
 
 from rag import cs_agent
-# Load .env
+
 load_dotenv()
 logging.basicConfig(level=logging.INFO)
 
@@ -101,7 +101,7 @@ class RTCHandler:
                 llm_time = time.time()
                 self.full_response = ""
 
-                # Single async function to handle both text streaming and audio generation
+                
                 async def stream_text_to_audio():
                     chunk_size = 1024
                     no_buffer = 0
@@ -113,7 +113,7 @@ class RTCHandler:
                             chunk = stream_data["data"]["chunk"]
                             self.full_response += chunk
                             text_buffer += chunk
-                            # Generate audio immediately for each text chunk
+                            
                             if re.search(r'[.,?;!]', chunk):
                                 try:
                                     audio_buffer_gen =  await self.edge_tts.generate_audio_buffer(text_buffer)
@@ -121,37 +121,37 @@ class RTCHandler:
                                     
                                     audio_buffer.seek(0)
                                     
-                                    # Convert MP3 to PCM
+                                    
                                     audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
                                     samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2 ** 15)
                                     
-                                    # Handle stereo to mono
+                                    
                                     if audio_segment.channels == 2:
                                         samples = samples.reshape((-1, 2)).mean(axis=1)
                                     
-                                    # # Resample to 24kHz
-                                    # resampled = librosa.resample(samples, orig_sr=audio_segment.frame_rate, target_sr=24000)
+                                    
+                                    
                                     import torch
                                     import torchaudio
                                     
-                                    # Check if CUDA is available
+                                    
                                     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                                     
-                                    # Convert numpy array to torch tensor and move to GPU
-                                    audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device)  # Add batch dimension and move to GPU
                                     
-                                    # Create resampler and move to GPU
+                                    audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device)  
+                                    
+                                    
                                     resampler = torchaudio.transforms.Resample(
                                         orig_freq=audio_segment.frame_rate,
                                         new_freq=24000
                                     ).to(device)
                                     
-                                    # Apply resampling on GPU
+                                    
                                     resampled_tensor = resampler(audio_tensor)
                                     
-                                    # Convert back to numpy (move to CPU first)
+                                    
                                     resampled = resampled_tensor.squeeze(0).cpu().numpy()
-                                    # Yield audio chunks
+                                    
                                     for i in range(0, len(resampled), chunk_size):
                                         yield (24000, resampled[i:i + chunk_size])
                                     no_buffer = 0
@@ -169,7 +169,7 @@ class RTCHandler:
                             print(f"\nTotal time: {total_time:.2f}s")
                             break
 
-                # Run the single async function
+                
                 loop = asyncio.new_event_loop()
                 asyncio.set_event_loop(loop)
                 
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
index b8ac480e4ee24cf791f17fb003bc58c359c5de79..da6978aaeaa6fabe29ef7c6c1a064e360280bd74 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
@@ -15,27 +15,27 @@ class WhisperSTT:
             model_size: Model size (tiny, base, small, medium, large)
             device: Device to use ("auto", "cuda", "cpu")
         """
-        # Set up cache directory
+        
         cache_dir = os.environ.get('WHISPER_CACHE_DIR', '/tmp/.cache/whisper')
         os.makedirs(cache_dir, exist_ok=True)
         
-        # Determine device
+        
         if device == "auto":
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
         else:
             self.device = device
             
-        # Validate CUDA availability if requested
+        
         if self.device == "cuda" and not torch.cuda.is_available():
             print("Warning: CUDA requested but not available. Falling back to CPU.")
             self.device = "cpu"
         
-        # Load model with device specification
+        
         print(f"Loading Whisper model '{model_size}' on device: {self.device}")
         self.model = whisper.load_model(model_size, device=self.device, download_root=cache_dir)
-        self.language = "id"  # ISO-639-1 code for Bahasa Indonesia
+        self.language = "id"  
+        
         
-        # Print GPU info if using CUDA
         if self.device == "cuda":
             gpu_name = torch.cuda.get_device_name(0)
             gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
@@ -52,23 +52,23 @@ class WhisperSTT:
         Returns:
             Transcribed text
         """
-        # Save audio to temporary file
+        
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
             tmp.write(audio.read())
             tmp.flush()
             tmp_path = tmp.name
 
         try:
-            # Transcribe with GPU acceleration if available
+            
             result = self.model.transcribe(
                 tmp_path, 
                 language=language,
-                # Optional: Add fp16 for faster inference on supported GPUs
+                
                 fp16=self.device == "cuda"
             )
             return result.get("text", "")
         finally:
-            # Clean up temporary file
+            
             os.remove(tmp_path)
     
     def get_device_info(self) -> dict:
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/audio_edge_tts.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/audio_edge_tts.py
index c9f13412885295c84efe9596e6b21f76a38008f6..003f26c7416dbd504b619e054bc73749dd76b53f 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/audio_edge_tts.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/audio_edge_tts.py
@@ -29,7 +29,7 @@ class EdgeTTS:
             pitch=self.pitch_str
         )
         
-        # Stream audio chunks
+        
         async for chunk in communicate.stream():
             if chunk["type"] == "audio":
                 yield chunk["data"]
@@ -52,7 +52,7 @@ class EdgeTTS:
                 pitch=self.pitch_str
             )
             
-            # Collect all audio chunks into a buffer
+            
             audio_buffer = io.BytesIO()
             async for chunk in communicate.stream():
                 if chunk["type"] == "audio":
@@ -85,7 +85,7 @@ class EdgeTTS:
             
             async for chunk in communicate.stream():
                 if chunk["type"] == "audio":
-                    # Call callback with audio chunk
+                    
                     callback_func(chunk["data"], None)
                     
         except Exception as e:
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
index ab67cd6148a447c67ff80a387f51cf9a90076a64..2fbfd49af3beada80f73b0b75cc8bd5f88a7d9ad 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
@@ -49,11 +49,11 @@ inferencer_config = InferencerConfig(
 )
 
 document_retriever = LangChainRetriever(
-        embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+        embedding_model="BAAI/bge-large-en",
         vectorstore_type="chroma",
         vectorstore_path="vectorstore/",
         use_hybrid_search=True,
-        chunk_size=1000,
+        chunk_size=3000,
         chunk_overlap=200
 )
 
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/agents.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/agents.py
new file mode 100644
index 0000000000000000000000000000000000000000..3244631eb0d065aa97f2d0b41f59e3b25a7b20b1
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/agents.py
@@ -0,0 +1,16 @@
+from rag.pipeline.language_model import LM
+from rag.inference.inferencer import Inferencer
+from abc import ABC, abstractmethod
+class Agent(ABC):
+    def __init__(self, inferencer:Inferencer, prompt_template = [
+        {
+            "role" : "system",
+            "content":"You are an agent that doing some specic task"
+        }
+    ]):
+        self.inferencer = inferencer
+        self.inferencer.model.prompt_template = prompt_template
+        self.prompt = prompt_template
+    @abstractmethod
+    async def get_result(self):
+        pass
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/customer_service_agent.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/customer_service_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce0382e8a3d7917f84ef2f8f927b2b3942c0aef5
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/customer_service_agent.py
@@ -0,0 +1,33 @@
+from rag.agents.agents import Agent
+from rag.inference.inferencer import Inferencer
+
+class CSAgent(Agent):
+    def __init__(self, inferencer : Inferencer , prompt_template):
+        super().__init__(inferencer, prompt_template)
+        self.inferencer = inferencer
+        self.prompt_template = prompt_template
+        self.file_paths = [
+            "../documents/bpjs.pdf",
+            # "../documents/pph21.pdf",
+            # "../documents/lembur.pdf",
+            # "../documents/uu13.pdf",
+            "../documents/file.pdf",
+        ]
+    async def load_documents(self):
+        for file_path in self.file_paths:
+            await self.add_doc(file_path)
+        
+    async def add_doc(self, file_path):
+        result = await self.inferencer.retriever.add_document_from_file(file_path)
+        if result.success:
+                print(f"Successfully processed: {result.document_metadata.file_name}")
+                print(f"Chunks created: {result.document_metadata.chunk_count}")
+        else:
+                print(f"Failed to process: {result.error_message}")
+
+    async def get_result(self, question):
+        self.inferencer.model.prompt_template = self.prompt_template
+        async for item in self.inferencer.infer_stream(query = question,
+                                    enable_reranking=False,
+                                    k=3):
+                yield item
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/gpt_customer_service_agent.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/gpt_customer_service_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e56dc1483e6cd38123269d3080064746753abb
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/gpt_customer_service_agent.py
@@ -0,0 +1,13 @@
+from rag.agents.agents import Agent
+from rag.pipeline.language_model import LM
+from rag.inference.inferencer import Inferencer
+
+class GPTCSAgent(Agent):
+    def __init__(self, inferencer : Inferencer , prompt_template):
+        super().__init__(inferencer, prompt_template)
+        self.inferencer = inferencer
+        self.prompt_template = prompt_template
+    async def get_result(self, question : str):
+        self.inferencer.model.prompt_template = self.prompt_template
+        print("Question received :", question)
+        return await self.inferencer.infer(query = question)
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/query_maker_agent.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/query_maker_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d3573736e94e0b081d5e16d7ca22da22916e34e
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/agents/query_maker_agent.py
@@ -0,0 +1,13 @@
+from rag.agents.agents import Agent
+from rag.pipeline.language_model import LM
+from rag.inference.inferencer import Inferencer
+
+class QueryMakerAgent(Agent):
+    def __init__(self, inferencer : Inferencer , prompt_template):
+        super().__init__(inferencer, prompt_template)
+        self.inferencer = inferencer
+        self.prompt_template = prompt_template
+    async def get_result(self, question : str):
+        self.inferencer.model.prompt_template = self.prompt_template
+        print("Question received :", question)
+        return await self.inferencer.infer(query = question)
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbe5746260322b5b3e739ee643af9c39e06b62b8
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/__init__.py
@@ -0,0 +1,29 @@
+def read_template_txt(file_path):
+    """Baca file txt biasa"""
+    with open(f"rag/chat_template/{file_path}.txt", 'r', encoding='utf-8') as f:
+        return f.read()
+def get_chat_template(file_name):
+    sys_prompt = read_template_txt(file_name)
+    return [
+        {
+            "role" : "system",
+            "content" : f"""
+            {sys_prompt}
+            """
+        },
+        {
+            "role" : "user",
+            "content" : """ 
+            
+            Please answer properly:  
+            {question} 
+
+            From given context :
+            {context}
+             
+            
+
+            """
+        }
+    ]
+    
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/customer_service.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/customer_service.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0eca246a79d34b17715ad5e17d4a5bfd1140a75b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/customer_service.txt
@@ -0,0 +1,12 @@
+You are a friendly and professional Customer Service for Human Resource Information System (HRIS) field,
+representative, fluent in Indonesian. Your job is to assist customers with accurate information based on your company's basic knowledge. Follow these guidelines:
+
+- Always greet customers in a friendly and professional manner.
+- Your answers are contextual and objective.
+- Provide clear, easy-to-understand, and structured answers based on the context provided by the user.
+- If information is not available, offer alternative assistance or direct them to the appropriate channel.
+- Use polite language and empathize with the customer's needs.
+- Conclude by offering further assistance.
+- You are highly skilled in the area relevant to the given context.
+
+Please use the given context to answer accurately.
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/query_maker.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/query_maker.txt
new file mode 100644
index 0000000000000000000000000000000000000000..15e21bccb8a5229bf21853055bceb700f7fcddf7
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/query_maker.txt
@@ -0,0 +1,35 @@
+Anda adalah agen AI yang tepat dan objektif, 
+Anda bertugas mengubah pertanyaan atau pernyataan pengguna menjadi query yang eksplisit dan efisien untuk keperluan pencarian dokumen dalam sistem RAG (Retrieval-Augmented Generation).
+
+Ikuti langkah-langkah berikut:
+
+1. Ekstrak bagian-bagian penting dari input pengguna:
+   - **Intent**: Tujuan utama atau jenis permintaan (misalnya: apa itu, cara, syarat, apakah bisa, berapa).
+   - **Entity/Noun Phrase**: Objek utama yang dibahas (misalnya: BPJS, tokenizer truncation, RWKV, gaji).
+   - **Context**: Informasi pendukung yang menyempitkan fokus (misalnya: kecelakaan kerja, gaji 1 juta per bulan, perusahaan mitra BPJS).
+   - **Question**: Pertanyaan spesifik yang ingin dijawab (misalnya: bagaimana prosesnya, apa manfaatnya, berapa jumlahnya).
+
+2. Setelah semua elemen diidentifikasi, bentuk **Query RAG** dengan struktur: [INTENT] + [ENTITY] + [CONTEXT] + [QUESTION]
+3. Gunakan bahasa natural yang ringkas, namun informatif dan eksplisit.
+4. Generate hanya hasil akhirnya saja berupa satu buah kalimat
+
+Contoh 0 :
+User Input:
+> Apa itu BPJS
+Output : Pengertian BPJS
+
+Contoh 1 :
+User Input:
+> Di mana lokasi PT Sakura System Solution ?
+
+Output: Lokasi PT Sakura System Solution
+
+Contoh 2:
+User Input:
+> Saya mengalami kecelakaan di kantor dan ingin tahu apakah bisa klaim BPJS karena perusahaan saya adalah mitra.
+
+Output: apakah bisa klaim BPJS kecelakaan kerja di kantor jika perusahaan mitra dan apakah saya memenuhi syarat
+
+
+**Tugas Anda sekarang:**
+Lakukan proses di atas untuk setiap input pengguna yang diberikan. Hasilkan query RAG akhir yang siap digunakan dalam pencarian dokumen.
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/query_maker_temp.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/query_maker_temp.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c7b372a6635d7dda81fd378566696ae72593067d
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/chat_template/query_maker_temp.txt
@@ -0,0 +1,30 @@
+Anda adalah agen AI yang tepat dan objektif, 
+Anda bertugas mengubah pertanyaan atau pernyataan pengguna menjadi query yang eksplisit dan efisien untuk keperluan pencarian dokumen dalam sistem RAG (Retrieval-Augmented Generation).
+
+Ikuti langkah-langkah berikut:
+
+1. Ekstrak bagian-bagian penting dari input pengguna:
+   - **Intent**: Tujuan utama atau jenis permintaan (misalnya: apa itu, cara, syarat, apakah bisa, berapa).
+   - **Entity/Noun Phrase**: Objek utama yang dibahas (misalnya: BPJS, tokenizer truncation, RWKV, gaji).
+   - **Context**: Informasi pendukung yang menyempitkan fokus (misalnya: kecelakaan kerja, gaji 1 juta per bulan, perusahaan mitra BPJS).
+   - **Question**: Pertanyaan spesifik yang ingin dijawab (misalnya: bagaimana prosesnya, apa manfaatnya, berapa jumlahnya).
+
+2. Setelah semua elemen diidentifikasi, bentuk **Query RAG** dengan struktur: [INTENT] + [ENTITY] + [CONTEXT] + [QUESTION]
+3. Gunakan bahasa natural yang ringkas, namun informatif dan eksplisit.
+4. Generate hanya hasil akhirnya saja berupa satu buah kalimat
+
+Contoh 1 :
+User Input:
+> Di mana lokasi PT Sakura System Solution ?
+
+Output: Lokasi PT Sakura System Solution
+
+Contoh 2:
+User Input:
+> Saya mengalami kecelakaan di kantor dan ingin tahu apakah bisa klaim BPJS karena perusahaan saya adalah mitra.
+
+Output: apakah bisa klaim BPJS kecelakaan kerja di kantor jika perusahaan mitra dan apakah saya memenuhi syarat
+
+
+**Tugas Anda sekarang:**
+Lakukan proses di atas untuk setiap input pengguna yang diberikan. Hasilkan query RAG akhir yang siap digunakan dalam pencarian dokumen.
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/language_model.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/language_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e3cb62b6843c9aacfa6039d2e7dc2e1ea77d36f
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/language_model.py
@@ -0,0 +1,947 @@
+import torch
+import asyncio
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer, BitsAndBytesConfig
+import torch
+from typing import Optional, Dict, Any, List, Union, Callable, Awaitable, AsyncGenerator
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from threading import Thread
+from rag.retriever.retriever_types import RetrievalResult
+from langchain_core.documents import Document
+import copy
+
+@dataclass
+class LMConfig:
+    model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"
+    device: str = "cuda"
+    torch_dtype: torch.dtype = torch.float16
+    max_length: int = 2048
+    temperature: float = 0.7
+    top_p: float = 0.8
+    top_k: int = 50
+    do_sample: bool = True
+    quantization_config: any = None
+    pad_token_id: Optional[int] = None
+    eos_token_id: Optional[int] = None
+    # RAG-specific configs
+    max_context_length: int = 1500
+    context_separator: str = "\n---\n"
+    instruction_template: str = "system"  # "system", "instruction", "custom"
+    # Async-specific configs
+    max_workers: int = 2
+    generation_timeout: float = 30
+    repetition_penalty: float = 1.0
+    # Streaming-specific configs
+    stream_timeout: float = 100  # timeout untuk stream chunk
+    skip_prompt: bool = True     # skip prompt dari streaming output
+
+class LM:
+    """
+    Async LLM Qwen 0.5B dengan interface yang mudah digunakan
+    Termasuk prompt formatting khusus untuk RAG (Retrieval-Augmented Generation)
+    Dan support untuk text streaming
+    """
+    
+    def __init__(self, config: Optional[LMConfig] = None, prompt_template = [
+                 {"role": "system", "content": "You are a helpful assistant."},
+                 {"role": "user", "content": "{question}"}
+            ] ):
+        """
+        Inisialisasi LM
+        
+        Args:
+            config: Konfigurasi model (optional, akan menggunakan default jika None)
+        """
+        if(config is None):
+            self.config = LMConfig()
+        else:
+            self.config = config
+        self.tokenizer : AutoTokenizer = None
+        self.model = None
+        self.generation_config = None
+        self.is_loaded = False
+        self.executor = ThreadPoolExecutor(max_workers=self.config.max_workers)
+        self._lock = asyncio.Lock()
+        # Setup logging
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        
+        # RAG prompt templates
+        self.prompt_template = prompt_template
+    
+    async def load_model(self) -> None:
+        """Load model dan tokenizer secara async"""
+        async with self._lock:
+            if self.is_loaded:
+                self.logger.info("Model already loaded")
+                return
+            
+            try:
+                self.logger.info(f"Loading model: {self.config.model_name}")
+                
+                # Load tokenizer dalam thread pool
+                self.tokenizer = await asyncio.get_event_loop().run_in_executor(
+                    self.executor,
+                    lambda: AutoTokenizer.from_pretrained(
+                        self.config.model_name,
+                        trust_remote_code=True,
+                        torch_dtype="auto",
+                        device_map="auto",
+                    )
+                )
+                
+                # Load model dalam thread pool
+                self.model = await asyncio.get_event_loop().run_in_executor(
+                    self.executor,
+                    lambda: AutoModelForCausalLM.from_pretrained(
+                        self.config.model_name,
+                        quantization_config=self.config.quantization_config,
+                        torch_dtype=self.config.torch_dtype,
+                        device_map=self.config.device,
+                        trust_remote_code=True
+                    )
+                )
+                
+                # Setup generation config
+                self.generation_config = GenerationConfig(
+                    max_length=self.config.max_length,
+                    temperature=self.config.temperature,
+                    top_p=self.config.top_p,
+                    top_k=self.config.top_k,
+                    do_sample=self.config.do_sample,
+                    pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                    eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                    repetition_penalty = self.config.repetition_penalty,
+                )
+                
+                self.is_loaded = True
+                self.logger.info("Model loaded successfully!")
+                
+            except Exception as e:
+                self.logger.error(f"Error loading model: {e}")
+                raise
+    
+    def get_available_templates(self) -> List[str]:
+        """
+        Dapatkan list template yang tersedia
+        
+        Returns:
+            List of available template names
+        """
+        return list(self.prompt_template)
+    
+    def preview_template(self, template_type: str, sample_question: str = "Apa itu AI?", 
+                        sample_context: str = "Artificial Intelligence adalah teknologi...") -> str:
+        """
+        Preview template dengan sample data
+        
+        Args:
+            template_type: Template type to preview
+            sample_question: Sample question
+            sample_context: Sample context
+            
+        Returns:
+            Preview of formatted template
+        """
+        if template_type not in self.prompt_template:
+            return f"Template '{template_type}' tidak tersedia. Available: {self.get_available_templates()}"
+        
+        template_data = copy.deepcopy(self.prompt_template)
+        # template_key = "user_template" if "user_template" in template_data else "template"
+        
+        return template_data["content"].format(
+            context=sample_context,
+            question=sample_question
+        )
+    
+    def _format_context(self, contexts: Union[List[str], RetrievalResult], numbering: bool = True) -> str:
+        """
+        Format retrieved contexts menjadi string yang coherent
+        
+        Args:
+            contexts: List of contexts (string atau RetrievalResult objects)
+            numbering: Whether to add document numbering
+            
+        Returns:
+            Formatted context string
+        """
+        if not contexts:
+            return ""
+        
+        formatted_contexts = []
+        self.logger.info(f"Context : {contexts}")
+        self.logger.info(f"Is RetrievalResult Contexts =  {isinstance(contexts, RetrievalResult)}")
+        if isinstance(contexts, RetrievalResult):
+                for i, ctx in enumerate(contexts.documents, 1):
+                    if numbering:
+                        header = f"[Dokumen {i}"
+                        if contexts.scores[i - 1]:
+                            header += f" (Skor: {contexts.scores[i - 1]:.3f})"
+                        header += "]"
+                    else:
+                        header = "[Dokumen"
+                        header += "]"
+                    formatted_contexts.append(f"{header}\n{ctx.page_content}")
+        else:
+            for i, ctx in enumerate(contexts, 1):
+                if isinstance(ctx, str):
+                    header = f"[Dokumen {i}]" if numbering else "[Dokumen]"
+                    formatted_contexts.append(f"{header}\n{ctx}")
+                else:
+                    header = f"[Dokumen {i}]" if numbering else "[Dokumen]"
+                    formatted_contexts.append(f"{header}\n{str(ctx)}")
+        
+        return self.config.context_separator.join(formatted_contexts)
+    
+    def _truncate_context(self, context: str, max_length: int) -> str:
+        """
+        Truncate context jika terlalu panjang
+        
+        Args:
+            context: Context string
+            max_length: Maximum length in characters
+            
+        Returns:
+            Truncated context
+        """
+        if len(context) <= max_length:
+            return context
+        
+        # Truncate dan tambahkan indicator
+        truncated = context[:max_length - 50]
+        return truncated + "\n\n[... Context dipotong karena terlalu panjang ...]"
+
+    async def format_rag_prompt(self, 
+                                question: str, 
+                                contexts: Union[List[str], RetrievalResult],
+                                template_type: Optional[str] = None,
+                                custom_template: Optional[str] = None,
+                                include_metadata: bool = True,
+                                context_numbering: bool = True,
+                                max_contexts: Optional[int] = None) -> str:
+        """
+        Format prompt untuk RAG dengan berbagai template options (async)
+        """
+        
+        def _format_sync():
+            
+            # Handle RetrievalResult secara eksplisit
+            if isinstance(contexts, RetrievalResult):
+                docs = contexts.documents
+                if max_contexts:
+                    docs = docs[:max_contexts]
+                processed_contexts = RetrievalResult(
+                    documents=docs,
+                    scores=contexts.scores[:len(docs)] if contexts.scores else [],
+                    query=contexts.query,
+                    retrieval_time=contexts.retrieval_time,
+                    metadata=contexts.metadata
+                )
+            else:
+                # contexts diasumsikan sebagai list biasa (list[str] atau list[Document])
+                processed_contexts = contexts[:max_contexts] if max_contexts and len(contexts) > max_contexts else contexts
+
+            # Format context menjadi string
+            formatted_context = self._format_context(processed_contexts, context_numbering)
+
+            # Truncate jika panjang melebihi batas
+            formatted_context = self._truncate_context(
+                formatted_context, 
+                self.config.max_context_length
+            )
+
+            # Tambah metadata jika diizinkan dan konteks adalah RetrievalResult
+            if include_metadata and isinstance(processed_contexts, RetrievalResult):
+                metadata_info = []
+                for i, doc in enumerate(processed_contexts.documents, 1):
+                    if hasattr(doc, "metadata") and doc.metadata:
+                        metadata_info.append(f"Dokumen {i}: {doc.metadata}")
+                # if metadata_info:
+                #     formatted_context += f"\n\n[Metadata]\n" + "\n".join(metadata_info)
+
+            return formatted_context
+
+        # Jalankan _format_sync di thread pool
+        formatted_context = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _format_sync
+        )
+        self.logger.info(f"Formatted Context {formatted_context}")
+        # Tentukan template yang akan dipakai
+        if(template_type == ""):
+            self.config.instruction_template = "system"
+        # Gunakan custom template jika disediakan
+        if custom_template:
+            return custom_template.format(
+                context=formatted_context,
+                question=question
+            )
+        elif self.prompt_template:
+            print("question", question)
+           
+            template_data = copy.deepcopy(self.prompt_template)
+            print("template = ", template_type, "rag template = ", template_data)
+            # template_key = "user_template" if "user_template" in template_data else "template"
+
+            formatted_template = []
+            for cht in template_data:
+                    # Create a copy of the content to avoid modifying the original
+                content = cht["content"]
+                
+                # Format both placeholders at once to avoid KeyError
+                if "{context}" in content or "{question}" in content:
+                    try:
+                        content = content.format(
+                            context=formatted_context,
+                            question=question
+                        )
+                    except KeyError as e:
+                        self.logger.error(f"Missing placeholder in template: {e}")
+                        # Fallback: format only available placeholders
+                        if "{context}" in content:
+                            content = content.replace("{context}", formatted_context)
+                        if "{question}" in content:
+                            content = content.replace("{question}", question)
+                
+                # Create new dict with formatted content
+                formatted_chat = {
+                    "role": cht["role"],
+                    "content": content
+                }
+                
+                # Copy other fields if they exist
+                if "description" in cht:
+                    formatted_chat["description"] = cht["description"]
+                    
+                formatted_template.append(formatted_chat)
+
+            # self.logger.info(f"Formatted Template {formatted_template}")
+            # print("Forrmatted Template", formatted_template)
+            return formatted_template
+        else:
+            # Fallback default template
+            return [
+                 {"role": "system", "content": "You are a helpful assistant."},
+                 {"role": "user", "content": question}
+            ]
+
+    async def generate_stream(self, 
+                             prompt: List[Dict], 
+                             max_new_tokens: Optional[int] = None,
+                             temperature: Optional[float] = None,
+                             top_p: Optional[float] = None,
+                             **kwargs) -> AsyncGenerator[str, None]:
+        """
+        Generate text dari prompt secara streaming async
+        
+        Args:
+            prompt: Input text prompt
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation (override config)
+            top_p: Top-p untuk generation (override config)
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Generated text chunks
+        """
+        await self._check_model_loaded()
+        
+        # Setup streamer
+        streamer = TextIteratorStreamer(
+            self.tokenizer, 
+            timeout=self.config.stream_timeout,
+            skip_prompt=self.config.skip_prompt,
+            skip_special_tokens=True
+        )
+        
+        def _generate_sync():
+            try:
+                # Tokenize input
+                inputs = self.tokenizer.apply_chat_template(
+                    prompt,
+                    add_generation_prompt=True,
+                    return_tensors="pt"
+                )
+                
+                # Override generation config jika diperlukan
+                gen_config = self.generation_config
+                if any([max_new_tokens, temperature, top_p]):
+                    gen_config = GenerationConfig(
+                        max_new_tokens=max_new_tokens or self.config.max_length,
+                        temperature=temperature or self.config.temperature,
+                        top_p=top_p or self.config.top_p,
+                        top_k=self.config.top_k,
+                        do_sample=self.config.do_sample,
+                        pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                        eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                        repetition_penalty=self.config.repetition_penalty,
+                        **kwargs
+                    )
+                
+                # Move to GPU
+                self.model.to("cuda")
+                input_ids = inputs.to("cuda")
+                
+                # Generate dalam thread terpisah
+                generation_kwargs = {
+                    "input_ids": input_ids,
+                    "generation_config": gen_config,
+                    "streamer": streamer,
+                    **kwargs
+                }
+                
+                thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+                thread.start()
+                
+                return thread
+                
+            except Exception as e:
+                self.logger.error(f"Error during stream generation setup: {e}")
+                raise
+        
+        # Setup generation thread
+        generation_thread = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _generate_sync
+        )
+        err = None
+        try:
+            # Stream tokens
+            for token in streamer:
+                if token:  # Skip empty tokens
+                    yield token
+                    
+            # Wait for generation thread to finish
+            err = await asyncio.get_event_loop().run_in_executor(
+                self.executor, generation_thread.join
+            )
+            
+        except Exception as e:
+            self.logger.error(f"Error during streaming: {e}, {err}")
+            # Make sure thread is cleaned up
+            if generation_thread.is_alive():
+                generation_thread.join(timeout=1.0)
+            raise
+
+    async def rag_generate_stream(self,
+                                 question: str,
+                                 contexts: Union[List[str], RetrievalResult],
+                                 template_type: Optional[str] = None,
+                                 max_new_tokens: Optional[int] = None,
+                                 temperature: Optional[float] = None,
+                                 **kwargs) -> AsyncGenerator[str, None]:
+        """
+        Generate jawaban untuk RAG secara streaming async
+        
+        Args:
+            question: User question
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Generated answer chunks
+        """
+        await self._check_model_loaded()
+        
+        # Format prompt
+        prompt = await self.format_rag_prompt(question, contexts, template_type)
+        
+        # Generate dengan temperature yang lebih rendah untuk RAG (lebih faktual)
+        temp = temperature if temperature is not None else 0.3
+        
+        async for chunk in self.generate_stream(
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temp,
+            **kwargs
+        ):
+            yield chunk
+
+    async def chat_stream(self, 
+                         messages: List[Dict[str, str]], 
+                         max_new_tokens: Optional[int] = None,
+                         **kwargs) -> AsyncGenerator[str, None]:
+        """
+        Chat dengan format conversation secara streaming async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Response text chunks
+        """
+        await self._check_model_loaded()
+        
+        def _format_chat():
+            try:
+                # Format messages untuk chat
+                formatted_prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                return formatted_prompt
+                
+            except Exception as e:
+                self.logger.error(f"Error during chat formatting: {e}")
+                raise
+        
+        # Format chat template dalam thread pool
+        formatted_prompt = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _format_chat
+        )
+        
+        async for chunk in self.generate_stream(
+            formatted_prompt, 
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        ):
+            yield chunk
+
+    async def rag_chat_stream(self,
+                             messages: List[Dict[str, str]],
+                             contexts: Union[List[str], RetrievalResult],
+                             template_type: Optional[str] = None,
+                             max_new_tokens: Optional[int] = None,
+                             **kwargs) -> AsyncGenerator[str, None]:
+        """
+        RAG Chat dengan format conversation secara streaming async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Response text chunks
+        """
+        await self._check_model_loaded()
+        
+        # Ambil last user message sebagai question
+        user_messages = [msg for msg in messages if msg.get("role") == "user"]
+        if not user_messages:
+            raise ValueError("No user message found in conversation")
+        
+        last_question = user_messages[-1]["content"]
+        
+        # Generate RAG response secara streaming
+        async for chunk in self.rag_generate_stream(
+            question=last_question,
+            contexts=contexts,
+            template_type=template_type,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        ):
+            yield chunk
+
+    # Utility method untuk collect full response dari stream
+    async def collect_stream(self, stream_generator: AsyncGenerator[str, None]) -> str:
+        """
+        Collect semua chunks dari stream generator menjadi full text
+        
+        Args:
+            stream_generator: AsyncGenerator yang menghasilkan text chunks
+            
+        Returns:
+            Complete generated text
+        """
+        chunks = []
+        async for chunk in stream_generator:
+            chunks.append(chunk)
+        return "".join(chunks)
+    
+    async def multi_template_generate(self,
+                                    question: str,
+                                    contexts: Union[List[str], RetrievalResult],
+                                    template_types: List[str],
+                                    max_new_tokens: Optional[int] = None,
+                                    **kwargs) -> Dict[str, str]:
+        """
+        Generate jawaban menggunakan multiple templates secara concurrent
+        
+        Args:
+            question: User question
+            contexts: List of retrieved contexts
+            template_types: List of template types to use
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Dictionary dengan template_type sebagai key dan response sebagai value
+        """
+        await self._check_model_loaded()
+        
+        # Create tasks untuk concurrent generation
+        tasks = []
+        for template_type in template_types:
+            task = asyncio.create_task(
+                self._generate_single_template(
+                    question, contexts, template_type, max_new_tokens, **kwargs
+                )
+            )
+            tasks.append((template_type, task))
+        
+        # Wait for all tasks
+        results = {}
+        for template_type, task in tasks:
+            try:
+                response = await task
+                results[template_type] = response
+            except Exception as e:
+                self.logger.error(f"Error generating with template {template_type}: {e}")
+                results[template_type] = f"Error: {str(e)}"
+        
+        return results
+    
+    async def _generate_single_template(self,
+                                      question: str,
+                                      contexts: Union[List[str], RetrievalResult],
+                                      template_type: str,
+                                      max_new_tokens: Optional[int] = None,
+                                      **kwargs) -> str:
+        """Helper method untuk single template generation"""
+        return await self.rag_generate(
+            question=question,
+            contexts=contexts,
+            template_type=template_type,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+    
+    async def rag_generate(self,
+                          question: str,
+                          contexts: Union[List[str], RetrievalResult],
+                          template_type: Optional[str] = None,
+                          max_new_tokens: Optional[int] = None,
+                          temperature: Optional[float] = None,
+                          **kwargs) -> str:
+        """
+        Generate jawaban untuk RAG secara async
+        
+        Args:
+            question: User question
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Generated answer
+        """
+        await self._check_model_loaded()
+        
+        # Format prompt
+        prompt = await self.format_rag_prompt(question, contexts, template_type)
+        
+        # Generate dengan temperature yang lebih rendah untuk RAG (lebih faktual)
+        temp = temperature if temperature is not None else 0.3
+        
+        return await self.generate(
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temp,
+            **kwargs
+        )
+    
+    async def rag_chat(self,
+                      messages: List[Dict[str, str]],
+                      contexts: Union[List[str], RetrievalResult],
+                      template_type: Optional[str] = None,
+                      max_new_tokens: Optional[int] = None,
+                      **kwargs) -> str:
+        """
+        RAG Chat dengan format conversation secara async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Response text
+        """
+        await self._check_model_loaded()
+        
+        # Ambil last user message sebagai question
+        user_messages = [msg for msg in messages if msg.get("role") == "user"]
+        if not user_messages:
+            raise ValueError("No user message found in conversation")
+        
+        last_question = user_messages[-1]["content"]
+        
+        # Generate RAG response
+        return await self.rag_generate(
+            question=last_question,
+            contexts=contexts,
+            template_type=template_type,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+    
+    async def _check_model_loaded(self) -> None:
+        """Cek apakah model sudah di-load secara async"""
+        if not self.is_loaded:
+            raise RuntimeError("Model belum di-load. Panggil await load_model() terlebih dahulu.")
+    
+    async def generate(self, 
+                      prompt: Union[List[Dict], str], 
+                      max_new_tokens: Optional[int] = None,
+                      temperature: Optional[float] = None,
+                      top_p: Optional[float] = None,
+                      **kwargs) -> str:
+        """
+        Generate text dari prompt secara async
+        
+        Args:
+            prompt: Input text prompt
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation (override config)
+            top_p: Top-p untuk generation (override config)
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Generated text
+        """
+        
+        await self._check_model_loaded()
+        
+        def _generate_sync():
+            try:
+                # Tokenize input
+                inputs = self.tokenizer.apply_chat_template(
+                    prompt,
+                    add_generation_prompt=True,
+                    return_tensors="pt"
+                )
+                
+                # Override generation config jika diperlukan
+                gen_config = self.generation_config
+                if any([max_new_tokens, temperature, top_p]):
+                    gen_config = GenerationConfig(
+                        max_new_tokens=max_new_tokens or self.config.max_length,
+                        temperature=temperature or self.config.temperature,
+                        top_p=top_p or self.config.top_p,
+                        top_k=self.config.top_k,
+                        do_sample=self.config.do_sample,
+                        pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                        eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                        repetition_penalty = self.config.repetition_penalty,
+                        **kwargs
+                    )
+                
+                # Generate
+                with torch.no_grad():
+                    
+                    self.model.to("cuda")
+                    input_ids = inputs.to("cuda")
+                    prompt_length = input_ids.shape[-1]
+                    outputs = self.model.generate(
+                        input_ids,
+                        generation_config=gen_config,
+                        **kwargs
+                    )
+                
+                # Decode output
+                generated_text = self.tokenizer.decode(
+                    outputs[0][prompt_length:], 
+                    skip_special_tokens=True
+                )
+
+                print("Generated Text", generated_text)
+                # Remove input prompt dari output
+                return generated_text
+                
+            except Exception as e:
+                self.logger.error(f"Error during generation: {e}")
+                raise
+        
+        # Run generation in thread pool dengan timeout
+        try:
+            result = await asyncio.wait_for(
+                asyncio.get_event_loop().run_in_executor(self.executor, _generate_sync),
+                timeout=self.config.generation_timeout
+            )
+            return result
+        except asyncio.TimeoutError:
+            self.logger.error(f"Generation timeout after {self.config.generation_timeout} seconds")
+            raise TimeoutError(f"Generation timeout after {self.config.generation_timeout} seconds")
+    
+    async def chat(self, 
+                  messages: List[Dict[str, str]], 
+                  max_new_tokens: Optional[int] = None,
+                  **kwargs) -> str:
+        """
+        Chat dengan format conversation secara async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Response text
+        """
+        await self._check_model_loaded()
+        
+        def _format_chat():
+            try:
+                # Format messages untuk chat
+                formatted_prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    chat_template="rag",
+                    return_tensors="pt"
+                )
+                return formatted_prompt
+                
+            except Exception as e:
+                self.logger.error(f"Error during chat formatting: {e}")
+                raise
+        
+        # Format chat template dalam thread pool
+        formatted_prompt = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _format_chat
+        )
+        
+        return await self.generate(
+            formatted_prompt, 
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+    
+    async def update_config(self, **kwargs) -> None:
+        """
+        Update konfigurasi model secara async
+        
+        Args:
+            **kwargs: Parameter konfigurasi yang akan diupdate
+        """
+        async with self._lock:
+            for key, value in kwargs.items():
+                if hasattr(self.config, key):
+                    setattr(self.config, key, value)
+                    self.logger.info(f"Updated {key} to {value}")
+                else:
+                    self.logger.warning(f"Unknown config parameter: {key}")
+            
+            # Update generation config jika model sudah loaded
+            if self.is_loaded:
+                self.generation_config = GenerationConfig(
+                    max_length=self.config.max_length,
+                    temperature=self.config.temperature,
+                    top_p=self.config.top_p,
+                    top_k=self.config.top_k,
+                    do_sample=self.config.do_sample,
+                    pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                    eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                    repetition_penalty = self.config.repetition_penalty,
+
+                )
+    
+    async def get_model_info(self) -> Dict[str, Any]:
+        """
+        Dapatkan informasi model secara async
+        
+        Returns:
+            Dictionary dengan informasi model
+        """
+        info = {
+            "model_name": self.config.model_name,
+            "is_loaded": self.is_loaded,
+            "config": self.config.__dict__
+        }
+        
+        if self.is_loaded:
+            # Get model info dalam thread pool
+            def _get_info():
+                return {
+                    "vocab_size": self.tokenizer.vocab_size,
+                    "model_parameters": sum(p.numel() for p in self.model.parameters()),
+                    "device": str(next(self.model.parameters()).device)
+                }
+            
+            model_info = await asyncio.get_event_loop().run_in_executor(
+                self.executor, _get_info
+            )
+            info.update(model_info)
+        
+        return info
+    
+    async def batch_generate(self, 
+                           prompts: List[str], 
+                           max_new_tokens: Optional[int] = None,
+                           **kwargs) -> List[str]:
+        """
+        Generate multiple prompts secara batch dan concurrent
+        
+        Args:
+            prompts: List of prompts to generate
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            List of generated texts
+        """
+        await self._check_model_loaded()
+        
+        # Create tasks untuk concurrent generation
+        tasks = [
+            asyncio.create_task(
+                self.generate(prompt, max_new_tokens=max_new_tokens, **kwargs)
+            )
+            for prompt in prompts
+        ]
+        
+        # Wait for all tasks
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results
+        processed_results = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                self.logger.error(f"Error generating prompt {i}: {result}")
+                processed_results.append(f"Error: {str(result)}")
+            else:
+                processed_results.append(result)
+        
+        return processed_results
+    
+    async def close(self) -> None:
+        """
+        Cleanup resources secara async
+        """
+        self.logger.info("Closing LM...")
+        
+        # Shutdown executor
+        self.executor.shutdown(wait=True)
+        
+        # Clear GPU memory
+        if hasattr(self, 'model') and self.model is not None:
+            del self.model
+        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+            del self.tokenizer
+        
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        
+        self.is_loaded = False
+        self.logger.info("LM closed successfully")
+    
+    async def __aenter__(self):
+        """Async context manager entry"""
+        await self.load_model()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        await self.close()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
index 26d9f3719dc160e0aa462994b108322fa1c8ea00..4de11510f04804a54e5d163933c3005340d5836e 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
@@ -6,6 +6,7 @@ from langchain_openai import OpenAIEmbeddings
 
 # Vector stores
 from langchain_community.vectorstores import Chroma, FAISS, Pinecone
+from langchain.retrievers import EnsembleRetriever
 
 # Retriever base
 from langchain_core.vectorstores import VectorStoreRetriever
@@ -24,7 +25,6 @@ from langchain_core.documents import Document
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-
 class LangChainRetriever(BaseRetriever):
     """LangChain-based retriever with multiple format support"""
 
@@ -160,17 +160,34 @@ class LangChainRetriever(BaseRetriever):
         except Exception as e:
             logger.error(f"Error adding documents: {str(e)}")
             return False
-
     async def _update_bm25_retriever(self, documents: List[Document]):
         try:
+            # Create BM25 retriever from documents
             self.bm25_retriever = BM25Retriever.from_documents(documents)
-            self.retriever = ContextualCompressionRetriever(
-                base_compressor=None,  # Optional: add compressor like CohereRerank or LLM-based
-                base_retriever=self.bm25_retriever  # Example: use BM25 as base, can combine
+            self.bm25_retriever.k = 10  # Set number of documents to retrieve
+            
+            # For hybrid search, you have several options:
+            
+            # Option 1: Use only BM25 retriever (simplest fix)
+            self.retriever = self.bm25_retriever
+            
+            vector_retriever = VectorStoreRetriever(
+                vectorstore=self.vectorstore,
+                search_kwargs={"k": 10}
+            )
+
+            self.retriever = EnsembleRetriever(
+                retrievers=[vector_retriever, self.bm25_retriever],
+                weights=[0.5, 0.5]  # Equal weight to both retrievers
             )
+            
         except Exception as e:
             logger.error(f"Error updating BM25 retriever: {str(e)}")
-
+            # Fallback to vector retriever only
+            self.retriever = VectorStoreRetriever(
+                vectorstore=self.vectorstore,
+                search_kwargs={"k": 10}
+            )
     async def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
         try:
             import time
@@ -181,6 +198,7 @@ class LangChainRetriever(BaseRetriever):
                 None, self.retriever.get_relevant_documents, query
             )
             retrieved_docs = retrieved_docs[:k]
+
             scores = [0.9 - (i * 0.1) for i in range(len(retrieved_docs))]
 
             retrieval_time = time.time() - start_time
@@ -222,4 +240,4 @@ class LangChainRetriever(BaseRetriever):
         return list(self.processed_documents.values())
 
     def get_supported_formats(self) -> List[str]:
-        return self.document_loader.get_supported_extensions()
+        return self.document_loader.get_supported_extensions()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call_gpt.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7891dced987f7cd7a455b5d4810fc275fa871b6e
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call_gpt.py
@@ -0,0 +1,364 @@
+import fastapi
+from fastapi.middleware.cors import CORSMiddleware
+
+from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
+from fastrtc.utils import audio_to_int16
+from openai import OpenAI
+from elevenlabs.client import ElevenLabs
+from dotenv import load_dotenv
+from tts.audio_edge_tts import EdgeTTS
+from rag import document_retriever
+import logging
+import time
+import platform
+import socket
+import os
+import numpy as np
+import io
+import wave
+import asyncio
+import librosa
+from pydub import AudioSegment
+# from stt.whisper_stt import WhisperSTT
+from collections import deque
+import torch
+import torchaudio.transforms as T
+import asyncio
+import concurrent.futures
+import threading
+from config.constant import HF_TOKEN
+import threading
+import re
+from openai import OpenAI
+from langchain_core.documents import Document
+
+from rag import ddgs
+# Load .env
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
+
+class RTCHandler:
+    def __init__(self, openai_client: OpenAI,  whisper_stt = None, edge_tts : EdgeTTS = None):
+
+        """Initialize RTC handler with OpenAI, ElevenLabs, and EdgeTTS"""
+        self.whisper_stt = whisper_stt
+        self.edge_tts = edge_tts
+        self.prompt = ""
+        self.sys_prompt = """
+        
+        Kamu adalah customer service yang berbahasa Indonesia dengan baik sopan, santun, tapi santai pembawaannya.
+        Kamu bisa menjelaskan sesuatu secara baik dan membimbing customer dalam menghadapi masalah yang ada!
+
+        Kamu akan menjawab customer dengan media call /telepon jadi anda harus memberikan respon seperlunya saja
+        Tidak kepanjanngan, dan sangat jelas, 
+
+
+        Tidak lebih dari 50 kata.
+        """
+        self.openai_client = openai_client
+        self.messages = [
+            
+            {
+             "role": "system", 
+             "content": self.sys_prompt
+             }
+            
+            ]
+        self.full_response = ""
+        self.stream = None
+        self.app = None
+
+        self._setup_webrtc_ip()
+
+    def _setup_webrtc_ip(self):
+        """Setup WebRTC IP for Windows"""
+        if platform.system() == 'Windows':
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            try:
+                s.connect(('8.8.8.8', 80))
+                local_ip = s.getsockname()[0]
+            except Exception:
+                local_ip = '127.0.0.1'
+            finally:
+                s.close()
+            os.environ['WEBRTC_IP'] = local_ip
+
+    def audio_to_bytes(self, audio_tuple, sample_rate=24000) -> io.BufferedReader:
+        sr, audio_data = audio_tuple
+        audio_int16 = audio_to_int16(audio_tuple)
+
+        buffer = io.BytesIO()
+        with wave.open(buffer, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(sr)
+            wf.writeframes(audio_int16.tobytes())
+        buffer.seek(0)
+        buffer.name = "audio.wav"
+        return buffer
+    def echo(self, audio):
+            """Process audio input and generate audio response - Optimized version"""
+            try:
+                stt_time = time.time()
+                logging.info("Performing STT")
+
+                
+                # transcription = self.whisper_stt.transcribe(self.audio_to_bytes(audio))
+                transcription = self.openai_client.audio.transcriptions.create(
+                    model="whisper-1",
+                    file=self.audio_to_bytes(audio),
+                    language="id"
+                )
+                
+                self.prompt = transcription.text
+                if self.prompt == "":
+                    logging.info("STT returned empty string")
+                    return
+                
+                logging.info(f"STT response: {transcription}")
+                
+                logging.info(f"STT took {time.time() - stt_time} seconds")
+
+                llm_time = time.time()
+                self.full_response = ""
+
+                # Single async function to handle both text streaming and audio generation
+                async def stream_text_to_audio():
+                    # self.prompt = "Perhitungan BPJS"
+                    retrieval_result = await document_retriever.retrieve(query = self.prompt)
+                    contexts = ""
+                    search_results = []
+                    
+                    async for result in ddgs.search(self.prompt, max_results=5):
+                        # self.logger.info(f"Processing SEO Result: {result[:100]}...")
+                        doc = Document(
+                            page_content=result,
+                            metadata={"source": "internet_search", "query": self.prompt}
+                        )
+                        print(doc)
+                        search_results.append(doc)
+            
+                    await document_retriever.add_documents([doc])
+
+                    i = 1
+                    for ctx in retrieval_result.documents:
+                        contexts += f"{i}. {ctx.page_content}" + "\n"
+                    print("Retrieved Contexts :", contexts)
+                    self.messages.append({"role": "user", "content": f""" 
+                                        Dari Konteks yang diberikan (jika diperlukan) :
+                                        {contexts}
+
+                                        Berikan jawaban atas pertanyaan yang diberikan :
+                                        {self.prompt}
+                                          
+                                          """})
+                    
+                    response = self.openai_client.chat.completions.create(
+                        model="gpt-3.5-turbo",
+                        messages=self.messages,
+                        max_tokens=200,
+                        stream=True
+                    )
+                    chunk_size = 1024
+                    no_buffer = 0
+                    text_buffer = ""
+                    
+                    for stream_data in response:
+                        print(stream_data.choices[0].delta.content)
+                        if stream_data.choices[0].finish_reason == "stop":
+                            if text_buffer:  # Yield sisa text
+                                yield text_buffer
+                            break
+                        if stream_data.choices[0].delta.content:
+                            chunk = stream_data.choices[0].delta.content
+                            self.full_response += chunk
+                            text_buffer += chunk
+                            # Generate audio immediately for each text chunk
+                            if re.search(r'[.,?;!]', chunk):
+                                try:
+                                    audio_buffer_gen =  await self.edge_tts.generate_audio_buffer(text_buffer)
+                                    audio_buffer = audio_buffer_gen[0]
+                                    
+                                    audio_buffer.seek(0)
+                                    
+                                    # Convert MP3 to PCM
+                                    audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
+                                    samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2 ** 15)
+                                    
+                                    # Handle stereo to mono
+                                    if audio_segment.channels == 2:
+                                        samples = samples.reshape((-1, 2)).mean(axis=1)
+                                    
+                                    # # Resample to 24kHz
+                                    # resampled = librosa.resample(samples, orig_sr=audio_segment.frame_rate, target_sr=24000)
+                                    import torch
+                                    import torchaudio
+                                    
+                                    # Check if CUDA is available
+                                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                                    
+                                    # Convert numpy array to torch tensor and move to GPU
+                                    audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device)  # Add batch dimension and move to GPU
+                                    
+                                    # Create resampler and move to GPU
+                                    resampler = torchaudio.transforms.Resample(
+                                        orig_freq=audio_segment.frame_rate,
+                                        new_freq=24000
+                                    ).to(device)
+                                    
+                                    # Apply resampling on GPU
+                                    resampled_tensor = resampler(audio_tensor)
+                                    
+                                    # Convert back to numpy (move to CPU first)
+                                    resampled = resampled_tensor.squeeze(0).cpu().numpy()
+                                    # Yield audio chunks
+                                    for i in range(0, len(resampled), chunk_size):
+                                        yield (24000, resampled[i:i + chunk_size])
+                                    no_buffer = 0
+                                    text_buffer = ""
+                                except Exception as e:
+                                    logging.error(f"TTS generation failed for chunk: {e}")
+                                    continue
+                                    
+                        # elif stream_data["type"] == "metadata":
+                        #     setup_time = stream_data['data']['setup_time']
+                        #     print(f"\nSetup completed in {setup_time:.2f}s")
+                            
+                        # elif stream_data["type"] == "complete":
+                        #     total_time = stream_data['data']['total_time']
+                        #     print(f"\nTotal time: {total_time:.2f}s")
+                        #     break
+
+                # Run the single async function
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                
+                try:
+                    async_gen = stream_text_to_audio()
+                    while True:
+                        try:
+                            chunk = loop.run_until_complete(async_gen.__anext__())
+                            yield chunk
+                        except StopAsyncIteration:
+                            break
+                finally:
+                    loop.close()
+
+                self.messages.append({"role": "assistant", "content": self.full_response + " "})
+                logging.info(f"LLM response: {self.full_response}")
+                logging.info(f"LLM took {time.time() - llm_time} seconds")
+
+            except Exception as e:
+                logging.error(f"Error in echo function: {e}")
+                error_audio = np.zeros(24000, dtype=np.float32)
+                yield (24000, error_audio)
+    def reset_conversation(self):
+        logging.info("Resetting chat")
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        self.full_response = ""
+
+    def create_stream(self):
+        try:
+            async def get_credentials():
+                return await get_cloudflare_turn_credentials_async(hf_token=HF_TOKEN)
+            self.stream = Stream(
+                rtc_configuration=get_credentials,
+                server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
+                handler = ReplyOnPause(
+                    self.echo,
+                    algo_options=AlgoOptions(
+                        audio_chunk_duration=0.5,
+                        started_talking_threshold=0.1,
+                        speech_threshold=0.03
+                    ),
+                    model_options=SileroVadOptions(
+                        threshold=0.90,
+                        min_speech_duration_ms=250,
+                        min_silence_duration_ms=2000,
+                        speech_pad_ms=400,
+                        max_speech_duration_s=15
+                    )
+                ),
+                modality="audio",
+                mode="send-receive"
+            )
+            return self.stream
+        except Exception as e:
+            logging.error(f"Error creating stream: {e}")
+            raise
+
+    def create_fastapi_app(self):
+        try:
+            self.app = fastapi.FastAPI()
+            self.app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+
+            if not self.stream:
+                self.create_stream()
+            self.stream.mount(self.app)
+
+            @self.app.get("/reset")
+            async def reset():
+                try:
+                    self.reset_conversation()
+                    return {"status": "success"}
+                except Exception as e:
+                    logging.error(f"Error in reset endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            @self.app.get("/status")
+            async def status():
+                try:
+                    return {
+                        "status": "running",
+                        "messages_count": len(self.messages),
+                        "last_response": self.full_response
+                    }
+                except Exception as e:
+                    logging.error(f"Error in status endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            return self.app
+        except Exception as e:
+            logging.error(f"Error creating FastAPI app: {e}")
+            raise
+
+    def start_server(self, host: str = "0.0.0.0", port: int = 7860):
+        import uvicorn
+        if not self.app:
+            self.create_fastapi_app()
+        logging.info(f"Starting server on {host}:{port}")
+        try:
+            uvicorn.run(self.app, host=host, port=port, log_level="info")
+        except Exception as e:
+            logging.error(f"Error starting server: {e}")
+            raise
+    def launch_ui(self, browser: bool = True):
+        try:
+            if not self.stream:
+                self.create_stream()
+            if not self.app:
+                self.create_fastapi_app()
+            logging.info("Launching RTC UI...")
+            self.stream.ui.launch(self.app,
+                                  server_name="0.0.0.0",
+                                  server_port=7860,
+                                  )
+        except Exception as e:
+            logging.error(f"Error launching UI: {e}")
+            raise
+
+    def get_conversation_history(self):
+        return self.messages.copy()
+
+    def set_system_prompt(self, new_prompt: str):
+        self.sys_prompt = new_prompt
+        self.messages[0] = {"role": "system", "content": new_prompt}
+
+    def get_last_response(self):
+        return self.full_response
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/qwen_llm_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/qwen_llm_test.py
index 936ef1bb67521f251537d8fd978b5e28b0287541..76225355d807e4590f9ce8f7c1d47efaa13bd461 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/qwen_llm_test.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/qwen_llm_test.py
@@ -1,14 +1,14 @@
 from rag.retriever.retriever_types import *
-from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+from rag.pipeline.language_model import LM, LMConfig
 
 import warnings
 warnings.filterwarnings("ignore")
 
-async def test_qwen_llm():
+async def test_language_model():
     print(" ===== Testing QWEN LLM ==== ")
-    """Example usage of async QwenLLM"""
+    """Example usage of async LM"""
 
-    config = QwenConfig(
+    config = LMConfig(
         temperature=0.5,
         max_length=512,
         generation_timeout=30
@@ -23,20 +23,20 @@ async def test_qwen_llm():
     )
 
     # Using async context manager
-    async with QwenLLM(config) as llm:
+    async with LM(config) as llm:
           await test_qwen_single_generation(llm)
           await test_qwen_single_rag_generation(llm, contexts)
           await test_qwen_multiple_template_rag_generation(llm, contexts)
           await test_qwen_batch_generation(llm, contexts)
     print(" ===== Testing LLM DONE ==== ")
 
-async def test_qwen_single_generation(llm : QwenLLM):
+async def test_qwen_single_generation(llm : LM):
     print(" * Test Single Generation * ")
     response = await llm.generate("Jelaskan tentang AI")
     print(f"Response: {response}")
     print(" * Test Single Generation Done * ")
 
-async def test_qwen_single_rag_generation(llm : QwenLLM, ctx : RetrievalResult):
+async def test_qwen_single_rag_generation(llm : LM, ctx : RetrievalResult):
     print(" * Test Single RAG Generation * ")
     rag_response = await llm.rag_generate(
             question="Apa itu AI dan machine learning?",
@@ -46,7 +46,7 @@ async def test_qwen_single_rag_generation(llm : QwenLLM, ctx : RetrievalResult):
     print(f"RAG Response: {rag_response}")
     print(" * Test Single RAG Generation Done * ")
 
-async def test_qwen_multiple_template_rag_generation(llm : QwenLLM,ctx : RetrievalResult):
+async def test_qwen_multiple_template_rag_generation(llm : LM,ctx : RetrievalResult):
         print(" * Test Multiple Template Generation * ")
         multi_responses = await llm.multi_template_generate(
              question="Apa itu AI?",
@@ -57,7 +57,7 @@ async def test_qwen_multiple_template_rag_generation(llm : QwenLLM,ctx : Retriev
         print(" * Test Multiple Template Generation Done* ")
 
 
-async def test_qwen_batch_generation(llm : QwenLLM, ctx : RetrievalResult):
+async def test_qwen_batch_generation(llm : LM, ctx : RetrievalResult):
         print(" * Test Batch Generation * ")
         batch_responses = await llm.batch_generate([
              "Jelaskan tentang Python",
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
index 97eeae9bb7706292cc56d80559e62eb51f8bc17d..866d8600acf58b14a7f0075fff1e549e2d103267 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
@@ -1,13 +1,14 @@
 from tests.inference_test import test_inference
-
+from huggingface_hub import login
+login(new_session=False)
 import warnings
 warnings.filterwarnings("ignore")
 import asyncio
 def run_test():
     try:
         # await test_document_retriever()
-        # await test_qwen_llm()
-        asyncio.run(test_inference())
+        # await test_language_model()
+        test_inference()
     except Exception as e:
         print(e)
 
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__test__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__test__.py
index d479ab503b7b16a279ee28036164453613737382..71d2681e3c3685a91542e2608036aef88786d9ce 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__test__.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__test__.py
@@ -1,8 +1,3 @@
-
-# from tests.document_retriever_test import test_document_retriever
-# from tests.document_retriever_test import test_document_retriever
-# from tests.qwen_llm_test import test_qwen_llm
-# from tests.inference_test import test_inference
 from tests.rtc_test import test_rtc
 import warnings
 warnings.filterwarnings("ignore")
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/app.log b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/app.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
index 1445848f655cc27ee59d61ee5a09fedc160923fc..ab67cd6148a447c67ff80a387f51cf9a90076a64 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
@@ -1,17 +1,44 @@
-from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+from rag.pipeline.language_model import LM, LMConfig
 from rag.retriever.langchain_retriever import LangChainRetriever
 from rag.inference.inferencer import Inferencer, InferencerConfig
+from rag.agents.customer_service_agent import CSAgent
+from rag.agents.query_maker_agent import QueryMakerAgent
+from langchain_core.documents import Document
+from rag.web_search.duckduckgo_search import DuckDuckGoSearch
+from rag.chat_template import get_chat_template
+from transformers import BitsAndBytesConfig
+import torch
 
-config = QwenConfig(
+import logging
+import sys
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(funcName)s() - %(message)s',
+    handlers=[
+        logging.FileHandler('app.log'),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+bnb = BitsAndBytesConfig(
+                            load_in_4bit=True,                      # Enable 4-bit quantization
+                            bnb_4bit_use_double_quant=True,         # Use double quantization
+                            bnb_4bit_quant_type="nf4",              # Use NF4 quantization
+                            bnb_4bit_compute_dtype=torch.bfloat16,  # Compute dtype for 4bit base models
+        )
+
+
+config = LMConfig(
+                model_name = "Qwen/Qwen2.5-1.5B-Instruct",
                 temperature=0.3,
                 max_length=512,
-                generation_timeout=30,
+                generation_timeout=100,
                 repetition_penalty=1.1,
-                max_workers = 1,
-                do_sample = True,
-        )
-    
-llm = QwenLLM(
+                max_workers = 2,
+                quantization_config = bnb
+)
+
+llm = LM(
         config = config
 )
 
@@ -22,29 +49,42 @@ inferencer_config = InferencerConfig(
 )
 
 document_retriever = LangChainRetriever(
-        embedding_model="all-MiniLM-L6-v2",
+        embedding_model="sentence-transformers/all-MiniLM-L6-v2",
         vectorstore_type="chroma",
-        vectorstore_path="./vectorstore",
+        vectorstore_path="vectorstore/",
         use_hybrid_search=True,
         chunk_size=1000,
         chunk_overlap=200
 )
 
-inferencer = Inferencer(
+ddgs = DuckDuckGoSearch()
+
+cs_inferencer = Inferencer(
         model=llm,
         retriever=document_retriever,
+        # search_engine = ddgs,
         reranker=None,
         config=inferencer_config
 )
 
-async def get_response(question):
-    result = await inferencer.infer(question, "rag_response")
-    return result
+query_maker_inferencer = Inferencer(
+        model=llm,
+        config=inferencer_config
+)
+
+cs_agent = CSAgent(
+    inferencer = cs_inferencer,
+    prompt_template = get_chat_template("customer_service")
+)
+
+query_maker_chat_template = get_chat_template("query_maker")
+query_maker_chat_template[1]["content"] = """{question}"""
+
+query_maker_agent = QueryMakerAgent(
+    inferencer = query_maker_inferencer,
+    prompt_template = query_maker_chat_template
+)
+
+
+
 
-async def get_stream_response(question):
-    async for item in inferencer.infer_stream(query = question,
-                                             enable_reranking=False,
-                                             template_type="main_template",
-                                             k=3):
-            print("Stream Response :", item)
-            yield item
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/inferencer.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/inferencer.py
index d159297ffadb5e74655e4dcc4bff50ae902ee480..dd3f1abf385649f151996ed347ccd1419d6bb63f 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/inferencer.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/inferencer.py
@@ -1,6 +1,8 @@
 from rag.retriever.langchain_retriever import LangChainRetriever
-from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+from rag.pipeline.language_model import LM, LMConfig
 from rag.retriever.retriever_types import RetrievalResult
+from rag.web_search.duckduckgo_search import DuckDuckGoSearch
+from langchain_core.documents import Document
 # from rag.pipeline.reranker import BGEM3Reranker
 from typing import List, Union, Dict, Any, Optional, AsyncGenerator
 import asyncio
@@ -29,15 +31,16 @@ class Inferencer:
     """
     
     def __init__(self, 
-                 model: QwenLLM, 
-                 retriever: LangChainRetriever, 
+                 model: LM, 
+                 retriever: LangChainRetriever = None, 
+                 search_engine = None,
                  reranker=None,
                  config: Optional[InferencerConfig] = None):
         """
         Initialize Inferencer
         
         Args:
-            model: QwenLLM instance
+            model: LM instance
             retriever: LangChainRetriever instance
             reranker: Reranker instance (optional)
             config: InferencerConfig (optional)
@@ -45,6 +48,7 @@ class Inferencer:
         self.model = model
         self.retriever = retriever
         self.reranker = reranker
+        self.search_engine = search_engine
         self.config = config or InferencerConfig()
         
         # Setup logging
@@ -85,6 +89,7 @@ class Inferencer:
         try:
             start_time = datetime.now()
             contexts = await self.retriever.retrieve(query, k=k)
+            self.logger.info(f"Retrieved Contexts : {contexts}")
             retrieval_time = (datetime.now() - start_time).total_seconds()
             
             self.logger.info(f"Retrieved {len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts)} contexts in {retrieval_time:.2f}s")
@@ -292,7 +297,7 @@ class Inferencer:
             yield chunk
     
     async def infer(self, 
-                   query: Union[str, List[str]], 
+                   query: str, 
                    response_type: Union[List[str], str] = None,
                    k: Optional[int] = None,
                    enable_reranking: Optional[bool] = None,
@@ -321,8 +326,12 @@ class Inferencer:
         
         try:
             # Step 1: Retrieve contexts
-            retrieved_contexts = await self.retrieve_context(main_query, k=k)
-            
+            if(self.search_engine):
+                await self.retrieve_from_search_engine(query, k = k)
+            if(self.retriever):
+                retrieved_contexts = await self.retrieve_context(main_query, k=k)
+            else:
+                retrieved_contexts  = ""
             # Step 2: Rerank contexts (if enabled)
             enable_rerank = enable_reranking if enable_reranking is not None else self.config.enable_reranking
             if enable_rerank:
@@ -363,7 +372,34 @@ class Inferencer:
         except Exception as e:
             self.logger.error(f"Error during inference: {e}")
             raise
-    
+    async def retrieve_from_search_engine(self, query: str, k: int = 3):
+        """
+        Alternative method: Process results as they come
+        """
+        from langchain_core.documents import Document
+        
+        search_results = []
+        
+        try:
+            # Process results one by one as they come
+            async for result in self.search_engine.search(query, max_results=k):
+                self.logger.info(f"Processing SEO Result: {result[:100]}...")
+                
+                doc = Document(
+                    page_content=result,
+                    metadata={"source": "internet_search", "query": query}
+                )
+                search_results.append(doc)
+                
+                # Optionally add to retriever immediately
+                await self.retriever.add_documents([doc])
+            
+            self.logger.info(f"Processed {len(search_results)} search results")
+            return search_results
+            
+        except Exception as e:
+            self.logger.error(f"Error in retrieve_from_search_engine_alternative: {e}", exc_info=True)
+            raise
     async def infer_stream(self, 
                           query: str,
                           k: Optional[int] = None,
@@ -389,8 +425,14 @@ class Inferencer:
         
         try:
             # Step 1: Retrieve contexts
-            retrieved_contexts = await self.retrieve_context(query, k=k)
+            if(self.search_engine):
+                await self.retrieve_from_search_engine(query, k = k)
+            if(self.retriever is not None):
+                retrieved_contexts = await self.retrieve_context(query, k=k)
+            else:
+                retrieved_contexts = ""
             
+
             # Step 2: Rerank contexts (if enabled)
             enable_rerank = enable_reranking if enable_reranking is not None else self.config.enable_reranking
             if enable_rerank:
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/qwen_llm.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/qwen_llm.py
index 74cd038efd48629ae7a3f9a68c6b7d54c1d29699..43640c45d2137667c5032d5bdb6d86d5df7d9055 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/qwen_llm.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/qwen_llm.py
@@ -17,7 +17,7 @@ import copy
 @dataclass
 class QwenConfig:
     """Konfigurasi untuk model Qwen 0.5B"""
-    model_name: str = "Qwen/Qwen2.5-0.5B-Instruct"
+    model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"
     device: str = "cuda"
     torch_dtype: torch.dtype = torch.float16
     max_length: int = 2048
@@ -286,14 +286,35 @@ class QwenLLM:
 
             formatted_template = []
             for cht in template_data:
-                # print("question for template = ", question)
-               
-                if("{context}" in cht["content"]):
-                    cht["content"] = cht["content"].format(context=formatted_context)
+                    # Create a copy of the content to avoid modifying the original
+                content = cht["content"]
+                
+                # Format both placeholders at once to avoid KeyError
+                if "{context}" in content or "{question}" in content:
+                    try:
+                        content = content.format(
+                            context=formatted_context,
+                            question=question
+                        )
+                    except KeyError as e:
+                        self.logger.error(f"Missing placeholder in template: {e}")
+                        # Fallback: format only available placeholders
+                        if "{context}" in content:
+                            content = content.replace("{context}", formatted_context)
+                        if "{question}" in content:
+                            content = content.replace("{question}", question)
+                
+                # Create new dict with formatted content
+                formatted_chat = {
+                    "role": cht["role"],
+                    "content": content
+                }
+                
+                # Copy other fields if they exist
+                if "description" in cht:
+                    formatted_chat["description"] = cht["description"]
                     
-                if("{question}" in cht["content"]):
-                    cht["content"] = cht["content"].format(question=question)
-                formatted_template.append(cht)
+                formatted_template.append(formatted_chat)
 
             self.logger.info("Formatted Template", formatted_template)
             print("Forrmatted Template", formatted_template)
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/chat_template.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/chat_template.py
index 3ee9c9a8c3c9e22fc388232bbe8040b45903b969..e44ca4247eb37e52399b8003dfb65cd77ff17bd4 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/chat_template.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/chat_template.py
@@ -8,18 +8,20 @@ def RAG_TEMPLATES():
 
             1. Selalu berikan sapaan yang ramah dan profesional
             2. Gunakan HANYA informasi dari knowledge base yang tersedia
-            3. Berikan jawaban yang jelas, mudah dipahami, dan terstruktur semuanya berdasarkan konteks yang diberikan yaitu :
-            {context}
+            3. Berikan jawaban yang jelas, mudah dipahami, dan terstruktur semuanya berdasarkan konteks yang diberikan user.
             4. Jika informasi tidak tersedia, tawarkan alternatif bantuan atau arahkan ke channel yang tepat
             5. Gunakan bahasa yang sopan dan empati terhadap kebutuhan pelanggan
             6. Akhiri dengan penawaran bantuan lebih lanjut
+            
             """,
             "description": "Template dengan system prompt untuk customer service professional"
             },
             {
             "role" : "user",
-            "content" : """
-            Dari konteks yang diberikan context berikan jawaban atas pertanyaan saya yaitu : {question}
+            "content" : """Dari konteks yang diberikan : {context} 
+            
+            berikan jawaban atas pertanyaan saya yaitu : {question}
+
             """
             },
         ],
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/duckduckgo_search.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/duckduckgo_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..29729e28665083b36ecda3b202f673dccfcd1584
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/web_search/duckduckgo_search.py
@@ -0,0 +1,142 @@
+from ddgs import DDGS
+from langchain_community.document_loaders import AsyncChromiumLoader
+from langchain_community.document_transformers import BeautifulSoupTransformer
+import re
+import logging
+from typing import AsyncGenerator, List
+
+class DuckDuckGoSearch:
+    def __init__(self, html_loader: AsyncChromiumLoader = None, html_parser = None):
+        # Initialize dengan default values jika tidak diberikan
+        self.html_loader = html_loader or AsyncChromiumLoader([])
+        self.html_parser = html_parser or BeautifulSoupTransformer()
+        self.logger = logging.getLogger("ddgs_logger")
+    
+    async def get_page(self, urls: List[str]):
+        """Get page content from URLs - returns list of documents"""
+        try:
+            self.html_loader.urls = urls
+            html = await self.html_loader.aload()  # This returns a LIST
+            self.logger.info(f"search engine aload result: {len(html)} documents loaded")
+            
+            docs_transformed = self.html_parser.transform_documents(
+                html, 
+                tags_to_extract=["p"], 
+                remove_unwanted_tags=["a"]
+            )
+            return docs_transformed  # Returns LIST of documents
+            
+        except Exception as e:
+            self.logger.error(f"Error loading pages: {e}", exc_info=True)
+            return []  # Return empty list on error
+    
+    def truncate(self, text: str, max_words: int = 400) -> str:
+        """Truncate text to specified number of words"""
+        if not text:
+            return ""
+        
+        words = text.split()
+        if len(words) <= max_words:
+            return text
+            
+        truncated = " ".join(words[:max_words])
+        return truncated + "..." if len(words) > max_words else truncated
+    
+    async def search(self, query: str, max_results: int = 5) -> AsyncGenerator[str, None]:
+        """
+        Search and yield page contents one by one
+        
+        FIXED VERSION: Properly handle async iteration
+        """
+        try:
+            self.logger.info(f"Searching for: {query} (max_results: {max_results})")
+            
+            # Step 1: Get search results from DDGS (regular iterator)
+            results = DDGS().text(query, max_results=max_results)
+            urls = []
+            
+            # Step 2: Extract URLs using regular for loop (NOT async for)
+            for result in results:  # ← FIXED: Regular for loop
+                url = result.get('href')
+                if url:
+                    urls.append(url)
+            
+            self.logger.info(f"Found {len(urls)} URLs to process")
+            
+            if not urls:
+                self.logger.warning("No URLs found from search results")
+                return
+            
+            # Step 3: Get page content (await the coroutine first)
+            docs = await self.get_page(urls)  # ← FIXED: Await first, get list
+            
+            # Step 4: Process documents using regular for loop (NOT async for)
+            for doc in docs:  # ← FIXED: Regular for loop on list
+                try:
+                    if hasattr(doc, 'page_content') and doc.page_content:
+                        # Clean up text
+                        page_text = re.sub(r"\n\n+", "\n", doc.page_content)
+                        page_text = page_text.strip()
+                        
+                        if page_text:  # Only yield if there's actual content
+                            text = self.truncate(page_text)
+                            yield text  # Yield makes this an async generator
+                        
+                except Exception as e:
+                    self.logger.error(f"Error processing document: {e}")
+                    continue
+                    
+        except Exception as e:
+            self.logger.error(f"Error in search method: {e}", exc_info=True)
+            # Don't re-raise, just log and return (generator will be empty)
+    
+    async def search_with_metadata(self, query: str, max_results: int = 5) -> AsyncGenerator[dict, None]:
+        """
+        Alternative method that yields dictionaries with metadata
+        """
+        try:
+            results = DDGS().text(query, max_results=max_results)
+            urls_and_titles = []
+            
+            # Collect URLs and titles
+            for result in results:
+                url = result.get('href')
+                title = result.get('title', 'No title')
+                if url:
+                    urls_and_titles.append({'url': url, 'title': title})
+            
+            if not urls_and_titles:
+                return
+            
+            # Get page content
+            urls = [item['url'] for item in urls_and_titles]
+            docs = await self.get_page(urls)
+            
+            # Process and yield with metadata
+            for i, doc in enumerate(docs):
+                try:
+                    if hasattr(doc, 'page_content') and doc.page_content:
+                        page_text = re.sub(r"\n\n+", "\n", doc.page_content)
+                        page_text = page_text.strip()
+                        
+                        if page_text:
+                            text = self.truncate(page_text)
+                            
+                            # Get metadata if available
+                            metadata = {}
+                            if i < len(urls_and_titles):
+                                metadata = urls_and_titles[i]
+                            
+                            yield {
+                                'content': text,
+                                'url': metadata.get('url', 'Unknown'),
+                                'title': metadata.get('title', 'No title'),
+                                'word_count': len(text.split())
+                            }
+                            
+                except Exception as e:
+                    self.logger.error(f"Error processing document {i}: {e}")
+                    continue
+                    
+        except Exception as e:
+            self.logger.error(f"Error in search_with_metadata: {e}", exc_info=True)
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/__init__.py
index 7634032b6dfc365becf41961702955b1822f1bbc..a14fed914c5b8df9d5c9e237fed2cccf536a4d5c 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/__init__.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/__init__.py
@@ -2,11 +2,13 @@ from openai import OpenAI
 from elevenlabs.client import ElevenLabs
 from tts.audio_edge_tts import EdgeTTS
 from config.constant import OPENAI_API_KEY, ELEVENLABS_API_KEY
+# from rtc.rtc_call import RTCHandler
 from rtc.rtc_call import RTCHandler
 from stt.whisper_stt import WhisperSTT
 
-whisper_stt = WhisperSTT("turbo")
+whisper_stt = WhisperSTT(model_size = "base", device = "cuda")
 edge_tts = EdgeTTS("id-ID-ArdiNeural",  "+0%", "+0%")
+openai_client = OpenAI(api_key = OPENAI_API_KEY)
 rtc_handler = RTCHandler(whisper_stt, edge_tts)
 
 def handle_rtc():
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
index b46f28629b236800fd0b0f6ba07297f4d7b85ff5..7604bca5419b74a7cca98aa8e74fe5f4a83ded1c 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
@@ -30,7 +30,7 @@ import threading
 import re
 
 
-from rag import get_stream_response
+from rag import cs_agent
 # Load .env
 load_dotenv()
 logging.basicConfig(level=logging.INFO)
@@ -94,7 +94,7 @@ class RTCHandler:
                     logging.info("STT returned empty string")
                     return
 
-                logging.info(f"STT response: {prompt}")
+                logging.info(f"STT response: {transcription}")
                 self.messages.append({"role": "user", "content": prompt})
                 logging.info(f"STT took {time.time() - stt_time} seconds")
 
@@ -106,7 +106,7 @@ class RTCHandler:
                     chunk_size = 1024
                     no_buffer = 0
                     text_buffer = ""
-                    async for stream_data in get_stream_response(question=prompt):
+                    async for stream_data in cs_agent.get_result(question = prompt):
                         print(stream_data)
                         
                         if stream_data["type"] == "chunk":
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
index 03f53e8e593532d66b546aefd14550f5a23a5d9f..b8ac480e4ee24cf791f17fb003bc58c359c5de79 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
@@ -1,31 +1,94 @@
+
 import whisper
+import torch
 from fastrtc.utils import audio_to_int16
 import io
 import os
 import tempfile
 
 class WhisperSTT:
-    def __init__(self, model_size: str = "base"):
+    def __init__(self, model_size: str = "base", device: str = "auto"):
         """
-        Initialize Whisper STT with specified model size (tiny, base, small, medium, large)
+        Initialize Whisper STT with specified model size and device
+        
+        Args:
+            model_size: Model size (tiny, base, small, medium, large)
+            device: Device to use ("auto", "cuda", "cpu")
         """
+        # Set up cache directory
         cache_dir = os.environ.get('WHISPER_CACHE_DIR', '/tmp/.cache/whisper')
         os.makedirs(cache_dir, exist_ok=True)
-        self.model = whisper.load_model(model_size, download_root=cache_dir)
+        
+        # Determine device
+        if device == "auto":
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+            
+        # Validate CUDA availability if requested
+        if self.device == "cuda" and not torch.cuda.is_available():
+            print("Warning: CUDA requested but not available. Falling back to CPU.")
+            self.device = "cpu"
+        
+        # Load model with device specification
+        print(f"Loading Whisper model '{model_size}' on device: {self.device}")
+        self.model = whisper.load_model(model_size, device=self.device, download_root=cache_dir)
         self.language = "id"  # ISO-639-1 code for Bahasa Indonesia
         
+        # Print GPU info if using CUDA
+        if self.device == "cuda":
+            gpu_name = torch.cuda.get_device_name(0)
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
+            print(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")
 
     def transcribe(self, audio: io.BufferedReader, language: str = "id") -> str:
-        # Simpan audio ke file sementara
+        """
+        Transcribe audio using Whisper
+        
+        Args:
+            audio: Audio file buffer
+            language: Language code (default: "id" for Indonesian)
+            
+        Returns:
+            Transcribed text
+        """
+        # Save audio to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
             tmp.write(audio.read())
             tmp.flush()
             tmp_path = tmp.name
 
         try:
-            result = self.model.transcribe(tmp_path, language=language)
+            # Transcribe with GPU acceleration if available
+            result = self.model.transcribe(
+                tmp_path, 
+                language=language,
+                # Optional: Add fp16 for faster inference on supported GPUs
+                fp16=self.device == "cuda"
+            )
             return result.get("text", "")
         finally:
+            # Clean up temporary file
             os.remove(tmp_path)
-
-
+    
+    def get_device_info(self) -> dict:
+        """
+        Get information about the current device being used
+        
+        Returns:
+            Dictionary with device information
+        """
+        info = {
+            "device": self.device,
+            "cuda_available": torch.cuda.is_available()
+        }
+        
+        if self.device == "cuda" and torch.cuda.is_available():
+            info.update({
+                "gpu_name": torch.cuda.get_device_name(0),
+                "gpu_memory_gb": torch.cuda.get_device_properties(0).total_memory / 1024**3,
+                "gpu_memory_allocated_gb": torch.cuda.memory_allocated() / 1024**3,
+                "gpu_memory_reserved_gb": torch.cuda.memory_reserved() / 1024**3
+            })
+        
+        return info
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/ddgs_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/ddgs_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e28909233fa163cfe1833a56eee0b0a06073f041
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/ddgs_test.py
@@ -0,0 +1,7 @@
+from rag.web_search import ddgs
+def test_ddgs():
+    # query = input()
+    # print("Searching for query = ", query)
+
+    print("*** searching result : **")
+    print(ddgs.search("Perhitungan uang lembur"))
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/inference_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/inference_test.py
index baee29a8a5904b6dbd89af29ab67a3dff56657ff..bdf49aaf6eebdbfa02b9a1ca792b8abb10a5412d 100644
--- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/inference_test.py
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/inference_test.py
@@ -1,69 +1,15 @@
 import gradio as gr
 import asyncio
-from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+from rag.pipeline.language_model import LM, LMConfig
 from rag.retriever.langchain_retriever import LangChainRetriever
 from rag.inference.inferencer import InferencerConfig, Inferencer
-
-async def test_inference():
+from rag import cs_agent, query_maker_agent
+def test_inference():
     """Main function that sets up and runs the RAG chatbot interface"""
     
     # Initialize RAG components
     print("==== Start Inference Test ===")
     
-    # Setup LLM
-    config = QwenConfig(
-        temperature=0.3,
-        max_length=512,
-        generation_timeout=30,
-        repetition_penalty=1.1,
-        do_sample = True,
-    )
-    
-    llm = QwenLLM(config=config)
-
-    # Setup Document Retriever
-    document_retriever = LangChainRetriever(
-        embedding_model="text-embedding-3-small",
-        vectorstore_type="chroma",
-        vectorstore_path="./vectorstore",
-        use_hybrid_search=True,
-        chunk_size=1000, 
-        chunk_overlap=200
-    )
-
-    # Load initial documents
-    file_paths = [
-        "../documents/bpjs.pdf",
-        "../documents/pph21.pdf",
-        "../documents/lembur.pdf",
-        "../documents/uu13.pdf",
-        "../documents/file.pdf",
-    ]
-    
-    for file_path in file_paths:
-        try:
-            result = await document_retriever.add_document_from_file(file_path)
-            if result.success:
-                print(f"Successfully processed: {result.document_metadata.file_name}")
-                print(f"Chunks created: {result.document_metadata.chunk_count}")
-            else:
-                print(f"Failed to process: {result.error_message}")
-        except Exception as e:
-            print(f"Error processing {file_path}: {e}")
-
-    # Setup Inferencer
-    inferencer_config = InferencerConfig(
-        default_k=2,
-        enable_reranking=False,
-        default_template_types=["system"]
-    )
-    
-    inferencer = Inferencer(
-        model=llm,
-        retriever=document_retriever,
-        reranker=None,
-        config=inferencer_config
-    )
     
     print("RAG system initialized successfully!")
 
@@ -73,16 +19,16 @@ async def test_inference():
             # Create new event loop for this thread
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
-            
+
             async def stream_response():
                 partial_response = ""
-                
-                async for stream_data in inferencer.infer_stream(
-                    query=message,
-                    k=3,
-                    template_type="main_template"
-                ):
-                    print(stream_data)
+                # print("message = ", message)
+                formatted_query = await query_maker_agent.get_result(question = message)
+                print("Formatted Query = ", formatted_query)
+                formatted_query = formatted_query['responses'][0]['rag_response']
+                await cs_agent.load_documents()
+                async for stream_data in cs_agent.get_result(question = formatted_query):
+
                     if stream_data["type"] == "chunk":
                         chunk = stream_data["data"]["chunk"]
                         partial_response += chunk
@@ -96,9 +42,8 @@ async def test_inference():
                         total_time = stream_data['data']['total_time']
                         print(f"\nTotal time: {total_time:.2f}s")
             
-            # Execute async generator
             async_gen = stream_response()
-            
+
             try:
                 while True:
                     result = loop.run_until_complete(async_gen.__anext__())
@@ -121,7 +66,7 @@ async def test_inference():
             asyncio.set_event_loop(loop)
             
             async def add_doc():
-                result = await document_retriever.add_document_from_file(file_path)
+                result = ""
                 return result
             
             result = loop.run_until_complete(add_doc())
@@ -158,8 +103,7 @@ async def test_inference():
     # Membuat interface Gradio
     with gr.Blocks(css=css, title="RAG Chatbot") as demo:
         gr.Markdown("""
-        # 🤖 RAG Chatbot dengan Text Streaming
-        Chatbot berbasis Retrieval-Augmented Generation (RAG) dengan dukungan streaming response.
+        # 🤖 SakuraAI, Virtual Assistant 
         """)
         
         # Status indicator
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.env.example b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.env.example
new file mode 100644
index 0000000000000000000000000000000000000000..8781e7ccb55ccb93ec4f7b16db64fac15585ab1e
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.env.example
@@ -0,0 +1,3 @@
+OPENAI_API_KEY =
+ELEVENLABS_API_KEY =
+HF_TOKEN = 
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..033de49ba796b475016b58afd68e698b8fd1b068
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes
@@ -0,0 +1,37 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+documents/SPISy[[:space:]]SaaS[[:space:]]To[[:space:]]The[[:space:]]Next[[:space:]]Level.pdf filter=lfs diff=lfs merge=lfs -text
+documents/file.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/deploy-to-huggingface.yml b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/deploy-to-huggingface.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b932880aa517cc9195934d8bd6360e4f8b93589b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/deploy-to-huggingface.yml
@@ -0,0 +1,52 @@
+name: Deploy to Huggingface
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  deploy-to-huggingface:
+    runs-on: ubuntu-latest
+
+    steps:
+      # Checkout repository
+      - name: Checkout Repository
+        uses: actions/checkout@v3
+
+      # Setup Git
+      - name: Setup Git for Huggingface
+        run: |
+          git config --global user.email "abdan.hafidz@gmail.com"
+          git config --global user.name "abdanhafidz"
+
+      # Clone Huggingface Space Repository
+      - name: Clone Huggingface Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git clone https://huggingface.co/spaces/lifedebugger/cs-ai-sakura-dev space
+
+      # Update Git Remote URL and Pull Latest Changes
+      - name: Update Remote and Pull Changes
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          cd space
+          git remote set-url origin https://lifedebugger:$HF_TOKEN@huggingface.co/spaces/lifedebugger/cs-ai-sakura-dev
+          git pull origin main || echo "No changes to pull"
+
+      # Copy Files to Huggingface Space
+      - name: Copy Files to Space
+        run: |
+          rsync -av --exclude='.git' ./ space/
+
+      # Commit and Push to Huggingface Space
+      - name: Commit and Push to Huggingface
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          cd space
+          git add .
+          git commit -m "Deploy files from GitHub repository" || echo "No changes to commit"
+          git push origin main || echo "No changes to push"
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..db662b545139900faad7660ac0d527296977a42a
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore
@@ -0,0 +1,9 @@
+.venv/
+venv/
+.vscode/
+__pycache__/
+my_vectorstore/
+FlagEmbedding/
+.env
+vectorstore/
+documents/
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/Dockerfile b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a54879e434bbcf0ee9205f62a2d14f64ac086afa
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/Dockerfile
@@ -0,0 +1,49 @@
+# Gunakan image dasar Python versi 3.13
+FROM python:3.13
+
+# Tambahkan user non-root untuk keamanan
+RUN useradd -m -u 1001 appuser
+
+# Set working directory
+WORKDIR /rag_be
+
+# Set cache directories ke writable location
+ENV HF_HOME=/tmp/.cache/huggingface
+ENV TRANSFORMERS_CACHE=/tmp/.cache/transformers
+ENV TORCH_HOME=/tmp/.cache/torch
+ENV XDG_CACHE_HOME=/tmp/.cache
+ENV TMPDIR=/tmp
+ENV WHISPER_CACHE_DIR=/tmp/.cache/whisper
+
+# Copy requirements dan install dependencies
+COPY requirements.txt ./
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+
+# Copy aplikasi dengan ownership ke appuser
+COPY --chown=appuser:appuser . /rag_be
+
+# Buat file .env dengan variabel environment menggunakan Hugging Face secrets
+RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=false \
+    --mount=type=secret,id=HF_TOKEN,mode=0444,required=false \
+    --mount=type=secret,id=ELEVENLABS_API_KEY,mode=0444,required=false \
+    echo "OPENAI_API_KEY=$(cat /run/secrets/OPENAI_API_KEY 2>/dev/null || echo '')" >> .env && \
+    echo "HF_TOKEN=$(cat /run/secrets/HF_TOKEN 2>/dev/null || echo '')" >> .env && \
+    echo "ELEVENLABS_API_KEY=$(cat /run/secrets/ELEVENLABS_API_KEY 2>/dev/null || echo '')" >> .env
+    
+RUN ls -l /rag_be/app && whoami && id
+
+# Buat directories yang diperlukan dengan permissions yang tepat
+RUN mkdir -p /tmp/.cache /tmp/.cache/whisper /tmp/.cache/huggingface /rag_be/vectorstore  /tmp/.cache/transformers /tmp/.cache/torch \
+             /rag_be/app/vectorstore /rag_be/documents  && \
+    chmod -R 777 /tmp/.cache /rag_be/app /rag_be/app/vectorstore /rag_be/vectorstore /rag_be/documents && \
+    chown -R appuser:appuser /tmp/.cache /rag_be/app /rag_be/app/vectorstore /rag_be/vectorstore  /rag_be/documents /rag_be/.env
+
+RUN apt-get update && apt-get install -y ffmpeg
+# Beralih ke user non-root
+USER appuser
+
+# Expose port untuk Hugging Face Spaces
+EXPOSE 7860
+
+# Jalankan aplikasi
+CMD ["python", "app/__test__.py"]
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef267daf6fea185f4ceaf65100ca030d8583a6f5
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
@@ -0,0 +1,31 @@
+---
+title: Cs Ai Sakura Dev
+emoji: 🏢
+colorFrom: indigo
+colorTo: indigo
+sdk: docker
+pinned: false
+---
+
+**Install The Requirements**
+
+1.Create a virtual environment and install the dependencies
+```
+python3 -m venv env
+source env/bin/activate
+pip install -r requirements.txt
+```
+
+2. Set your OPENAI_API_KEY in .env file
+
+3. **TO LAUNCH THE GRADIO UI** Run the command below :
+```
+cd app
+python __test__.py
+```
+
+4. **TO LAUNCH THE API ENDPOINT (SERVER)** Run the command below :
+```
+cd app
+python __server__.py
+```
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/.gradio/certificate.pem b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/.gradio/certificate.pem
new file mode 100644
index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/.gradio/certificate.pem
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97eeae9bb7706292cc56d80559e62eb51f8bc17d
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__chat__.py
@@ -0,0 +1,14 @@
+from tests.inference_test import test_inference
+
+import warnings
+warnings.filterwarnings("ignore")
+import asyncio
+def run_test():
+    try:
+        # await test_document_retriever()
+        # await test_qwen_llm()
+        asyncio.run(test_inference())
+    except Exception as e:
+        print(e)
+
+run_test()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__server__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__server__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a5bc16920de927edb07987e71ffdda0ad0f2e0a
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__server__.py
@@ -0,0 +1,3 @@
+import rtc
+
+rtc.handle_rtc_server()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__test__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__test__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d479ab503b7b16a279ee28036164453613737382
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/__test__.py
@@ -0,0 +1,19 @@
+
+# from tests.document_retriever_test import test_document_retriever
+# from tests.document_retriever_test import test_document_retriever
+# from tests.qwen_llm_test import test_qwen_llm
+# from tests.inference_test import test_inference
+from tests.rtc_test import test_rtc
+import warnings
+warnings.filterwarnings("ignore")
+import asyncio
+def run_test():
+    try:
+        # await test_document_retriever()
+        # await test_qwen_llm()
+        # asyncio.run(test_inference())
+        test_rtc()
+    except Exception as e:
+        print(e)
+
+run_test()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/config/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/config/constant.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/config/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a1b8059d50edfe0a90b1e00ebc20c4998dfaee0
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/config/constant.py
@@ -0,0 +1,7 @@
+from dotenv import load_dotenv
+import os
+load_dotenv()
+
+OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
+ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
+HF_TOKEN = os.getenv("HF_TOKEN")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1445848f655cc27ee59d61ee5a09fedc160923fc
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/__init__.py
@@ -0,0 +1,50 @@
+from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+from rag.retriever.langchain_retriever import LangChainRetriever
+from rag.inference.inferencer import Inferencer, InferencerConfig
+
+config = QwenConfig(
+                temperature=0.3,
+                max_length=512,
+                generation_timeout=30,
+                repetition_penalty=1.1,
+                max_workers = 1,
+                do_sample = True,
+        )
+    
+llm = QwenLLM(
+        config = config
+)
+
+inferencer_config = InferencerConfig(
+        default_k=5,
+        enable_reranking=False,
+        default_template_types="main_template"
+)
+
+document_retriever = LangChainRetriever(
+        embedding_model="all-MiniLM-L6-v2",
+        vectorstore_type="chroma",
+        vectorstore_path="./vectorstore",
+        use_hybrid_search=True,
+        chunk_size=1000,
+        chunk_overlap=200
+)
+
+inferencer = Inferencer(
+        model=llm,
+        retriever=document_retriever,
+        reranker=None,
+        config=inferencer_config
+)
+
+async def get_response(question):
+    result = await inferencer.infer(question, "rag_response")
+    return result
+
+async def get_stream_response(question):
+    async for item in inferencer.infer_stream(query = question,
+                                             enable_reranking=False,
+                                             template_type="main_template",
+                                             k=3):
+            print("Stream Response :", item)
+            yield item
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/inferencer.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d159297ffadb5e74655e4dcc4bff50ae902ee480
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/inference/inferencer.py
@@ -0,0 +1,552 @@
+from rag.retriever.langchain_retriever import LangChainRetriever
+from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+from rag.retriever.retriever_types import RetrievalResult
+# from rag.pipeline.reranker import BGEM3Reranker
+from typing import List, Union, Dict, Any, Optional, AsyncGenerator
+import asyncio
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+
+@dataclass
+class InferencerConfig:
+    """Konfigurasi untuk Inferencer"""
+    default_k: int = 5
+    max_contexts: int = 10
+    enable_reranking: bool = False
+    reranker_top_k: int = 5
+    default_template_types: List[str] = None
+    enable_logging: bool = True
+    response_timeout: float = 30.0
+    
+    def __post_init__(self):
+        if self.default_template_types is None:
+            self.default_template_types = ["system", "instruction", "friendly"]
+
+class Inferencer:
+    """
+    Advanced RAG Inferencer dengan support untuk streaming, reranking, dan multiple response types
+    """
+    
+    def __init__(self, 
+                 model: QwenLLM, 
+                 retriever: LangChainRetriever, 
+                 reranker=None,
+                 config: Optional[InferencerConfig] = None):
+        """
+        Initialize Inferencer
+        
+        Args:
+            model: QwenLLM instance
+            retriever: LangChainRetriever instance
+            reranker: Reranker instance (optional)
+            config: InferencerConfig (optional)
+        """
+        self.model = model
+        self.retriever = retriever
+        self.reranker = reranker
+        self.config = config or InferencerConfig()
+        
+        # Setup logging
+        if self.config.enable_logging:
+            logging.basicConfig(level=logging.INFO)
+            self.logger = logging.getLogger(__name__)
+        else:
+            self.logger = logging.getLogger(__name__)
+            self.logger.setLevel(logging.ERROR)
+        
+        # Model loading flag
+        self._model_loaded = False
+    
+    async def _ensure_model_loaded(self):
+        """Pastikan model sudah diload (hanya sekali)"""
+        if not self._model_loaded:
+            self.logger.info("Loading model...")
+            await self.model.load_model()
+            self._model_loaded = True
+            self.logger.info("Model loaded successfully")
+    
+    async def retrieve_context(self, 
+                             query: str, 
+                             k: Optional[int] = None) -> RetrievalResult:
+        """
+        Retrieve context documents
+        
+        Args:
+            query: Search query
+            k: Number of documents to retrieve
+            
+        Returns:
+            RetrievalResult object
+        """
+        k = k or self.config.default_k
+        self.logger.info(f"Retrieving {k} contexts for query: {query[:50]}...")
+        
+        try:
+            start_time = datetime.now()
+            contexts = await self.retriever.retrieve(query, k=k)
+            retrieval_time = (datetime.now() - start_time).total_seconds()
+            
+            self.logger.info(f"Retrieved {len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts)} contexts in {retrieval_time:.2f}s")
+            return contexts
+            
+        except Exception as e:
+            self.logger.error(f"Error during retrieval: {e}")
+            raise
+    
+    async def rerank_contexts(self, 
+                            contexts: RetrievalResult, 
+                            query: str,
+                            top_k: Optional[int] = None) -> RetrievalResult:
+        """
+        Rerank retrieved contexts
+        
+        Args:
+            contexts: Retrieved contexts
+            query: Original query
+            top_k: Number of top contexts to keep after reranking
+            
+        Returns:
+            Reranked RetrievalResult object
+        """
+        if not self.reranker or not self.config.enable_reranking:
+            self.logger.info("Reranking disabled or reranker not available")
+            return contexts
+        
+        top_k = top_k or self.config.reranker_top_k
+        self.logger.info(f"Reranking contexts, keeping top {top_k}")
+        
+        try:
+            start_time = datetime.now()
+            reranked_contexts = await self.reranker.rerank(
+                query=query,
+                contexts=contexts,
+                top_k=top_k
+            )
+            rerank_time = (datetime.now() - start_time).total_seconds()
+            
+            self.logger.info(f"Reranking completed in {rerank_time:.2f}s")
+            return reranked_contexts
+            
+        except Exception as e:
+            self.logger.error(f"Error during reranking: {e}")
+            # Return original contexts if reranking fails
+            return contexts
+    
+    async def generate_response(self, 
+                              contexts: RetrievalResult, 
+                              query: Union[str, List[str]], 
+                              response_type: Union[List[str], str] = None,
+                              template_types: Optional[List[str]] = None,
+                              max_new_tokens: Optional[int] = None,
+                              **generation_kwargs) -> List[Dict[str, Any]]:
+        """
+        Generate responses based on contexts and query
+        
+        Args:
+            contexts: Retrieved contexts
+            query: User query or list of queries
+            response_type: Type(s) of response to generate
+            template_types: Template types for multi_response
+            max_new_tokens: Maximum tokens to generate
+            **generation_kwargs: Additional generation parameters
+            
+        Returns:
+            List of response dictionaries
+        """
+        await self._ensure_model_loaded()
+        
+        # Default response types
+        if response_type is None:
+            response_type = ["rag_response"]
+        elif isinstance(response_type, str):
+            response_type = [response_type]
+        
+        # Default template types
+        if template_types is None:
+            template_types = self.config.default_template_types
+        
+        responses = []
+        
+        try:
+            # RAG Response
+            if "rag_response" in response_type:
+                self.logger.info("Generating RAG response...")
+                start_time = datetime.now()
+                
+                if isinstance(query, list):
+                    # Handle multiple queries
+                    rag_responses = {}
+                    for i, q in enumerate(query):
+                        rag_response = await self.model.rag_generate(
+                            question=q,
+                            contexts=contexts,
+                            template_type="friendly",
+                            max_new_tokens=max_new_tokens,
+                            **generation_kwargs
+                        )
+                        rag_responses[f"query_{i}"] = rag_response
+                    responses.append({"rag_response": rag_responses})
+                else:
+                    rag_response = await self.model.rag_generate(
+                        question=query,
+                        contexts=contexts,
+                        template_type="friendly",
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                    responses.append({"rag_response": rag_response})
+                
+                generation_time = (datetime.now() - start_time).total_seconds()
+                self.logger.info(f"RAG response generated in {generation_time:.2f}s")
+            
+            # Multi-template Response
+            if "multi_response" in response_type:
+                self.logger.info("Generating multi-template responses...")
+                start_time = datetime.now()
+                
+                if isinstance(query, list):
+                    multi_responses = {}
+                    for i, q in enumerate(query):
+                        multi_response = await self.model.multi_template_generate(
+                            question=q,
+                            contexts=contexts,
+                            template_types=template_types,
+                            max_new_tokens=max_new_tokens,
+                            **generation_kwargs
+                        )
+                        multi_responses[f"query_{i}"] = multi_response
+                    responses.append({"multi_responses": multi_responses})
+                else:
+                    multi_responses = await self.model.multi_template_generate(
+                        question=query,
+                        contexts=contexts,
+                        template_types=template_types,
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                    responses.append({"multi_responses": multi_responses})
+                
+                generation_time = (datetime.now() - start_time).total_seconds()
+                self.logger.info(f"Multi-template responses generated in {generation_time:.2f}s")
+            
+            # Batch Response (untuk multiple prompts tanpa RAG context)
+            if "batch_response" in response_type:
+                self.logger.info("Generating batch responses...")
+                start_time = datetime.now()
+                
+                if isinstance(query, list):
+                    batch_responses = await self.model.batch_generate(
+                        query, 
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                else:
+                    batch_responses = await self.model.batch_generate(
+                        [query], 
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                
+                responses.append({"batch_responses": batch_responses})
+                
+                generation_time = (datetime.now() - start_time).total_seconds()
+                self.logger.info(f"Batch responses generated in {generation_time:.2f}s")
+            
+            return responses
+            
+        except Exception as e:
+            self.logger.error(f"Error during response generation: {e}")
+            raise
+    
+    async def generate_response_stream(self, 
+                                     contexts: RetrievalResult, 
+                                     query: str,
+                                     template_type: str = "main_template",
+                                     max_new_tokens: Optional[int] = None,
+                                     **generation_kwargs) -> AsyncGenerator[str, None]:
+        """
+        Generate RAG response with streaming
+        
+        Args:
+            contexts: Retrieved contexts
+            query: User query
+            template_type: Template type to use
+            max_new_tokens: Maximum tokens to generate
+            **generation_kwargs: Additional generation parameters
+            
+        Yields:
+            Response chunks
+        """
+        await self._ensure_model_loaded()
+        
+        self.logger.info(f"Generating streaming RAG response with template: {template_type}")
+        
+        async for chunk in self.model.rag_generate_stream(
+            question=query,
+            contexts=contexts,
+            template_type=template_type,
+            max_new_tokens=max_new_tokens,
+            **generation_kwargs
+        ):
+            yield chunk
+    
+    async def infer(self, 
+                   query: Union[str, List[str]], 
+                   response_type: Union[List[str], str] = None,
+                   k: Optional[int] = None,
+                   enable_reranking: Optional[bool] = None,
+                   template_types: Optional[List[str]] = None,
+                   max_new_tokens: Optional[int] = None,
+                   **generation_kwargs) -> Dict[str, Any]:
+        """
+        Complete inference pipeline
+        
+        Args:
+            query: User query or list of queries
+            response_type: Type(s) of response to generate
+            k: Number of contexts to retrieve
+            enable_reranking: Whether to enable reranking
+            template_types: Template types for multi_response
+            max_new_tokens: Maximum tokens to generate
+            **generation_kwargs: Additional generation parameters
+            
+        Returns:
+            Dictionary with results and metadata
+        """
+        start_time = datetime.now()
+        
+        # Handle single query
+        main_query = query[0] if isinstance(query, list) else query
+        
+        try:
+            # Step 1: Retrieve contexts
+            retrieved_contexts = await self.retrieve_context(main_query, k=k)
+            
+            # Step 2: Rerank contexts (if enabled)
+            enable_rerank = enable_reranking if enable_reranking is not None else self.config.enable_reranking
+            if enable_rerank:
+                contexts = await self.rerank_contexts(retrieved_contexts, main_query)
+            else:
+                contexts = retrieved_contexts
+            
+            # Step 3: Generate responses
+            responses = await self.generate_response(
+                contexts=contexts,
+                query=query,
+                response_type=response_type,
+                template_types=template_types,
+                max_new_tokens=max_new_tokens,
+                **generation_kwargs
+            )
+            
+            total_time = (datetime.now() - start_time).total_seconds()
+            
+            # Prepare result
+            result = {
+                "query": query,
+                "responses": responses,
+                "contexts": contexts,
+                "metadata": {
+                    "total_time": total_time,
+                    "retrieval_enabled": True,
+                    "reranking_enabled": enable_rerank,
+                    "num_contexts": len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts),
+                    "response_types": response_type,
+                    "timestamp": datetime.now().isoformat()
+                }
+            }
+            
+            self.logger.info(f"Inference completed in {total_time:.2f}s")
+            return result
+            
+        except Exception as e:
+            self.logger.error(f"Error during inference: {e}")
+            raise
+    
+    async def infer_stream(self, 
+                          query: str,
+                          k: Optional[int] = None,
+                          enable_reranking: Optional[bool] = None,
+                          template_type: str = "main_template",
+                          max_new_tokens: Optional[int] = None,
+                          **generation_kwargs) -> AsyncGenerator[Dict[str, Any], None]:
+        """
+        Complete inference pipeline with streaming response
+        
+        Args:
+            query: User query
+            k: Number of contexts to retrieve
+            enable_reranking: Whether to enable reranking
+            template_type: Template type to use
+            max_new_tokens: Maximum tokens to generate
+            **generation_kwargs: Additional generation parameters
+            
+        Yields:
+            Dictionaries with stream data and metadata
+        """
+        start_time = datetime.now()
+        
+        try:
+            # Step 1: Retrieve contexts
+            retrieved_contexts = await self.retrieve_context(query, k=k)
+            
+            # Step 2: Rerank contexts (if enabled)
+            enable_rerank = enable_reranking if enable_reranking is not None else self.config.enable_reranking
+            if enable_rerank:
+                contexts = await self.rerank_contexts(retrieved_contexts, query)
+            else:
+                contexts = retrieved_contexts
+            
+            # Yield metadata first
+            setup_time = (datetime.now() - start_time).total_seconds()
+            yield {
+                "type": "metadata",
+                "data": {
+                    "query": query,
+                    "setup_time": setup_time,
+                    "num_contexts": len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts),
+                    "reranking_enabled": enable_rerank,
+                    "template_type": template_type
+                }
+            }
+            
+            # Step 3: Stream response
+            response_start = datetime.now()
+            accumulated_text = ""
+            
+            async for chunk in self.generate_response_stream(
+                contexts=contexts,
+                query=query,
+                template_type=template_type,
+                max_new_tokens=max_new_tokens,
+                **generation_kwargs
+            ):
+                accumulated_text += chunk
+                yield {
+                    "type": "chunk",
+                    "data": {
+                        "chunk": chunk,
+                        "accumulated_text": accumulated_text,
+                        "generation_time": (datetime.now() - response_start).total_seconds()
+                    }
+                }
+            
+            # Yield final metadata
+            total_time = (datetime.now() - start_time).total_seconds()
+            yield {
+                "type": "complete",
+                "data": {
+                    "total_time": total_time,
+                    "final_response": accumulated_text,
+                    "contexts": contexts
+                }
+            }
+            
+        except Exception as e:
+            self.logger.error(f"Error during streaming inference: {e}")
+            yield {
+                "type": "error",
+                "data": {
+                    "error": str(e),
+                    "error_time": (datetime.now() - start_time).total_seconds()
+                }
+            }
+    
+    async def batch_infer(self, 
+                         queries: List[str],
+                         response_type: Union[List[str], str] = None,
+                         k: Optional[int] = None,
+                         enable_reranking: Optional[bool] = None,
+                         **generation_kwargs) -> List[Dict[str, Any]]:
+        """
+        Batch inference untuk multiple queries
+        
+        Args:
+            queries: List of queries
+            response_type: Type(s) of response to generate
+            k: Number of contexts to retrieve per query
+            enable_reranking: Whether to enable reranking
+            **generation_kwargs: Additional generation parameters
+            
+        Returns:
+            List of inference results
+        """
+        self.logger.info(f"Starting batch inference for {len(queries)} queries")
+        
+        # Create tasks untuk concurrent processing
+        tasks = [
+            asyncio.create_task(
+                self.infer(
+                    query=query,
+                    response_type=response_type,
+                    k=k,
+                    enable_reranking=enable_reranking,
+                    **generation_kwargs
+                )
+            )
+            for query in queries
+        ]
+        
+        # Wait for all tasks
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results
+        processed_results = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                self.logger.error(f"Error processing query {i}: {result}")
+                processed_results.append({
+                    "query": queries[i],
+                    "error": str(result),
+                    "success": False
+                })
+            else:
+                result["success"] = True
+                processed_results.append(result)
+        
+        return processed_results
+    
+    async def get_available_templates(self) -> List[str]:
+        """Get available template types from model"""
+        await self._ensure_model_loaded()
+        return self.model.get_available_templates()
+    
+    async def preview_template(self, 
+                              template_type: str, 
+                              sample_query: str = "Apa itu AI?") -> str:
+        """Preview template formatting"""
+        await self._ensure_model_loaded()
+        return self.model.preview_template(
+            template_type=template_type,
+            sample_question=sample_query,
+            sample_context="Sample context untuk preview template..."
+        )
+    
+    async def get_model_info(self) -> Dict[str, Any]:
+        """Get model information"""
+        await self._ensure_model_loaded()
+        model_info = await self.model.get_model_info()
+        
+        return {
+            "model_info": model_info,
+            "inferencer_config": self.config.__dict__,
+            "reranker_available": self.reranker is not None,
+            "available_templates": await self.get_available_templates()
+        }
+    
+    async def close(self):
+        """Clean up resources"""
+        self.logger.info("Closing Inferencer...")
+        if self.model:
+            await self.model.close()
+        self.logger.info("Inferencer closed successfully")
+    
+    async def __aenter__(self):
+        """Async context manager entry"""
+        await self._ensure_model_loaded()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        await self.close()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/preprocessing.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b439abce750b049976b4f68b2094ba665f078eb
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/preprocessing.py
@@ -0,0 +1,780 @@
+import re
+import json
+from typing import List, Dict, Any, Optional, Union
+from dataclasses import dataclass
+import logging
+from datetime import datetime
+import hashlib
+
+# Import types yang sudah ada
+from typing import List, Dict, Any, Optional, Union
+from dataclasses import dataclass
+from enum import Enum
+from langchain_core.documents import Document
+from rag.retriever.retriever_types import *
+@dataclass
+class PreprocessingConfig:
+    """Konfigurasi untuk preprocessing"""
+    # Text cleaning options
+    remove_extra_whitespace: bool = True
+    remove_special_chars: bool = False
+    normalize_unicode: bool = True
+    remove_urls: bool = False
+    remove_emails: bool = False
+    
+    # Chunking options
+    enable_chunking: bool = False        # Apakah perlu chunking lagi
+    chunk_size: int = 500
+    chunk_overlap: int = 50
+    chunk_method: str = "sentence"       # "sentence", "paragraph", "fixed"
+    
+    # Content filtering
+    min_content_length: int = 20
+    max_content_length: int = 3000
+    filter_empty_content: bool = True
+    filter_duplicate_content: bool = True
+    
+    # Metadata options
+    extract_metadata: bool = True
+    include_retrieval_info: bool = True
+    include_document_info: bool = True
+    include_timestamps: bool = True
+    
+    # Scoring options
+    use_retrieval_scores: bool = True    # Use scores dari retrieval system
+    normalize_scores: bool = True        # Normalize scores ke range 0-1
+    min_score_threshold: float = 0.0    # Filter berdasarkan minimum score
+    score_boost_factor: float = 1.0     # Boost factor untuk scores
+
+class RetrievalPreprocessor:
+    """
+    Preprocessor untuk RetrievalResult
+    Mengkonversi RetrievalResult menjadi List[RetrievalResult] yang siap untuk RAG
+    """
+    
+    def __init__(self, config: Optional[PreprocessingConfig] = None):
+        """
+        Initialize preprocessor
+        
+        Args:
+            config: Preprocessing configuration
+        """
+        self.config = config or PreprocessingConfig()
+        
+        # Setup logging
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        
+        # Regex patterns untuk cleaning
+        self.url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
+        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
+        self.special_chars_pattern = re.compile(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/]')
+        self.whitespace_pattern = re.compile(r'\s+')
+        
+        # Cache untuk duplicate detection
+        self._seen_content_hashes = set()
+    
+    def process_retrieval_result(self, retrieval_result: RetrievalResult) -> List[RetrievalResult]:
+        """
+        Main method: Process RetrievalResult menjadi List[RetrievalResult]
+        
+        Args:
+            retrieval_result: RetrievalResult dari retrieval system
+            
+        Returns:
+            List[RetrievalResult] yang siap untuk RAG
+        """
+        if not retrieval_result.documents:
+            self.logger.warning("No documents in retrieval result")
+            return []
+        
+        if len(retrieval_result.documents) != len(retrieval_result.scores):
+            self.logger.warning(
+                f"Documents count ({len(retrieval_result.documents)}) != "
+                f"Scores count ({len(retrieval_result.scores)})"
+            )
+        
+        self.logger.info(
+            f"Processing {len(retrieval_result.documents)} documents from retrieval result for query: '{retrieval_result.query}'"
+        )
+        
+        # Clear cache untuk setiap batch baru
+        self._seen_content_hashes.clear()
+        
+        contexts = []
+        
+        # Process setiap document
+        for i, doc in enumerate(retrieval_result.documents):
+            try:
+                # Get corresponding score
+                score = retrieval_result.scores[i] if i < len(retrieval_result.scores) else 0.0
+                
+                # Process single document
+                processed_contexts = self._process_single_document(
+                    document=doc,
+                    retrieval_score=score,
+                    document_index=i,
+                    total_documents=len(retrieval_result.documents),
+                    retrieval_result=retrieval_result
+                )
+                
+                contexts.extend(processed_contexts)
+                
+            except Exception as e:
+                self.logger.error(f"Error processing document {i}: {e}")
+                continue
+        
+        # Post-processing
+        contexts = self._post_process_contexts(contexts)
+        
+        self.logger.info(f"Successfully processed {len(contexts)} contexts from retrieval result")
+        
+        return contexts
+    
+    def _process_single_document(self, 
+                                document: Document,
+                                retrieval_score: float,
+                                document_index: int,
+                                total_documents: int,
+                                retrieval_result: RetrievalResult) -> List[RetrievalResult]:
+        """
+        Process single document menjadi RetrievalResult(s)
+        
+        Args:
+            document: Langchain Document object
+            retrieval_score: Score dari retrieval system
+            document_index: Index document dalam batch
+            total_documents: Total documents dalam batch
+            retrieval_result: Original retrieval result untuk metadata
+            
+        Returns:
+            List[RetrievalResult]
+        """
+        if not document.page_content or not document.page_content.strip():
+            self.logger.warning(f"Empty content in document {document_index}")
+            return []
+        
+        # Clean content
+        cleaned_content = self._clean_text(document.page_content)
+        
+        if not cleaned_content:
+            return []
+        
+        # Filter by length
+        if len(cleaned_content) < self.config.min_content_length:
+            self.logger.debug(f"Content too short in document {document_index}: {len(cleaned_content)} chars")
+            return []
+        
+        if len(cleaned_content) > self.config.max_content_length:
+            # Truncate content
+            cleaned_content = self._truncate_content(cleaned_content)
+            self.logger.debug(f"Content truncated in document {document_index}")
+        
+        # Check for duplicates
+        if self.config.filter_duplicate_content:
+            content_hash = hashlib.md5(cleaned_content.encode()).hexdigest()
+            if content_hash in self._seen_content_hashes:
+                self.logger.debug(f"Duplicate content detected in document {document_index}")
+                return []
+            self._seen_content_hashes.add(content_hash)
+        
+        # Filter by score threshold
+        if self.config.use_retrieval_scores and retrieval_score < self.config.min_score_threshold:
+            self.logger.debug(f"Score too low in document {document_index}: {retrieval_score}")
+            return []
+        
+        # Chunking (if enabled)
+        if self.config.enable_chunking:
+            chunks = self._chunk_content(cleaned_content)
+            contexts = []
+            
+            for chunk_index, chunk in enumerate(chunks):
+                context = self._create_retrieved_context(
+                    content=chunk,
+                    document=document,
+                    retrieval_score=retrieval_score,
+                    document_index=document_index,
+                    chunk_index=chunk_index,
+                    total_chunks=len(chunks),
+                    total_documents=total_documents,
+                    retrieval_result=retrieval_result
+                )
+                contexts.append(context)
+            
+            return contexts
+        else:
+            # Single context per document
+            context = self._create_retrieved_context(
+                content=cleaned_content,
+                document=document,
+                retrieval_score=retrieval_score,
+                document_index=document_index,
+                chunk_index=None,
+                total_chunks=1,
+                total_documents=total_documents,
+                retrieval_result=retrieval_result
+            )
+            
+            return [context]
+    
+    def _create_retrieved_context(self,
+                                 content: str,
+                                 document: Document,
+                                 retrieval_score: float,
+                                 document_index: int,
+                                 chunk_index: Optional[int],
+                                 total_chunks: int,
+                                 total_documents: int,
+                                 retrieval_result: RetrievalResult) -> RetrievalResult:
+        """
+        Create RetrievalResult object
+        """
+        # Process score
+        final_score = self._process_score(retrieval_score, document_index, total_documents)
+        
+        # Extract source
+        source = self._extract_source(document)
+        
+        # Build metadata
+        metadata = self._build_metadata(
+            document=document,
+            retrieval_result=retrieval_result,
+            document_index=document_index,
+            chunk_index=chunk_index,
+            total_chunks=total_chunks,
+            total_documents=total_documents,
+            content=content
+        )
+        
+        return RetrievalResult(
+            content=content,
+            source=source,
+            score=final_score,
+            metadata=metadata
+        )
+    
+    def _clean_text(self, text: str) -> str:
+        """Clean text berdasarkan konfigurasi"""
+        if not text:
+            return ""
+        
+        cleaned = text
+        
+        # Normalize unicode
+        if self.config.normalize_unicode:
+            import unicodedata
+            cleaned = unicodedata.normalize('NFKC', cleaned)
+        
+        # Remove URLs
+        if self.config.remove_urls:
+            cleaned = self.url_pattern.sub('', cleaned)
+        
+        # Remove emails
+        if self.config.remove_emails:
+            cleaned = self.email_pattern.sub('', cleaned)
+        
+        # Remove special characters
+        if self.config.remove_special_chars:
+            cleaned = self.special_chars_pattern.sub(' ', cleaned)
+        
+        # Remove extra whitespace
+        if self.config.remove_extra_whitespace:
+            cleaned = self.whitespace_pattern.sub(' ', cleaned)
+        
+        return cleaned.strip()
+    
+    def _truncate_content(self, content: str) -> str:
+        """Truncate content yang terlalu panjang"""
+        max_length = self.config.max_content_length
+        
+        if len(content) <= max_length:
+            return content
+        
+        # Try to cut at sentence boundary
+        truncated = content[:max_length - 20]
+        last_sentence_end = max(
+            truncated.rfind('.'),
+            truncated.rfind('!'),
+            truncated.rfind('?')
+        )
+        
+        if last_sentence_end > len(truncated) * 0.7:
+            return truncated[:last_sentence_end + 1]
+        else:
+            # Cut at word boundary
+            last_space = truncated.rfind(' ')
+            if last_space > len(truncated) * 0.8:
+                return truncated[:last_space] + "..."
+            else:
+                return truncated + "..."
+    
+    def _chunk_content(self, content: str) -> List[str]:
+        """Chunk content jika diperlukan"""
+        if len(content) <= self.config.chunk_size:
+            return [content]
+        
+        if self.config.chunk_method == "sentence":
+            return self._chunk_by_sentence(content)
+        elif self.config.chunk_method == "paragraph":
+            return self._chunk_by_paragraph(content)
+        elif self.config.chunk_method == "fixed":
+            return self._chunk_by_fixed_size(content)
+        else:
+            return [content]  # No chunking
+    
+    def _chunk_by_sentence(self, text: str) -> List[str]:
+        """Chunk by sentences"""
+        sentences = re.split(r'[.!?]+\s+', text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        
+        chunks = []
+        current_chunk = ""
+        
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) > self.config.chunk_size and current_chunk:
+                chunks.append(current_chunk.strip())
+                
+                # Handle overlap
+                if self.config.chunk_overlap > 0:
+                    overlap_text = current_chunk[-self.config.chunk_overlap:]
+                    current_chunk = overlap_text + " " + sentence
+                else:
+                    current_chunk = sentence
+            else:
+                if current_chunk:
+                    current_chunk += " " + sentence
+                else:
+                    current_chunk = sentence
+        
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        
+        return chunks
+    
+    def _chunk_by_paragraph(self, text: str) -> List[str]:
+        """Chunk by paragraphs"""
+        paragraphs = text.split('\n\n')
+        paragraphs = [p.strip() for p in paragraphs if p.strip()]
+        
+        chunks = []
+        current_chunk = ""
+        
+        for paragraph in paragraphs:
+            if len(current_chunk) + len(paragraph) > self.config.chunk_size and current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = paragraph
+            else:
+                if current_chunk:
+                    current_chunk += "\n\n" + paragraph
+                else:
+                    current_chunk = paragraph
+        
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        
+        return chunks
+    
+    def _chunk_by_fixed_size(self, text: str) -> List[str]:
+        """Chunk by fixed size dengan overlap"""
+        chunks = []
+        start = 0
+        
+        while start < len(text):
+            end = start + self.config.chunk_size
+            chunk = text[start:end]
+            
+            # Try to break at word boundary
+            if end < len(text):
+                last_space = chunk.rfind(' ')
+                if last_space > len(chunk) * 0.8:
+                    chunk = chunk[:last_space]
+                    end = start + last_space
+            
+            chunks.append(chunk.strip())
+            
+            # Move with overlap
+            start = end - self.config.chunk_overlap
+            if start <= 0:
+                start = end
+        
+        return [chunk for chunk in chunks if chunk]
+    
+    def _process_score(self, retrieval_score: float, document_index: int, total_documents: int) -> float:
+        """Process and normalize score"""
+        if not self.config.use_retrieval_scores:
+            return 1.0
+        
+        score = retrieval_score * self.config.score_boost_factor
+        
+        # Normalize to 0-1 range jika diperlukan
+        if self.config.normalize_scores:
+            # Assume retrieval scores are already normalized, but ensure they are in range
+            score = max(0.0, min(1.0, score))
+        
+        return round(score, 4)
+    
+    def _extract_source(self, document: Document) -> str:
+        """Extract source dari document metadata"""
+        metadata = document.metadata or {}
+        
+        # Try different metadata keys for source
+        source_keys = ['source', 'file_name', 'filename', 'title', 'file_path', 'path']
+        
+        for key in source_keys:
+            if key in metadata and metadata[key]:
+                return str(metadata[key])
+        
+        # Fallback to generic source
+        return "unknown_source"
+    
+    def _build_metadata(self,
+                       document: Document,
+                       retrieval_result: RetrievalResult,
+                       document_index: int,
+                       chunk_index: Optional[int],
+                       total_chunks: int,
+                       total_documents: int,
+                       content: str) -> Dict[str, Any]:
+        """Build comprehensive metadata"""
+        metadata = {}
+        
+        if self.config.extract_metadata:
+            # Include original document metadata
+            if document.metadata and self.config.include_document_info:
+                metadata.update({
+                    "original_metadata": document.metadata,
+                    "document_index": document_index,
+                    "total_documents": total_documents
+                })
+            
+            # Include chunking info
+            if chunk_index is not None:
+                metadata.update({
+                    "chunk_index": chunk_index,
+                    "total_chunks": total_chunks,
+                    "is_chunked": total_chunks > 1
+                })
+            
+            # Include retrieval info
+            if self.config.include_retrieval_info:
+                metadata.update({
+                    "retrieval_query": retrieval_result.query,
+                    "retrieval_time": retrieval_result.retrieval_time,
+                    "retrieval_metadata": retrieval_result.metadata
+                })
+            
+            # Include processing info
+            if self.config.include_timestamps:
+                metadata.update({
+                    "processed_at": datetime.now().isoformat(),
+                    "processor_config": {
+                        "chunking_enabled": self.config.enable_chunking,
+                        "chunk_method": self.config.chunk_method if self.config.enable_chunking else None,
+                        "cleaning_enabled": any([
+                            self.config.remove_extra_whitespace,
+                            self.config.remove_special_chars,
+                            self.config.normalize_unicode,
+                            self.config.remove_urls,
+                            self.config.remove_emails
+                        ])
+                    }
+                })
+            
+            # Content statistics
+            word_count = len(content.split())
+            sentence_count = len(re.split(r'[.!?]+', content))
+            
+            metadata.update({
+                "content_stats": {
+                    "character_count": len(content),
+                    "word_count": word_count,
+                    "sentence_count": max(1, sentence_count),
+                    "avg_words_per_sentence": round(word_count / max(1, sentence_count), 1)
+                }
+            })
+        
+        return metadata
+    
+    def _post_process_contexts(self, contexts: List[RetrievalResult]) -> List[RetrievalResult]:
+        """Post-processing untuk final contexts"""
+        if not contexts:
+            return contexts
+        
+        # Sort by score (descending)
+        if self.config.use_retrieval_scores:
+            contexts.sort(key=lambda x: x.score or 0.0, reverse=True)
+        
+        # Additional filtering jika diperlukan
+        filtered_contexts = []
+        for ctx in contexts:
+            if self.config.filter_empty_content and not ctx.content.strip():
+                continue
+            filtered_contexts.append(ctx)
+        
+        return filtered_contexts
+    
+    def get_processing_stats(self, contexts: List[RetrievalResult]) -> Dict[str, Any]:
+        """Get statistics tentang processed contexts"""
+        if not contexts:
+            return {"total_contexts": 0}
+        
+        total_contexts = len(contexts)
+        total_words = sum(len(ctx.content.split()) for ctx in contexts)
+        total_chars = sum(len(ctx.content) for ctx in contexts)
+        
+        # Score distribution
+        scores = [ctx.score for ctx in contexts if ctx.score is not None]
+        
+        # Source distribution
+        sources = {}
+        for ctx in contexts:
+            if ctx.source:
+                sources[ctx.source] = sources.get(ctx.source, 0) + 1
+        
+        # Chunking stats
+        chunked_contexts = sum(1 for ctx in contexts 
+                             if ctx.metadata and ctx.metadata.get("is_chunked", False))
+        
+        stats = {
+            "total_contexts": total_contexts,
+            "total_words": total_words,
+            "total_characters": total_chars,
+            "avg_words_per_context": round(total_words / total_contexts, 1),
+            "avg_chars_per_context": round(total_chars / total_contexts, 1),
+            "chunked_contexts": chunked_contexts,
+            "chunking_percentage": round((chunked_contexts / total_contexts) * 100, 1)
+        }
+        
+        if scores:
+            stats["score_stats"] = {
+                "min_score": min(scores),
+                "max_score": max(scores),
+                "avg_score": round(sum(scores) / len(scores), 4),
+                "median_score": round(sorted(scores)[len(scores)//2], 4)
+            }
+        
+        if sources:
+            stats["source_distribution"] = sources
+            stats["unique_sources"] = len(sources)
+        
+        # Content length distribution
+        lengths = [len(ctx.content) for ctx in contexts]
+        stats["content_length_stats"] = {
+            "min_length": min(lengths),
+            "max_length": max(lengths),
+            "avg_length": round(sum(lengths) / len(lengths), 1)
+        }
+        
+        return stats
+    
+    def batch_process_retrieval_results(self, 
+                                       retrieval_results: List[RetrievalResult]) -> List[RetrievalResult]:
+        """
+        Process multiple RetrievalResult objects sekaligus
+        
+        Args:
+            retrieval_results: List of RetrievalResult objects
+            
+        Returns:
+            Combined List[RetrievalResult]
+        """
+        if not retrieval_results:
+            return []
+        
+        self.logger.info(f"Batch processing {len(retrieval_results)} retrieval results")
+        
+        all_contexts = []
+        
+        for i, result in enumerate(retrieval_results):
+            try:
+                contexts = self.process_retrieval_result(result)
+                
+                # Add batch info to metadata
+                for ctx in contexts:
+                    if ctx.metadata:
+                        ctx.metadata["batch_index"] = i
+                        ctx.metadata["batch_query"] = result.query
+                    else:
+                        ctx.metadata = {
+                            "batch_index": i,
+                            "batch_query": result.query
+                        }
+                
+                all_contexts.extend(contexts)
+                
+            except Exception as e:
+                self.logger.error(f"Error processing retrieval result {i}: {e}")
+                continue
+        
+        # Final post-processing untuk batch
+        all_contexts = self._post_process_contexts(all_contexts)
+        
+        self.logger.info(f"Batch processing completed: {len(all_contexts)} total contexts")
+        
+        return all_contexts
+    
+    def filter_contexts_by_query_relevance(self, 
+                                          contexts: List[RetrievalResult],
+                                          query: str,
+                                          min_relevance_score: float = 0.5) -> List[RetrievalResult]:
+        """
+        Filter contexts berdasarkan relevance dengan query (simple keyword matching)
+        
+        Args:
+            contexts: List of RetrievalResult
+            query: Original query string
+            min_relevance_score: Minimum relevance score threshold
+            
+        Returns:
+            Filtered List[RetrievalResult]
+        """
+        if not contexts or not query:
+            return contexts
+        
+        query_words = set(query.lower().split())
+        filtered_contexts = []
+        
+        for ctx in contexts:
+            content_words = set(ctx.content.lower().split())
+            
+            # Simple relevance calculation: overlap of words
+            overlap = len(query_words.intersection(content_words))
+            relevance_score = overlap / len(query_words) if query_words else 0.0
+            
+            if relevance_score >= min_relevance_score:
+                # Update metadata dengan relevance info
+                if ctx.metadata:
+                    ctx.metadata["query_relevance_score"] = round(relevance_score, 3)
+                    ctx.metadata["matched_query_words"] = list(query_words.intersection(content_words))
+                else:
+                    ctx.metadata = {
+                        "query_relevance_score": round(relevance_score, 3),
+                        "matched_query_words": list(query_words.intersection(content_words))
+                    }
+                
+                filtered_contexts.append(ctx)
+        
+        # Sort by relevance score
+        filtered_contexts.sort(
+            key=lambda x: x.metadata.get("query_relevance_score", 0.0), 
+            reverse=True
+        )
+        
+        self.logger.info(
+            f"Filtered {len(contexts)} contexts to {len(filtered_contexts)} "
+            f"based on query relevance (min_score: {min_relevance_score})"
+        )
+        
+        return filtered_contexts
+    
+    def deduplicate_contexts(self, 
+                           contexts: List[RetrievalResult],
+                           similarity_threshold: float = 0.8) -> List[RetrievalResult]:
+        """
+        Remove duplicate atau very similar contexts
+        
+        Args:
+            contexts: List of RetrievalResult
+            similarity_threshold: Threshold for considering contexts as duplicates
+            
+        Returns:
+            Deduplicated List[RetrievalResult]
+        """
+        if not contexts:
+            return contexts
+        
+        from difflib import SequenceMatcher
+        
+        def similarity(a, b):
+            return SequenceMatcher(None, a, b).ratio()
+        
+        deduplicated = []
+        
+        for ctx in contexts:
+            is_duplicate = False
+            
+            for existing_ctx in deduplicated:
+                sim_score = similarity(ctx.content, existing_ctx.content)
+                
+                if sim_score >= similarity_threshold:
+                    is_duplicate = True
+                    
+                    # Keep the one with higher score
+                    if (ctx.score or 0.0) > (existing_ctx.score or 0.0):
+                        # Replace existing with current
+                        idx = deduplicated.index(existing_ctx)
+                        deduplicated[idx] = ctx
+                    
+                    break
+            
+            if not is_duplicate:
+                deduplicated.append(ctx)
+        
+        self.logger.info(
+            f"Deduplicated {len(contexts)} contexts to {len(deduplicated)} "
+            f"(similarity_threshold: {similarity_threshold})"
+        )
+        
+        return deduplicated
+    
+    def merge_processing_results(self, 
+                               processing_results: List[ProcessingResult]) -> List[RetrievalResult]:
+        """
+        Merge multiple ProcessingResult objects menjadi RetrievalResult list
+        
+        Args:
+            processing_results: List of ProcessingResult objects
+            
+        Returns:
+            List[RetrievalResult]
+        """
+        if not processing_results:
+            return []
+        
+        all_contexts = []
+        
+        for i, proc_result in enumerate(processing_results):
+            if not proc_result.success:
+                self.logger.warning(f"Skipping failed processing result {i}: {proc_result.error_message}")
+                continue
+            
+            if not proc_result.chunks:
+                continue
+            
+            # Convert Document chunks to RetrievalResult
+            for j, chunk in enumerate(proc_result.chunks):
+                # Extract source dari document metadata
+                source = self._extract_source(chunk)
+                
+                # Build metadata from ProcessingResult
+                metadata = {
+                    "document_metadata": proc_result.document_metadata.__dict__,
+                    "chunk_index": j,
+                    "total_chunks": len(proc_result.chunks),
+                    "processing_result_index": i,
+                    "processed_at": datetime.now().isoformat()
+                }
+                
+                # Include original chunk metadata
+                if chunk.metadata:
+                    metadata["original_chunk_metadata"] = chunk.metadata
+                
+                # Clean content
+                cleaned_content = self._clean_text(chunk.page_content)
+                
+                if not cleaned_content or len(cleaned_content) < self.config.min_content_length:
+                    continue
+                
+                # Create RetrievalResult
+                context = RetrievalResult(
+                    content=cleaned_content,
+                    source=source,
+                    score=1.0,  # Default score for processing results
+                    metadata=metadata
+                )
+                
+                all_contexts.append(context)
+        
+        self.logger.info(f"Merged {len(processing_results)} processing results into {len(all_contexts)} contexts")
+        
+        return all_contexts
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/qwen_llm.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/qwen_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..74cd038efd48629ae7a3f9a68c6b7d54c1d29699
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/qwen_llm.py
@@ -0,0 +1,921 @@
+import torch
+import asyncio
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer, BitsAndBytesConfig
+import torch
+from typing import Optional, Dict, Any, List, Union, Callable, Awaitable, AsyncGenerator
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from threading import Thread
+from rag.prompt_tuner.chat_template import RAG_TEMPLATES
+from rag.retriever.retriever_types import RetrievalResult
+from langchain_core.documents import Document
+import copy
+
+@dataclass
+class QwenConfig:
+    """Konfigurasi untuk model Qwen 0.5B"""
+    model_name: str = "Qwen/Qwen2.5-0.5B-Instruct"
+    device: str = "cuda"
+    torch_dtype: torch.dtype = torch.float16
+    max_length: int = 2048
+    temperature: float = 0.7
+    top_p: float = 0.8
+    top_k: int = 50
+    do_sample: bool = True
+    pad_token_id: Optional[int] = None
+    eos_token_id: Optional[int] = None
+    # RAG-specific configs
+    max_context_length: int = 1500
+    context_separator: str = "\n---\n"
+    instruction_template: str = "system"  # "system", "instruction", "custom"
+    # Async-specific configs
+    max_workers: int = 2
+    generation_timeout: float = 30
+    repetition_penalty: float = 1.0
+    # Streaming-specific configs
+    stream_timeout: float = 10  # timeout untuk stream chunk
+    skip_prompt: bool = True     # skip prompt dari streaming output
+
+class QwenLLM:
+    """
+    Async LLM Qwen 0.5B dengan interface yang mudah digunakan
+    Termasuk prompt formatting khusus untuk RAG (Retrieval-Augmented Generation)
+    Dan support untuk text streaming
+    """
+    
+    def __init__(self, config: Optional[QwenConfig] = None, rag_templates = RAG_TEMPLATES()):
+        """
+        Inisialisasi QwenLLM
+        
+        Args:
+            config: Konfigurasi model (optional, akan menggunakan default jika None)
+        """
+        self.config = config or QwenConfig()
+        self.config.quantization_config = BitsAndBytesConfig(
+                            load_in_4bit=True,                      # Enable 4-bit quantization
+                            bnb_4bit_use_double_quant=True,         # Use double quantization
+                            bnb_4bit_quant_type="nf4",              # Use NF4 quantization
+                            bnb_4bit_compute_dtype=torch.bfloat16,  # Compute dtype for 4bit base models
+                            )
+        self.tokenizer : AutoTokenizer = None
+        self.model = None
+        self.generation_config = None
+        self.is_loaded = False
+        self.executor = ThreadPoolExecutor(max_workers=self.config.max_workers)
+        self._lock = asyncio.Lock()
+        # Setup logging
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        
+        # RAG prompt templates
+        self.rag_templates = rag_templates
+    
+    async def load_model(self) -> None:
+        """Load model dan tokenizer secara async"""
+        async with self._lock:
+            if self.is_loaded:
+                self.logger.info("Model already loaded")
+                return
+            
+            try:
+                self.logger.info(f"Loading model: {self.config.model_name}")
+                
+                # Load tokenizer dalam thread pool
+                self.tokenizer = await asyncio.get_event_loop().run_in_executor(
+                    self.executor,
+                    lambda: AutoTokenizer.from_pretrained(
+                        self.config.model_name,
+                        trust_remote_code=True,
+                        torch_dtype="auto",
+                        device_map="auto"
+                    )
+                )
+                
+                # Load model dalam thread pool
+                self.model = await asyncio.get_event_loop().run_in_executor(
+                    self.executor,
+                    lambda: AutoModelForCausalLM.from_pretrained(
+                        self.config.model_name,
+                        quantization_config=self.config.quantization_config,
+                        torch_dtype=self.config.torch_dtype,
+                        device_map=self.config.device,
+                        trust_remote_code=True
+                    )
+                )
+                
+                # Setup generation config
+                self.generation_config = GenerationConfig(
+                    max_length=self.config.max_length,
+                    temperature=self.config.temperature,
+                    top_p=self.config.top_p,
+                    top_k=self.config.top_k,
+                    do_sample=self.config.do_sample,
+                    pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                    eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                    repetition_penalty = self.config.repetition_penalty,
+                )
+                
+                self.is_loaded = True
+                self.logger.info("Model loaded successfully!")
+                
+            except Exception as e:
+                self.logger.error(f"Error loading model: {e}")
+                raise
+    
+    def get_available_templates(self) -> List[str]:
+        """
+        Dapatkan list template yang tersedia
+        
+        Returns:
+            List of available template names
+        """
+        return list(self.rag_templates.keys())
+    
+    def preview_template(self, template_type: str, sample_question: str = "Apa itu AI?", 
+                        sample_context: str = "Artificial Intelligence adalah teknologi...") -> str:
+        """
+        Preview template dengan sample data
+        
+        Args:
+            template_type: Template type to preview
+            sample_question: Sample question
+            sample_context: Sample context
+            
+        Returns:
+            Preview of formatted template
+        """
+        if template_type not in self.rag_templates:
+            return f"Template '{template_type}' tidak tersedia. Available: {self.get_available_templates()}"
+        
+        template_data = copy.deepcopy(self.rag_templates[template_type])
+        # template_key = "user_template" if "user_template" in template_data else "template"
+        
+        return template_data["content"].format(
+            context=sample_context,
+            question=sample_question
+        )
+    
+    def _format_context(self, contexts: Union[List[str], RetrievalResult], numbering: bool = True) -> str:
+        """
+        Format retrieved contexts menjadi string yang coherent
+        
+        Args:
+            contexts: List of contexts (string atau RetrievalResult objects)
+            numbering: Whether to add document numbering
+            
+        Returns:
+            Formatted context string
+        """
+        if not contexts:
+            return ""
+        
+        formatted_contexts = []
+        if isinstance(contexts, RetrievalResult):
+                for i, ctx in enumerate(contexts.documents, 1):
+                    if numbering:
+                        header = f"[Dokumen {i}"
+                        if contexts.scores[i - 1]:
+                            header += f" (Skor: {contexts.scores[i - 1]:.3f})"
+                        header += "]"
+                    else:
+                        header = "[Dokumen"
+                        header += "]"
+                    formatted_contexts.append(f"{header}\n{ctx.page_content}")
+        else:
+            for i, ctx in enumerate(contexts, 1):
+                if isinstance(ctx, str):
+                    header = f"[Dokumen {i}]" if numbering else "[Dokumen]"
+                    formatted_contexts.append(f"{header}\n{ctx}")
+                else:
+                    header = f"[Dokumen {i}]" if numbering else "[Dokumen]"
+                    formatted_contexts.append(f"{header}\n{str(ctx)}")
+        
+        return self.config.context_separator.join(formatted_contexts)
+    
+    def _truncate_context(self, context: str, max_length: int) -> str:
+        """
+        Truncate context jika terlalu panjang
+        
+        Args:
+            context: Context string
+            max_length: Maximum length in characters
+            
+        Returns:
+            Truncated context
+        """
+        if len(context) <= max_length:
+            return context
+        
+        # Truncate dan tambahkan indicator
+        truncated = context[:max_length - 50]
+        return truncated + "\n\n[... Context dipotong karena terlalu panjang ...]"
+
+    async def format_rag_prompt(self, 
+                                question: str, 
+                                contexts: Union[List[str], RetrievalResult],
+                                template_type: Optional[str] = None,
+                                custom_template: Optional[str] = None,
+                                include_metadata: bool = True,
+                                context_numbering: bool = True,
+                                max_contexts: Optional[int] = None) -> str:
+        """
+        Format prompt untuk RAG dengan berbagai template options (async)
+        """
+        
+        def _format_sync():
+            
+            # Handle RetrievalResult secara eksplisit
+            if isinstance(contexts, RetrievalResult):
+                docs = contexts.documents
+                if max_contexts:
+                    docs = docs[:max_contexts]
+                processed_contexts = RetrievalResult(
+                    documents=docs,
+                    scores=contexts.scores[:len(docs)] if contexts.scores else [],
+                    query=contexts.query,
+                    retrieval_time=contexts.retrieval_time,
+                    metadata=contexts.metadata
+                )
+            else:
+                # contexts diasumsikan sebagai list biasa (list[str] atau list[Document])
+                processed_contexts = contexts[:max_contexts] if max_contexts and len(contexts) > max_contexts else contexts
+
+            # Format context menjadi string
+            formatted_context = self._format_context(processed_contexts, context_numbering)
+
+            # Truncate jika panjang melebihi batas
+            formatted_context = self._truncate_context(
+                formatted_context, 
+                self.config.max_context_length
+            )
+
+            # Tambah metadata jika diizinkan dan konteks adalah RetrievalResult
+            if include_metadata and isinstance(processed_contexts, RetrievalResult):
+                metadata_info = []
+                for i, doc in enumerate(processed_contexts.documents, 1):
+                    if hasattr(doc, "metadata") and doc.metadata:
+                        metadata_info.append(f"Dokumen {i}: {doc.metadata}")
+                # if metadata_info:
+                #     formatted_context += f"\n\n[Metadata]\n" + "\n".join(metadata_info)
+
+            return formatted_context
+
+        # Jalankan _format_sync di thread pool
+        formatted_context = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _format_sync
+        )
+
+        # Tentukan template yang akan dipakai
+        if(template_type == ""):
+            self.config.instruction_template = "system"
+        # Gunakan custom template jika disediakan
+        if custom_template:
+            return custom_template.format(
+                context=formatted_context,
+                question=question
+            )
+        elif template_type in self.rag_templates.keys():
+            print("question", question)
+           
+            template_data = copy.deepcopy(self.rag_templates[template_type])
+            print("template = ", template_type, "rag template = ", template_data)
+            # template_key = "user_template" if "user_template" in template_data else "template"
+
+            formatted_template = []
+            for cht in template_data:
+                # print("question for template = ", question)
+               
+                if("{context}" in cht["content"]):
+                    cht["content"] = cht["content"].format(context=formatted_context)
+                    
+                if("{question}" in cht["content"]):
+                    cht["content"] = cht["content"].format(question=question)
+                formatted_template.append(cht)
+
+            self.logger.info("Formatted Template", formatted_template)
+            print("Forrmatted Template", formatted_template)
+            return formatted_template
+        else:
+            # Fallback default template
+            return [
+                 {"role": "system", "content": "You are a helpful assistant."},
+                 {"role": "user", "content": question}
+            ]
+
+    async def generate_stream(self, 
+                             prompt: List[Dict], 
+                             max_new_tokens: Optional[int] = None,
+                             temperature: Optional[float] = None,
+                             top_p: Optional[float] = None,
+                             **kwargs) -> AsyncGenerator[str, None]:
+        """
+        Generate text dari prompt secara streaming async
+        
+        Args:
+            prompt: Input text prompt
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation (override config)
+            top_p: Top-p untuk generation (override config)
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Generated text chunks
+        """
+        await self._check_model_loaded()
+        
+        # Setup streamer
+        streamer = TextIteratorStreamer(
+            self.tokenizer, 
+            timeout=self.config.stream_timeout,
+            skip_prompt=self.config.skip_prompt,
+            skip_special_tokens=True
+        )
+        
+        def _generate_sync():
+            try:
+                # Tokenize input
+                inputs = self.tokenizer.apply_chat_template(
+                    prompt,
+                    add_generation_prompt=True,
+                    return_tensors="pt"
+                )
+                
+                # Override generation config jika diperlukan
+                gen_config = self.generation_config
+                if any([max_new_tokens, temperature, top_p]):
+                    gen_config = GenerationConfig(
+                        max_new_tokens=max_new_tokens or self.config.max_length,
+                        temperature=temperature or self.config.temperature,
+                        top_p=top_p or self.config.top_p,
+                        top_k=self.config.top_k,
+                        do_sample=self.config.do_sample,
+                        pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                        eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                        repetition_penalty=self.config.repetition_penalty,
+                        **kwargs
+                    )
+                
+                # Move to GPU
+                self.model.to("cuda")
+                input_ids = inputs.to("cuda")
+                
+                # Generate dalam thread terpisah
+                generation_kwargs = {
+                    "input_ids": input_ids,
+                    "generation_config": gen_config,
+                    "streamer": streamer,
+                    **kwargs
+                }
+                
+                thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+                thread.start()
+                
+                return thread
+                
+            except Exception as e:
+                self.logger.error(f"Error during stream generation setup: {e}")
+                raise
+        
+        # Setup generation thread
+        generation_thread = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _generate_sync
+        )
+        
+        try:
+            # Stream tokens
+            for token in streamer:
+                if token:  # Skip empty tokens
+                    yield token
+                    
+            # Wait for generation thread to finish
+            await asyncio.get_event_loop().run_in_executor(
+                self.executor, generation_thread.join
+            )
+            
+        except Exception as e:
+            self.logger.error(f"Error during streaming: {e}")
+            # Make sure thread is cleaned up
+            if generation_thread.is_alive():
+                generation_thread.join(timeout=1.0)
+            raise
+
+    async def rag_generate_stream(self,
+                                 question: str,
+                                 contexts: Union[List[str], RetrievalResult],
+                                 template_type: Optional[str] = None,
+                                 max_new_tokens: Optional[int] = None,
+                                 temperature: Optional[float] = None,
+                                 **kwargs) -> AsyncGenerator[str, None]:
+        """
+        Generate jawaban untuk RAG secara streaming async
+        
+        Args:
+            question: User question
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Generated answer chunks
+        """
+        await self._check_model_loaded()
+        
+        # Format prompt
+        prompt = await self.format_rag_prompt(question, contexts, template_type)
+        
+        # Generate dengan temperature yang lebih rendah untuk RAG (lebih faktual)
+        temp = temperature if temperature is not None else 0.3
+        
+        async for chunk in self.generate_stream(
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temp,
+            **kwargs
+        ):
+            yield chunk
+
+    async def chat_stream(self, 
+                         messages: List[Dict[str, str]], 
+                         max_new_tokens: Optional[int] = None,
+                         **kwargs) -> AsyncGenerator[str, None]:
+        """
+        Chat dengan format conversation secara streaming async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Response text chunks
+        """
+        await self._check_model_loaded()
+        
+        def _format_chat():
+            try:
+                # Format messages untuk chat
+                formatted_prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                return formatted_prompt
+                
+            except Exception as e:
+                self.logger.error(f"Error during chat formatting: {e}")
+                raise
+        
+        # Format chat template dalam thread pool
+        formatted_prompt = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _format_chat
+        )
+        
+        async for chunk in self.generate_stream(
+            formatted_prompt, 
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        ):
+            yield chunk
+
+    async def rag_chat_stream(self,
+                             messages: List[Dict[str, str]],
+                             contexts: Union[List[str], RetrievalResult],
+                             template_type: Optional[str] = None,
+                             max_new_tokens: Optional[int] = None,
+                             **kwargs) -> AsyncGenerator[str, None]:
+        """
+        RAG Chat dengan format conversation secara streaming async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Yields:
+            Response text chunks
+        """
+        await self._check_model_loaded()
+        
+        # Ambil last user message sebagai question
+        user_messages = [msg for msg in messages if msg.get("role") == "user"]
+        if not user_messages:
+            raise ValueError("No user message found in conversation")
+        
+        last_question = user_messages[-1]["content"]
+        
+        # Generate RAG response secara streaming
+        async for chunk in self.rag_generate_stream(
+            question=last_question,
+            contexts=contexts,
+            template_type=template_type,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        ):
+            yield chunk
+
+    # Utility method untuk collect full response dari stream
+    async def collect_stream(self, stream_generator: AsyncGenerator[str, None]) -> str:
+        """
+        Collect semua chunks dari stream generator menjadi full text
+        
+        Args:
+            stream_generator: AsyncGenerator yang menghasilkan text chunks
+            
+        Returns:
+            Complete generated text
+        """
+        chunks = []
+        async for chunk in stream_generator:
+            chunks.append(chunk)
+        return "".join(chunks)
+    
+    async def multi_template_generate(self,
+                                    question: str,
+                                    contexts: Union[List[str], RetrievalResult],
+                                    template_types: List[str],
+                                    max_new_tokens: Optional[int] = None,
+                                    **kwargs) -> Dict[str, str]:
+        """
+        Generate jawaban menggunakan multiple templates secara concurrent
+        
+        Args:
+            question: User question
+            contexts: List of retrieved contexts
+            template_types: List of template types to use
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Dictionary dengan template_type sebagai key dan response sebagai value
+        """
+        await self._check_model_loaded()
+        
+        # Create tasks untuk concurrent generation
+        tasks = []
+        for template_type in template_types:
+            task = asyncio.create_task(
+                self._generate_single_template(
+                    question, contexts, template_type, max_new_tokens, **kwargs
+                )
+            )
+            tasks.append((template_type, task))
+        
+        # Wait for all tasks
+        results = {}
+        for template_type, task in tasks:
+            try:
+                response = await task
+                results[template_type] = response
+            except Exception as e:
+                self.logger.error(f"Error generating with template {template_type}: {e}")
+                results[template_type] = f"Error: {str(e)}"
+        
+        return results
+    
+    async def _generate_single_template(self,
+                                      question: str,
+                                      contexts: Union[List[str], RetrievalResult],
+                                      template_type: str,
+                                      max_new_tokens: Optional[int] = None,
+                                      **kwargs) -> str:
+        """Helper method untuk single template generation"""
+        return await self.rag_generate(
+            question=question,
+            contexts=contexts,
+            template_type=template_type,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+    
+    async def rag_generate(self,
+                          question: str,
+                          contexts: Union[List[str], RetrievalResult],
+                          template_type: Optional[str] = None,
+                          max_new_tokens: Optional[int] = None,
+                          temperature: Optional[float] = None,
+                          **kwargs) -> str:
+        """
+        Generate jawaban untuk RAG secara async
+        
+        Args:
+            question: User question
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Generated answer
+        """
+        await self._check_model_loaded()
+        
+        # Format prompt
+        prompt = await self.format_rag_prompt(question, contexts, template_type)
+        
+        # Generate dengan temperature yang lebih rendah untuk RAG (lebih faktual)
+        temp = temperature if temperature is not None else 0.3
+        
+        return await self.generate(
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temp,
+            **kwargs
+        )
+    
+    async def rag_chat(self,
+                      messages: List[Dict[str, str]],
+                      contexts: Union[List[str], RetrievalResult],
+                      template_type: Optional[str] = None,
+                      max_new_tokens: Optional[int] = None,
+                      **kwargs) -> str:
+        """
+        RAG Chat dengan format conversation secara async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            contexts: List of retrieved contexts
+            template_type: Template type untuk formatting
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Response text
+        """
+        await self._check_model_loaded()
+        
+        # Ambil last user message sebagai question
+        user_messages = [msg for msg in messages if msg.get("role") == "user"]
+        if not user_messages:
+            raise ValueError("No user message found in conversation")
+        
+        last_question = user_messages[-1]["content"]
+        
+        # Generate RAG response
+        return await self.rag_generate(
+            question=last_question,
+            contexts=contexts,
+            template_type=template_type,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+    
+    async def _check_model_loaded(self) -> None:
+        """Cek apakah model sudah di-load secara async"""
+        if not self.is_loaded:
+            raise RuntimeError("Model belum di-load. Panggil await load_model() terlebih dahulu.")
+    
+    async def generate(self, 
+                      prompt: str, 
+                      max_new_tokens: Optional[int] = None,
+                      temperature: Optional[float] = None,
+                      top_p: Optional[float] = None,
+                      **kwargs) -> str:
+        """
+        Generate text dari prompt secara async
+        
+        Args:
+            prompt: Input text prompt
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation (override config)
+            top_p: Top-p untuk generation (override config)
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Generated text
+        """
+        await self._check_model_loaded()
+        
+        def _generate_sync():
+            try:
+                # Tokenize input
+                inputs = self.tokenizer(prompt, return_tensors="pt")
+                
+                # Override generation config jika diperlukan
+                gen_config = self.generation_config
+                if any([max_new_tokens, temperature, top_p]):
+                    gen_config = GenerationConfig(
+                        max_new_tokens=max_new_tokens or self.config.max_length,
+                        temperature=temperature or self.config.temperature,
+                        top_p=top_p or self.config.top_p,
+                        top_k=self.config.top_k,
+                        do_sample=self.config.do_sample,
+                        pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                        eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                        repetition_penalty = self.config.repetition_penalty,
+                        **kwargs
+                    )
+                
+                # Generate
+                with torch.no_grad():
+                    
+                    self.model.to("cuda")
+                    input_ids = inputs.input_ids.to("cuda")
+
+                    outputs = self.model.generate(
+                        input_ids,
+                        generation_config=gen_config,
+                        **kwargs
+                    )
+                
+                # Decode output
+                generated_text = self.tokenizer.decode(
+                    outputs[0], 
+                    skip_special_tokens=True
+                )
+                
+                # Remove input prompt dari output
+                if generated_text.startswith(prompt):
+                    generated_text = generated_text[len(prompt):].strip()
+                
+                return generated_text
+                
+            except Exception as e:
+                self.logger.error(f"Error during generation: {e}")
+                raise
+        
+        # Run generation in thread pool dengan timeout
+        try:
+            result = await asyncio.wait_for(
+                asyncio.get_event_loop().run_in_executor(self.executor, _generate_sync),
+                timeout=self.config.generation_timeout
+            )
+            return result
+        except asyncio.TimeoutError:
+            self.logger.error(f"Generation timeout after {self.config.generation_timeout} seconds")
+            raise TimeoutError(f"Generation timeout after {self.config.generation_timeout} seconds")
+    
+    async def chat(self, 
+                  messages: List[Dict[str, str]], 
+                  max_new_tokens: Optional[int] = None,
+                  **kwargs) -> str:
+        """
+        Chat dengan format conversation secara async
+        
+        Args:
+            messages: List of messages dengan format [{"role": "user", "content": "..."}]
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Response text
+        """
+        await self._check_model_loaded()
+        
+        def _format_chat():
+            try:
+                # Format messages untuk chat
+                formatted_prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    return_tensors="pt"
+                )
+                return formatted_prompt
+                
+            except Exception as e:
+                self.logger.error(f"Error during chat formatting: {e}")
+                raise
+        
+        # Format chat template dalam thread pool
+        formatted_prompt = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _format_chat
+        )
+        
+        return await self.generate(
+            formatted_prompt, 
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+    
+    async def update_config(self, **kwargs) -> None:
+        """
+        Update konfigurasi model secara async
+        
+        Args:
+            **kwargs: Parameter konfigurasi yang akan diupdate
+        """
+        async with self._lock:
+            for key, value in kwargs.items():
+                if hasattr(self.config, key):
+                    setattr(self.config, key, value)
+                    self.logger.info(f"Updated {key} to {value}")
+                else:
+                    self.logger.warning(f"Unknown config parameter: {key}")
+            
+            # Update generation config jika model sudah loaded
+            if self.is_loaded:
+                self.generation_config = GenerationConfig(
+                    max_length=self.config.max_length,
+                    temperature=self.config.temperature,
+                    top_p=self.config.top_p,
+                    top_k=self.config.top_k,
+                    do_sample=self.config.do_sample,
+                    pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                    eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                    repetition_penalty = self.config.repetition_penalty,
+
+                )
+    
+    async def get_model_info(self) -> Dict[str, Any]:
+        """
+        Dapatkan informasi model secara async
+        
+        Returns:
+            Dictionary dengan informasi model
+        """
+        info = {
+            "model_name": self.config.model_name,
+            "is_loaded": self.is_loaded,
+            "config": self.config.__dict__
+        }
+        
+        if self.is_loaded:
+            # Get model info dalam thread pool
+            def _get_info():
+                return {
+                    "vocab_size": self.tokenizer.vocab_size,
+                    "model_parameters": sum(p.numel() for p in self.model.parameters()),
+                    "device": str(next(self.model.parameters()).device)
+                }
+            
+            model_info = await asyncio.get_event_loop().run_in_executor(
+                self.executor, _get_info
+            )
+            info.update(model_info)
+        
+        return info
+    
+    async def batch_generate(self, 
+                           prompts: List[str], 
+                           max_new_tokens: Optional[int] = None,
+                           **kwargs) -> List[str]:
+        """
+        Generate multiple prompts secara batch dan concurrent
+        
+        Args:
+            prompts: List of prompts to generate
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            List of generated texts
+        """
+        await self._check_model_loaded()
+        
+        # Create tasks untuk concurrent generation
+        tasks = [
+            asyncio.create_task(
+                self.generate(prompt, max_new_tokens=max_new_tokens, **kwargs)
+            )
+            for prompt in prompts
+        ]
+        
+        # Wait for all tasks
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results
+        processed_results = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                self.logger.error(f"Error generating prompt {i}: {result}")
+                processed_results.append(f"Error: {str(result)}")
+            else:
+                processed_results.append(result)
+        
+        return processed_results
+    
+    async def close(self) -> None:
+        """
+        Cleanup resources secara async
+        """
+        self.logger.info("Closing QwenLLM...")
+        
+        # Shutdown executor
+        self.executor.shutdown(wait=True)
+        
+        # Clear GPU memory
+        if hasattr(self, 'model') and self.model is not None:
+            del self.model
+        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+            del self.tokenizer
+        
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        
+        self.is_loaded = False
+        self.logger.info("QwenLLM closed successfully")
+    
+    async def __aenter__(self):
+        """Async context manager entry"""
+        await self.load_model()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        await self.close()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/reranker.temp b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/reranker.temp
new file mode 100644
index 0000000000000000000000000000000000000000..33b36f8739c986ee8ca84aebacff8d016095b381
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/pipeline/reranker.temp
@@ -0,0 +1,391 @@
+from typing import List, Dict, Any, Optional, Tuple
+import numpy as np
+import time
+from langchain_core.documents import Document
+from rag.FlagEmbedding import BGEM3FlagModel
+
+class BGEM3Reranker:
+    """BGE-M3 based reranker with support for dense, sparse, and multi-vector scoring"""
+    
+    def __init__(self, 
+                 model_name: str = 'BAAI/bge-m3',
+                 use_fp16: bool = True,
+                 weights: Dict[str, float] = None):
+        """
+        Initialize BGE-M3 reranker
+        
+        Args:
+            model_name: Model name/path for BGE-M3
+            use_fp16: Use FP16 for faster computation
+            weights: Weights for different scoring methods
+                    {'dense': 1.0, 'sparse': 1.0, 'colbert': 1.0}
+        """
+        self.model = BGEM3FlagModel(model_name, use_fp16=use_fp16)
+        self.weights = weights or {'dense': 1.0, 'sparse': 1.0, 'colbert': 0.0}
+        
+    def _extract_text_from_documents(self, documents: List[Document]) -> List[str]:
+        """Extract text content from LangChain documents"""
+        return [doc.page_content for doc in documents]
+    
+    def _compute_dense_scores(self, query_embedding: np.ndarray, doc_embeddings: np.ndarray) -> np.ndarray:
+        """Compute dense (semantic) similarity scores using matrix multiplication"""
+        # BGE-M3 embeddings are already normalized, so we can use direct matrix multiplication
+        # query_embedding shape: (embedding_dim,)
+        # doc_embeddings shape: (num_docs, embedding_dim)
+        scores = doc_embeddings @ query_embedding  # Direct matrix multiplication
+        return scores
+    
+    def _compute_sparse_scores(self, query_sparse: Dict, doc_sparse_list: List[Dict]) -> List[float]:
+        """Compute sparse (lexical) similarity scores"""
+        scores = []
+        for doc_sparse in doc_sparse_list:
+            score = self.model.compute_lexical_matching_score(query_sparse, doc_sparse)
+            scores.append(score)
+        return scores
+    
+    def _compute_colbert_scores(self, query_colbert: np.ndarray, doc_colbert_list: List[np.ndarray]) -> List[float]:
+        """Compute ColBERT multi-vector interaction scores using BGE-M3's native method"""
+        scores = []
+        for doc_colbert in doc_colbert_list:
+            # Use BGE-M3's native ColBERT scoring method
+            score = self.model.colbert_score(query_colbert, doc_colbert)
+            scores.append(float(score))
+        return scores
+    
+    def rerank(self, retrieval_result: RetrievalResult, top_k: Optional[int] = None) -> RetrievalResult:
+        """
+        Rerank documents using BGE-M3 multi-vector scoring
+        
+        Args:
+            retrieval_result: Original retrieval result
+            top_k: Number of top documents to return (None = return all)
+            
+        Returns:
+            RetrievalResult: Reranked retrieval result
+        """
+        start_time = time.time()
+        
+        if not retrieval_result.query:
+            raise ValueError("Query is required for reranking")
+        
+        if not retrieval_result.documents:
+            return retrieval_result
+        
+        # Extract texts
+        query_text = retrieval_result.query
+        doc_texts = self._extract_text_from_documents(retrieval_result.documents)
+        
+        # Encode query and documents
+        query_output = self.model.encode(
+            [query_text], 
+            return_dense=self.weights['dense'] > 0,
+            return_sparse=self.weights['sparse'] > 0,
+            return_colbert_vecs=self.weights['colbert'] > 0
+        )
+        
+        doc_output = self.model.encode(
+            doc_texts,
+            return_dense=self.weights['dense'] > 0,
+            return_sparse=self.weights['sparse'] > 0,
+            return_colbert_vecs=self.weights['colbert'] > 0
+        )
+        
+        # Compute individual scores
+        final_scores = np.zeros(len(doc_texts))
+        score_components = {}
+        
+        # Dense scores
+        if self.weights['dense'] > 0:
+            dense_scores = self._compute_dense_scores(
+                query_output['dense_vecs'][0],
+                doc_output['dense_vecs']
+            )
+            final_scores += self.weights['dense'] * dense_scores
+            score_components['dense'] = dense_scores.tolist()
+        
+        # Sparse scores
+        if self.weights['sparse'] > 0:
+            sparse_scores = self._compute_sparse_scores(
+                query_output['lexical_weights'][0],
+                doc_output['lexical_weights']
+            )
+            final_scores += self.weights['sparse'] * np.array(sparse_scores)
+            score_components['sparse'] = sparse_scores
+        
+        # ColBERT scores
+        if self.weights['colbert'] > 0:
+            colbert_scores = self._compute_colbert_scores(
+                query_output['colbert_vecs'][0],
+                doc_output['colbert_vecs']
+            )
+            final_scores += self.weights['colbert'] * np.array(colbert_scores)
+            score_components['colbert'] = colbert_scores
+        
+        # Sort by scores (descending)
+        sorted_indices = np.argsort(final_scores)[::-1]
+        
+        # Apply top_k filtering
+        if top_k:
+            sorted_indices = sorted_indices[:top_k]
+        
+        # Reorder documents and scores
+        reranked_documents = [retrieval_result.documents[i] for i in sorted_indices]
+        reranked_scores = [float(final_scores[i]) for i in sorted_indices]
+        
+        # Create metadata with score components
+        rerank_metadata = {
+            'reranker': 'BGE-M3',
+            'weights': self.weights,
+            'score_components': {
+                component: [scores[i] for i in sorted_indices] 
+                for component, scores in score_components.items()
+            },
+            'original_scores': [retrieval_result.scores[i] for i in sorted_indices] if retrieval_result.scores else None,
+            'rerank_time': time.time() - start_time
+        }
+        
+        # Merge with existing metadata
+        final_metadata = retrieval_result.metadata.copy() if retrieval_result.metadata else {}
+        final_metadata.update(rerank_metadata)
+        
+        return RetrievalResult(
+            documents=reranked_documents,
+            scores=reranked_scores,
+            query=retrieval_result.query,
+            retrieval_time=retrieval_result.retrieval_time,
+            metadata=final_metadata
+        )
+    
+    def rerank_with_scores(self, 
+                          query: str, 
+                          documents: List[Document], 
+                          original_scores: Optional[List[float]] = None,
+                          top_k: Optional[int] = None) -> RetrievalResult:
+        """
+        Convenience method to rerank documents directly
+        
+        Args:
+            query: Query string
+            documents: List of documents to rerank
+            original_scores: Original retrieval scores (optional)
+            top_k: Number of top documents to return
+            
+        Returns:
+            RetrievalResult: Reranked result
+        """
+        retrieval_result = RetrievalResult(
+            documents=documents,
+            scores=original_scores or [0.0] * len(documents),
+            query=query
+        )
+        
+        return self.rerank(retrieval_result, top_k=top_k)
+    
+    def rerank_dense_only(self, retrieval_result: RetrievalResult, 
+                         batch_size: int = 12, 
+                         max_length: int = 8192,
+                         top_k: Optional[int] = None) -> RetrievalResult:
+        """
+        Fast reranking using only dense embeddings (optimized for speed)
+        
+        Args:
+            retrieval_result: Original retrieval result
+            batch_size: Batch size for encoding
+            max_length: Maximum sequence length
+            top_k: Number of top documents to return
+            
+        Returns:
+            RetrievalResult: Reranked result using only dense similarity
+        """
+        start_time = time.time()
+        
+        if not retrieval_result.query:
+            raise ValueError("Query is required for reranking")
+        
+        if not retrieval_result.documents:
+            return retrieval_result
+        
+        # Extract texts
+        query_text = retrieval_result.query
+        doc_texts = self._extract_text_from_documents(retrieval_result.documents)
+        
+        # Encode query
+        query_embedding = self.model.encode(
+            [query_text], 
+            batch_size=batch_size,
+            max_length=max_length
+        )['dense_vecs'][0]
+        
+        # Encode documents
+        doc_embeddings = self.model.encode(
+            doc_texts,
+            batch_size=batch_size,
+            max_length=max_length
+        )['dense_vecs']
+        
+        # Compute similarity scores using matrix multiplication
+        similarity_scores = doc_embeddings @ query_embedding
+        
+        # Sort by scores (descending)
+        sorted_indices = np.argsort(similarity_scores)[::-1]
+        
+        # Apply top_k filtering
+        if top_k:
+            sorted_indices = sorted_indices[:top_k]
+        
+        # Reorder documents and scores
+        reranked_documents = [retrieval_result.documents[i] for i in sorted_indices]
+        reranked_scores = [float(similarity_scores[i]) for i in sorted_indices]
+        
+        # Create metadata
+        rerank_metadata = {
+            'reranker': 'BGE-M3-Dense',
+            'method': 'dense_only',
+            'batch_size': batch_size,
+            'max_length': max_length,
+            'original_scores': [retrieval_result.scores[i] for i in sorted_indices] if retrieval_result.scores else None,
+            'rerank_time': time.time() - start_time
+        }
+        
+        # Merge with existing metadata
+        final_metadata = retrieval_result.metadata.copy() if retrieval_result.metadata else {}
+        final_metadata.update(rerank_metadata)
+        
+        return RetrievalResult(
+            documents=reranked_documents,
+            scores=reranked_scores,
+            query=retrieval_result.query,
+            retrieval_time=retrieval_result.retrieval_time,
+            metadata=final_metadata
+        )
+    
+    def rerank_colbert_only(self, retrieval_result: RetrievalResult, 
+                           top_k: Optional[int] = None) -> RetrievalResult:
+        """
+        Reranking using only ColBERT multi-vector interaction
+        
+        Args:
+            retrieval_result: Original retrieval result
+            top_k: Number of top documents to return
+            
+        Returns:
+            RetrievalResult: Reranked result using only ColBERT scoring
+        """
+        start_time = time.time()
+        
+        if not retrieval_result.query:
+            raise ValueError("Query is required for reranking")
+        
+        if not retrieval_result.documents:
+            return retrieval_result
+        
+        # Extract texts
+        query_text = retrieval_result.query
+        doc_texts = self._extract_text_from_documents(retrieval_result.documents)
+        
+        # Encode query and documents with ColBERT vectors
+        query_output = self.model.encode([query_text], return_colbert_vecs=True)
+        doc_output = self.model.encode(doc_texts, return_colbert_vecs=True)
+        
+        # Compute ColBERT scores using BGE-M3's native method
+        colbert_scores = []
+        query_colbert = query_output['colbert_vecs'][0]
+        
+        for doc_colbert in doc_output['colbert_vecs']:
+            score = self.model.colbert_score(query_colbert, doc_colbert)
+            colbert_scores.append(float(score))
+        
+        colbert_scores = np.array(colbert_scores)
+        
+        # Sort by scores (descending)
+        sorted_indices = np.argsort(colbert_scores)[::-1]
+        
+        # Apply top_k filtering
+        if top_k:
+            sorted_indices = sorted_indices[:top_k]
+        
+        # Reorder documents and scores
+        reranked_documents = [retrieval_result.documents[i] for i in sorted_indices]
+        reranked_scores = [float(colbert_scores[i]) for i in sorted_indices]
+        
+        # Create metadata
+        rerank_metadata = {
+            'reranker': 'BGE-M3-ColBERT',
+            'method': 'colbert_only',
+            'original_scores': [retrieval_result.scores[i] for i in sorted_indices] if retrieval_result.scores else None,
+            'rerank_time': time.time() - start_time
+        }
+        
+        # Merge with existing metadata
+        final_metadata = retrieval_result.metadata.copy() if retrieval_result.metadata else {}
+        final_metadata.update(rerank_metadata)
+        
+        return RetrievalResult(
+            documents=reranked_documents,
+            scores=reranked_scores,
+            query=retrieval_result.query,
+            retrieval_time=retrieval_result.retrieval_time,
+            metadata=final_metadata
+        )
+
+
+# Usage example
+def example_usage():
+    """Example of how to use the BGE-M3 reranker"""
+    
+    # Initialize reranker
+    reranker = BGEM3Reranker(
+        weights={
+            'dense': 1.0,    # Semantic similarity
+            'sparse': 0.3,   # Lexical matching  
+            'colbert': 0.0   # Multi-vector interaction (disabled for faster computation)
+        }
+    )
+    
+    # Create sample documents
+    documents = [
+        Document(page_content="BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction."),
+        Document(page_content="BM25 is a bag-of-words retrieval function that ranks documents based on query terms."),
+        Document(page_content="Dense retrieval uses neural embeddings to find semantically similar documents."),
+        Document(page_content="Sparse retrieval methods like TF-IDF focus on exact term matching.")
+    ]
+    
+    # Create retrieval result
+    retrieval_result = RetrievalResult(
+        documents=documents,
+        scores=[0.8, 0.6, 0.7, 0.5],  # Original retrieval scores
+        query="What is BGE M3?",
+        retrieval_time=0.1
+    )
+    
+    print("=== Multi-vector Reranking (Dense + Sparse) ===")
+    # Rerank documents using multi-vector approach
+    reranked_result = reranker.rerank(retrieval_result, top_k=3)
+    
+    print(f"Query: {reranked_result.query}")
+    print(f"Reranked {len(reranked_result.documents)} documents:")
+    
+    for i, (doc, score) in enumerate(zip(reranked_result.documents, reranked_result.scores)):
+        print(f"\n{i+1}. Score: {score:.4f}")
+        print(f"   Content: {doc.page_content[:100]}...")
+    
+    print("\n=== Dense-only Reranking (Fast) ===")
+    # Fast dense-only reranking
+    dense_result = reranker.rerank_dense_only(retrieval_result, top_k=3)
+    
+    for i, (doc, score) in enumerate(zip(dense_result.documents, dense_result.scores)):
+        print(f"\n{i+1}. Dense Score: {score:.4f}")
+        print(f"   Content: {doc.page_content[:80]}...")
+    
+    print("\n=== ColBERT-only Reranking (Precise) ===")
+    # ColBERT-only reranking for high precision
+    colbert_result = reranker.rerank_colbert_only(retrieval_result, top_k=3)
+    
+    for i, (doc, score) in enumerate(zip(colbert_result.documents, colbert_result.scores)):
+        print(f"\n{i+1}. ColBERT Score: {score:.4f}")
+        print(f"   Content: {doc.page_content[:80]}...")
+    
+    print(f"\nPerformance comparison:")
+    print(f"Multi-vector time: {reranked_result.metadata['rerank_time']:.4f}s")
+    print(f"Dense-only time: {dense_result.metadata['rerank_time']:.4f}s") 
+    print(f"ColBERT-only time: {colbert_result.metadata['rerank_time']:.4f}s")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/chat_template.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/chat_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ee9c9a8c3c9e22fc388232bbe8040b45903b969
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/prompt_tuner/chat_template.py
@@ -0,0 +1,26 @@
+# Updated RAG Templates dengan structure yang lebih baik
+def RAG_TEMPLATES():
+    return {
+        "main_template": [
+            {
+            "role": "system",
+            "content": """Anda adalah Customer Service yang ramah dan profesional, dapat berbahasa Indonesia dengan baik dan benar. Tugas Anda adalah membantu pelanggan dengan informasi yang akurat berdasarkan knowledge base perusahaan. Ikuti pedoman berikut:
+
+            1. Selalu berikan sapaan yang ramah dan profesional
+            2. Gunakan HANYA informasi dari knowledge base yang tersedia
+            3. Berikan jawaban yang jelas, mudah dipahami, dan terstruktur semuanya berdasarkan konteks yang diberikan yaitu :
+            {context}
+            4. Jika informasi tidak tersedia, tawarkan alternatif bantuan atau arahkan ke channel yang tepat
+            5. Gunakan bahasa yang sopan dan empati terhadap kebutuhan pelanggan
+            6. Akhiri dengan penawaran bantuan lebih lanjut
+            """,
+            "description": "Template dengan system prompt untuk customer service professional"
+            },
+            {
+            "role" : "user",
+            "content" : """
+            Dari konteks yang diberikan context berikan jawaban atas pertanyaan saya yaitu : {question}
+            """
+            },
+        ],
+    }
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/base_retriever.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/base_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..10627dc41ef2d73995f43870f8b1ed9dd1eb1ad3
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/base_retriever.py
@@ -0,0 +1,180 @@
+from rag.retriever.retriever_types import (
+    DocumentType, 
+    RetrievalResult, 
+)
+from typing import List
+from abc import ABC, abstractmethod
+
+from langchain_community.document_loaders import (
+    PyMuPDFLoader,
+    Docx2txtLoader,
+    UnstructuredPowerPointLoader,
+    TextLoader
+)
+
+
+from rag.retriever.document_loader import BaseDocumentLoader
+from langchain_core.documents import Document
+
+
+import asyncio
+import logging
+from pathlib import Path
+
+import hashlib
+import os
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class BaseRetriever(ABC):
+    """Abstract base class for retrievers"""
+    
+    @abstractmethod
+    async def add_documents(self, documents: List[Document]) -> bool:
+        """Add documents to retriever"""
+        pass
+    
+    @abstractmethod
+    async def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
+        """Retrieve relevant documents"""
+        pass
+    
+    @abstractmethod
+    async def delete_documents(self, document_ids: List[str]) -> bool:
+        """Delete documents by IDs"""
+        pass
+
+# ===== DOCUMENT LOADERS =====
+
+class MultiFormatDocumentLoader(BaseDocumentLoader):
+    """Document loader supporting multiple formats"""
+    
+    def __init__(self):
+        self.loaders = {
+            DocumentType.PDF: self._load_pdf,
+            DocumentType.DOCX: self._load_docx,
+            DocumentType.PPT: self._load_ppt,
+            DocumentType.PPTX: self._load_pptx,
+            DocumentType.TXT: self._load_txt
+        }
+    
+    async def load_document(self, file_path: str) -> List[Document]:
+        """Load document based on file extension"""
+        try:
+            file_path = Path(file_path)
+            if not file_path.exists():
+                raise FileNotFoundError(f"File not found: {file_path}")
+            
+            # Determine document type
+            doc_type = self._get_document_type(file_path)
+            
+            # Load document
+            loader_func = self.loaders.get(doc_type)
+            if not loader_func:
+                raise ValueError(f"Unsupported file type: {doc_type}")
+            
+            logger.info(f"Loading {doc_type} document: {file_path}")
+            documents = await loader_func(str(file_path))
+            
+            # Add metadata to documents
+            for doc in documents:
+                doc.metadata.update({
+                    "file_path": str(file_path),
+                    "file_name": file_path.name,
+                    "file_type": doc_type.value,
+                    "file_size": file_path.stat().st_size,
+                    "file_hash": self._calculate_file_hash(file_path)
+                })
+            
+            return documents
+            
+        except Exception as e:
+            logger.error(f"Error loading document {file_path}: {str(e)}")
+            raise
+    
+    def get_supported_extensions(self) -> List[str]:
+        """Get supported file extensions"""
+        return [".pdf", ".docx", ".ppt", ".pptx", ".txt"]
+    
+    def _get_document_type(self, file_path: Path) -> DocumentType:
+        """Determine document type from file extension"""
+        extension = file_path.suffix.lower()
+        mapping = {
+            ".pdf": DocumentType.PDF,
+            ".docx": DocumentType.DOCX,
+            ".ppt": DocumentType.PPT,
+            ".pptx": DocumentType.PPTX,
+            ".txt": DocumentType.TXT
+        }
+        
+        doc_type = mapping.get(extension)
+        if not doc_type:
+            raise ValueError(f"Unsupported file extension: {extension}")
+        
+        return doc_type
+    
+    def _calculate_file_hash(self, file_path: Path) -> str:
+        """Calculate MD5 hash of file"""
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+    
+    async def _load_pdf(self, file_path: str) -> List[Document]:
+        """Load PDF document"""
+        try:
+            loader = PyMuPDFLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PDF: {str(e)}")
+    
+    async def _load_docx(self, file_path: str) -> List[Document]:
+        """Load DOCX document"""
+        try:
+            loader = Docx2txtLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading DOCX: {str(e)}")
+    
+    async def _load_ppt(self, file_path: str) -> List[Document]:
+        """Load PPT document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPT: {str(e)}")
+    
+    async def _load_pptx(self, file_path: str) -> List[Document]:
+        """Load PPTX document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPTX: {str(e)}")
+    
+    async def _load_txt(self, file_path: str) -> List[Document]:
+        """Load TXT document"""
+        try:
+            loader = TextLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading TXT: {str(e)}")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_loader.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..af371daefcf29622bd7aab9a5089e6b382a9eea4
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_loader.py
@@ -0,0 +1,168 @@
+from rag.retriever.retriever_types import (
+    DocumentType, 
+    RetrievalResult, 
+)
+from abc import ABC, abstractmethod
+from typing import List
+from langchain_core.documents import Document
+from pathlib import Path
+
+import logging
+
+from langchain_community.document_loaders import (
+    PyMuPDFLoader,
+    Docx2txtLoader,
+    UnstructuredPowerPointLoader,
+    TextLoader
+)
+import asyncio
+import hashlib
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class BaseDocumentLoader(ABC):
+    """Abstract base class for document loaders"""
+    
+    @abstractmethod
+    async def load_document(self, file_path: str) -> List[Document]:
+        """Load document from file path"""
+        pass
+    
+    @abstractmethod
+    def get_supported_extensions(self) -> List[str]:
+        """Get supported file extensions"""
+        pass
+
+
+
+class MultiFormatDocumentLoader(BaseDocumentLoader):
+    """Document loader supporting multiple formats"""
+    
+    def __init__(self):
+        self.loaders = {
+            DocumentType.PDF: self._load_pdf,
+            DocumentType.DOCX: self._load_docx,
+            DocumentType.PPT: self._load_ppt,
+            DocumentType.PPTX: self._load_pptx,
+            DocumentType.TXT: self._load_txt
+        }
+    
+    async def load_document(self, file_path: str) -> List[Document]:
+        """Load document based on file extension"""
+        try:
+            file_path = Path(file_path)
+            if not file_path.exists():
+                raise FileNotFoundError(f"File not found: {file_path}")
+            
+            # Determine document type
+            doc_type = self._get_document_type(file_path)
+            
+            # Load document
+            loader_func = self.loaders.get(doc_type)
+            if not loader_func:
+                raise ValueError(f"Unsupported file type: {doc_type}")
+            
+            logger.info(f"Loading {doc_type} document: {file_path}")
+            documents = await loader_func(str(file_path))
+            
+            # Add metadata to documents
+            for doc in documents:
+                doc.metadata.update({
+                    "file_path": str(file_path),
+                    "file_name": file_path.name,
+                    "file_type": doc_type.value,
+                    "file_size": file_path.stat().st_size,
+                    "file_hash": self._calculate_file_hash(file_path)
+                })
+            
+            return documents
+            
+        except Exception as e:
+            logger.error(f"Error loading document {file_path}: {str(e)}")
+            raise
+    
+    def get_supported_extensions(self) -> List[str]:
+        """Get supported file extensions"""
+        return [".pdf", ".docx", ".ppt", ".pptx", ".txt"]
+    
+    def _get_document_type(self, file_path: Path) -> DocumentType:
+        """Determine document type from file extension"""
+        extension = file_path.suffix.lower()
+        mapping = {
+            ".pdf": DocumentType.PDF,
+            ".docx": DocumentType.DOCX,
+            ".ppt": DocumentType.PPT,
+            ".pptx": DocumentType.PPTX,
+            ".txt": DocumentType.TXT
+        }
+        
+        doc_type = mapping.get(extension)
+        if not doc_type:
+            raise ValueError(f"Unsupported file extension: {extension}")
+        
+        return doc_type
+    
+    def _calculate_file_hash(self, file_path: Path) -> str:
+        """Calculate MD5 hash of file"""
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+    
+    async def _load_pdf(self, file_path: str) -> List[Document]:
+        """Load PDF document"""
+        try:
+            loader = PyMuPDFLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PDF: {str(e)}")
+    
+    async def _load_docx(self, file_path: str) -> List[Document]:
+        """Load DOCX document"""
+        try:
+            loader = Docx2txtLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading DOCX: {str(e)}")
+    
+    async def _load_ppt(self, file_path: str) -> List[Document]:
+        """Load PPT document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPT: {str(e)}")
+    
+    async def _load_pptx(self, file_path: str) -> List[Document]:
+        """Load PPTX document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPTX: {str(e)}")
+    
+    async def _load_txt(self, file_path: str) -> List[Document]:
+        """Load TXT document"""
+        try:
+            loader = TextLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading TXT: {str(e)}")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_processor.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7fc12d3e1adab606dae3c82631ef7233f8db8b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/document_processor.py
@@ -0,0 +1,55 @@
+from typing import List, Dict, Any, Optional, Union
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+import asyncio
+import logging
+
+from langchain_core.documents import Document
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class DocumentProcessor:
+    """Document processor for chunking and preprocessing"""
+    
+    def __init__(self, 
+                 chunk_size: int = 1000,
+                 chunk_overlap: int = 200,
+                 separators: Optional[List[str]] = None):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        
+        # Default separators for better chunking
+        if separators is None:
+            separators = ["\n\n", "\n", " ", ""]
+        
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=separators,
+            length_function=len
+        )
+    
+    async def process_documents(self, documents: List[Document]) -> List[Document]:
+        """Process documents by splitting into chunks"""
+        try:
+            logger.info(f"Processing {len(documents)} documents")
+            
+            # Split documents into chunks
+            chunks = await asyncio.get_event_loop().run_in_executor(
+                None, self.text_splitter.split_documents, documents
+            )
+            
+            # Add chunk metadata
+            for i, chunk in enumerate(chunks):
+                chunk.metadata.update({
+                    "chunk_id": i,
+                    "chunk_size": len(chunk.page_content),
+                    "processed_at": str(asyncio.get_event_loop().time())
+                })
+            
+            logger.info(f"Created {len(chunks)} chunks from {len(documents)} documents")
+            return chunks
+            
+        except Exception as e:
+            logger.error(f"Error processing documents: {str(e)}")
+            raise
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d9f3719dc160e0aa462994b108322fa1c8ea00
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/langchain_retriever.py
@@ -0,0 +1,225 @@
+from rag.retriever.base_retriever import BaseRetriever
+
+# Embeddings
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_openai import OpenAIEmbeddings
+
+# Vector stores
+from langchain_community.vectorstores import Chroma, FAISS, Pinecone
+
+# Retriever base
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_community.retrievers import BM25Retriever
+from langchain.retrievers import ContextualCompressionRetriever
+
+from typing import Dict, Optional, List
+from rag.retriever.document_loader import MultiFormatDocumentLoader
+from rag.retriever.document_processor import DocumentProcessor
+from rag.retriever.retriever_types import ProcessingResult, ProcessingStatus, RetrievalResult, DocumentMetadata
+
+import asyncio
+from pathlib import Path
+import logging
+from langchain_core.documents import Document
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class LangChainRetriever(BaseRetriever):
+    """LangChain-based retriever with multiple format support"""
+
+    def __init__(self,
+                 embedding_model: str = "text-embedding-3-small",
+                 vectorstore_type: str = "chroma",
+                 vectorstore_path: Optional[str] = None,
+                 use_hybrid_search: bool = True,
+                 **kwargs):
+
+        self.embedding_model = embedding_model
+        self.vectorstore_type = vectorstore_type
+        self.vectorstore_path = vectorstore_path or "./vectorstore"
+        self.use_hybrid_search = use_hybrid_search
+
+        self.document_loader = MultiFormatDocumentLoader()
+        self.document_processor = DocumentProcessor(**kwargs)
+        self.embeddings = self._initialize_embeddings()
+        self.vectorstore = self._initialize_vectorstore()
+        self.retriever = self._initialize_retriever()
+
+        self.processed_documents: Dict[str, DocumentMetadata] = {}
+
+        logger.info(f"LangChainRetriever initialized with {vectorstore_type} vectorstore")
+
+    def _initialize_embeddings(self):
+        try:
+            if self.embedding_model.startswith("text-embedding"):
+                return OpenAIEmbeddings(model=self.embedding_model)
+            else:
+                return HuggingFaceEmbeddings(model_name=self.embedding_model)
+        except Exception as e:
+            logger.error(f"Error initializing embeddings: {str(e)}")
+            return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+
+    def _initialize_vectorstore(self):
+        try:
+            if self.vectorstore_type.lower() == "chroma":
+                return Chroma(
+                    persist_directory=self.vectorstore_path,
+                    embedding_function=self.embeddings
+                )
+            elif self.vectorstore_type.lower() == "faiss":
+                return FAISS(
+                    embedding_function=self.embeddings,
+                    index_path=self.vectorstore_path
+                )
+            else:
+                raise ValueError(f"Unsupported vectorstore type: {self.vectorstore_type}")
+        except Exception as e:
+            logger.error(f"Error initializing vectorstore: {str(e)}")
+            return FAISS.from_documents([], self.embeddings)
+
+    def _initialize_retriever(self):
+        try:
+            vector_retriever = VectorStoreRetriever(
+                vectorstore=self.vectorstore,
+                search_kwargs={"k": 10}
+            )
+            if self.use_hybrid_search:
+                self.bm25_retriever = None  # initialized later after adding docs
+                return vector_retriever  # temporary fallback
+            else:
+                return vector_retriever
+        except Exception as e:
+            logger.error(f"Error initializing retriever: {str(e)}")
+            return VectorStoreRetriever(vectorstore=self.vectorstore)
+
+    async def add_document_from_file(self, file_path: str) -> ProcessingResult:
+        try:
+            file_path = Path(file_path)
+            if not file_path.exists():
+                return ProcessingResult(
+                    success=False,
+                    document_metadata=None,
+                    chunks=[],
+                    error_message=f"File not found: {file_path}"
+                )
+
+            doc_metadata = DocumentMetadata(
+                file_path=str(file_path),
+                file_name=file_path.name,
+                file_type=self.document_loader._get_document_type(file_path),
+                file_size=file_path.stat().st_size,
+                file_hash=self.document_loader._calculate_file_hash(file_path),
+                created_at=str(asyncio.get_event_loop().time()),
+                processing_status=ProcessingStatus.PROCESSING
+            )
+
+            documents = await self.document_loader.load_document(str(file_path))
+            chunks = await self.document_processor.process_documents(documents)
+            await self.add_documents(chunks)
+
+            doc_metadata.chunk_count = len(chunks)
+            doc_metadata.processing_status = ProcessingStatus.COMPLETED
+            doc_metadata.processed_at = str(asyncio.get_event_loop().time())
+            self.processed_documents[doc_metadata.file_hash] = doc_metadata
+
+            logger.info(f"Successfully processed {file_path}: {len(chunks)} chunks")
+
+            return ProcessingResult(
+                success=True,
+                document_metadata=doc_metadata,
+                chunks=chunks
+            )
+
+        except Exception as e:
+            error_msg = f"Error processing document {file_path}: {str(e)}"
+            logger.error(error_msg)
+
+            return ProcessingResult(
+                success=False,
+                document_metadata=doc_metadata if 'doc_metadata' in locals() else None,
+                chunks=[],
+                error_message=error_msg
+            )
+
+    async def add_documents(self, documents: List[Document]) -> bool:
+        try:
+            if not documents:
+                return True
+
+            await asyncio.get_event_loop().run_in_executor(
+                None, self.vectorstore.add_documents, documents
+            )
+
+            if self.use_hybrid_search:
+                await self._update_bm25_retriever(documents)
+
+            logger.info(f"Added {len(documents)} documents to vector store")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error adding documents: {str(e)}")
+            return False
+
+    async def _update_bm25_retriever(self, documents: List[Document]):
+        try:
+            self.bm25_retriever = BM25Retriever.from_documents(documents)
+            self.retriever = ContextualCompressionRetriever(
+                base_compressor=None,  # Optional: add compressor like CohereRerank or LLM-based
+                base_retriever=self.bm25_retriever  # Example: use BM25 as base, can combine
+            )
+        except Exception as e:
+            logger.error(f"Error updating BM25 retriever: {str(e)}")
+
+    async def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
+        try:
+            import time
+            start_time = time.time()
+            logger.info(f"Retrieving documents for query: '{query}'")
+
+            retrieved_docs = await asyncio.get_event_loop().run_in_executor(
+                None, self.retriever.get_relevant_documents, query
+            )
+            retrieved_docs = retrieved_docs[:k]
+            scores = [0.9 - (i * 0.1) for i in range(len(retrieved_docs))]
+
+            retrieval_time = time.time() - start_time
+
+            logger.info(f"Retrieved {len(retrieved_docs)} documents in {retrieval_time:.2f}s")
+
+            return RetrievalResult(
+                documents=retrieved_docs,
+                scores=scores,
+                query=query,
+                retrieval_time=retrieval_time,
+                metadata={
+                    "vectorstore_type": self.vectorstore_type,
+                    "embedding_model": self.embedding_model,
+                    "hybrid_search": self.use_hybrid_search
+                }
+            )
+
+        except Exception as e:
+            logger.error(f"Error retrieving documents: {str(e)}")
+            raise
+
+    async def delete_documents(self, document_ids: List[str]) -> bool:
+        try:
+            if hasattr(self.vectorstore, 'delete'):
+                await asyncio.get_event_loop().run_in_executor(
+                    None, self.vectorstore.delete, document_ids
+                )
+            logger.info(f"Deleted {len(document_ids)} documents")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting documents: {str(e)}")
+            return False
+
+    def get_document_metadata(self, file_hash: str) -> Optional[DocumentMetadata]:
+        return self.processed_documents.get(file_hash)
+
+    def list_processed_documents(self) -> List[DocumentMetadata]:
+        return list(self.processed_documents.values())
+
+    def get_supported_formats(self) -> List[str]:
+        return self.document_loader.get_supported_extensions()
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/retriever_types.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/retriever_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e59b8fc155e29dfcf474130d0c95cd9af385e9
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rag/retriever/retriever_types.py
@@ -0,0 +1,52 @@
+
+from typing import List, Dict, Any, Optional, Union
+
+
+from dataclasses import dataclass
+from enum import Enum
+
+from langchain_core.documents import Document
+
+class DocumentType(str, Enum):
+    PDF = "pdf"
+    DOCX = "docx"
+    PPT = "ppt"
+    PPTX = "pptx"
+    TXT = "txt"
+
+class ProcessingStatus(str, Enum):
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    ERROR = "error"
+
+@dataclass
+class DocumentMetadata:
+    """Document metadata"""
+    file_path: str
+    file_name: str
+    file_type: DocumentType
+    file_size: int
+    file_hash: str
+    created_at: str
+    processed_at: Optional[str] = None
+    chunk_count: int = 0
+    processing_status: ProcessingStatus = ProcessingStatus.PENDING
+    error_message: Optional[str] = None
+
+@dataclass
+class RetrievalResult:
+    """Retrieval result"""
+    documents: List[Document]
+    scores: List[float]
+    query: Optional[str] = None
+    retrieval_time: Optional[float] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+@dataclass
+class ProcessingResult:
+    """Document processing result"""
+    success: bool
+    document_metadata: DocumentMetadata
+    chunks: List[Document]
+    error_message: Optional[str] = None
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/requirements.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2253fcd604f5ec1b2b1d9fda22e9577496afb1c0
Binary files /dev/null and b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/requirements.txt differ
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/.gradio/certificate.pem b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/.gradio/certificate.pem
new file mode 100644
index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/.gradio/certificate.pem
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7634032b6dfc365becf41961702955b1822f1bbc
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/__init__.py
@@ -0,0 +1,16 @@
+from openai import OpenAI
+from elevenlabs.client import ElevenLabs
+from tts.audio_edge_tts import EdgeTTS
+from config.constant import OPENAI_API_KEY, ELEVENLABS_API_KEY
+from rtc.rtc_call import RTCHandler
+from stt.whisper_stt import WhisperSTT
+
+whisper_stt = WhisperSTT("turbo")
+edge_tts = EdgeTTS("id-ID-ArdiNeural",  "+0%", "+0%")
+rtc_handler = RTCHandler(whisper_stt, edge_tts)
+
+def handle_rtc():
+    rtc_handler.launch_ui()
+
+def handle_rtc_server():
+    rtc_handler.start_server()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/call_entity.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/call_entity.py
new file mode 100644
index 0000000000000000000000000000000000000000..05975fde53930169c190a7edd5c4f820c05bcd31
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/call_entity.py
@@ -0,0 +1,4 @@
+class Call:
+    def __init__(self, call_id : int, is_used : bool):
+        self.Id = call_id
+        self.IsUsed = is_used
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46f28629b236800fd0b0f6ba07297f4d7b85ff5
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/rtc_call.py
@@ -0,0 +1,304 @@
+import fastapi
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
+from fastrtc.utils import audio_to_int16
+from openai import OpenAI
+from elevenlabs.client import ElevenLabs
+from dotenv import load_dotenv
+from tts.audio_edge_tts import EdgeTTS
+import logging
+import time
+import platform
+import socket
+import os
+import numpy as np
+import io
+import wave
+import asyncio
+import librosa
+from pydub import AudioSegment
+from stt.whisper_stt import WhisperSTT
+from collections import deque
+import torch
+import torchaudio.transforms as T
+import asyncio
+import concurrent.futures
+import threading
+from config.constant import HF_TOKEN
+import threading
+import re
+
+
+from rag import get_stream_response
+# Load .env
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
+
+
+class RTCHandler:
+    def __init__(self, whisper_stt: WhisperSTT, edge_tts : EdgeTTS):
+
+        """Initialize RTC handler with OpenAI, ElevenLabs, and EdgeTTS"""
+        self.whisper_stt = whisper_stt
+        self.edge_tts = edge_tts
+        self.sys_prompt = """
+        
+        Kamu adalah customer service yang berbahasa Indonesia dengan baik sopan, santun, tapi santai pembawaannya.
+        Kamu bisa menjelaskan sesuatu secara baik dan membimbing customer dalam menghadapi masalah yang ada!
+        
+        """
+
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        self.full_response = ""
+        self.stream = None
+        self.app = None
+
+        self._setup_webrtc_ip()
+
+    def _setup_webrtc_ip(self):
+        """Setup WebRTC IP for Windows"""
+        if platform.system() == 'Windows':
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            try:
+                s.connect(('8.8.8.8', 80))
+                local_ip = s.getsockname()[0]
+            except Exception:
+                local_ip = '127.0.0.1'
+            finally:
+                s.close()
+            os.environ['WEBRTC_IP'] = local_ip
+
+    def audio_to_bytes(self, audio_tuple, sample_rate=24000) -> io.BufferedReader:
+        sr, audio_data = audio_tuple
+        audio_int16 = audio_to_int16(audio_tuple)
+
+        buffer = io.BytesIO()
+        with wave.open(buffer, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(sr)
+            wf.writeframes(audio_int16.tobytes())
+        buffer.seek(0)
+        buffer.name = "audio.wav"
+        return buffer
+    def echo(self, audio):
+            """Process audio input and generate audio response - Optimized version"""
+            try:
+                stt_time = time.time()
+                logging.info("Performing STT")
+
+                transcription = self.whisper_stt.transcribe(self.audio_to_bytes(audio))
+                prompt = transcription
+                if prompt == "":
+                    logging.info("STT returned empty string")
+                    return
+
+                logging.info(f"STT response: {prompt}")
+                self.messages.append({"role": "user", "content": prompt})
+                logging.info(f"STT took {time.time() - stt_time} seconds")
+
+                llm_time = time.time()
+                self.full_response = ""
+
+                # Single async function to handle both text streaming and audio generation
+                async def stream_text_to_audio():
+                    chunk_size = 1024
+                    no_buffer = 0
+                    text_buffer = ""
+                    async for stream_data in get_stream_response(question=prompt):
+                        print(stream_data)
+                        
+                        if stream_data["type"] == "chunk":
+                            chunk = stream_data["data"]["chunk"]
+                            self.full_response += chunk
+                            text_buffer += chunk
+                            # Generate audio immediately for each text chunk
+                            if re.search(r'[.,?;!]', chunk):
+                                try:
+                                    audio_buffer_gen =  await self.edge_tts.generate_audio_buffer(text_buffer)
+                                    audio_buffer = audio_buffer_gen[0]
+                                    
+                                    audio_buffer.seek(0)
+                                    
+                                    # Convert MP3 to PCM
+                                    audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
+                                    samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2 ** 15)
+                                    
+                                    # Handle stereo to mono
+                                    if audio_segment.channels == 2:
+                                        samples = samples.reshape((-1, 2)).mean(axis=1)
+                                    
+                                    # # Resample to 24kHz
+                                    # resampled = librosa.resample(samples, orig_sr=audio_segment.frame_rate, target_sr=24000)
+                                    import torch
+                                    import torchaudio
+                                    
+                                    # Check if CUDA is available
+                                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                                    
+                                    # Convert numpy array to torch tensor and move to GPU
+                                    audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device)  # Add batch dimension and move to GPU
+                                    
+                                    # Create resampler and move to GPU
+                                    resampler = torchaudio.transforms.Resample(
+                                        orig_freq=audio_segment.frame_rate,
+                                        new_freq=24000
+                                    ).to(device)
+                                    
+                                    # Apply resampling on GPU
+                                    resampled_tensor = resampler(audio_tensor)
+                                    
+                                    # Convert back to numpy (move to CPU first)
+                                    resampled = resampled_tensor.squeeze(0).cpu().numpy()
+                                    # Yield audio chunks
+                                    for i in range(0, len(resampled), chunk_size):
+                                        yield (24000, resampled[i:i + chunk_size])
+                                    no_buffer = 0
+                                    text_buffer = ""
+                                except Exception as e:
+                                    logging.error(f"TTS generation failed for chunk: {e}")
+                                    continue
+                                    
+                        elif stream_data["type"] == "metadata":
+                            setup_time = stream_data['data']['setup_time']
+                            print(f"\nSetup completed in {setup_time:.2f}s")
+                            
+                        elif stream_data["type"] == "complete":
+                            total_time = stream_data['data']['total_time']
+                            print(f"\nTotal time: {total_time:.2f}s")
+                            break
+
+                # Run the single async function
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                
+                try:
+                    async_gen = stream_text_to_audio()
+                    while True:
+                        try:
+                            chunk = loop.run_until_complete(async_gen.__anext__())
+                            yield chunk
+                        except StopAsyncIteration:
+                            break
+                finally:
+                    loop.close()
+
+                self.messages.append({"role": "assistant", "content": self.full_response + " "})
+                logging.info(f"LLM response: {self.full_response}")
+                logging.info(f"LLM took {time.time() - llm_time} seconds")
+
+            except Exception as e:
+                logging.error(f"Error in echo function: {e}")
+                error_audio = np.zeros(24000, dtype=np.float32)
+                yield (24000, error_audio)
+    def reset_conversation(self):
+        logging.info("Resetting chat")
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        self.full_response = ""
+
+    def create_stream(self):
+        try:
+            async def get_credentials():
+                return await get_cloudflare_turn_credentials_async(hf_token=HF_TOKEN)
+            self.stream = Stream(
+                rtc_configuration=get_credentials,
+                server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
+                handler = ReplyOnPause(
+                    self.echo,
+                    algo_options=AlgoOptions(
+                        audio_chunk_duration=0.5,
+                        started_talking_threshold=0.1,
+                        speech_threshold=0.03
+                    ),
+                    model_options=SileroVadOptions(
+                        threshold=0.90,
+                        min_speech_duration_ms=250,
+                        min_silence_duration_ms=2000,
+                        speech_pad_ms=400,
+                        max_speech_duration_s=15
+                    )
+                ),
+                modality="audio",
+                mode="send-receive"
+            )
+            return self.stream
+        except Exception as e:
+            logging.error(f"Error creating stream: {e}")
+            raise
+
+    def create_fastapi_app(self):
+        try:
+            self.app = fastapi.FastAPI()
+            self.app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+
+            if not self.stream:
+                self.create_stream()
+            self.stream.mount(self.app)
+
+            @self.app.get("/reset")
+            async def reset():
+                try:
+                    self.reset_conversation()
+                    return {"status": "success"}
+                except Exception as e:
+                    logging.error(f"Error in reset endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            @self.app.get("/status")
+            async def status():
+                try:
+                    return {
+                        "status": "running",
+                        "messages_count": len(self.messages),
+                        "last_response": self.full_response
+                    }
+                except Exception as e:
+                    logging.error(f"Error in status endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            return self.app
+        except Exception as e:
+            logging.error(f"Error creating FastAPI app: {e}")
+            raise
+
+    def start_server(self, host: str = "0.0.0.0", port: int = 7860):
+        import uvicorn
+        if not self.app:
+            self.create_fastapi_app()
+        logging.info(f"Starting server on {host}:{port}")
+        try:
+            uvicorn.run(self.app, host=host, port=port, log_level="info")
+        except Exception as e:
+            logging.error(f"Error starting server: {e}")
+            raise
+    def launch_ui(self, browser: bool = True):
+        try:
+            if not self.stream:
+                self.create_stream()
+            if not self.app:
+                self.create_fastapi_app()
+            logging.info("Launching RTC UI...")
+            self.stream.ui.launch(self.app,
+                                  server_name="0.0.0.0",
+                                  server_port=7860,
+                                  )
+        except Exception as e:
+            logging.error(f"Error launching UI: {e}")
+            raise
+
+    def get_conversation_history(self):
+        return self.messages.copy()
+
+    def set_system_prompt(self, new_prompt: str):
+        self.sys_prompt = new_prompt
+        self.messages[0] = {"role": "system", "content": new_prompt}
+
+    def get_last_response(self):
+        return self.full_response
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/temporary_rtc_call.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/temporary_rtc_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..712293836a6f5d8a720fd5f6e2dc84f3568f9556
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/temporary_rtc_call.py
@@ -0,0 +1,292 @@
+import fastapi
+from fastapi.responses import FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions
+from fastrtc.utils import audio_to_int16, audio_to_float32
+from openai import OpenAI
+from elevenlabs.client import ElevenLabs
+import logging
+import time
+import numpy as np
+import io
+import wave
+import platform
+import os
+import socket
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+logging.basicConfig(level=logging.INFO)
+
+
+class RTCHandler:
+    def __init__(self, openai_client: OpenAI, elevenlabs_client: ElevenLabs):
+        """Initialize RTC handler with OpenAI and ElevenLabs models"""
+        # Initialize clients
+        self.openai_client = openai_client
+        self.elevenlabs_client = elevenlabs_client
+        
+        # System prompt
+        self.sys_prompt = """
+Kamu adalah customer service yang berbahasa Indonesia dengan baik sopan, santun, tapi santai pembawaannya.
+Kamu bisa menjelaskan sesuatu secara baik dan membimbing customer dalam menghadapi masalah yang ada!
+"""
+        
+        # Message history
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        
+        # Full response storage
+        self.full_response = ""
+        
+        # Stream and app
+        self.stream = None
+        self.app = None
+        
+        # Setup Windows IP configuration
+        self._setup_webrtc_ip()
+    
+    def _setup_webrtc_ip(self):
+        """Setup WebRTC IP for Windows"""
+        if platform.system() == 'Windows':
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            try:
+                s.connect(('8.8.8.8', 80))
+                local_ip = s.getsockname()[0]
+            except Exception:
+                local_ip = '127.0.0.1'
+            finally:
+                s.close()
+            
+            os.environ['WEBRTC_IP'] = local_ip
+
+    def audio_to_bytes(self, audio_tuple, sample_rate=24000) -> io.BufferedReader:
+        """
+        Convert (sample_rate, np.ndarray) tuple into WAV buffer.
+        """
+        sr, audio_data = audio_tuple  # Ambil sample rate dan data dari tuple
+
+        # Convert float32 audio [-1,1] to PCM 16-bit
+        audio_int16 = audio_to_int16(audio_tuple)
+
+        buffer = io.BytesIO()
+        with wave.open(buffer, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)  # 2 bytes = 16-bit
+            wf.setframerate(sr)
+            wf.writeframes(audio_int16.tobytes())
+
+        buffer.seek(0)
+        buffer.name = "audio.wav"
+        return buffer
+    
+    def echo(self, audio):
+        """Process audio input and generate audio response"""
+        try:
+            stt_time = time.time()
+            logging.info("Performing STT")
+
+            # Speech to Text using OpenAI Whisper
+            transcription = self.openai_client.audio.transcriptions.create(
+                model="whisper-1",
+                file=self.audio_to_bytes(audio),
+                language="id"
+            )
+            
+            prompt = transcription.text
+            if prompt == "":
+                logging.info("STT returned empty string")
+                return
+            
+            logging.info(f"STT response: {prompt}")
+            self.messages.append({"role": "user", "content": prompt})
+            logging.info(f"STT took {time.time() - stt_time} seconds")
+
+            llm_time = time.time()
+            
+            # Kumpulkan teks dalam chunks yang lebih besar untuk TTS
+            def collect_text_chunks():
+                self.full_response = ""
+                text_buffer = ""
+                chunk_size = 50  # Kumpulkan ~50 karakter sebelum generate TTS
+                
+                response = self.openai_client.chat.completions.create(
+                    model="gpt-3.5-turbo",
+                    messages=self.messages,
+                    max_tokens=200,
+                    stream=True
+                )
+                
+                for chunk in response:
+                    if chunk.choices[0].finish_reason == "stop":
+                        if text_buffer:  # Yield sisa text
+                            yield text_buffer
+                        break
+                    if chunk.choices[0].delta.content:
+                        content = chunk.choices[0].delta.content
+                        self.full_response += content
+                        text_buffer += content
+                        
+                        # Yield ketika buffer cukup besar atau ada tanda baca
+                        if len(text_buffer) >= chunk_size or content in '.!?':
+                            yield text_buffer
+                            text_buffer = ""
+            
+            # Generate TTS untuk setiap chunk text
+            for text_chunk in collect_text_chunks():
+                if text_chunk.strip():
+                    try:
+                        audio_response = self.openai_client.audio.speech.create(
+                            model="tts-1",
+                            voice="nova",
+                            input=text_chunk.strip(),
+                            response_format="pcm",
+                            speed=1.0
+                        )
+                        
+                        audio_bytes = audio_response.content
+                        audio_array = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
+                        
+                        # Yield dalam chunks kecil
+                        chunk_size = 1024
+                        for i in range(0, len(audio_array), chunk_size):
+                            chunk = audio_array[i:i + chunk_size]
+                            yield (24000, chunk)
+                            
+                    except Exception as e:
+                        logging.error(f"TTS generation failed for chunk: {e}")
+                        continue
+
+            # Store assistant response
+            self.messages.append({"role": "assistant", "content": self.full_response + " "})
+            logging.info(f"LLM response: {self.full_response}")
+            logging.info(f"LLM took {time.time() - llm_time} seconds")
+
+        except Exception as e:
+            logging.error(f"Error in echo function: {e}")
+            error_audio = np.zeros(24000, dtype=np.float32)  # 1 second silence
+            yield (24000, error_audio)
+
+    def reset_conversation(self):
+        """Reset conversation history"""
+        logging.info("Resetting chat")
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        self.full_response = ""
+    
+    def create_stream(self):
+        """Create the audio stream with advanced options"""
+        try:
+            self.stream = Stream(
+                ReplyOnPause(
+                    self.echo,
+                    algo_options=AlgoOptions(
+                        audio_chunk_duration=0.5,
+                        started_talking_threshold=0.1,
+                        speech_threshold=0.03
+                    ),
+                    model_options=SileroVadOptions(
+                        threshold=0.75,
+                        min_speech_duration_ms=250,
+                        min_silence_duration_ms=1500,
+                        speech_pad_ms=400,
+                        max_speech_duration_s=15
+                    )
+                ),
+                modality="audio",
+                mode="send-receive"
+            )
+            return self.stream
+        except Exception as e:
+            logging.error(f"Error creating stream: {e}")
+            raise
+    
+    def create_fastapi_app(self):
+        """Create FastAPI application with CORS and endpoints"""
+        try:
+            self.app = fastapi.FastAPI()
+            
+            # Add CORS middleware
+            self.app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+            
+            # Create and mount stream
+            if not self.stream:
+                self.create_stream()
+            
+            self.stream.mount(self.app)
+            
+            # Add reset endpoint
+            @self.app.get("/reset")
+            async def reset():
+                try:
+                    self.reset_conversation()
+                    return {"status": "success"}
+                except Exception as e:
+                    logging.error(f"Error in reset endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+            
+            # Add status endpoint
+            @self.app.get("/status")
+            async def status():
+                try:
+                    messages_count = len(self.messages)
+                    return {
+                        "status": "running",
+                        "messages_count": messages_count,
+                        "last_response": self.full_response
+                    }
+                except Exception as e:
+                    logging.error(f"Error in status endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+            
+            return self.app
+            
+        except Exception as e:
+            logging.error(f"Error creating FastAPI app: {e}")
+            raise
+    
+    def start_server(self, host: str = "0.0.0.0", port: int = 8000):
+        """Start the FastAPI server"""
+        import uvicorn
+        
+        if not self.app:
+            self.create_fastapi_app()
+        
+        logging.info(f"Starting server on {host}:{port}")
+        try:
+            uvicorn.run(self.app, host=host, port=port, log_level="info")
+        except Exception as e:
+            logging.error(f"Error starting server: {e}")
+            raise
+    
+    def launch_ui(self, browser: bool = True):
+        """Launch the RTC UI"""
+        try:
+            if not self.stream:
+                self.create_stream()
+            if not self.app:
+                self.create_fastapi_app()
+            logging.info("Launching RTC UI...")
+            self.stream.ui.launch(self.app, inbrowser=browser)
+        except Exception as e:
+            logging.error(f"Error launching UI: {e}")
+            raise
+    
+    def get_conversation_history(self):
+        """Get current conversation history"""
+        return self.messages.copy()
+    
+    def set_system_prompt(self, new_prompt: str):
+        """Update system prompt"""
+        self.sys_prompt = new_prompt
+        self.messages[0] = {"role": "system", "content": new_prompt}
+    
+    def get_last_response(self):
+        """Get the last assistant response"""
+        return self.full_response
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/temporaryinit.temp b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/temporaryinit.temp
new file mode 100644
index 0000000000000000000000000000000000000000..6d3d421e9abeef06ce7eb5e7d97899dcd9ddd89d
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/rtc/temporaryinit.temp
@@ -0,0 +1,18 @@
+from rtc.openai_stt import OpenAISTTModel
+from rtc.openai_tts import OpenAITTSModel
+from rtc.rtc_call import RTCHandler
+from config.constant import OPENAI_API_KEY
+stt_model = OpenAISTTModel(
+            api_key=OPENAI_API_KEY,
+            model="whisper-1",
+            language="en",  # Set to your preferred language
+            response_format="text")
+tts_model = OpenAITTSModel(
+            api_key=OPENAI_API_KEY,
+            model="tts-1",  # Use "tts-1-hd" for higher quality
+            voice="alloy",  # Choose: alloy, echo, fable, onyx, nova, shimmer
+            response_format="mp3",
+            speed=1.0)
+def handle_rtc():
+    rtc_call = RTCHandler(stt_model, tts_model)
+    rtc_call.start_stream()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f53e8e593532d66b546aefd14550f5a23a5d9f
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/stt/whisper_stt.py
@@ -0,0 +1,31 @@
+import whisper
+from fastrtc.utils import audio_to_int16
+import io
+import os
+import tempfile
+
+class WhisperSTT:
+    def __init__(self, model_size: str = "base"):
+        """
+        Initialize Whisper STT with specified model size (tiny, base, small, medium, large)
+        """
+        cache_dir = os.environ.get('WHISPER_CACHE_DIR', '/tmp/.cache/whisper')
+        os.makedirs(cache_dir, exist_ok=True)
+        self.model = whisper.load_model(model_size, download_root=cache_dir)
+        self.language = "id"  # ISO-639-1 code for Bahasa Indonesia
+        
+
+    def transcribe(self, audio: io.BufferedReader, language: str = "id") -> str:
+        # Simpan audio ke file sementara
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp.write(audio.read())
+            tmp.flush()
+            tmp_path = tmp.name
+
+        try:
+            result = self.model.transcribe(tmp_path, language=language)
+            return result.get("text", "")
+        finally:
+            os.remove(tmp_path)
+
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/document_retriever_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/document_retriever_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c75863b4628d0d59d2e3a3daf2e199a81fd450
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/document_retriever_test.py
@@ -0,0 +1,44 @@
+from rag.retriever.langchain_retriever import LangChainRetriever
+
+
+async def test_document_retriever():
+    print(" ===== Testing document retriever ==== ")
+    """Example usage of LangChainRetriever"""
+    # Initialize retriever
+    retriever = LangChainRetriever(
+        embedding_model="text-embedding-3-small",
+        vectorstore_type="chroma",
+        vectorstore_path="./my_vectorstore",
+        use_hybrid_search=True,
+        chunk_size=1000,
+        chunk_overlap=200
+    )
+    
+    # Add documents from files
+    file_paths = [
+        "../documents/file.pdf",
+    ]
+    
+    for file_path in file_paths:
+        result = await retriever.add_document_from_file(file_path)
+        if result.success:
+            print(f"Successfully processed: {result.document_metadata.file_name}")
+            print(f"Chunks created: {result.document_metadata.chunk_count}")
+        else:
+            print(f"Failed to process: {result.error_message}")
+    
+    # Query documents
+    query = "Recurrent neural network (RNN) is"
+    result = await retriever.retrieve(query, k=5)
+    
+    print(f"\nQuery: {result.query}")
+    print(f"Found {len(result.documents)} relevant documents")
+    print(f"Retrieval time: {result.retrieval_time:.2f}s")
+    
+    for i, doc in enumerate(result.documents):
+        print(f"\nDocument {i+1}:")
+        print(f"Score: {result.scores[i]:.3f}")
+        print(f"Content: {doc.page_content[:200]}...")
+        print(f"Metadata: {doc.metadata}")
+
+    print(" ===== Testing document retriever DONE ==== ")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/inference_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/inference_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..baee29a8a5904b6dbd89af29ab67a3dff56657ff
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/inference_test.py
@@ -0,0 +1,327 @@
+import gradio as gr
+import asyncio
+from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+from rag.retriever.langchain_retriever import LangChainRetriever
+from rag.inference.inferencer import InferencerConfig, Inferencer
+
+async def test_inference():
+    """Main function that sets up and runs the RAG chatbot interface"""
+    
+    # Initialize RAG components
+    print("==== Start Inference Test ===")
+    
+    # Setup LLM
+    config = QwenConfig(
+        temperature=0.3,
+        max_length=512,
+        generation_timeout=30,
+        repetition_penalty=1.1,
+        do_sample = True,
+    )
+    
+    llm = QwenLLM(config=config)
+
+    # Setup Document Retriever
+    document_retriever = LangChainRetriever(
+        embedding_model="text-embedding-3-small",
+        vectorstore_type="chroma",
+        vectorstore_path="./vectorstore",
+        use_hybrid_search=True,
+        chunk_size=1000, 
+        chunk_overlap=200
+    )
+
+    # Load initial documents
+    file_paths = [
+        "../documents/bpjs.pdf",
+        "../documents/pph21.pdf",
+        "../documents/lembur.pdf",
+        "../documents/uu13.pdf",
+        "../documents/file.pdf",
+    ]
+    
+    for file_path in file_paths:
+        try:
+            result = await document_retriever.add_document_from_file(file_path)
+            if result.success:
+                print(f"Successfully processed: {result.document_metadata.file_name}")
+                print(f"Chunks created: {result.document_metadata.chunk_count}")
+            else:
+                print(f"Failed to process: {result.error_message}")
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+
+    # Setup Inferencer
+    inferencer_config = InferencerConfig(
+        default_k=2,
+        enable_reranking=False,
+        default_template_types=["system"]
+    )
+    
+    inferencer = Inferencer(
+        model=llm,
+        retriever=document_retriever,
+        reranker=None,
+        config=inferencer_config
+    )
+    
+    print("RAG system initialized successfully!")
+
+    def chatbot_response(message, history):
+        """Streaming response menggunakan RAG inferencer"""
+        try:
+            # Create new event loop for this thread
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            
+            async def stream_response():
+                partial_response = ""
+                
+                async for stream_data in inferencer.infer_stream(
+                    query=message,
+                    k=3,
+                    template_type="main_template"
+                ):
+                    print(stream_data)
+                    if stream_data["type"] == "chunk":
+                        chunk = stream_data["data"]["chunk"]
+                        partial_response += chunk
+                        yield partial_response
+                        
+                    elif stream_data["type"] == "metadata":
+                        setup_time = stream_data['data']['setup_time']
+                        print(f"\nSetup completed in {setup_time:.2f}s")
+                        
+                    elif stream_data["type"] == "complete":
+                        total_time = stream_data['data']['total_time']
+                        print(f"\nTotal time: {total_time:.2f}s")
+            
+            # Execute async generator
+            async_gen = stream_response()
+            
+            try:
+                while True:
+                    result = loop.run_until_complete(async_gen.__anext__())
+                    yield result
+            except StopAsyncIteration:
+                pass
+            finally:
+                loop.close()
+                
+        except Exception as e:
+            yield f"❌ Error: {str(e)}"
+
+    def add_document_to_vectorstore(file_path):
+        """Add document to vectorstore"""
+        if not file_path:
+            return "⚠️ Please select a file first."
+        
+        try:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            
+            async def add_doc():
+                result = await document_retriever.add_document_from_file(file_path)
+                return result
+            
+            result = loop.run_until_complete(add_doc())
+            loop.close()
+            
+            if result.success:
+                return f"✅ Successfully added: {result.document_metadata.file_name} ({result.document_metadata.chunk_count} chunks)"
+            else:
+                return f"❌ Failed to add document: {result.error_message}"
+                
+        except Exception as e:
+            return f"❌ Error adding document: {str(e)}"
+
+    def clear_chat():
+        """Function untuk clear chat history"""
+        return [], ""
+
+    # CSS untuk styling
+    css = """
+    .gradio-container {
+        max-width: 900px !important;
+        margin: auto !important;
+    }
+    .chat-message {
+        padding: 10px;
+        margin: 5px;
+        border-radius: 10px;
+    }
+    #chatbot {
+        height: 500px;
+    }
+    """
+
+    # Membuat interface Gradio
+    with gr.Blocks(css=css, title="RAG Chatbot") as demo:
+        gr.Markdown("""
+        # 🤖 RAG Chatbot dengan Text Streaming
+        Chatbot berbasis Retrieval-Augmented Generation (RAG) dengan dukungan streaming response.
+        """)
+        
+        # Status indicator
+        with gr.Row():
+            status_text = gr.Textbox(
+                value="✅ RAG System Ready",
+                label="System Status",
+                interactive=False,
+                container=True
+            )
+        
+        with gr.Row():
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    show_label=False,
+                    container=True,
+                    bubble_full_width=False,
+                    show_copy_button=True,
+                    layout="panel"
+                )
+                
+                with gr.Row():
+                    msg = gr.Textbox(
+                        placeholder="Tanyakan sesuatu tentang dokumen Anda...",
+                        show_label=False,
+                        scale=4,
+                        container=False,
+                        lines=1,
+                        max_lines=3,
+                        autofocus=True
+                    )
+                    send_btn = gr.Button("Kirim", variant="primary", scale=1)
+                
+                with gr.Row():
+                    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+                    stop_btn = gr.Button("⏹️ Stop", variant="stop", visible=False)
+            
+            # Document management panel
+            with gr.Column(scale=1):
+                gr.Markdown("### 📚 Document Management")
+                
+                with gr.Group():
+                    file_upload = gr.File(
+                        label="Upload Document",
+                        file_types=[".pdf", ".txt", ".docx"],
+                        type="filepath"
+                    )
+                    upload_btn = gr.Button("Add to Knowledge Base", variant="secondary")
+                    upload_status = gr.Textbox(
+                        label="Upload Status",
+                        interactive=False,
+                        lines=3
+                    )
+                
+                gr.Markdown("""
+                ### ⚙️ RAG Settings
+                - **K**: 3 (documents retrieved)
+                - **Template**: Friendly
+                - **Reranking**: Disabled
+                - **Vectorstore**: ChromaDB
+                """)
+        
+        # State untuk tracking
+        is_generating = gr.State(False)
+        
+        # Event handlers untuk chat
+        def user_message(message, history, generating):
+            """Handle user message"""
+            if message.strip() and not generating:
+                history.append([message, None])
+                return "", history, True, gr.update(visible=True), gr.update(interactive=False)
+            return message, history, generating, gr.update(visible=False), gr.update(interactive=True)
+        
+        def bot_message_stream(history, generating):
+            """Handle streaming bot response"""
+            if history and history[-1][1] is None and generating:
+                user_msg = history[-1][0]
+                
+                for partial_response in chatbot_response(user_msg, history):
+                    history[-1][1] = partial_response
+                    yield history, True, gr.update(visible=True), gr.update(interactive=False)
+                
+                yield history, False, gr.update(visible=False), gr.update(interactive=True)
+            else:
+                yield history, generating, gr.update(visible=False), gr.update(interactive=True)
+        
+        def stop_generation():
+            """Stop the generation process"""
+            return False, gr.update(visible=False), gr.update(interactive=True)
+        
+        # Binding events untuk submit message
+        submit_event = msg.submit(
+            user_message,
+            inputs=[msg, chatbot, is_generating],
+            outputs=[msg, chatbot, is_generating, stop_btn, send_btn]
+        ).then(
+            bot_message_stream,
+            inputs=[chatbot, is_generating],
+            outputs=[chatbot, is_generating, stop_btn, send_btn]
+        )
+        
+        # Binding events untuk send button
+        send_event = send_btn.click(
+            user_message,
+            inputs=[msg, chatbot, is_generating],
+            outputs=[msg, chatbot, is_generating, stop_btn, send_btn]
+        ).then(
+            bot_message_stream,
+            inputs=[chatbot, is_generating],
+            outputs=[chatbot, is_generating, stop_btn, send_btn]
+        )
+        
+        # Clear chat event
+        clear_btn.click(
+            clear_chat,
+            outputs=[chatbot, msg]
+        ).then(
+            lambda: (False, gr.update(visible=False), gr.update(interactive=True)),
+            outputs=[is_generating, stop_btn, send_btn]
+        )
+        
+        # Stop generation event
+        stop_btn.click(
+            stop_generation,
+            outputs=[is_generating, stop_btn, send_btn],
+            cancels=[submit_event, send_event]
+        )
+        
+        # Document upload event
+        upload_btn.click(
+            add_document_to_vectorstore,
+            inputs=[file_upload],
+            outputs=[upload_status]
+        )
+        
+        # Info panel
+        with gr.Accordion("ℹ️ Info Penggunaan", open=False):
+            gr.Markdown("""
+            ### Cara Menggunakan:
+            1. **Chat**: Ketik pertanyaan tentang dokumen yang sudah dimuat
+            2. **Upload**: Tambahkan dokumen baru ke knowledge base
+            3. **Stream**: Response akan muncul secara streaming
+            4. **Stop**: Gunakan tombol stop untuk menghentikan generasi
+            
+            ### Dokumen yang Dimuat:
+            - file2.pdf (dari folder documents)
+            - Dokumen tambahan yang Anda upload
+            
+            ### Teknologi yang Digunakan:
+            - **LLM**: Qwen dengan streaming
+            - **Embedding**: text-embedding-3-small
+            - **Vectorstore**: ChromaDB
+            - **Search**: Hybrid search (dense + sparse)
+            """)
+
+    # Launch the interface
+    print("Launching Gradio interface...")
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7861,
+        show_error=True,
+        show_api=False
+    )
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/qwen_llm_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/qwen_llm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..936ef1bb67521f251537d8fd978b5e28b0287541
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/qwen_llm_test.py
@@ -0,0 +1,69 @@
+from rag.retriever.retriever_types import *
+from rag.pipeline.qwen_llm import QwenLLM, QwenConfig
+
+import warnings
+warnings.filterwarnings("ignore")
+
+async def test_qwen_llm():
+    print(" ===== Testing QWEN LLM ==== ")
+    """Example usage of async QwenLLM"""
+
+    config = QwenConfig(
+        temperature=0.5,
+        max_length=512,
+        generation_timeout=30
+    )
+    
+    contexts = RetrievalResult(
+                documents =  [
+                    Document(page_content = "AI adalah teknologi yang memungkinkan mesin untuk belajar dan beradaptasi."),
+                    Document(page_content ="Machine learning adalah subset dari AI yang fokus pada pembelajaran otomatis.")
+                    ],
+                scores=[0.95, 0.1]
+    )
+
+    # Using async context manager
+    async with QwenLLM(config) as llm:
+          await test_qwen_single_generation(llm)
+          await test_qwen_single_rag_generation(llm, contexts)
+          await test_qwen_multiple_template_rag_generation(llm, contexts)
+          await test_qwen_batch_generation(llm, contexts)
+    print(" ===== Testing LLM DONE ==== ")
+
+async def test_qwen_single_generation(llm : QwenLLM):
+    print(" * Test Single Generation * ")
+    response = await llm.generate("Jelaskan tentang AI")
+    print(f"Response: {response}")
+    print(" * Test Single Generation Done * ")
+
+async def test_qwen_single_rag_generation(llm : QwenLLM, ctx : RetrievalResult):
+    print(" * Test Single RAG Generation * ")
+    rag_response = await llm.rag_generate(
+            question="Apa itu AI dan machine learning?",
+            contexts=ctx,
+            template_type="system"
+        )
+    print(f"RAG Response: {rag_response}")
+    print(" * Test Single RAG Generation Done * ")
+
+async def test_qwen_multiple_template_rag_generation(llm : QwenLLM,ctx : RetrievalResult):
+        print(" * Test Multiple Template Generation * ")
+        multi_responses = await llm.multi_template_generate(
+             question="Apa itu AI?",
+             contexts=ctx,
+             template_types=["system", "instruction"]
+        )
+        print(f"Multi-template responses: {multi_responses}")
+        print(" * Test Multiple Template Generation Done* ")
+
+
+async def test_qwen_batch_generation(llm : QwenLLM, ctx : RetrievalResult):
+        print(" * Test Batch Generation * ")
+        batch_responses = await llm.batch_generate([
+             "Jelaskan tentang Python",
+             "Apa itu machine learning?",
+             "Bagaimana cara kerja neural network?"
+         ])
+        print(f"Batch responses: {batch_responses}")
+        print(" * Test Batch Generation Done * ")
+    
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/rtc_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/rtc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ed38f21121e018954c538a91a256110c51d676
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tests/rtc_test.py
@@ -0,0 +1,8 @@
+# import inspect
+import rtc
+# import rag
+
+predicting_thread = dict(status = False)
+
+def test_rtc():
+    rtc.handle_rtc()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/audio_edge_tts.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/audio_edge_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f13412885295c84efe9596e6b21f76a38008f6
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/app/tts/audio_edge_tts.py
@@ -0,0 +1,93 @@
+
+import edge_tts
+import asyncio
+from typing import AsyncGenerator, Optional, Tuple
+import io
+
+class EdgeTTS:
+    def __init__(self, voice_short_name: str,  rate: str = "+0%", volume: str = "+0%", pitch: str = "+0Hz",):
+        self.voice_short_name = voice_short_name
+        self.rate_str = rate
+        self.volume_str = volume
+        self.pitch_str = pitch
+    
+    async def generate_audio_stream(self, text: str) -> AsyncGenerator[bytes, None]:
+        """
+        Generate audio stream as bytes chunks for FastRTC integration
+        Returns: AsyncGenerator yielding audio bytes chunks
+        """
+        if not text.strip():
+            raise ValueError("Please enter text to convert.")
+        if not self.voice_short_name:
+            raise ValueError("Please select a voice.")
+        
+        communicate = edge_tts.Communicate(
+            text,
+            self.voice_short_name,
+            rate=self.rate_str,
+            volume =  self.volume_str,
+            pitch=self.pitch_str
+        )
+        
+        # Stream audio chunks
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                yield chunk["data"]
+    
+    async def generate_audio_buffer(self, text: str) -> Tuple[Optional[io.BytesIO], Optional[str]]:
+        """
+        Generate complete audio as bytes buffer for immediate use
+        Returns: (audio_bytes, error_message)
+        """
+        try:
+            if not text.strip():
+                return None, "Please enter text to convert."
+            if not self.voice_short_name:
+                return None, "Please select a voice."
+            
+            communicate = edge_tts.Communicate(
+                text,
+                self.voice_short_name,
+                rate=self.rate_str,
+                pitch=self.pitch_str
+            )
+            
+            # Collect all audio chunks into a buffer
+            audio_buffer = io.BytesIO()
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    audio_buffer.write(chunk["data"])
+            
+            return audio_buffer, None
+            
+        except Exception as e:
+            return None, f"Error generating audio: {str(e)}"
+    
+    async def generate_audio_with_callback(self, text: str, callback_func):
+        """
+        Generate audio and call callback function for each chunk
+        Useful for real-time streaming to FastRTC
+        """
+        if not text.strip():
+            callback_func(None, "Please enter text to convert.")
+            return
+        if not self.voice_short_name:
+            callback_func(None, "Please select a voice.")
+            return
+        
+        try:
+            communicate = edge_tts.Communicate(
+                text,
+                self.voice_short_name,
+                rate=self.rate_str,
+                pitch=self.pitch_str
+            )
+            
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    # Call callback with audio chunk
+                    callback_func(chunk["data"], None)
+                    
+        except Exception as e:
+            callback_func(None, f"Error: {str(e)}")
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..91376f170761f6b9e53e31477ba756909a7a6604
Binary files /dev/null and b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt differ
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/.gradio/certificate.pem b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/.gradio/certificate.pem
new file mode 100644
index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/.gradio/certificate.pem
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__chat__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__chat__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ce03b862781097f7593f756e3913ec3ad2e214
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__chat__.py
@@ -0,0 +1,12 @@
+from tests.inference_test import test_inference
+from huggingface_hub import login
+login(new_session=False)
+import warnings
+warnings.filterwarnings("ignore")
+def run_test():
+    try:
+        test_inference()
+    except Exception as e:
+        print(e)
+
+run_test()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__server__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__server__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a5bc16920de927edb07987e71ffdda0ad0f2e0a
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__server__.py
@@ -0,0 +1,3 @@
+import rtc
+
+rtc.handle_rtc_server()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__test__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__test__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d2681e3c3685a91542e2608036aef88786d9ce
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__test__.py
@@ -0,0 +1,14 @@
+from tests.rtc_test import test_rtc
+import warnings
+warnings.filterwarnings("ignore")
+import asyncio
+def run_test():
+    try:
+        # await test_document_retriever()
+        # await test_qwen_llm()
+        # asyncio.run(test_inference())
+        test_rtc()
+    except Exception as e:
+        print(e)
+
+run_test()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/config/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/config/constant.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/config/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a1b8059d50edfe0a90b1e00ebc20c4998dfaee0
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/config/constant.py
@@ -0,0 +1,7 @@
+from dotenv import load_dotenv
+import os
+load_dotenv()
+
+OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
+ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
+HF_TOKEN = os.getenv("HF_TOKEN")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..210adf5ca4086414a4cf11e348e031d639deec18
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/__init__.py
@@ -0,0 +1,2 @@
+from src.internal.chatbot.chatbot_handler import RAGChatbot
+from src.internal.chatbot.ui_chatbot import ChatbotUI
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/chatbot_handler.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/chatbot_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..50dc9d6a5b4ed51da743f22a25160d243a05b732
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/chatbot_handler.py
@@ -0,0 +1,122 @@
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Dict, Any, Optional, List, Tuple
+import asyncio
+import gradio as gr
+from dataclasses import dataclass
+from enum import Enum
+from src.internal.rag.agents.query_maker_agent import QueryMakerAgent
+from src.internal.rag.agents.customer_service_agent import CSAgent
+
+class RAGChatbot:
+    """Main RAG Chatbot class"""
+    
+    def __init__(self, query_maker_agent :QueryMakerAgent,  cs_agent: CSAgent,  title: str = "RAG Chatbot"):
+        self.rag_agent = cs_agent
+        self.query_maker_agent = query_maker_agent
+        self.title = title
+        self.css = self._get_default_css()
+
+        async def document_loader() -> AsyncGenerator[str, None]:
+            await self.rag_agent.load_documents()
+            yield "done"
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        async_gen = document_loader()
+        
+        try:
+            while True:
+                loop.run_until_complete(async_gen.__anext__())
+        except StopAsyncIteration:
+            pass
+        finally:
+            loop.close()
+
+    def _get_default_css(self) -> str:
+        """Get default CSS styling"""
+        return """
+        .gradio-container {
+            max-width: 900px !important;
+            margin: auto !important;
+        }
+        .chat-message {
+            padding: 10px;
+            margin: 5px;
+            border-radius: 10px;
+        }
+        #chatbot {
+            height: 500px;
+        }
+        """
+    
+    async def _stream_response(self, message: str) -> AsyncGenerator[str, None]:
+        """Internal method untuk streaming response"""
+        try:
+            partial_response = ""
+            
+            
+            async for stream_data in self.rag_agent.get_result(question=message):
+                if stream_data["type"] == "chunk":
+                    chunk = stream_data["data"]["chunk"]
+                    partial_response += chunk
+                    yield partial_response
+                    
+                elif stream_data["type"] == "metadata":
+                    setup_time = stream_data['data']['setup_time']
+                    print(f"\nSetup completed in {setup_time:.2f}s")
+                    
+                elif stream_data["type"] == "complete":
+                    total_time = stream_data['data']['total_time']
+                    print(f"\nTotal time: {total_time:.2f}s")
+                    
+        except Exception as e:
+            yield f"❌ Error: {str(e)}"
+    
+    def _chatbot_response(self, message: str, history: List[Tuple[str, str]]):
+        """Generate chatbot response with proper async handling"""
+        try:
+            # Create new event loop for this thread
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            
+            async_gen = self._stream_response(message)
+            
+            try:
+                while True:
+                    result = loop.run_until_complete(async_gen.__anext__())
+                    yield result
+            except StopAsyncIteration:
+                pass
+            finally:
+                loop.close()
+                
+        except Exception as e:
+            yield f"❌ Error: {str(e)}"
+    
+    def _clear_chat(self) -> Tuple[List, str]:
+        """Clear chat history"""
+        return [], ""
+    
+    def _user_message(self, message: str, history: List, generating: bool) -> Tuple[str, List, bool, gr.update, gr.update]:
+        """Handle user message input"""
+        if message.strip() and not generating:
+            history.append([message, None])
+            return "", history, True, gr.update(visible=True), gr.update(interactive=False)
+        return message, history, generating, gr.update(visible=False), gr.update(interactive=True)
+    
+    def _bot_message_stream(self, history: List, generating: bool):
+        """Handle streaming bot response"""
+        if history and history[-1][1] is None and generating:
+            user_msg = history[-1][0]
+            
+            for partial_response in self._chatbot_response(user_msg, history):
+                history[-1][1] = partial_response
+                yield history, True, gr.update(visible=True), gr.update(interactive=False)
+            
+            yield history, False, gr.update(visible=False), gr.update(interactive=True)
+        else:
+            yield history, generating, gr.update(visible=False), gr.update(interactive=True)
+    
+    def _stop_generation(self) -> Tuple[bool, gr.update, gr.update]:
+        """Stop the generation process"""
+        return False, gr.update(visible=False), gr.update(interactive=True)
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/ui_chatbot.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/ui_chatbot.py
new file mode 100644
index 0000000000000000000000000000000000000000..73647b13f0e5b2361892afe4ec788d05146505c6
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/chatbot/ui_chatbot.py
@@ -0,0 +1,171 @@
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Dict, Any, Optional, List, Tuple
+import asyncio
+import gradio as gr
+from dataclasses import dataclass
+from enum import Enum
+from src.internal.chatbot.chatbot_handler import RAGChatbot
+class ChatbotInterface(ABC):
+    """Abstract interface untuk chatbot UI"""
+    
+    @abstractmethod
+    def create_interface(self) -> gr.Blocks:
+        """Create the Gradio interface"""
+        pass
+    
+    @abstractmethod
+    def launch(self, **kwargs) -> None:
+        """Launch the interface"""
+        pass
+
+class ChatbotUI(ChatbotInterface):
+    """Gradio implementation of RAG chatbot interface"""
+    
+    def __init__(self, chatbot: RAGChatbot):
+        self.chatbot = chatbot
+        self.demo = None
+    
+    def create_interface(self) -> gr.Blocks:
+        """Create the Gradio interface"""
+        with gr.Blocks(css=self.chatbot.css, title=self.chatbot.title) as demo:
+            # Header
+            gr.Markdown(f"""
+            # 🤖 {self.chatbot.title}
+            ### Intelligent Document Assistant
+            """)
+            
+            # Status indicator
+            with gr.Row():
+                status_text = gr.Textbox(
+                    value="✅ RAG System Ready",
+                    label="System Status",
+                    interactive=False,
+                    container=True
+                )
+            
+            # Main chat interface
+            with gr.Row():
+                with gr.Column(scale=3):
+                    chatbot_ui = gr.Chatbot(
+                        elem_id="chatbot",
+                        show_label=False,
+                        container=True,
+                        bubble_full_width=False,
+                        show_copy_button=True,
+                        layout="panel"
+                    )
+                    
+                    # Message input
+                    with gr.Row():
+                        msg = gr.Textbox(
+                            placeholder="Ask anything about your documents...",
+                            show_label=False,
+                            scale=4,
+                            container=False,
+                            lines=1,
+                            max_lines=3,
+                            autofocus=True
+                        )
+                        send_btn = gr.Button("Send", variant="primary", scale=1)
+                    
+                    # Control buttons
+                    with gr.Row():
+                        clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+                        stop_btn = gr.Button("⏹️ Stop", variant="stop", visible=False)
+                
+                # Settings panel
+                with gr.Column(scale=1):
+                    gr.Markdown("### ⚙️ RAG Settings")
+                    with gr.Group():
+                        gr.Markdown("""
+                        - **K**: 3 (documents retrieved)
+                        - **Template**: Friendly
+                        - **Reranking**: Disabled
+                        - **Vectorstore**: ChromaDB
+                        - **Streaming**: Enabled
+                        """)
+            
+            # State management
+            is_generating = gr.State(False)
+            
+            # Event bindings
+            self._bind_events(msg, chatbot_ui, send_btn, clear_btn, stop_btn, is_generating)
+            
+            # Info panel
+            with gr.Accordion("ℹ️ Usage Information", open=False):
+                gr.Markdown("""
+                ### How to Use:
+                1. **Chat**: Type questions about your documents
+                2. **Stream**: Responses appear in real-time
+                3. **Stop**: Use stop button to halt generation
+                4. **Clear**: Reset conversation history
+                
+                ### Technology Stack:
+                - **LLM**: Advanced language model with streaming
+                - **Embedding**: High-quality text embeddings
+                - **Vectorstore**: ChromaDB for efficient retrieval
+                - **Search**: Hybrid search (dense + sparse)
+                """)
+        
+        self.demo = demo
+        return demo
+    
+    def _bind_events(self, msg, chatbot_ui, send_btn, clear_btn, stop_btn, is_generating):
+        """Bind all event handlers"""
+        # Submit message events
+        submit_event = msg.submit(
+            self.chatbot._user_message,
+            inputs=[msg, chatbot_ui, is_generating],
+            outputs=[msg, chatbot_ui, is_generating, stop_btn, send_btn]
+        ).then(
+            self.chatbot._bot_message_stream,
+            inputs=[chatbot_ui, is_generating],
+            outputs=[chatbot_ui, is_generating, stop_btn, send_btn]
+        )
+        
+        send_event = send_btn.click(
+            self.chatbot._user_message,
+            inputs=[msg, chatbot_ui, is_generating],
+            outputs=[msg, chatbot_ui, is_generating, stop_btn, send_btn]
+        ).then(
+            self.chatbot._bot_message_stream,
+            inputs=[chatbot_ui, is_generating],
+            outputs=[chatbot_ui, is_generating, stop_btn, send_btn]
+        )
+        
+        # Clear chat event
+        clear_btn.click(
+            self.chatbot._clear_chat,
+            outputs=[chatbot_ui, msg]
+        ).then(
+            lambda: (False, gr.update(visible=False), gr.update(interactive=True)),
+            outputs=[is_generating, stop_btn, send_btn]
+        )
+        
+        # Stop generation event
+        stop_btn.click(
+            self.chatbot._stop_generation,
+            outputs=[is_generating, stop_btn, send_btn],
+            cancels=[submit_event, send_event]
+        )
+    
+    def launch(self, port = 7861, **kwargs) -> None:
+        """Launch the Gradio interface"""
+        if self.demo is None:
+            self.create_interface()
+        
+        # Default launch parameters
+        default_params = {
+            'share': False,
+            'server_name': "0.0.0.0",
+            'server_port': port,
+            'show_error': True,
+            'show_api': False
+        }
+        
+        # Merge with user parameters
+        launch_params = {**default_params, **kwargs}
+        
+        print("Launching Gradio interface...")
+        self.demo.launch(**launch_params)
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a2c31063c7bb5e6bcafad7c04ca5e59f0679b7
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/__init__.py
@@ -0,0 +1,14 @@
+from src.internal.rag.pipeline.language_model import LanguageModel, LanguageModelConfig
+from src.internal.rag.retriever.langchain_retriever import LangChainRetriever
+from src.internal.rag.inference.inferencer import Inferencer, InferencerConfig
+from src.internal.rag.agents.customer_service_agent import CSAgent
+from src.internal.rag.agents.query_maker_agent import QueryMakerAgent
+from langchain_core.documents import Document
+from src.internal.rag.web_search.duckduckgo_search import DuckDuckGoSearch
+from src.internal.rag.chat_template.prompt_template import get_chat_template
+
+
+
+
+
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/agents.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/agents.py
new file mode 100644
index 0000000000000000000000000000000000000000..575077d70359a7e88d9f1cf2bfc42a3290cf61da
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/agents.py
@@ -0,0 +1,16 @@
+
+from src.internal.rag.inference.inferencer import Inferencer
+from abc import ABC, abstractmethod
+class Agent(ABC):
+    def __init__(self, inferencer:Inferencer, prompt_template = [
+        {
+            "role" : "system",
+            "content":"You are an agent that doing some specic task"
+        }
+    ]):
+        self.inferencer = inferencer
+        self.inferencer.model.prompt_template = prompt_template
+        self.prompt = prompt_template
+    @abstractmethod
+    async def get_result(self):
+        pass
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/customer_service_agent.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7b06efe5bac5b299e0d07020ddf4af90b325b78
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
@@ -0,0 +1,33 @@
+from src.internal.rag.agents.agents import Agent
+from src.internal.rag.inference.inferencer import Inferencer
+
+class CSAgent(Agent):
+    def __init__(self, inferencer : Inferencer , prompt_template):
+        super().__init__(inferencer, prompt_template)
+        self.inferencer = inferencer
+        self.prompt_template = prompt_template
+        self.file_paths = [
+            "data/documents/bpjs.pdf",
+            # "../documents/pph21.pdf",
+            # "../documents/lembur.pdf",
+            # "../documents/uu13.pdf",
+            "data/documents/file.pdf",
+        ]
+    async def load_documents(self):
+        for file_path in self.file_paths:
+            await self.add_doc(file_path)
+        
+    async def add_doc(self, file_path):
+        result = await self.inferencer.retriever.add_document_from_file(file_path)
+        if result.success:
+                print(f"Successfully processed: {result.document_metadata.file_name}")
+                print(f"Chunks created: {result.document_metadata.chunk_count}")
+        else:
+                print(f"Failed to process: {result.error_message}")
+
+    async def get_result(self, question):
+        self.inferencer.model.prompt_template = self.prompt_template
+        async for item in self.inferencer.infer_stream(query = question,
+                                    enable_reranking=False,
+                                    k=3):
+                yield item
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/gpt_customer_service_agent.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/gpt_customer_service_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07cb9487bcf878fe775a28c7607732934e3a44b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/gpt_customer_service_agent.py
@@ -0,0 +1,12 @@
+from src.internal.rag.agents.agents import Agent
+from src.internal.rag.inference.inferencer import Inferencer
+
+class GPTCSAgent(Agent):
+    def __init__(self, inferencer : Inferencer , prompt_template):
+        super().__init__(inferencer, prompt_template)
+        self.inferencer = inferencer
+        self.prompt_template = prompt_template
+    async def get_result(self, question : str):
+        self.inferencer.model.prompt_template = self.prompt_template
+        print("Question received :", question)
+        return await self.inferencer.infer(query = question)
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/query_maker_agent.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/query_maker_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e30964542549dc0730f9989cec95eb53284dd4b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/query_maker_agent.py
@@ -0,0 +1,12 @@
+from src.internal.rag.agents.agents import Agent
+from src.internal.rag.inference.inferencer import Inferencer
+
+class QueryMakerAgent(Agent):
+    def __init__(self, inferencer : Inferencer , prompt_template):
+        super().__init__(inferencer, prompt_template)
+        self.inferencer = inferencer
+        self.prompt_template = prompt_template
+    async def get_result(self, question : str):
+        self.inferencer.model.prompt_template = self.prompt_template
+        print("Question received :", question)
+        return await self.inferencer.infer(query = question)
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/chat_template/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/chat_template/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/chat_template/prompt_template.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/chat_template/prompt_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b335f9e11f3b32095ae8ac2d876bab71d754f49
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/chat_template/prompt_template.py
@@ -0,0 +1,26 @@
+def read_template_txt(file_path):
+    """Baca file txt biasa"""
+    with open(f"data/prompt_template/{file_path}.txt", 'r', encoding='utf-8') as f:
+        return f.read()
+def get_chat_template(file_name):
+    sys_prompt = read_template_txt(file_name)
+    return [
+        {
+            "role" : "system",
+            "content" : f"""
+            {sys_prompt}
+            """
+        },
+        {
+            "role" : "user",
+            "content" : """ 
+            
+            Please answer properly:  
+            {question} 
+
+            From given context :
+            {context}
+            """
+        }
+    ]
+    
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc727bc1e123e39878fc992c9d360b1c4616b7a9
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer.py
@@ -0,0 +1,473 @@
+from src.internal.rag.retriever.langchain_retriever import LangChainRetriever
+from src.internal.rag.pipeline.language_model import LanguageModel, LanguageModelConfig
+from src.internal.rag.retriever.retriever_types import RetrievalResult
+from typing import List, Union, Dict, Any, Optional, AsyncGenerator
+import asyncio
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+
+@dataclass
+class InferencerConfig:
+    """Konfigurasi untuk Inferencer"""
+    default_k: int = 5
+    max_contexts: int = 10
+    enable_reranking: bool = False
+    reranker_top_k: int = 5
+    enable_logging: bool = True
+    response_timeout: float = 30.0
+
+class Inferencer:
+    def __init__(self, 
+                 model: LanguageModel, 
+                 retriever: LangChainRetriever = None, 
+                 search_engine = None,
+                 reranker=None,
+                 config: Optional[InferencerConfig] = None):
+        self.model = model
+        self.retriever = retriever
+        self.reranker = reranker
+        self.search_engine = search_engine
+        self.config = config or InferencerConfig()
+        
+        # Setup logging
+        if self.config.enable_logging:
+            logging.basicConfig(level=logging.INFO)
+            self.logger = logging.getLogger(__name__)
+        else:
+            self.logger = logging.getLogger(__name__)
+            self.logger.setLevel(logging.ERROR)
+        
+        # Model loading flag
+        self._model_loaded = False
+    
+    async def _ensure_model_loaded(self):
+        """Pastikan model sudah diload (hanya sekali)"""
+        if not self._model_loaded:
+            self.logger.info("Loading model...")
+            await self.model.load_model()
+            self._model_loaded = True
+            self.logger.info("Model loaded successfully")
+    
+    async def retrieve_context(self, 
+                             query: str, 
+                             k: Optional[int] = None) -> RetrievalResult:
+        """
+        Retrieve context documents
+        
+        Args:
+            query: Search query
+            k: Number of documents to retrieve
+            
+        Returns:
+            RetrievalResult object
+        """
+        k = k or self.config.default_k
+        self.logger.info(f"Retrieving {k} contexts for query: {query[:50]}...")
+        
+        try:
+            start_time = datetime.now()
+            contexts = await self.retriever.retrieve(query, k=k)
+            self.logger.info(f"Retrieved Contexts : {contexts}")
+            retrieval_time = (datetime.now() - start_time).total_seconds()
+            
+            self.logger.info(f"Retrieved {len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts)} contexts in {retrieval_time:.2f}s")
+            return contexts
+            
+        except Exception as e:
+            self.logger.error(f"Error during retrieval: {e}")
+            raise
+    
+    async def rerank_contexts(self, 
+                            contexts: RetrievalResult, 
+                            query: str,
+                            top_k: Optional[int] = None) -> RetrievalResult:
+        """
+        Rerank retrieved contexts
+        
+        Args:
+            contexts: Retrieved contexts
+            query: Original query
+            top_k: Number of top contexts to keep after reranking
+            
+        Returns:
+            Reranked RetrievalResult object
+        """
+        if not self.reranker or not self.config.enable_reranking:
+            self.logger.info("Reranking disabled or reranker not available")
+            return contexts
+        
+        top_k = top_k or self.config.reranker_top_k
+        self.logger.info(f"Reranking contexts, keeping top {top_k}")
+        
+        try:
+            start_time = datetime.now()
+            reranked_contexts = await self.reranker.rerank(
+                query=query,
+                contexts=contexts,
+                top_k=top_k
+            )
+            rerank_time = (datetime.now() - start_time).total_seconds()
+            
+            self.logger.info(f"Reranking completed in {rerank_time:.2f}s")
+            return reranked_contexts
+            
+        except Exception as e:
+            self.logger.error(f"Error during reranking: {e}")
+            # Return original contexts if reranking fails
+            return contexts
+    
+    async def generate_response(self, 
+                              contexts: RetrievalResult, 
+                              query: Union[str, List[str]], 
+                              response_type: Union[List[str], str] = None,
+                              max_new_tokens: Optional[int] = None,
+                              **generation_kwargs) -> List[Dict[str, Any]]:
+
+        await self._ensure_model_loaded()
+        
+        # Default response types
+        if response_type is None:
+            response_type = ["rag_response"]
+        elif isinstance(response_type, str):
+            response_type = [response_type]
+
+        
+        responses = []
+        
+        try:
+            # RAG Response
+            if "rag_response" in response_type:
+                self.logger.info("Generating RAG response...")
+                start_time = datetime.now()
+                
+                if isinstance(query, list):
+                    # Handle multiple queries
+                    rag_responses = {}
+                    for i, q in enumerate(query):
+                        rag_response = await self.model.rag_generate(
+                            question=q,
+                            contexts=contexts,
+                            max_new_tokens=max_new_tokens,
+                            **generation_kwargs
+                        )
+                        rag_responses[f"query_{i}"] = rag_response
+                    responses.append({"rag_response": rag_responses})
+                else:
+                    rag_response = await self.model.rag_generate(
+                        question=query,
+                        contexts=contexts,
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                    responses.append({"rag_response": rag_response})
+                
+                generation_time = (datetime.now() - start_time).total_seconds()
+                self.logger.info(f"RAG response generated in {generation_time:.2f}s")
+            
+            # Multi-template Response
+            if "multi_response" in response_type:
+                self.logger.info("Generating multi-template responses...")
+                start_time = datetime.now()
+                
+                if isinstance(query, list):
+                    multi_responses = {}
+                    for i, q in enumerate(query):
+                        multi_response = await self.model.multi_template_generate(
+                            question=q,
+                            contexts=contexts,
+                            max_new_tokens=max_new_tokens,
+                            **generation_kwargs
+                        )
+                        multi_responses[f"query_{i}"] = multi_response
+                    responses.append({"multi_responses": multi_responses})
+                else:
+                    multi_responses = await self.model.multi_template_generate(
+                        question=query,
+                        contexts=contexts,
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                    responses.append({"multi_responses": multi_responses})
+                
+                generation_time = (datetime.now() - start_time).total_seconds()
+                self.logger.info(f"Multi-template responses generated in {generation_time:.2f}s")
+            
+        
+            if "batch_response" in response_type:
+                self.logger.info("Generating batch responses...")
+                start_time = datetime.now()
+                
+                if isinstance(query, list):
+                    batch_responses = await self.model.batch_generate(
+                        query, 
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                else:
+                    batch_responses = await self.model.batch_generate(
+                        [query], 
+                        max_new_tokens=max_new_tokens,
+                        **generation_kwargs
+                    )
+                
+                responses.append({"batch_responses": batch_responses})
+                
+                generation_time = (datetime.now() - start_time).total_seconds()
+                self.logger.info(f"Batch responses generated in {generation_time:.2f}s")
+            
+            return responses
+            
+        except Exception as e:
+            self.logger.error(f"Error during response generation: {e}")
+            raise
+    
+    async def generate_response_stream(self, 
+                                     contexts: RetrievalResult, 
+                                     query: str,
+                                     max_new_tokens: Optional[int] = None,
+                                     **generation_kwargs) -> AsyncGenerator[str, None]:
+
+        await self._ensure_model_loaded()
+        
+        async for chunk in self.model.rag_generate_stream(
+            question=query,
+            contexts=contexts,
+            max_new_tokens=max_new_tokens,
+            **generation_kwargs
+        ):
+            yield chunk
+    
+    async def infer(self, 
+                   query: str, 
+                   response_type: Union[List[str], str] = None,
+                   k: Optional[int] = None,
+                   enable_reranking: Optional[bool] = None,
+                   max_new_tokens: Optional[int] = None,
+                   **generation_kwargs) -> Dict[str, Any]:
+        
+        start_time = datetime.now()
+
+        main_query = query[0] if isinstance(query, list) else query
+        
+        try:
+            
+            if(self.search_engine):
+                await self.retrieve_from_search_engine(query, k = k)
+            if(self.retriever):
+                retrieved_contexts = await self.retrieve_context(main_query, k=k)
+            else:
+                retrieved_contexts  = ""
+
+            enable_rerank = enable_reranking if enable_reranking is not None else self.config.enable_reranking
+            if enable_rerank:
+                contexts = await self.rerank_contexts(retrieved_contexts, main_query)
+            else:
+                contexts = retrieved_contexts
+            
+            # Step 3: Generate responses
+            responses = await self.generate_response(
+                contexts=contexts,
+                query=query,
+                response_type=response_type,
+                max_new_tokens=max_new_tokens,
+                **generation_kwargs
+            )
+            
+            total_time = (datetime.now() - start_time).total_seconds()
+            
+            # Prepare result
+            result = {
+                "query": query,
+                "responses": responses,
+                "contexts": contexts,
+                "metadata": {
+                    "total_time": total_time,
+                    "retrieval_enabled": True,
+                    "reranking_enabled": enable_rerank,
+                    "num_contexts": len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts),
+                    "response_types": response_type,
+                    "timestamp": datetime.now().isoformat()
+                }
+            }
+            
+            self.logger.info(f"Inference completed in {total_time:.2f}s")
+            return result
+            
+        except Exception as e:
+            self.logger.error(f"Error during inference: {e}")
+            raise
+    async def retrieve_from_search_engine(self, query: str, k: int = 3):
+        """
+        Alternative method: Process results as they come
+        """
+        from langchain_core.documents import Document
+        
+        search_results = []
+        
+        try:
+            # Process results one by one as they come
+            async for result in self.search_engine.search(query, max_results=k):
+                self.logger.info(f"Processing SEO Result: {result[:100]}...")
+                
+                doc = Document(
+                    page_content=result,
+                    metadata={"source": "internet_search", "query": query}
+                )
+                search_results.append(doc)
+                
+                # Optionally add to retriever immediately
+                await self.retriever.add_documents([doc])
+            
+            self.logger.info(f"Processed {len(search_results)} search results")
+            return search_results
+            
+        except Exception as e:
+            self.logger.error(f"Error in retrieve_from_search_engine_alternative: {e}", exc_info=True)
+            raise
+    async def infer_stream(self, 
+                          query: str,
+                          k: Optional[int] = None,
+                          enable_reranking: Optional[bool] = None,
+                          max_new_tokens: Optional[int] = None,
+                          **generation_kwargs) -> AsyncGenerator[Dict[str, Any], None]:
+       
+        start_time = datetime.now()
+        
+        try:
+            if(self.search_engine):
+                await self.retrieve_from_search_engine(query, k = k)
+            if(self.retriever is not None):
+                retrieved_contexts = await self.retrieve_context(query, k=k)
+            else:
+                retrieved_contexts = ""
+
+            enable_rerank = enable_reranking if enable_reranking is not None else self.config.enable_reranking
+            if enable_rerank:
+                contexts = await self.rerank_contexts(retrieved_contexts, query)
+            else:
+                contexts = retrieved_contexts
+
+            setup_time = (datetime.now() - start_time).total_seconds()
+            yield {
+                "type": "metadata",
+                "data": {
+                    "query": query,
+                    "setup_time": setup_time,
+                    "num_contexts": len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts),
+                    "reranking_enabled": enable_rerank,
+                }
+            }
+        
+
+            response_start = datetime.now()
+            accumulated_text = ""
+            
+            async for chunk in self.generate_response_stream(
+                contexts=contexts,
+                query=query,
+                max_new_tokens=max_new_tokens,
+                **generation_kwargs
+            ):
+                accumulated_text += chunk
+                yield {
+                    "type": "chunk",
+                    "data": {
+                        "chunk": chunk,
+                        "accumulated_text": accumulated_text,
+                        "generation_time": (datetime.now() - response_start).total_seconds()
+                    }
+                }
+            total_time = (datetime.now() - start_time).total_seconds()
+            yield {
+                "type": "complete",
+                "data": {
+                    "total_time": total_time,
+                    "final_response": accumulated_text,
+                    "contexts": contexts
+                }
+            }
+            
+        except Exception as e:
+            self.logger.error(f"Error during streaming inference: {e}")
+            yield {
+                "type": "error",
+                "data": {
+                    "error": str(e),
+                    "error_time": (datetime.now() - start_time).total_seconds()
+                }
+            }
+    
+    async def batch_infer(self, 
+                         queries: List[str],
+                         response_type: Union[List[str], str] = None,
+                         k: Optional[int] = None,
+                         enable_reranking: Optional[bool] = None,
+                         **generation_kwargs) -> List[Dict[str, Any]]:
+        
+        self.logger.info(f"Starting batch inference for {len(queries)} queries")
+
+        tasks = [
+            asyncio.create_task(
+                self.infer(
+                    query=query,
+                    response_type=response_type,
+                    k=k,
+                    enable_reranking=enable_reranking,
+                    **generation_kwargs
+                )
+            )
+            for query in queries
+        ]
+        
+        # Wait for all tasks
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # Process results
+        processed_results = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                self.logger.error(f"Error processing query {i}: {result}")
+                processed_results.append({
+                    "query": queries[i],
+                    "error": str(result),
+                    "success": False
+                })
+            else:
+                result["success"] = True
+                processed_results.append(result)
+        
+        return processed_results
+    
+    async def get_available_templates(self) -> List[str]:
+        """Get available template types from model"""
+        await self._ensure_model_loaded()
+        return self.model.get_available_templates()
+
+    async def get_model_info(self) -> Dict[str, Any]:
+        """Get model information"""
+        await self._ensure_model_loaded()
+        model_info = await self.model.get_model_info()
+        
+        return {
+            "model_info": model_info,
+            "inferencer_config": self.config.__dict__,
+            "reranker_available": self.reranker is not None,
+            "available_templates": await self.get_available_templates()
+        }
+    
+    async def close(self):
+        """Clean up resources"""
+        self.logger.info("Closing Inferencer...")
+        if self.model:
+            await self.model.close()
+        self.logger.info("Inferencer closed successfully")
+    
+    async def __aenter__(self):
+        """Async context manager entry"""
+        await self._ensure_model_loaded()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        await self.close()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/language_model.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/language_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b11ac9ee510bbd8aaa646229eba213c78a7622f
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/language_model.py
@@ -0,0 +1,630 @@
+import torch
+import asyncio
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer, BitsAndBytesConfig
+import torch
+from typing import Optional, Dict, Any, List, Union, Callable, Awaitable, AsyncGenerator
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from threading import Thread
+from src.internal.rag.retriever.retriever_types import RetrievalResult
+from langchain_core.documents import Document
+import copy
+
+@dataclass
+class LanguageModelConfig:
+    model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"
+    device: str = "cuda"
+    torch_dtype: torch.dtype = torch.float16
+    max_length: int = 2048
+    temperature: float = 0.7
+    top_p: float = 0.8
+    top_k: int = 50
+    do_sample: bool = True
+    quantization_config: any = None
+    pad_token_id: Optional[int] = None
+    eos_token_id: Optional[int] = None
+    
+    max_context_length: int = 1500
+    context_separator: str = "\n---\n"
+    instruction_template: str = "system"  
+    
+    max_workers: int = 2
+    generation_timeout: float = 30
+    repetition_penalty: float = 1.0
+    
+    stream_timeout: float = 100  
+    skip_prompt: bool = True     
+
+class LanguageModel:
+    """
+    Async LLM Qwen 0.5B dengan interface yang mudah digunakan
+    Termasuk prompt formatting khusus untuk RAG (Retrieval-Augmented Generation)
+    Dan support untuk text streaming
+    """
+    
+    def __init__(self, config: Optional[LanguageModelConfig] = None, prompt_template : List[Dict] = [
+                 {"role": "system", "content": "You are a helpful assistant."},
+                 {"role": "user", "content": "{question}"}
+            ] ):
+        """
+        Inisialisasi LM
+        
+        Args:
+            config: Konfigurasi model (optional, akan menggunakan default jika None)
+        """
+        if(config is None):
+            self.config = LanguageModelConfig()
+        else:
+            self.config = config
+        self.tokenizer : AutoTokenizer = None
+        self.model = None
+        self.generation_config = None
+        self.is_loaded = False
+        self.executor = ThreadPoolExecutor(max_workers=self.config.max_workers)
+        self._lock = asyncio.Lock()
+        
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        
+        
+        self.prompt_template = prompt_template
+    
+    async def load_model(self) -> None:
+        """Load model dan tokenizer secara async"""
+        async with self._lock:
+            if self.is_loaded:
+                self.logger.info("Model already loaded")
+                return
+            
+            try:
+                self.logger.info(f"Loading model: {self.config.model_name}")
+                
+                
+                self.tokenizer = await asyncio.get_event_loop().run_in_executor(
+                    self.executor,
+                    lambda: AutoTokenizer.from_pretrained(
+                        self.config.model_name,
+                        trust_remote_code=True,
+                        torch_dtype="auto",
+                        device_map="auto",
+                    )
+                )
+                
+                
+                self.model = await asyncio.get_event_loop().run_in_executor(
+                    self.executor,
+                    lambda: AutoModelForCausalLM.from_pretrained(
+                        self.config.model_name,
+                        quantization_config=self.config.quantization_config,
+                        torch_dtype=self.config.torch_dtype,
+                        device_map=self.config.device,
+                        trust_remote_code=True
+                    )
+                )
+                
+                
+                self.generation_config = GenerationConfig(
+                    max_length=self.config.max_length,
+                    temperature=self.config.temperature,
+                    top_p=self.config.top_p,
+                    top_k=self.config.top_k,
+                    do_sample=self.config.do_sample,
+                    pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                    eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                    repetition_penalty = self.config.repetition_penalty,
+                )
+                
+                self.is_loaded = True
+                self.logger.info("Model loaded successfully!")
+                
+            except Exception as e:
+                self.logger.error(f"Error loading model: {e}")
+                raise
+    
+    def _format_context(self, contexts: Union[List[str], RetrievalResult], numbering: bool = True) -> str:
+        """
+        Format retrieved contexts menjadi string yang coherent
+        
+        Args:
+            contexts: List of contexts (string atau RetrievalResult objects)
+            numbering: Whether to add document numbering
+            
+        Returns:
+            Formatted context string
+        """
+        if not contexts:
+            return ""
+        
+        formatted_contexts = []
+        self.logger.info(f"Context : {contexts}")
+        self.logger.info(f"Is RetrievalResult Contexts =  {isinstance(contexts, RetrievalResult)}")
+        if isinstance(contexts, RetrievalResult):
+                for i, ctx in enumerate(contexts.documents, 1):
+                    if numbering:
+                        header = f"[Dokumen {i}"
+                        if contexts.scores[i - 1]:
+                            header += f" (Skor: {contexts.scores[i - 1]:.3f})"
+                        header += "]"
+                    else:
+                        header = "[Dokumen"
+                        header += "]"
+                    formatted_contexts.append(f"{header}\n{ctx.page_content}")
+        else:
+            for i, ctx in enumerate(contexts, 1):
+                if isinstance(ctx, str):
+                    header = f"[Dokumen {i}]" if numbering else "[Dokumen]"
+                    formatted_contexts.append(f"{header}\n{ctx}")
+                else:
+                    header = f"[Dokumen {i}]" if numbering else "[Dokumen]"
+                    formatted_contexts.append(f"{header}\n{str(ctx)}")
+        
+        return self.config.context_separator.join(formatted_contexts)
+    
+    def _truncate_context(self, context: str, max_length: int) -> str:
+        """
+        Truncate context jika terlalu panjang
+        
+        Args:
+            context: Context string
+            max_length: Maximum length in characters
+            
+        Returns:
+            Truncated context
+        """
+        if len(context) <= max_length:
+            return context
+        
+        
+        truncated = context[:max_length - 50]
+        return truncated + "\n\n[... Context dipotong karena terlalu panjang ...]"
+
+    async def format_rag_prompt(self, 
+                                question: str, 
+                                contexts: Union[List[str], RetrievalResult],
+                                custom_template: Optional[str] = None,
+                                include_metadata: bool = True,
+                                context_numbering: bool = True,
+                                max_contexts: Optional[int] = None) -> str:
+        """
+        Format prompt untuk RAG dengan berbagai template options (async)
+        """
+        
+        def _format_sync():
+            
+            
+            if isinstance(contexts, RetrievalResult):
+                docs = contexts.documents
+                if max_contexts:
+                    docs = docs[:max_contexts]
+                processed_contexts = RetrievalResult(
+                    documents=docs,
+                    scores=contexts.scores[:len(docs)] if contexts.scores else [],
+                    query=contexts.query,
+                    retrieval_time=contexts.retrieval_time,
+                    metadata=contexts.metadata
+                )
+            else:
+                
+                processed_contexts = contexts[:max_contexts] if max_contexts and len(contexts) > max_contexts else contexts
+
+            
+            formatted_context = self._format_context(processed_contexts, context_numbering)
+
+            
+            formatted_context = self._truncate_context(
+                formatted_context, 
+                self.config.max_context_length
+            )
+
+            
+            if include_metadata and isinstance(processed_contexts, RetrievalResult):
+                metadata_info = []
+                for i, doc in enumerate(processed_contexts.documents, 1):
+                    if hasattr(doc, "metadata") and doc.metadata:
+                        metadata_info.append(f"Dokumen {i}: {doc.metadata}")
+                
+                
+
+            return formatted_context
+
+        
+        formatted_context = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _format_sync
+        )
+        self.logger.info(f"Formatted Context {formatted_context}")
+        
+        
+        if custom_template:
+            return custom_template.format(
+                context=formatted_context,
+                question=question
+            )
+        elif self.prompt_template:
+            print("question", question)
+           
+            template_data = copy.deepcopy(self.prompt_template)
+            print("rag template = ", template_data)
+            
+
+            formatted_template = []
+            for cht in template_data:
+                content = cht["content"]
+                if "{context}" in content or "{question}" in content:
+                    try:
+                        content = content.format(
+                            context=formatted_context,
+                            question=question
+                        )
+                    except KeyError as e:
+                        self.logger.error(f"Missing placeholder in template: {e}")
+                        
+                        if "{context}" in content:
+                            content = content.replace("{context}", formatted_context)
+                        if "{question}" in content:
+                            content = content.replace("{question}", question)
+
+                formatted_chat = {
+                    "role": cht["role"],
+                    "content": content
+                }
+                
+                if "description" in cht:
+                    formatted_chat["description"] = cht["description"]
+                    
+                formatted_template.append(formatted_chat)
+
+            
+            
+            return formatted_template
+        else:
+            
+            return [
+                 {"role": "system", "content": "You are a helpful assistant."},
+                 {"role": "user", "content": question}
+            ]
+
+    async def generate_stream(self, 
+                             prompt: List[Dict], 
+                             max_new_tokens: Optional[int] = None,
+                             temperature: Optional[float] = None,
+                             top_p: Optional[float] = None,
+                             **kwargs) -> AsyncGenerator[str, None]:
+
+        await self._check_model_loaded()
+        
+        
+        streamer = TextIteratorStreamer(
+            self.tokenizer, 
+            timeout=self.config.stream_timeout,
+            skip_prompt=self.config.skip_prompt,
+            skip_special_tokens=True
+        )
+        
+        def _generate_sync():
+            try:
+                
+                inputs = self.tokenizer.apply_chat_template(
+                    prompt,
+                    add_generation_prompt=True,
+                    return_tensors="pt"
+                )
+                
+                
+                gen_config = self.generation_config
+                if any([max_new_tokens, temperature, top_p]):
+                    gen_config = GenerationConfig(
+                        max_new_tokens=max_new_tokens or self.config.max_length,
+                        temperature=temperature or self.config.temperature,
+                        top_p=top_p or self.config.top_p,
+                        top_k=self.config.top_k,
+                        do_sample=self.config.do_sample,
+                        pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                        eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                        repetition_penalty=self.config.repetition_penalty,
+                        **kwargs
+                    )
+                
+                
+                self.model.to("cuda")
+                input_ids = inputs.to("cuda")
+                
+                
+                generation_kwargs = {
+                    "input_ids": input_ids,
+                    "generation_config": gen_config,
+                    "streamer": streamer,
+                    **kwargs
+                }
+                
+                thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+                thread.start()
+                
+                return thread
+                
+            except Exception as e:
+                self.logger.error(f"Error during stream generation setup: {e}")
+                raise
+        
+        
+        generation_thread = await asyncio.get_event_loop().run_in_executor(
+            self.executor, _generate_sync
+        )
+        err = None
+        try:
+            
+            for token in streamer:
+                if token:  
+                    yield token
+                    
+            
+            err = await asyncio.get_event_loop().run_in_executor(
+                self.executor, generation_thread.join
+            )
+            
+        except Exception as e:
+            self.logger.error(f"Error during streaming: {e}, {err}")
+            
+            if generation_thread.is_alive():
+                generation_thread.join(timeout=1.0)
+            raise
+
+    async def rag_generate_stream(self,
+                                 question: str,
+                                 contexts: Union[List[str], RetrievalResult],
+                                 max_new_tokens: Optional[int] = None,
+                                 temperature: Optional[float] = None,
+                                 **kwargs) -> AsyncGenerator[str, None]:
+        await self._check_model_loaded()
+
+        prompt = await self.format_rag_prompt(question, contexts)
+        
+        temp = temperature if temperature is not None else 0.3
+        
+        async for chunk in self.generate_stream(
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temp,
+            **kwargs
+        ):
+            yield chunk
+
+
+    async def rag_generate(self,
+                          question: str,
+                          contexts: Union[List[str], RetrievalResult],
+                          max_new_tokens: Optional[int] = None,
+                          temperature: Optional[float] = None,
+                          **kwargs) -> str:
+
+        await self._check_model_loaded()
+        
+        
+        prompt = await self.format_rag_prompt(question, contexts)
+        
+        
+        temp = temperature if temperature is not None else 0.3
+        
+        return await self.generate(
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temp,
+            **kwargs
+        )
+    
+    async def _check_model_loaded(self) -> None:
+        """Cek apakah model sudah di-load secara async"""
+        if not self.is_loaded:
+            raise RuntimeError("Model belum di-load. Panggil await load_model() terlebih dahulu.")
+    
+    async def generate(self, 
+                      prompt: Union[List[Dict], str], 
+                      max_new_tokens: Optional[int] = None,
+                      temperature: Optional[float] = None,
+                      top_p: Optional[float] = None,
+                      **kwargs) -> str:
+        """
+        Generate text dari prompt secara async
+        
+        Args:
+            prompt: Input text prompt
+            max_new_tokens: Maximum token baru yang akan di-generate
+            temperature: Temperature untuk generation (override config)
+            top_p: Top-p untuk generation (override config)
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            Generated text
+        """
+        
+        await self._check_model_loaded()
+        
+        def _generate_sync():
+            try:
+                
+                inputs = self.tokenizer.apply_chat_template(
+                    prompt,
+                    add_generation_prompt=True,
+                    return_tensors="pt"
+                )
+                
+                
+                gen_config = self.generation_config
+                if any([max_new_tokens, temperature, top_p]):
+                    gen_config = GenerationConfig(
+                        max_new_tokens=max_new_tokens or self.config.max_length,
+                        temperature=temperature or self.config.temperature,
+                        top_p=top_p or self.config.top_p,
+                        top_k=self.config.top_k,
+                        do_sample=self.config.do_sample,
+                        pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                        eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                        repetition_penalty = self.config.repetition_penalty,
+                        **kwargs
+                    )
+                
+                
+                with torch.no_grad():
+                    
+                    self.model.to("cuda")
+                    input_ids = inputs.to("cuda")
+                    prompt_length = input_ids.shape[-1]
+                    outputs = self.model.generate(
+                        input_ids,
+                        generation_config=gen_config,
+                        **kwargs
+                    )
+                
+                
+                generated_text = self.tokenizer.decode(
+                    outputs[0][prompt_length:], 
+                    skip_special_tokens=True
+                )
+
+                print("Generated Text", generated_text)
+                
+                return generated_text
+                
+            except Exception as e:
+                self.logger.error(f"Error during generation: {e}")
+                raise
+        
+        
+        try:
+            result = await asyncio.wait_for(
+                asyncio.get_event_loop().run_in_executor(self.executor, _generate_sync),
+                timeout=self.config.generation_timeout
+            )
+            return result
+        except asyncio.TimeoutError:
+            self.logger.error(f"Generation timeout after {self.config.generation_timeout} seconds")
+            raise TimeoutError(f"Generation timeout after {self.config.generation_timeout} seconds")
+
+    async def update_config(self, **kwargs) -> None:
+        """
+        Update konfigurasi model secara async
+        
+        Args:
+            **kwargs: Parameter konfigurasi yang akan diupdate
+        """
+        async with self._lock:
+            for key, value in kwargs.items():
+                if hasattr(self.config, key):
+                    setattr(self.config, key, value)
+                    self.logger.info(f"Updated {key} to {value}")
+                else:
+                    self.logger.warning(f"Unknown config parameter: {key}")
+            
+            
+            if self.is_loaded:
+                self.generation_config = GenerationConfig(
+                    max_length=self.config.max_length,
+                    temperature=self.config.temperature,
+                    top_p=self.config.top_p,
+                    top_k=self.config.top_k,
+                    do_sample=self.config.do_sample,
+                    pad_token_id=self.config.pad_token_id or self.tokenizer.eos_token_id,
+                    eos_token_id=self.config.eos_token_id or self.tokenizer.eos_token_id,
+                    repetition_penalty = self.config.repetition_penalty,
+
+                )
+    
+    async def get_model_info(self) -> Dict[str, Any]:
+        """
+        Dapatkan informasi model secara async
+        
+        Returns:
+            Dictionary dengan informasi model
+        """
+        info = {
+            "model_name": self.config.model_name,
+            "is_loaded": self.is_loaded,
+            "config": self.config.__dict__
+        }
+        
+        if self.is_loaded:
+            
+            def _get_info():
+                return {
+                    "vocab_size": self.tokenizer.vocab_size,
+                    "model_parameters": sum(p.numel() for p in self.model.parameters()),
+                    "device": str(next(self.model.parameters()).device)
+                }
+            
+            model_info = await asyncio.get_event_loop().run_in_executor(
+                self.executor, _get_info
+            )
+            info.update(model_info)
+        
+        return info
+    
+    async def batch_generate(self, 
+                           prompts: List[str], 
+                           max_new_tokens: Optional[int] = None,
+                           **kwargs) -> List[str]:
+        """
+        Generate multiple prompts secara batch dan concurrent
+        
+        Args:
+            prompts: List of prompts to generate
+            max_new_tokens: Maximum token baru yang akan di-generate
+            **kwargs: Parameter tambahan untuk generation
+            
+        Returns:
+            List of generated texts
+        """
+        await self._check_model_loaded()
+        
+        
+        tasks = [
+            asyncio.create_task(
+                self.generate(prompt, max_new_tokens=max_new_tokens, **kwargs)
+            )
+            for prompt in prompts
+        ]
+        
+        
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        
+        processed_results = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                self.logger.error(f"Error generating prompt {i}: {result}")
+                processed_results.append(f"Error: {str(result)}")
+            else:
+                processed_results.append(result)
+        
+        return processed_results
+    
+    async def close(self) -> None:
+        """
+        Cleanup resources secara async
+        """
+        self.logger.info("Closing LM...")
+        
+        
+        self.executor.shutdown(wait=True)
+        
+        
+        if hasattr(self, 'model') and self.model is not None:
+            del self.model
+        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+            del self.tokenizer
+        
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        
+        self.is_loaded = False
+        self.logger.info("LM closed successfully")
+    
+    async def __aenter__(self):
+        """Async context manager entry"""
+        await self.load_model()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        await self.close()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/base_retriever.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/base_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..61b8f1c3077297be68669255d4c671b2be539b95
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/base_retriever.py
@@ -0,0 +1,177 @@
+from src.internal.rag.retriever.retriever_types import (
+    DocumentType, 
+    RetrievalResult, 
+)
+from typing import List
+from abc import ABC, abstractmethod
+from langchain_community.document_loaders import (
+    PyMuPDFLoader,
+    Docx2txtLoader,
+    UnstructuredPowerPointLoader,
+    TextLoader
+)
+
+from src.internal.rag.retriever.document_loader import BaseDocumentLoader
+from langchain_core.documents import Document
+
+import asyncio
+import logging
+from pathlib import Path
+
+import hashlib
+import os
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class BaseRetriever(ABC):
+    """Abstract base class for retrievers"""
+    
+    @abstractmethod
+    async def add_documents(self, documents: List[Document]) -> bool:
+        """Add documents to retriever"""
+        pass
+    
+    @abstractmethod
+    async def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
+        """Retrieve relevant documents"""
+        pass
+    
+    @abstractmethod
+    async def delete_documents(self, document_ids: List[str]) -> bool:
+        """Delete documents by IDs"""
+        pass
+
+
+
+class MultiFormatDocumentLoader(BaseDocumentLoader):
+    """Document loader supporting multiple formats"""
+    
+    def __init__(self):
+        self.loaders = {
+            DocumentType.PDF: self._load_pdf,
+            DocumentType.DOCX: self._load_docx,
+            DocumentType.PPT: self._load_ppt,
+            DocumentType.PPTX: self._load_pptx,
+            DocumentType.TXT: self._load_txt
+        }
+    
+    async def load_document(self, file_path: str) -> List[Document]:
+        """Load document based on file extension"""
+        try:
+            file_path = Path(file_path)
+            if not file_path.exists():
+                raise FileNotFoundError(f"File not found: {file_path}")
+            
+            
+            doc_type = self._get_document_type(file_path)
+            
+            
+            loader_func = self.loaders.get(doc_type)
+            if not loader_func:
+                raise ValueError(f"Unsupported file type: {doc_type}")
+            
+            logger.info(f"Loading {doc_type} document: {file_path}")
+            documents = await loader_func(str(file_path))
+            
+            
+            for doc in documents:
+                doc.metadata.update({
+                    "file_path": str(file_path),
+                    "file_name": file_path.name,
+                    "file_type": doc_type.value,
+                    "file_size": file_path.stat().st_size,
+                    "file_hash": self._calculate_file_hash(file_path)
+                })
+            
+            return documents
+            
+        except Exception as e:
+            logger.error(f"Error loading document {file_path}: {str(e)}")
+            raise
+    
+    def get_supported_extensions(self) -> List[str]:
+        """Get supported file extensions"""
+        return [".pdf", ".docx", ".ppt", ".pptx", ".txt"]
+    
+    def _get_document_type(self, file_path: Path) -> DocumentType:
+        """Determine document type from file extension"""
+        extension = file_path.suffix.lower()
+        mapping = {
+            ".pdf": DocumentType.PDF,
+            ".docx": DocumentType.DOCX,
+            ".ppt": DocumentType.PPT,
+            ".pptx": DocumentType.PPTX,
+            ".txt": DocumentType.TXT
+        }
+        
+        doc_type = mapping.get(extension)
+        if not doc_type:
+            raise ValueError(f"Unsupported file extension: {extension}")
+        
+        return doc_type
+    
+    def _calculate_file_hash(self, file_path: Path) -> str:
+        """Calculate MD5 hash of file"""
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+    
+    async def _load_pdf(self, file_path: str) -> List[Document]:
+        """Load PDF document"""
+        try:
+            loader = PyMuPDFLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PDF: {str(e)}")
+    
+    async def _load_docx(self, file_path: str) -> List[Document]:
+        """Load DOCX document"""
+        try:
+            loader = Docx2txtLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading DOCX: {str(e)}")
+    
+    async def _load_ppt(self, file_path: str) -> List[Document]:
+        """Load PPT document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPT: {str(e)}")
+    
+    async def _load_pptx(self, file_path: str) -> List[Document]:
+        """Load PPTX document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPTX: {str(e)}")
+    
+    async def _load_txt(self, file_path: str) -> List[Document]:
+        """Load TXT document"""
+        try:
+            loader = TextLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading TXT: {str(e)}")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/document_loader.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/document_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4b04f24fc075edae8a2ed69f0ff567c562ad61
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/document_loader.py
@@ -0,0 +1,168 @@
+from src.internal.rag.retriever.retriever_types import (
+    DocumentType, 
+    RetrievalResult, 
+)
+from abc import ABC, abstractmethod
+from typing import List
+from langchain_core.documents import Document
+from pathlib import Path
+
+import logging
+
+from langchain_community.document_loaders import (
+    PyMuPDFLoader,
+    Docx2txtLoader,
+    UnstructuredPowerPointLoader,
+    TextLoader
+)
+import asyncio
+import hashlib
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class BaseDocumentLoader(ABC):
+    """Abstract base class for document loaders"""
+    
+    @abstractmethod
+    async def load_document(self, file_path: str) -> List[Document]:
+        """Load document from file path"""
+        pass
+    
+    @abstractmethod
+    def get_supported_extensions(self) -> List[str]:
+        """Get supported file extensions"""
+        pass
+
+
+
+class MultiFormatDocumentLoader(BaseDocumentLoader):
+    """Document loader supporting multiple formats"""
+    
+    def __init__(self):
+        self.loaders = {
+            DocumentType.PDF: self._load_pdf,
+            DocumentType.DOCX: self._load_docx,
+            DocumentType.PPT: self._load_ppt,
+            DocumentType.PPTX: self._load_pptx,
+            DocumentType.TXT: self._load_txt
+        }
+    
+    async def load_document(self, file_path: str) -> List[Document]:
+        """Load document based on file extension"""
+        try:
+            file_path = Path(file_path)
+            if not file_path.exists():
+                raise FileNotFoundError(f"File not found: {file_path}")
+            
+            
+            doc_type = self._get_document_type(file_path)
+            
+            
+            loader_func = self.loaders.get(doc_type)
+            if not loader_func:
+                raise ValueError(f"Unsupported file type: {doc_type}")
+            
+            logger.info(f"Loading {doc_type} document: {file_path}")
+            documents = await loader_func(str(file_path))
+            
+            
+            for doc in documents:
+                doc.metadata.update({
+                    "file_path": str(file_path),
+                    "file_name": file_path.name,
+                    "file_type": doc_type.value,
+                    "file_size": file_path.stat().st_size,
+                    "file_hash": self._calculate_file_hash(file_path)
+                })
+            
+            return documents
+            
+        except Exception as e:
+            logger.error(f"Error loading document {file_path}: {str(e)}")
+            raise
+    
+    def get_supported_extensions(self) -> List[str]:
+        """Get supported file extensions"""
+        return [".pdf", ".docx", ".ppt", ".pptx", ".txt"]
+    
+    def _get_document_type(self, file_path: Path) -> DocumentType:
+        """Determine document type from file extension"""
+        extension = file_path.suffix.lower()
+        mapping = {
+            ".pdf": DocumentType.PDF,
+            ".docx": DocumentType.DOCX,
+            ".ppt": DocumentType.PPT,
+            ".pptx": DocumentType.PPTX,
+            ".txt": DocumentType.TXT
+        }
+        
+        doc_type = mapping.get(extension)
+        if not doc_type:
+            raise ValueError(f"Unsupported file extension: {extension}")
+        
+        return doc_type
+    
+    def _calculate_file_hash(self, file_path: Path) -> str:
+        """Calculate MD5 hash of file"""
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+    
+    async def _load_pdf(self, file_path: str) -> List[Document]:
+        """Load PDF document"""
+        try:
+            loader = PyMuPDFLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PDF: {str(e)}")
+    
+    async def _load_docx(self, file_path: str) -> List[Document]:
+        """Load DOCX document"""
+        try:
+            loader = Docx2txtLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading DOCX: {str(e)}")
+    
+    async def _load_ppt(self, file_path: str) -> List[Document]:
+        """Load PPT document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPT: {str(e)}")
+    
+    async def _load_pptx(self, file_path: str) -> List[Document]:
+        """Load PPTX document"""
+        try:
+            loader = UnstructuredPowerPointLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading PPTX: {str(e)}")
+    
+    async def _load_txt(self, file_path: str) -> List[Document]:
+        """Load TXT document"""
+        try:
+            loader = TextLoader(file_path)
+            documents = await asyncio.get_event_loop().run_in_executor(
+                None, loader.load
+            )
+            return documents
+        except Exception as e:
+            raise Exception(f"Error loading TXT: {str(e)}")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/document_processor.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/document_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7596d32e8ea3c72685b0ced26eb711503e59656b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/document_processor.py
@@ -0,0 +1,55 @@
+from typing import List, Dict, Any, Optional, Union
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+import asyncio
+import logging
+
+from langchain_core.documents import Document
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class DocumentProcessor:
+    """Document processor for chunking and preprocessing"""
+    
+    def __init__(self, 
+                 chunk_size: int = 1000,
+                 chunk_overlap: int = 200,
+                 separators: Optional[List[str]] = None):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        
+        
+        if separators is None:
+            separators = ["\n\n", "\n", " ", ""]
+        
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=separators,
+            length_function=len
+        )
+    
+    async def process_documents(self, documents: List[Document]) -> List[Document]:
+        """Process documents by splitting into chunks"""
+        try:
+            logger.info(f"Processing {len(documents)} documents")
+            
+            
+            chunks = await asyncio.get_event_loop().run_in_executor(
+                None, self.text_splitter.split_documents, documents
+            )
+            
+            
+            for i, chunk in enumerate(chunks):
+                chunk.metadata.update({
+                    "chunk_id": i,
+                    "chunk_size": len(chunk.page_content),
+                    "processed_at": str(asyncio.get_event_loop().time())
+                })
+            
+            logger.info(f"Created {len(chunks)} chunks from {len(documents)} documents")
+            return chunks
+            
+        except Exception as e:
+            logger.error(f"Error processing documents: {str(e)}")
+            raise
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/langchain_retriever.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/langchain_retriever.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ec2e66d671bb66e557f0d42a9a86652ab5305c
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/langchain_retriever.py
@@ -0,0 +1,235 @@
+from src.internal.rag.retriever.base_retriever import BaseRetriever
+
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_openai import OpenAIEmbeddings
+
+from langchain_community.vectorstores import Chroma, FAISS, Pinecone
+from langchain.retrievers import EnsembleRetriever
+
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_community.retrievers import BM25Retriever
+
+from typing import Dict, Optional, List
+from src.internal.rag.retriever.document_loader import MultiFormatDocumentLoader
+from src.internal.rag.retriever.document_processor import DocumentProcessor
+from src.internal.rag.retriever.retriever_types import ProcessingResult, ProcessingStatus, RetrievalResult, DocumentMetadata
+
+import asyncio
+from pathlib import Path
+import logging
+from langchain_core.documents import Document
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class LangChainRetriever(BaseRetriever):
+    """LangChain-based retriever with multiple format support"""
+
+    def __init__(self,
+                 embedding_model: str = "text-embedding-3-small",
+                 vectorstore_type: str = "chroma",
+                 vectorstore_path: Optional[str] = None,
+                 use_hybrid_search: bool = True,
+                 **kwargs):
+
+        self.embedding_model = embedding_model
+        self.vectorstore_type = vectorstore_type
+        self.vectorstore_path = vectorstore_path or "./vectorstore"
+        self.use_hybrid_search = use_hybrid_search
+
+        self.document_loader = MultiFormatDocumentLoader()
+        self.document_processor = DocumentProcessor(**kwargs)
+        self.embeddings = self._initialize_embeddings()
+        self.vectorstore = self._initialize_vectorstore()
+        self.retriever = self._initialize_retriever()
+
+        self.processed_documents: Dict[str, DocumentMetadata] = {}
+
+        logger.info(f"LangChainRetriever initialized with {vectorstore_type} vectorstore")
+
+    def _initialize_embeddings(self):
+        try:
+            if self.embedding_model.startswith("text-embedding"):
+                return OpenAIEmbeddings(model=self.embedding_model)
+            else:
+                return HuggingFaceEmbeddings(model_name=self.embedding_model)
+        except Exception as e:
+            logger.error(f"Error initializing embeddings: {str(e)}")
+            return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+
+    def _initialize_vectorstore(self):
+        try:
+            if self.vectorstore_type.lower() == "chroma":
+                return Chroma(
+                    persist_directory=self.vectorstore_path,
+                    embedding_function=self.embeddings
+                )
+            elif self.vectorstore_type.lower() == "faiss":
+                return FAISS(
+                    embedding_function=self.embeddings,
+                    index_path=self.vectorstore_path
+                )
+            else:
+                raise ValueError(f"Unsupported vectorstore type: {self.vectorstore_type}")
+        except Exception as e:
+            logger.error(f"Error initializing vectorstore: {str(e)}")
+            return FAISS.from_documents([], self.embeddings)
+
+    def _initialize_retriever(self):
+        try:
+            vector_retriever = VectorStoreRetriever(
+                vectorstore=self.vectorstore,
+                search_kwargs={"k": 10}
+            )
+            if self.use_hybrid_search:
+                self.bm25_retriever = None  
+                return vector_retriever  
+            else:
+                return vector_retriever
+        except Exception as e:
+            logger.error(f"Error initializing retriever: {str(e)}")
+            return VectorStoreRetriever(vectorstore=self.vectorstore)
+
+    async def add_document_from_file(self, file_path: str) -> ProcessingResult:
+        try:
+            file_path = Path(file_path)
+            if not file_path.exists():
+                return ProcessingResult(
+                    success=False,
+                    document_metadata=None,
+                    chunks=[],
+                    error_message=f"File not found: {file_path}"
+                )
+
+            doc_metadata = DocumentMetadata(
+                file_path=str(file_path),
+                file_name=file_path.name,
+                file_type=self.document_loader._get_document_type(file_path),
+                file_size=file_path.stat().st_size,
+                file_hash=self.document_loader._calculate_file_hash(file_path),
+                created_at=str(asyncio.get_event_loop().time()),
+                processing_status=ProcessingStatus.PROCESSING
+            )
+
+            documents = await self.document_loader.load_document(str(file_path))
+            chunks = await self.document_processor.process_documents(documents)
+            await self.add_documents(chunks)
+
+            doc_metadata.chunk_count = len(chunks)
+            doc_metadata.processing_status = ProcessingStatus.COMPLETED
+            doc_metadata.processed_at = str(asyncio.get_event_loop().time())
+            self.processed_documents[doc_metadata.file_hash] = doc_metadata
+
+            logger.info(f"Successfully processed {file_path}: {len(chunks)} chunks")
+
+            return ProcessingResult(
+                success=True,
+                document_metadata=doc_metadata,
+                chunks=chunks
+            )
+
+        except Exception as e:
+            error_msg = f"Error processing document {file_path}: {str(e)}"
+            logger.error(error_msg)
+
+            return ProcessingResult(
+                success=False,
+                document_metadata=doc_metadata if 'doc_metadata' in locals() else None,
+                chunks=[],
+                error_message=error_msg
+            )
+
+    async def add_documents(self, documents: List[Document]) -> bool:
+        try:
+            if not documents:
+                return True
+
+            await asyncio.get_event_loop().run_in_executor(
+                None, self.vectorstore.add_documents, documents
+            )
+
+            if self.use_hybrid_search:
+                await self._update_bm25_retriever(documents)
+
+            logger.info(f"Added {len(documents)} documents to vector store")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error adding documents: {str(e)}")
+            return False
+    async def _update_bm25_retriever(self, documents: List[Document]):
+        try:
+            
+            self.bm25_retriever = BM25Retriever.from_documents(documents)
+            self.bm25_retriever.k = 10  
+            self.retriever = self.bm25_retriever
+            
+            vector_retriever = VectorStoreRetriever(
+                vectorstore=self.vectorstore,
+                search_kwargs={"k": 10}
+            )
+
+            self.retriever = EnsembleRetriever(
+                retrievers=[vector_retriever, self.bm25_retriever],
+                weights=[0.5, 0.5]  
+            )
+            
+        except Exception as e:
+            logger.error(f"Error updating BM25 retriever: {str(e)}")
+            
+            self.retriever = VectorStoreRetriever(
+                vectorstore=self.vectorstore,
+                search_kwargs={"k": 10}
+            )
+    async def retrieve(self, query: str, k: int = 5) -> RetrievalResult:
+        try:
+            import time
+            start_time = time.time()
+            logger.info(f"Retrieving documents for query: '{query}'")
+
+            retrieved_docs = await asyncio.get_event_loop().run_in_executor(
+                None, self.retriever.get_relevant_documents, query
+            )
+            retrieved_docs = retrieved_docs[:k]
+
+            scores = [0.9 - (i * 0.1) for i in range(len(retrieved_docs))]
+
+            retrieval_time = time.time() - start_time
+
+            logger.info(f"Retrieved {len(retrieved_docs)} documents in {retrieval_time:.2f}s")
+
+            return RetrievalResult(
+                documents=retrieved_docs,
+                scores=scores,
+                query=query,
+                retrieval_time=retrieval_time,
+                metadata={
+                    "vectorstore_type": self.vectorstore_type,
+                    "embedding_model": self.embedding_model,
+                    "hybrid_search": self.use_hybrid_search
+                }
+            )
+
+        except Exception as e:
+            logger.error(f"Error retrieving documents: {str(e)}")
+            raise
+
+    async def delete_documents(self, document_ids: List[str]) -> bool:
+        try:
+            if hasattr(self.vectorstore, 'delete'):
+                await asyncio.get_event_loop().run_in_executor(
+                    None, self.vectorstore.delete, document_ids
+                )
+            logger.info(f"Deleted {len(document_ids)} documents")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting documents: {str(e)}")
+            return False
+
+    def get_document_metadata(self, file_hash: str) -> Optional[DocumentMetadata]:
+        return self.processed_documents.get(file_hash)
+
+    def list_processed_documents(self) -> List[DocumentMetadata]:
+        return list(self.processed_documents.values())
+
+    def get_supported_formats(self) -> List[str]:
+        return self.document_loader.get_supported_extensions()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/retriever_types.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/retriever_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e59b8fc155e29dfcf474130d0c95cd9af385e9
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/retriever/retriever_types.py
@@ -0,0 +1,52 @@
+
+from typing import List, Dict, Any, Optional, Union
+
+
+from dataclasses import dataclass
+from enum import Enum
+
+from langchain_core.documents import Document
+
+class DocumentType(str, Enum):
+    PDF = "pdf"
+    DOCX = "docx"
+    PPT = "ppt"
+    PPTX = "pptx"
+    TXT = "txt"
+
+class ProcessingStatus(str, Enum):
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    ERROR = "error"
+
+@dataclass
+class DocumentMetadata:
+    """Document metadata"""
+    file_path: str
+    file_name: str
+    file_type: DocumentType
+    file_size: int
+    file_hash: str
+    created_at: str
+    processed_at: Optional[str] = None
+    chunk_count: int = 0
+    processing_status: ProcessingStatus = ProcessingStatus.PENDING
+    error_message: Optional[str] = None
+
+@dataclass
+class RetrievalResult:
+    """Retrieval result"""
+    documents: List[Document]
+    scores: List[float]
+    query: Optional[str] = None
+    retrieval_time: Optional[float] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+@dataclass
+class ProcessingResult:
+    """Document processing result"""
+    success: bool
+    document_metadata: DocumentMetadata
+    chunks: List[Document]
+    error_message: Optional[str] = None
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/web_search/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/web_search/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/web_search/duckduckgo_search.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/web_search/duckduckgo_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..92fafbbad37b79d55d26db135289d4afa121443a
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rag/web_search/duckduckgo_search.py
@@ -0,0 +1,91 @@
+import logging
+import re
+from typing import AsyncGenerator, List, Optional
+
+from ddgs import DDGS
+from langchain_community.document_loaders import AsyncChromiumLoader
+from langchain_community.document_transformers import BeautifulSoupTransformer
+
+class DuckDuckGoSearch:
+    def __init__(
+        self,
+        html_loader: Optional[AsyncChromiumLoader] = None,
+        html_parser: Optional[BeautifulSoupTransformer] = None,
+    ):
+        self.html_loader = html_loader or AsyncChromiumLoader([])
+        self.html_parser = html_parser or BeautifulSoupTransformer()
+        self.logger = logging.getLogger("ddgs_logger")
+
+    async def get_page(self, urls: List[str]):
+        """Get page content from URLs - returns list of documents"""
+        try:
+            self.html_loader.urls = urls
+            html_docs = await self.html_loader.aload()
+            self.logger.info(f"Loaded {len(html_docs)} documents")
+            docs_transformed = self.html_parser.transform_documents(
+                html_docs, tags_to_extract=["p"], remove_unwanted_tags=["a"]
+            )
+            return docs_transformed
+        except Exception as e:
+            self.logger.error(f"Error loading pages: {e}", exc_info=True)
+            return []
+
+    def truncate(self, text: str, max_words: int = 400) -> str:
+        """Truncate text to specified number of words"""
+        if not text:
+            return ""
+        words = text.split()
+        truncated = " ".join(words[:max_words])
+        return truncated + "..." if len(words) > max_words else truncated
+
+    async def search(self, query: str, max_results: int = 5) -> AsyncGenerator[str, None]:
+        """Search and yield page contents one by one"""
+        try:
+            self.logger.info(f"Searching for: {query} (max_results: {max_results})")
+            results = DDGS().text(query, max_results=max_results)
+            urls = [r.get('href') for r in results if r.get('href')]
+            self.logger.info(f"Found {len(urls)} URLs to process")
+            if not urls:
+                self.logger.warning("No URLs found from search results")
+                return
+            docs = await self.get_page(urls)
+            for doc in docs:
+                try:
+                    page_text = getattr(doc, 'page_content', '').strip()
+                    if page_text:
+                        page_text = re.sub(r"\n\n+", "\n", page_text)
+                        yield self.truncate(page_text)
+                except Exception as e:
+                    self.logger.error(f"Error processing document: {e}")
+        except Exception as e:
+            self.logger.error(f"Error in search method: {e}", exc_info=True)
+
+    async def search_with_metadata(self, query: str, max_results: int = 5) -> AsyncGenerator[dict, None]:
+        """Yield dictionaries with metadata for each search result"""
+        try:
+            results = DDGS().text(query, max_results=max_results)
+            urls_and_titles = [
+                {'url': r.get('href'), 'title': r.get('title', 'No title')}
+                for r in results if r.get('href')
+            ]
+            if not urls_and_titles:
+                return
+            urls = [item['url'] for item in urls_and_titles]
+            docs = await self.get_page(urls)
+            for i, doc in enumerate(docs):
+                try:
+                    page_text = getattr(doc, 'page_content', '').strip()
+                    if page_text:
+                        page_text = re.sub(r"\n\n+", "\n", page_text)
+                        text = self.truncate(page_text)
+                        metadata = urls_and_titles[i] if i < len(urls_and_titles) else {}
+                        yield {
+                            'content': text,
+                            'url': metadata.get('url', 'Unknown'),
+                            'title': metadata.get('title', 'No title'),
+                            'word_count': len(text.split())
+                        }
+                except Exception as e:
+                    self.logger.error(f"Error processing document {i}: {e}")
+        except Exception as e:
+            self.logger.error(f"Error in search_with_metadata: {e}", exc_info=True)
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/.gradio/certificate.pem b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/.gradio/certificate.pem
new file mode 100644
index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/.gradio/certificate.pem
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c757976a38f119a50ae4cc323f43231f2bb6556b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/__init__.py
@@ -0,0 +1,3 @@
+
+# from rtc.rtc_call import RTCHandler
+from src.internal.rtc.rtc_call import RTCHandler
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5deff744fd053e12ebdb84edae18cd81ce024ce
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call.py
@@ -0,0 +1,230 @@
+import fastapi
+from fastapi.middleware.cors import CORSMiddleware
+from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
+from src.utils.audio_helper import audio_to_bytes, resample_audio
+from dotenv import load_dotenv
+
+from src.internal.tts.audio_edge_tts import EdgeTTS
+from src.internal.stt.whisper_stt import WhisperSTT
+from src.internal.rag.agents.customer_service_agent import CSAgent
+
+import logging
+import time
+import platform
+import socket
+import os
+import numpy as np
+import asyncio
+import asyncio
+from src.config.constant import HF_TOKEN
+import re
+
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
+
+class RTCHandler:
+    def __init__(self, cs_agent: CSAgent, whisper_stt: WhisperSTT, edge_tts : EdgeTTS):
+        self.cs_agent = cs_agent
+        self.whisper_stt = whisper_stt
+        self.edge_tts = edge_tts
+        self.full_response = ""
+        self.stream = None
+        self.app = None
+
+        self._setup_webrtc_ip()
+
+    def _setup_webrtc_ip(self):
+        """Setup WebRTC IP for Windows"""
+        if platform.system() == 'Windows':
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            try:
+                s.connect(('8.8.8.8', 80))
+                local_ip = s.getsockname()[0]
+            except Exception:
+                local_ip = '127.0.0.1'
+            finally:
+                s.close()
+            os.environ['WEBRTC_IP'] = local_ip
+
+    
+    def echo(self, audio):
+            try:
+                stt_time = time.time()
+                logging.info("Performing STT")
+
+                transcription = self.whisper_stt.transcribe(audio_to_bytes(audio))
+                prompt = transcription
+                if prompt == "":
+                    logging.info("STT returned empty string")
+                    return
+
+                logging.info(f"STT response: {transcription}")
+                self.messages.append({"role": "user", "content": prompt})
+                logging.info(f"STT took {time.time() - stt_time} seconds")
+
+                llm_time = time.time()
+
+                self.full_response = ""
+                self.cs_agent.inferencer.model.prompt_template.append(self.messages)
+
+                async def stream_text_to_audio():
+                    chunk_size = 1024
+                    text_buffer = ""
+                    async for stream_data in self.cs_agent.get_result(question = prompt):
+                        if stream_data["type"] == "chunk":
+                            chunk = stream_data["data"]["chunk"]
+                            self.full_response += chunk
+                            text_buffer += chunk
+                            
+                            if re.search(r'[.,?;!]', chunk):
+                                try:
+                                    audio_buffer_gen =  await self.edge_tts.generate_audio_buffer(text_buffer)
+                                    audio_buffer = audio_buffer_gen[0]
+                                    resampled = resample_audio(audio_buffer)
+                                    for i in range(0, len(resampled), chunk_size):
+                                        yield (24000, resampled[i:i + chunk_size])
+                                    no_buffer = 0
+                                    text_buffer = ""
+                                except Exception as e:
+                                    logging.error(f"TTS generation failed for chunk: {e}")
+                                    continue
+                                    
+                        elif stream_data["type"] == "metadata":
+                            setup_time = stream_data['data']['setup_time']
+                            print(f"\nSetup completed in {setup_time:.2f}s")
+                            
+                        elif stream_data["type"] == "complete":
+                            total_time = stream_data['data']['total_time']
+                            print(f"\nTotal time: {total_time:.2f}s")
+                            break
+
+                
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                
+                try:
+                    async_gen = stream_text_to_audio()
+                    while True:
+                        try:
+                            chunk = loop.run_until_complete(async_gen.__anext__())
+                            yield chunk
+                        except StopAsyncIteration:
+                            break
+                finally:
+                    loop.close()
+
+                self.messages.append({"role": "assistant", "content": self.full_response + " "})
+                logging.info(f"LLM response: {self.full_response}")
+                logging.info(f"LLM took {time.time() - llm_time} seconds")
+
+            except Exception as e:
+                logging.error(f"Error in echo function: {e}")
+                error_audio = np.zeros(24000, dtype=np.float32)
+                yield (24000, error_audio)
+
+    def reset_conversation(self):
+        logging.info("Resetting chat")
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        self.full_response = ""
+
+    def create_stream(self):
+        try:
+            async def get_credentials():
+                return await get_cloudflare_turn_credentials_async(hf_token=HF_TOKEN)
+            self.stream = Stream(
+                rtc_configuration=get_credentials,
+                server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
+                handler = ReplyOnPause(
+                    self.echo,
+                    algo_options=AlgoOptions(
+                        audio_chunk_duration=0.5,
+                        started_talking_threshold=0.1,
+                        speech_threshold=0.03
+                    ),
+                    model_options=SileroVadOptions(
+                        threshold=0.90,
+                        min_speech_duration_ms=250,
+                        min_silence_duration_ms=2000,
+                        speech_pad_ms=400,
+                        max_speech_duration_s=15
+                    )
+                ),
+                modality="audio",
+                mode="send-receive"
+            )
+            return self.stream
+        except Exception as e:
+            logging.error(f"Error creating stream: {e}")
+            raise
+
+    def create_fastapi_app(self):
+        try:
+            self.app = fastapi.FastAPI()
+            self.app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+
+            if not self.stream:
+                self.create_stream()
+            self.stream.mount(self.app)
+
+            @self.app.get("/reset")
+            async def reset():
+                try:
+                    self.reset_conversation()
+                    return {"status": "success"}
+                except Exception as e:
+                    logging.error(f"Error in reset endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            @self.app.get("/status")
+            async def status():
+                try:
+                    return {
+                        "status": "running",
+                        "messages_count": len(self.messages),
+                        "last_response": self.full_response
+                    }
+                except Exception as e:
+                    logging.error(f"Error in status endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            return self.app
+        except Exception as e:
+            logging.error(f"Error creating FastAPI app: {e}")
+            raise
+
+    def start_server(self, host: str = "0.0.0.0", port: int = 7862):
+        import uvicorn
+        if not self.app:
+            self.create_fastapi_app()
+        logging.info(f"Starting server on {host}:{port}")
+        try:
+            uvicorn.run(self.app, host=host, port=port, log_level="info")
+        except Exception as e:
+            logging.error(f"Error starting server: {e}")
+            raise
+    def launch_ui(self, browser: bool = True, port = 7860):
+        try:
+            if not self.stream:
+                self.create_stream()
+            if not self.app:
+                self.create_fastapi_app()
+            logging.info("Launching RTC UI...")
+            self.stream.ui.launch(self.app,
+                                  server_name="0.0.0.0",
+                                  server_port=port,
+                                  )
+        except Exception as e:
+            logging.error(f"Error launching UI: {e}")
+            raise
+
+    def get_conversation_history(self):
+        return self.messages.copy()
+
+    def get_last_response(self):
+        return self.full_response
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call_gpt.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c02a168e17a10a063d9ce36a5a62f370a359e6d
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call_gpt.py
@@ -0,0 +1,289 @@
+import fastapi
+from fastapi.middleware.cors import CORSMiddleware
+from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
+from fastrtc.utils import audio_to_int16
+from openai import OpenAI
+from elevenlabs.client import ElevenLabs
+from dotenv import load_dotenv
+from tts.audio_edge_tts import EdgeTTS
+from rag import document_retriever, ddgs
+import logging
+import time
+import platform
+import socket
+import os
+import numpy as np
+import io
+import wave
+import asyncio
+import librosa
+from pydub import AudioSegment
+from collections import deque
+import torch
+import torchaudio.transforms as T
+import concurrent.futures
+import threading
+from config.constant import HF_TOKEN
+import re
+from langchain_core.documents import Document
+import torchaudio
+# Load .env
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
+
+class RTCHandler:
+    def __init__(self, openai_client: OpenAI, whisper_stt=None, edge_tts: EdgeTTS=None):
+        self.whisper_stt = whisper_stt
+        self.edge_tts = edge_tts
+        self.prompt = ""
+        self.sys_prompt = (
+            "Kamu adalah customer service yang berbahasa Indonesia dengan baik sopan, santun, tapi santai pembawaannya.\n"
+            "Kamu bisa menjelaskan sesuatu secara baik dan membimbing customer dalam menghadapi masalah yang ada!\n"
+            "Kamu akan menjawab customer dengan media call /telepon jadi anda harus memberikan respon seperlunya saja\n"
+            "Tidak kepanjanngan, dan sangat jelas, Tidak lebih dari 50 kata."
+        )
+        self.openai_client = openai_client
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        self.full_response = ""
+        self.stream = None
+        self.app = None
+        self._setup_webrtc_ip()
+
+    def _setup_webrtc_ip(self):
+        if platform.system() == 'Windows':
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            try:
+                s.connect(('8.8.8.8', 80))
+                local_ip = s.getsockname()[0]
+            except Exception:
+                local_ip = '127.0.0.1'
+            finally:
+                s.close()
+            os.environ['WEBRTC_IP'] = local_ip
+
+    def audio_to_bytes(self, audio_tuple, sample_rate=24000) -> io.BufferedReader:
+        sr, audio_data = audio_tuple
+        audio_int16 = audio_to_int16(audio_tuple)
+        buffer = io.BytesIO()
+        with wave.open(buffer, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(sr)
+            wf.writeframes(audio_int16.tobytes())
+        buffer.seek(0)
+        buffer.name = "audio.wav"
+        return buffer
+
+    def echo(self, audio):
+        try:
+            stt_time = time.time()
+            logging.info("Performing STT")
+            transcription = self.openai_client.audio.transcriptions.create(
+                model="whisper-1",
+                file=self.audio_to_bytes(audio),
+                language="id"
+            )
+            self.prompt = transcription.text
+            if not self.prompt:
+                logging.info("STT returned empty string")
+                return
+            logging.info(f"STT response: {transcription}")
+            logging.info(f"STT took {time.time() - stt_time} seconds")
+
+            llm_time = time.time()
+            self.full_response = ""
+
+            async def stream_text_to_audio():
+                retrieval_result = await document_retriever.retrieve(query=self.prompt)
+                contexts = ""
+                search_results = []
+                async for result in ddgs.search(self.prompt, max_results=5):
+                    doc = Document(
+                        page_content=result,
+                        metadata={"source": "internet_search", "query": self.prompt}
+                    )
+                    search_results.append(doc)
+                await document_retriever.add_documents(search_results)
+                for i, ctx in enumerate(retrieval_result.documents, 1):
+                    contexts += f"{i}. {ctx.page_content}\n"
+                self.messages.append({
+                    "role": "user",
+                    "content": (
+                        f"Dari Konteks yang diberikan (jika diperlukan) :\n{contexts}\n"
+                        f"Berikan jawaban atas pertanyaan yang diberikan :\n{self.prompt}"
+                    )
+                })
+                response = self.openai_client.chat.completions.create(
+                    model="gpt-3.5-turbo",
+                    messages=self.messages,
+                    max_tokens=200,
+                    stream=True
+                )
+                chunk_size = 1024
+                text_buffer = ""
+                for stream_data in response:
+                    delta = stream_data.choices[0].delta.content
+                    if stream_data.choices[0].finish_reason == "stop":
+                        if text_buffer:
+                            yield text_buffer
+                        break
+                    if delta:
+                        self.full_response += delta
+                        text_buffer += delta
+                        if re.search(r'[.,?;!]', delta):
+                            try:
+                                audio_buffer_gen = await self.edge_tts.generate_audio_buffer(text_buffer)
+                                audio_buffer = audio_buffer_gen[0]
+                                audio_buffer.seek(0)
+                                audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
+                                samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2 ** 15)
+                                if audio_segment.channels == 2:
+                                    samples = samples.reshape((-1, 2)).mean(axis=1)
+                                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                                audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device)
+                                resampler = torchaudio.transforms.Resample(
+                                    orig_freq=audio_segment.frame_rate,
+                                    new_freq=24000
+                                ).to(device)
+                                resampled_tensor = resampler(audio_tensor)
+                                resampled = resampled_tensor.squeeze(0).cpu().numpy()
+                                for i in range(0, len(resampled), chunk_size):
+                                    yield (24000, resampled[i:i + chunk_size])
+                                text_buffer = ""
+                            except Exception as e:
+                                logging.error(f"TTS generation failed for chunk: {e}")
+                                continue
+
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                async_gen = stream_text_to_audio()
+                while True:
+                    try:
+                        chunk = loop.run_until_complete(async_gen.__anext__())
+                        yield chunk
+                    except StopAsyncIteration:
+                        break
+            finally:
+                loop.close()
+
+            self.messages.append({"role": "assistant", "content": self.full_response + " "})
+            logging.info(f"LLM response: {self.full_response}")
+            logging.info(f"LLM took {time.time() - llm_time} seconds")
+
+        except Exception as e:
+            logging.error(f"Error in echo function: {e}")
+            error_audio = np.zeros(24000, dtype=np.float32)
+            yield (24000, error_audio)
+
+    def reset_conversation(self):
+        logging.info("Resetting chat")
+        self.messages = [{"role": "system", "content": self.sys_prompt}]
+        self.full_response = ""
+
+    def create_stream(self):
+        try:
+            async def get_credentials():
+                return await get_cloudflare_turn_credentials_async(hf_token=HF_TOKEN)
+            self.stream = Stream(
+                rtc_configuration=get_credentials,
+                server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
+                handler=ReplyOnPause(
+                    self.echo,
+                    algo_options=AlgoOptions(
+                        audio_chunk_duration=0.5,
+                        started_talking_threshold=0.1,
+                        speech_threshold=0.03
+                    ),
+                    model_options=SileroVadOptions(
+                        threshold=0.90,
+                        min_speech_duration_ms=250,
+                        min_silence_duration_ms=2000,
+                        speech_pad_ms=400,
+                        max_speech_duration_s=15
+                    )
+                ),
+                modality="audio",
+                mode="send-receive"
+            )
+            return self.stream
+        except Exception as e:
+            logging.error(f"Error creating stream: {e}")
+            raise
+
+    def create_fastapi_app(self):
+        try:
+            self.app = fastapi.FastAPI()
+            self.app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+            if not self.stream:
+                self.create_stream()
+            self.stream.mount(self.app)
+
+            @self.app.get("/reset")
+            async def reset():
+                try:
+                    self.reset_conversation()
+                    return {"status": "success"}
+                except Exception as e:
+                    logging.error(f"Error in reset endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            @self.app.get("/status")
+            async def status():
+                try:
+                    return {
+                        "status": "running",
+                        "messages_count": len(self.messages),
+                        "last_response": self.full_response
+                    }
+                except Exception as e:
+                    logging.error(f"Error in status endpoint: {e}")
+                    return {"status": "error", "message": str(e)}
+
+            return self.app
+        except Exception as e:
+            logging.error(f"Error creating FastAPI app: {e}")
+            raise
+
+    def start_server(self, host: str = "0.0.0.0", port: int = 7860):
+        import uvicorn
+        if not self.app:
+            self.create_fastapi_app()
+        logging.info(f"Starting server on {host}:{port}")
+        try:
+            uvicorn.run(self.app, host=host, port=port, log_level="info")
+        except Exception as e:
+            logging.error(f"Error starting server: {e}")
+            raise
+
+    def launch_ui(self, browser: bool = True):
+        try:
+            if not self.stream:
+                self.create_stream()
+            if not self.app:
+                self.create_fastapi_app()
+            logging.info("Launching RTC UI...")
+            self.stream.ui.launch(
+                self.app,
+                server_name="0.0.0.0",
+                server_port=7860,
+            )
+        except Exception as e:
+            logging.error(f"Error launching UI: {e}")
+            raise
+
+    def get_conversation_history(self):
+        return self.messages.copy()
+
+    def set_system_prompt(self, new_prompt: str):
+        self.sys_prompt = new_prompt
+        self.messages[0] = {"role": "system", "content": new_prompt}
+
+    def get_last_response(self):
+        return self.full_response
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/stt/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/stt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e70e10f7b1601fc3ce285279d75b37501eeba1
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/stt/__init__.py
@@ -0,0 +1 @@
+from src.internal.stt.whisper_stt import WhisperSTT
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/stt/whisper_stt.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/stt/whisper_stt.py
new file mode 100644
index 0000000000000000000000000000000000000000..6313b8c777794ca375869f6f98eb60ae9f120add
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/stt/whisper_stt.py
@@ -0,0 +1,69 @@
+
+import whisper
+import torch
+from fastrtc.utils import audio_to_int16
+import io
+import os
+import tempfile
+
+class WhisperSTT:
+    def __init__(self, model_size: str = "base", device: str = "auto"):
+        cache_dir = os.environ.get('WHISPER_CACHE_DIR', '/tmp/.cache/whisper')
+        os.makedirs(cache_dir, exist_ok=True)
+        
+        
+        if device == "auto":
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+            
+        
+        if self.device == "cuda" and not torch.cuda.is_available():
+            print("Warning: CUDA requested but not available. Falling back to CPU.")
+            self.device = "cpu"
+        
+        
+        print(f"Loading Whisper model '{model_size}' on device: {self.device}")
+        self.model = whisper.load_model(model_size, device=self.device, download_root=cache_dir)
+        self.language = "id"  
+        
+        
+        if self.device == "cuda":
+            gpu_name = torch.cuda.get_device_name(0)
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
+            print(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")
+
+    def transcribe(self, audio: io.BufferedReader, language: str = "id") -> str:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp.write(audio.read())
+            tmp.flush()
+            tmp_path = tmp.name
+
+        try:
+            
+            result = self.model.transcribe(
+                tmp_path, 
+                language=language,
+                
+                fp16=self.device == "cuda"
+            )
+            return result.get("text", "")
+        finally:
+            
+            os.remove(tmp_path)
+    
+    def get_device_info(self) -> dict:
+        info = {
+            "device": self.device,
+            "cuda_available": torch.cuda.is_available()
+        }
+        
+        if self.device == "cuda" and torch.cuda.is_available():
+            info.update({
+                "gpu_name": torch.cuda.get_device_name(0),
+                "gpu_memory_gb": torch.cuda.get_device_properties(0).total_memory / 1024**3,
+                "gpu_memory_allocated_gb": torch.cuda.memory_allocated() / 1024**3,
+                "gpu_memory_reserved_gb": torch.cuda.memory_reserved() / 1024**3
+            })
+        
+        return info
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/tts/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4845345ee3ea6dc2337a79e3d1e454aed465d1f6
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/tts/__init__.py
@@ -0,0 +1 @@
+from src.internal.tts.audio_edge_tts import EdgeTTS
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/tts/audio_edge_tts.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/tts/audio_edge_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..003f26c7416dbd504b619e054bc73749dd76b53f
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/internal/tts/audio_edge_tts.py
@@ -0,0 +1,93 @@
+
+import edge_tts
+import asyncio
+from typing import AsyncGenerator, Optional, Tuple
+import io
+
+class EdgeTTS:
+    def __init__(self, voice_short_name: str,  rate: str = "+0%", volume: str = "+0%", pitch: str = "+0Hz",):
+        self.voice_short_name = voice_short_name
+        self.rate_str = rate
+        self.volume_str = volume
+        self.pitch_str = pitch
+    
+    async def generate_audio_stream(self, text: str) -> AsyncGenerator[bytes, None]:
+        """
+        Generate audio stream as bytes chunks for FastRTC integration
+        Returns: AsyncGenerator yielding audio bytes chunks
+        """
+        if not text.strip():
+            raise ValueError("Please enter text to convert.")
+        if not self.voice_short_name:
+            raise ValueError("Please select a voice.")
+        
+        communicate = edge_tts.Communicate(
+            text,
+            self.voice_short_name,
+            rate=self.rate_str,
+            volume =  self.volume_str,
+            pitch=self.pitch_str
+        )
+        
+        
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                yield chunk["data"]
+    
+    async def generate_audio_buffer(self, text: str) -> Tuple[Optional[io.BytesIO], Optional[str]]:
+        """
+        Generate complete audio as bytes buffer for immediate use
+        Returns: (audio_bytes, error_message)
+        """
+        try:
+            if not text.strip():
+                return None, "Please enter text to convert."
+            if not self.voice_short_name:
+                return None, "Please select a voice."
+            
+            communicate = edge_tts.Communicate(
+                text,
+                self.voice_short_name,
+                rate=self.rate_str,
+                pitch=self.pitch_str
+            )
+            
+            
+            audio_buffer = io.BytesIO()
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    audio_buffer.write(chunk["data"])
+            
+            return audio_buffer, None
+            
+        except Exception as e:
+            return None, f"Error generating audio: {str(e)}"
+    
+    async def generate_audio_with_callback(self, text: str, callback_func):
+        """
+        Generate audio and call callback function for each chunk
+        Useful for real-time streaming to FastRTC
+        """
+        if not text.strip():
+            callback_func(None, "Please enter text to convert.")
+            return
+        if not self.voice_short_name:
+            callback_func(None, "Please select a voice.")
+            return
+        
+        try:
+            communicate = edge_tts.Communicate(
+                text,
+                self.voice_short_name,
+                rate=self.rate_str,
+                pitch=self.pitch_str
+            )
+            
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    
+                    callback_func(chunk["data"], None)
+                    
+        except Exception as e:
+            callback_func(None, f"Error: {str(e)}")
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66db118d66e6a6f01eeb18cf0b6dde2ecb80235b
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/__init__.py
@@ -0,0 +1 @@
+from src.provider.app_provider import AppProvider
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/app_provider.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/app_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6d9a96ea9eed7cb29c32356caaea92f9395dc8e
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/app_provider.py
@@ -0,0 +1,25 @@
+from src.provider.rag_agents_provider import RAGAgentsProvider
+from src.provider.chatbot_provider import ChatBotProvider
+from src.provider.stt_provider import STTProvider
+from src.provider.tts_provider import TTSProvider
+from src.provider.rtc_provider import RTCProvider
+
+class AppProvider:
+    def __init__(self):
+        
+        self.agents_provider = RAGAgentsProvider()
+        self.chatbot_provider = ChatBotProvider(self.agents_provider)
+        self.stt_provider = STTProvider()
+        self.tts_provider = TTSProvider()
+        self.rtc_provider = RTCProvider(self.agents_provider, self.stt_provider, self.tts_provider )
+
+    def provide_rag_agents(self) -> RAGAgentsProvider:
+        return self.agents_provider
+    def provide_chatbot(self) -> ChatBotProvider:
+        return self.chatbot_provider
+    def provide_stt(self) -> STTProvider:
+        return self.stt_provider
+    def provide_tts(self) -> TTSProvider :
+        return self.tts_provider
+    def provide_rtc(self) -> RTCProvider:
+        return self.rtc_provider
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/chatbot_provider.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/chatbot_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..dabe100e476e544a265d3755ee60f9b5990248dc
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/chatbot_provider.py
@@ -0,0 +1,15 @@
+from src.internal.chatbot import ChatbotUI, RAGChatbot
+from src.provider.rag_agents_provider import RAGAgentsProvider
+class ChatBotProvider:
+    def __init__(self, rag_agent_provider : RAGAgentsProvider):
+
+        query_maker_agent = rag_agent_provider.provide_query_maker_agent()
+        cs_agent = rag_agent_provider.provide_cs_agent()
+
+        self.chatbot_handler = RAGChatbot(query_maker_agent, cs_agent)
+        self.chatbot_ui = ChatbotUI(self.chatbot_handler)
+
+    def provide_chatbot_ui(self) -> ChatbotUI:
+        return self.chatbot_ui
+    def provide_chatbot_handler(self) -> RAGChatbot:
+        return self.chatbot_handler
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/rag_agents_provider.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/rag_agents_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..66619e40c83f1d8c234858688fd629c614006973
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/rag_agents_provider.py
@@ -0,0 +1,84 @@
+
+from transformers import BitsAndBytesConfig
+import torch
+from src.internal.rag import (
+    LanguageModel, 
+    LanguageModelConfig, 
+    LangChainRetriever, 
+    Inferencer, 
+    InferencerConfig,
+    CSAgent,
+    QueryMakerAgent,
+    DuckDuckGoSearch,
+    get_chat_template
+)
+
+class RAGAgentsProvider:
+    def __init__(self):
+        self.bnb = BitsAndBytesConfig(
+                                    load_in_4bit=True,                    
+                                    bnb_4bit_use_double_quant=True,         
+                                    bnb_4bit_quant_type="nf4",             
+                                    bnb_4bit_compute_dtype=torch.bfloat16,
+                )
+        self.config = LanguageModelConfig(
+                        model_name = "Qwen/Qwen2.5-1.5B-Instruct",
+                        temperature=0.3,
+                        max_length=512,
+                        generation_timeout=999999,
+                        repetition_penalty=1.1,
+                        max_workers = 1,
+                        quantization_config = self.bnb
+        )
+        self.llm = LanguageModel(
+                config = self.config
+        )
+
+        self.inferencer_config = InferencerConfig(
+                default_k=5,
+                enable_reranking=False,
+        )
+
+        self.document_retriever = LangChainRetriever(
+                embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+                vectorstore_type="chroma",
+                vectorstore_path="data/vectorstore/",
+                use_hybrid_search=True,
+                chunk_size=3000,
+                chunk_overlap=200
+        )
+
+        self.ddgs = DuckDuckGoSearch()
+
+        self.cs_inferencer = Inferencer(
+                model=self.llm,
+                retriever=self.document_retriever,
+                # search_engine = ddgs,
+                reranker=None,
+                config=self.inferencer_config
+        )
+
+        self.query_maker_inferencer = Inferencer(
+                model=self.llm,
+                config=self.inferencer_config
+        )
+
+        self.cs_agent = CSAgent(
+            inferencer = self.cs_inferencer,
+            prompt_template = get_chat_template("customer_service")
+        )
+
+        self.query_maker_chat_template = get_chat_template("query_maker")
+        self.query_maker_chat_template[1]["content"] = """{question}"""
+
+        self.query_maker_agent = QueryMakerAgent(
+            inferencer = self.query_maker_inferencer,
+            prompt_template = self.query_maker_chat_template
+        )
+    
+    def provide_query_maker_agent(self) -> QueryMakerAgent:
+        return self.query_maker_agent
+    
+    def provide_cs_agent(self) -> CSAgent:
+        return self.cs_agent
+
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/rtc_provider.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/rtc_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4c2883889c10cd7d82e23cdf25cc8a1a271198a
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/rtc_provider.py
@@ -0,0 +1,20 @@
+
+from src.provider.rag_agents_provider import RAGAgentsProvider
+from src.provider.stt_provider import STTProvider
+from src.provider.tts_provider import TTSProvider
+from src.internal.rtc import RTCHandler
+class RTCProvider:
+    def __init__(self, rag_agent_provider : RAGAgentsProvider, stt_provider : STTProvider, tts_provider : TTSProvider  ):
+        
+        self.tts_provider = tts_provider
+        self.stt_provider = stt_provider
+        self.rag_agent_provider = rag_agent_provider
+
+        cs_agent = rag_agent_provider.provide_cs_agent()
+        whisper_stt = stt_provider.provide_whisper_stt()
+        edge_tts = tts_provider.provide_edge_tts()
+
+        self.rtc_handler = RTCHandler(cs_agent, whisper_stt , edge_tts)
+    
+    def provide_rtc_handler(self) -> RTCHandler:
+        return self.rtc_handler
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/stt_provider.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/stt_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ee1021f787393267d3ac29714f787de1a0128d
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/stt_provider.py
@@ -0,0 +1,6 @@
+from src.internal.stt.whisper_stt import WhisperSTT
+class STTProvider:
+    def __init__(self):
+        self.whisper_stt = WhisperSTT(model_size = "base", device = "cuda")
+    def provide_whisper_stt(self) -> WhisperSTT:
+        return self.whisper_stt
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/tts_provider.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/tts_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..84e955a8084be37aedefd86473ed163d4f4cf208
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/provider/tts_provider.py
@@ -0,0 +1,6 @@
+from src.internal.tts.audio_edge_tts import EdgeTTS
+class TTSProvider:
+    def __init__(self):
+        self.edge_tts = EdgeTTS("id-ID-ArdiNeural",  "+0%", "+0%")
+    def provide_edge_tts(self) -> EdgeTTS:
+        return self.edge_tts
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/ddgs_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/ddgs_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45f54468fc70c585f779eea53bfa52c8caa16f0
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/ddgs_test.py
@@ -0,0 +1,7 @@
+from src.internal.rag import DuckDuckGoSearch
+def test_ddgs():
+    # query = input()
+    # print("Searching for query = ", query)
+    ddgs = DuckDuckGoSearch()
+    print("*** searching result : **")
+    print(ddgs.search("Perhitungan uang lembur"))
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/document_retriever_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/document_retriever_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..455ddf068258659ad3ad742b164ffc8995324ae9
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/document_retriever_test.py
@@ -0,0 +1,44 @@
+from src.internal.rag import LangChainRetriever
+
+
+async def test_document_retriever():
+    print(" ===== Testing document retriever ==== ")
+    """Example usage of LangChainRetriever"""
+    # Initialize retriever
+    retriever = LangChainRetriever(
+        embedding_model="text-embedding-3-small",
+        vectorstore_type="chroma",
+        vectorstore_path="./my_vectorstore",
+        use_hybrid_search=True,
+        chunk_size=1000,
+        chunk_overlap=200
+    )
+    
+    # Add documents from files
+    file_paths = [
+        "../documents/file.pdf",
+    ]
+    
+    for file_path in file_paths:
+        result = await retriever.add_document_from_file(file_path)
+        if result.success:
+            print(f"Successfully processed: {result.document_metadata.file_name}")
+            print(f"Chunks created: {result.document_metadata.chunk_count}")
+        else:
+            print(f"Failed to process: {result.error_message}")
+    
+    # Query documents
+    query = "Recurrent neural network (RNN) is"
+    result = await retriever.retrieve(query, k=5)
+    
+    print(f"\nQuery: {result.query}")
+    print(f"Found {len(result.documents)} relevant documents")
+    print(f"Retrieval time: {result.retrieval_time:.2f}s")
+    
+    for i, doc in enumerate(result.documents):
+        print(f"\nDocument {i+1}:")
+        print(f"Score: {result.scores[i]:.3f}")
+        print(f"Content: {doc.page_content[:200]}...")
+        print(f"Metadata: {doc.metadata}")
+
+    print(" ===== Testing document retriever DONE ==== ")
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/inference_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/inference_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8437e1bb4f100e99f99b139372949e89134462a8
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/inference_test.py
@@ -0,0 +1,271 @@
+import gradio as gr
+import asyncio
+from rag.pipeline.language_model import LM, LMConfig
+from rag.retriever.langchain_retriever import LangChainRetriever
+from rag.inference.inferencer import InferencerConfig, Inferencer
+from rag import cs_agent, query_maker_agent
+def test_inference():
+    """Main function that sets up and runs the RAG chatbot interface"""
+    
+    # Initialize RAG components
+    print("==== Start Inference Test ===")
+    
+    
+    print("RAG system initialized successfully!")
+
+    def chatbot_response(message, history):
+        """Streaming response menggunakan RAG inferencer"""
+        try:
+            # Create new event loop for this thread
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+            async def stream_response():
+                partial_response = ""
+                # print("message = ", message)
+                # formatted_query = await query_maker_agent.get_result(question = message)
+                # print("Formatted Query = ", formatted_query)
+                # formatted_query = formatted_query['responses'][0]['rag_response']
+                await cs_agent.load_documents()
+                async for stream_data in cs_agent.get_result(question = message):
+
+                    if stream_data["type"] == "chunk":
+                        chunk = stream_data["data"]["chunk"]
+                        partial_response += chunk
+                        yield partial_response
+                        
+                    elif stream_data["type"] == "metadata":
+                        setup_time = stream_data['data']['setup_time']
+                        print(f"\nSetup completed in {setup_time:.2f}s")
+                        
+                    elif stream_data["type"] == "complete":
+                        total_time = stream_data['data']['total_time']
+                        print(f"\nTotal time: {total_time:.2f}s")
+            
+            async_gen = stream_response()
+
+            try:
+                while True:
+                    result = loop.run_until_complete(async_gen.__anext__())
+                    yield result
+            except StopAsyncIteration:
+                pass
+            finally:
+                loop.close()
+                
+        except Exception as e:
+            yield f"❌ Error: {str(e)}"
+
+    def add_document_to_vectorstore(file_path):
+        """Add document to vectorstore"""
+        if not file_path:
+            return "⚠️ Please select a file first."
+        
+        try:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            
+            async def add_doc():
+                result = ""
+                return result
+            
+            result = loop.run_until_complete(add_doc())
+            loop.close()
+            
+            if result.success:
+                return f"✅ Successfully added: {result.document_metadata.file_name} ({result.document_metadata.chunk_count} chunks)"
+            else:
+                return f"❌ Failed to add document: {result.error_message}"
+                
+        except Exception as e:
+            return f"❌ Error adding document: {str(e)}"
+
+    def clear_chat():
+        """Function untuk clear chat history"""
+        return [], ""
+
+    # CSS untuk styling
+    css = """
+    .gradio-container {
+        max-width: 900px !important;
+        margin: auto !important;
+    }
+    .chat-message {
+        padding: 10px;
+        margin: 5px;
+        border-radius: 10px;
+    }
+    #chatbot {
+        height: 500px;
+    }
+    """
+
+    # Membuat interface Gradio
+    with gr.Blocks(css=css, title="RAG Chatbot") as demo:
+        gr.Markdown("""
+        # 🤖 SakuraAI, Virtual Assistant 
+        """)
+        
+        # Status indicator
+        with gr.Row():
+            status_text = gr.Textbox(
+                value="✅ RAG System Ready",
+                label="System Status",
+                interactive=False,
+                container=True
+            )
+        
+        with gr.Row():
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    show_label=False,
+                    container=True,
+                    bubble_full_width=False,
+                    show_copy_button=True,
+                    layout="panel"
+                )
+                
+                with gr.Row():
+                    msg = gr.Textbox(
+                        placeholder="Tanyakan sesuatu tentang dokumen Anda...",
+                        show_label=False,
+                        scale=4,
+                        container=False,
+                        lines=1,
+                        max_lines=3,
+                        autofocus=True
+                    )
+                    send_btn = gr.Button("Kirim", variant="primary", scale=1)
+                
+                with gr.Row():
+                    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+                    stop_btn = gr.Button("⏹️ Stop", variant="stop", visible=False)
+            
+            # Document management panel
+            with gr.Column(scale=1):
+                gr.Markdown("### 📚 Document Management")
+                
+                with gr.Group():
+                    file_upload = gr.File(
+                        label="Upload Document",
+                        file_types=[".pdf", ".txt", ".docx"],
+                        type="filepath"
+                    )
+                    upload_btn = gr.Button("Add to Knowledge Base", variant="secondary")
+                    upload_status = gr.Textbox(
+                        label="Upload Status",
+                        interactive=False,
+                        lines=3
+                    )
+                
+                gr.Markdown("""
+                ### ⚙️ RAG Settings
+                - **K**: 3 (documents retrieved)
+                - **Template**: Friendly
+                - **Reranking**: Disabled
+                - **Vectorstore**: ChromaDB
+                """)
+        
+        # State untuk tracking
+        is_generating = gr.State(False)
+        
+        # Event handlers untuk chat
+        def user_message(message, history, generating):
+            """Handle user message"""
+            if message.strip() and not generating:
+                history.append([message, None])
+                return "", history, True, gr.update(visible=True), gr.update(interactive=False)
+            return message, history, generating, gr.update(visible=False), gr.update(interactive=True)
+        
+        def bot_message_stream(history, generating):
+            """Handle streaming bot response"""
+            if history and history[-1][1] is None and generating:
+                user_msg = history[-1][0]
+                
+                for partial_response in chatbot_response(user_msg, history):
+                    history[-1][1] = partial_response
+                    yield history, True, gr.update(visible=True), gr.update(interactive=False)
+                
+                yield history, False, gr.update(visible=False), gr.update(interactive=True)
+            else:
+                yield history, generating, gr.update(visible=False), gr.update(interactive=True)
+        
+        def stop_generation():
+            """Stop the generation process"""
+            return False, gr.update(visible=False), gr.update(interactive=True)
+        
+        # Binding events untuk submit message
+        submit_event = msg.submit(
+            user_message,
+            inputs=[msg, chatbot, is_generating],
+            outputs=[msg, chatbot, is_generating, stop_btn, send_btn]
+        ).then(
+            bot_message_stream,
+            inputs=[chatbot, is_generating],
+            outputs=[chatbot, is_generating, stop_btn, send_btn]
+        )
+        
+        # Binding events untuk send button
+        send_event = send_btn.click(
+            user_message,
+            inputs=[msg, chatbot, is_generating],
+            outputs=[msg, chatbot, is_generating, stop_btn, send_btn]
+        ).then(
+            bot_message_stream,
+            inputs=[chatbot, is_generating],
+            outputs=[chatbot, is_generating, stop_btn, send_btn]
+        )
+        
+        # Clear chat event
+        clear_btn.click(
+            clear_chat,
+            outputs=[chatbot, msg]
+        ).then(
+            lambda: (False, gr.update(visible=False), gr.update(interactive=True)),
+            outputs=[is_generating, stop_btn, send_btn]
+        )
+        
+        # Stop generation event
+        stop_btn.click(
+            stop_generation,
+            outputs=[is_generating, stop_btn, send_btn],
+            cancels=[submit_event, send_event]
+        )
+        
+        # Document upload event
+        upload_btn.click(
+            add_document_to_vectorstore,
+            inputs=[file_upload],
+            outputs=[upload_status]
+        )
+        
+        # Info panel
+        with gr.Accordion("ℹ️ Info Penggunaan", open=False):
+            gr.Markdown("""
+            ### Cara Menggunakan:
+            1. **Chat**: Ketik pertanyaan tentang dokumen yang sudah dimuat
+            2. **Upload**: Tambahkan dokumen baru ke knowledge base
+            3. **Stream**: Response akan muncul secara streaming
+            4. **Stop**: Gunakan tombol stop untuk menghentikan generasi
+            
+            ### Dokumen yang Dimuat:
+            - file2.pdf (dari folder documents)
+            - Dokumen tambahan yang Anda upload
+            
+            ### Teknologi yang Digunakan:
+            - **LLM**: Qwen dengan streaming
+            - **Embedding**: text-embedding-3-small
+            - **Vectorstore**: ChromaDB
+            - **Search**: Hybrid search (dense + sparse)
+            """)
+
+    # Launch the interface
+    print("Launching Gradio interface...")
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7861,
+        show_error=True,
+        show_api=False
+    )
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/qwen_llm_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/qwen_llm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..76225355d807e4590f9ce8f7c1d47efaa13bd461
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/qwen_llm_test.py
@@ -0,0 +1,69 @@
+from rag.retriever.retriever_types import *
+from rag.pipeline.language_model import LM, LMConfig
+
+import warnings
+warnings.filterwarnings("ignore")
+
+async def test_language_model():
+    print(" ===== Testing QWEN LLM ==== ")
+    """Example usage of async LM"""
+
+    config = LMConfig(
+        temperature=0.5,
+        max_length=512,
+        generation_timeout=30
+    )
+    
+    contexts = RetrievalResult(
+                documents =  [
+                    Document(page_content = "AI adalah teknologi yang memungkinkan mesin untuk belajar dan beradaptasi."),
+                    Document(page_content ="Machine learning adalah subset dari AI yang fokus pada pembelajaran otomatis.")
+                    ],
+                scores=[0.95, 0.1]
+    )
+
+    # Using async context manager
+    async with LM(config) as llm:
+          await test_qwen_single_generation(llm)
+          await test_qwen_single_rag_generation(llm, contexts)
+          await test_qwen_multiple_template_rag_generation(llm, contexts)
+          await test_qwen_batch_generation(llm, contexts)
+    print(" ===== Testing LLM DONE ==== ")
+
+async def test_qwen_single_generation(llm : LM):
+    print(" * Test Single Generation * ")
+    response = await llm.generate("Jelaskan tentang AI")
+    print(f"Response: {response}")
+    print(" * Test Single Generation Done * ")
+
+async def test_qwen_single_rag_generation(llm : LM, ctx : RetrievalResult):
+    print(" * Test Single RAG Generation * ")
+    rag_response = await llm.rag_generate(
+            question="Apa itu AI dan machine learning?",
+            contexts=ctx,
+            template_type="system"
+        )
+    print(f"RAG Response: {rag_response}")
+    print(" * Test Single RAG Generation Done * ")
+
+async def test_qwen_multiple_template_rag_generation(llm : LM,ctx : RetrievalResult):
+        print(" * Test Multiple Template Generation * ")
+        multi_responses = await llm.multi_template_generate(
+             question="Apa itu AI?",
+             contexts=ctx,
+             template_types=["system", "instruction"]
+        )
+        print(f"Multi-template responses: {multi_responses}")
+        print(" * Test Multiple Template Generation Done* ")
+
+
+async def test_qwen_batch_generation(llm : LM, ctx : RetrievalResult):
+        print(" * Test Batch Generation * ")
+        batch_responses = await llm.batch_generate([
+             "Jelaskan tentang Python",
+             "Apa itu machine learning?",
+             "Bagaimana cara kerja neural network?"
+         ])
+        print(f"Batch responses: {batch_responses}")
+        print(" * Test Batch Generation Done * ")
+    
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/rtc_test.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/rtc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ed38f21121e018954c538a91a256110c51d676
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/tests/rtc_test.py
@@ -0,0 +1,8 @@
+# import inspect
+import rtc
+# import rag
+
+predicting_thread = dict(status = False)
+
+def test_rtc():
+    rtc.handle_rtc()
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/utils/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/utils/audio_helper.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/utils/audio_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..d93784fe88ff4008f30aed5418136e1f910f1d90
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/utils/audio_helper.py
@@ -0,0 +1,45 @@
+import io
+from fastrtc.utils import audio_to_int16
+from pydub import AudioSegment
+import wave
+import torch
+import torchaudio
+import numpy as np
+
+def audio_to_bytes(self, audio_tuple, sample_rate=24000) -> io.BufferedReader:
+        sr, audio_data = audio_tuple
+        audio_int16 = audio_to_int16(audio_tuple)
+
+        buffer = io.BytesIO()
+        with wave.open(buffer, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(sr)
+            wf.writeframes(audio_int16.tobytes())
+        buffer.seek(0)
+        buffer.name = "audio.wav"
+        return buffer
+
+def resample_audio(audio_buffer:io.BytesIO):
+    audio_buffer.seek(0)
+
+    audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
+    samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2 ** 15)
+    
+    
+    if audio_segment.channels == 2:
+        samples = samples.reshape((-1, 2)).mean(axis=1)
+    
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    
+    audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device)  
+
+    resampler = torchaudio.transforms.Resample(
+        orig_freq=audio_segment.frame_rate,
+        new_freq=24000
+    ).to(device)
+    
+    resampled_tensor = resampler(audio_tensor)
+    resampled = resampled_tensor.squeeze(0).cpu().numpy()
+
+    return resampled
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/src/config/__init__.py b/space/space/space/space/space/space/space/space/space/src/config/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cace997b849e612f0abeeb4d9da1026a2c5d276e 100644
--- a/space/space/space/space/space/space/space/space/space/src/config/__init__.py
+++ b/space/space/space/space/space/space/space/space/space/src/config/__init__.py
@@ -0,0 +1,5 @@
+from src.config.constant import (
+    OPENAI_API_KEY,
+    ELEVENLABS_API_KEY,
+    HF_TOKEN,
+)
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/__init__.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/__init__.py
index 63a2c31063c7bb5e6bcafad7c04ca5e59f0679b7..1f4e0b48e4a1deb489ce1c9329978d810d05f91d 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/rag/__init__.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/__init__.py
@@ -1,12 +1,14 @@
 from src.internal.rag.pipeline.language_model import LanguageModel, LanguageModelConfig
 from src.internal.rag.retriever.langchain_retriever import LangChainRetriever
 from src.internal.rag.inference.inferencer import Inferencer, InferencerConfig
+from src.internal.rag.agents.base_agents import Agent
 from src.internal.rag.agents.customer_service_agent import CSAgent
+from src.internal.rag.agents.gpt_customer_service_agent import GPTCSAgent
 from src.internal.rag.agents.query_maker_agent import QueryMakerAgent
 from langchain_core.documents import Document
 from src.internal.rag.web_search.duckduckgo_search import DuckDuckGoSearch
 from src.internal.rag.chat_template.prompt_template import get_chat_template
-
+from src.internal.rag.inference.inferencer_types import chunk_response, meta_data_response, complete_response, error_response
 
 
 
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/base_agents.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/base_agents.py
new file mode 100644
index 0000000000000000000000000000000000000000..669b7bf46f1c7ff7b4d68a496962d6813360d301
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/base_agents.py
@@ -0,0 +1,18 @@
+
+from src.internal.rag.inference.inferencer import Inferencer
+from abc import ABC, abstractmethod
+from typing import List, Dict, Union
+class Agent(ABC):
+    def __init__(self, inferencer:Union[Inferencer, None] = None, prompt_template = [
+        {
+            "role" : "system",
+            "content":"You are an agent that doing some specic task"
+        }
+    ]):
+        if(inferencer is not None):
+            self.inferencer = inferencer
+            self.inferencer.model.prompt_template = prompt_template
+        self.prompt = prompt_template
+    @abstractmethod
+    async def get_result(self, chat_memory : List[Dict] = [], enable_search : bool = False, question : str = ""):
+        pass
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/customer_service_agent.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
index b7b06efe5bac5b299e0d07020ddf4af90b325b78..5c7789af288e8b0d85a1b5fe5811360ffa946437 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
@@ -1,6 +1,6 @@
-from src.internal.rag.agents.agents import Agent
+from src.internal.rag.agents.base_agents import Agent
 from src.internal.rag.inference.inferencer import Inferencer
-
+from typing import Dict, List
 class CSAgent(Agent):
     def __init__(self, inferencer : Inferencer , prompt_template):
         super().__init__(inferencer, prompt_template)
@@ -25,9 +25,7 @@ class CSAgent(Agent):
         else:
                 print(f"Failed to process: {result.error_message}")
 
-    async def get_result(self, question):
+    async def get_result(self, chat_memory : List[Dict] = [], enable_search : bool = False, question : str = ""):
         self.inferencer.model.prompt_template = self.prompt_template
-        async for item in self.inferencer.infer_stream(query = question,
-                                    enable_reranking=False,
-                                    k=3):
+        async for item in self.inferencer.infer_stream(chat_memory = chat_memory, query = question, enable_reranking=False, k=3):
                 yield item
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/gpt_customer_service_agent.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/gpt_customer_service_agent.py
index d07cb9487bcf878fe775a28c7607732934e3a44b..82ccbf999359f75458fb2d265992eb898079b5f9 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/gpt_customer_service_agent.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/gpt_customer_service_agent.py
@@ -1,12 +1,114 @@
-from src.internal.rag.agents.agents import Agent
+import logging
+from datetime import datetime
+from src.internal.rag.agents.base_agents import Agent
 from src.internal.rag.inference.inferencer import Inferencer
+from src.internal.rag.retriever.langchain_retriever import LangChainRetriever
+from src.internal.rag.web_search.duckduckgo_search import DDGS
+from typing import List, Dict
+from langchain_core.documents import Document
+import copy
+from src.internal.rag.inference.inferencer_types import (
+    chunk_response, error_response, complete_response, meta_data_response
+)
+from openai import OpenAI
 
 class GPTCSAgent(Agent):
-    def __init__(self, inferencer : Inferencer , prompt_template):
-        super().__init__(inferencer, prompt_template)
-        self.inferencer = inferencer
+    def __init__(self, ddgs : DDGS, retriever : LangChainRetriever, openai_client : OpenAI, prompt_template : List[Dict]):
+        super().__init__(None, prompt_template)
+        self.openai_client = openai_client
+        self.retriever = retriever
         self.prompt_template = prompt_template
-    async def get_result(self, question : str):
-        self.inferencer.model.prompt_template = self.prompt_template
-        print("Question received :", question)
-        return await self.inferencer.infer(query = question)
+        self.ddgs = ddgs
+        self.full_response = ""
+
+    async def retrieve_context(self, query  : str):
+        retrieval_result = await self.retriever.retrieve(query=query)
+        contexts  = ""
+        for i, ctx in enumerate(retrieval_result.documents, 1):
+            contexts += f"{i}. {ctx.page_content}\n"
+        return contexts
+    
+    async def web_search(self, query : str):
+        search_results = []
+        async for result in self.ddgs.search(query, max_results=5):
+            doc = Document(
+                page_content=result,
+                metadata={"source": "internet_search", "query": self.prompt}
+            )
+            search_results.append(doc)
+        await self.retriever.add_documents(search_results)
+        return search_results
+    
+    def format_prompt(self, question, context):
+        prompt_template = copy.deepcopy(self.prompt_template)
+        formatted_prompt = []
+        for cht in prompt_template:
+            if("content" in cht.keys()):
+                cht["content"] = cht["content"].format(
+                    question = question,
+                    context = context
+                )
+            formatted_prompt.append(cht)
+        return formatted_prompt
+
+    async def get_result(self, chat_memory : List[Dict] = [], enable_search : bool = False, question : str = ""):
+       
+        response_start = datetime.now()
+        start_time = datetime.now()
+
+        if(enable_search):
+            self.web_search(question)
+
+        contexts = await self.retrieve_context(question)
+        setup_time = (datetime.now() - start_time).total_seconds()
+
+        yield meta_data_response(
+            query = question,
+            setup_time = setup_time,
+            num_contexts = len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts),
+            enable_rerank = False
+        )
+
+        chat_memory = chat_memory + self.format_prompt(question, contexts)
+        
+        accumulated_text = ""
+        text_buffer = ""
+
+        try:
+            response = self.openai_client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=chat_memory,
+                max_tokens=200,
+                stream=True
+            )
+            response_start = datetime.now()
+            accumulated_text = ""
+            for stream_data in response:
+                delta = stream_data.choices[0].delta.content if stream_data.choices[0].delta else ""
+                finish_reason = stream_data.choices[0].finish_reason
+
+                if finish_reason == "stop":
+                    total_time = (datetime.now() - start_time).total_seconds()
+                    if text_buffer:
+                        yield complete_response(
+                            total_time=total_time,
+                            accumulated_text=accumulated_text,
+                            contexts=contexts
+                        )
+                        yield text_buffer
+                    break
+
+                if delta:
+                    self.full_response += delta
+                    text_buffer += delta
+                    chunk = delta
+                    accumulated_text += chunk
+                    yield chunk_response(
+                        chunk=chunk,
+                        accumulated_text=accumulated_text,
+                        generation_times=(datetime.now() - response_start).total_seconds()
+                    )
+                    text_buffer = ""
+        except Exception as e:
+            logging.error(f"TTS generation failed for chunk: {e}")
+            yield error_response(error=str(e))
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/query_maker_agent.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/query_maker_agent.py
index 0e30964542549dc0730f9989cec95eb53284dd4b..0a6ef843f9fbec89cfdabc0dc20ce804d2f8a661 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/query_maker_agent.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/agents/query_maker_agent.py
@@ -1,4 +1,4 @@
-from src.internal.rag.agents.agents import Agent
+from src.internal.rag.agents.base_agents import Agent
 from src.internal.rag.inference.inferencer import Inferencer
 
 class QueryMakerAgent(Agent):
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer.py
index dc727bc1e123e39878fc992c9d360b1c4616b7a9..a6a1c27c382b0e74b52a392d71baba14838c4028 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer.py
@@ -1,6 +1,7 @@
 from src.internal.rag.retriever.langchain_retriever import LangChainRetriever
 from src.internal.rag.pipeline.language_model import LanguageModel, LanguageModelConfig
 from src.internal.rag.retriever.retriever_types import RetrievalResult
+from src.internal.rag.inference.inferencer_types import chunk_response, meta_data_response, complete_response, error_response
 from typing import List, Union, Dict, Any, Optional, AsyncGenerator
 import asyncio
 import logging
@@ -117,7 +118,8 @@ class Inferencer:
             # Return original contexts if reranking fails
             return contexts
     
-    async def generate_response(self, 
+    async def generate_response(self,
+                              chat_memory: List[Dict],
                               contexts: RetrievalResult, 
                               query: Union[str, List[str]], 
                               response_type: Union[List[str], str] = None,
@@ -146,6 +148,7 @@ class Inferencer:
                     rag_responses = {}
                     for i, q in enumerate(query):
                         rag_response = await self.model.rag_generate(
+                            chat_memory=chat_memory,
                             question=q,
                             contexts=contexts,
                             max_new_tokens=max_new_tokens,
@@ -155,6 +158,7 @@ class Inferencer:
                     responses.append({"rag_response": rag_responses})
                 else:
                     rag_response = await self.model.rag_generate(
+                        chat_memory = chat_memory,
                         question=query,
                         contexts=contexts,
                         max_new_tokens=max_new_tokens,
@@ -223,6 +227,7 @@ class Inferencer:
             raise
     
     async def generate_response_stream(self, 
+                                     chat_memory: List[Dict],
                                      contexts: RetrievalResult, 
                                      query: str,
                                      max_new_tokens: Optional[int] = None,
@@ -231,6 +236,7 @@ class Inferencer:
         await self._ensure_model_loaded()
         
         async for chunk in self.model.rag_generate_stream(
+            chat_memory = chat_memory,
             question=query,
             contexts=contexts,
             max_new_tokens=max_new_tokens,
@@ -239,6 +245,8 @@ class Inferencer:
             yield chunk
     
     async def infer(self, 
+                   enable_search:bool,
+                   chat_memory:List[Dict],
                    query: str, 
                    response_type: Union[List[str], str] = None,
                    k: Optional[int] = None,
@@ -252,7 +260,7 @@ class Inferencer:
         
         try:
             
-            if(self.search_engine):
+            if(enable_search):
                 await self.retrieve_from_search_engine(query, k = k)
             if(self.retriever):
                 retrieved_contexts = await self.retrieve_context(main_query, k=k)
@@ -267,6 +275,7 @@ class Inferencer:
             
             # Step 3: Generate responses
             responses = await self.generate_response(
+                chat_memory = chat_memory,
                 contexts=contexts,
                 query=query,
                 response_type=response_type,
@@ -326,6 +335,8 @@ class Inferencer:
             self.logger.error(f"Error in retrieve_from_search_engine_alternative: {e}", exc_info=True)
             raise
     async def infer_stream(self, 
+                          enable_search:bool,
+                          chat_memory:List[Dict],
                           query: str,
                           k: Optional[int] = None,
                           enable_reranking: Optional[bool] = None,
@@ -335,7 +346,7 @@ class Inferencer:
         start_time = datetime.now()
         
         try:
-            if(self.search_engine):
+            if(enable_search):
                 await self.retrieve_from_search_engine(query, k = k)
             if(self.retriever is not None):
                 retrieved_contexts = await self.retrieve_context(query, k=k)
@@ -349,54 +360,45 @@ class Inferencer:
                 contexts = retrieved_contexts
 
             setup_time = (datetime.now() - start_time).total_seconds()
-            yield {
-                "type": "metadata",
-                "data": {
-                    "query": query,
-                    "setup_time": setup_time,
-                    "num_contexts": len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts),
-                    "reranking_enabled": enable_rerank,
-                }
-            }
-        
+            
+            yield meta_data_response(
+                query = query,
+                setup_time = setup_time,
+                num_contexts = len(contexts.documents) if hasattr(contexts, 'documents') else len(contexts),
+                enable_rerank = enable_rerank
+            )
+            
 
             response_start = datetime.now()
             accumulated_text = ""
             
             async for chunk in self.generate_response_stream(
+                chat_memory = chat_memory,
                 contexts=contexts,
                 query=query,
                 max_new_tokens=max_new_tokens,
                 **generation_kwargs
             ):
                 accumulated_text += chunk
-                yield {
-                    "type": "chunk",
-                    "data": {
-                        "chunk": chunk,
-                        "accumulated_text": accumulated_text,
-                        "generation_time": (datetime.now() - response_start).total_seconds()
-                    }
-                }
+                yield chunk_response(
+                    chunk = chunk,
+                    accumulated_text=accumulated_text,
+                    generation_times = (datetime.now() - response_start).total_seconds()
+                )
+            
             total_time = (datetime.now() - start_time).total_seconds()
-            yield {
-                "type": "complete",
-                "data": {
-                    "total_time": total_time,
-                    "final_response": accumulated_text,
-                    "contexts": contexts
-                }
-            }
+            yield complete_response(
+                total_time = total_time, 
+                accumulated_text = accumulated_text,
+                contexts = contexts
+            )
             
         except Exception as e:
             self.logger.error(f"Error during streaming inference: {e}")
-            yield {
-                "type": "error",
-                "data": {
-                    "error": str(e),
-                    "error_time": (datetime.now() - start_time).total_seconds()
-                }
-            }
+            yield error_response(
+                e = str(e),
+                error_time = (datetime.now() - start_time).total_seconds()
+            )
     
     async def batch_infer(self, 
                          queries: List[str],
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer_types.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..28915b825443fbfcd838eda42c24b0ee9e26da07
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/inference/inferencer_types.py
@@ -0,0 +1,41 @@
+from src.internal.rag.retriever.retriever_types import RetrievalResult
+
+def meta_data_response(query : str, setup_time : float, num_contexts : int, enable_rerank : bool):
+     return {
+                "type": "metadata",
+                "data": {
+                    "query": query,
+                    "setup_time": setup_time,
+                    "num_contexts": num_contexts,
+                    "reranking_enabled": enable_rerank,
+                }
+    }
+
+def chunk_response(chunk : str, accumulated_text : str, generation_times : float):
+    return {
+                    "type": "chunk",
+                    "data": {
+                        "chunk": chunk,
+                        "accumulated_text": accumulated_text,
+                        "generation_time": generation_times,
+                    }
+    }
+
+def complete_response(total_time : float, accumulated_text : str, contexts : RetrievalResult):
+     return {
+                "type": "complete",
+                "data": {
+                    "total_time": total_time,
+                    "final_response": accumulated_text,
+                    "contexts": contexts
+                }
+    }
+
+def error_response(e : Exception, error_time : float):
+     return {
+            "type": "error",
+            "data": {
+                "error": str(e),
+                "error_time": error_time
+            }
+    }
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/language_model.py b/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/language_model.py
index 4b11ac9ee510bbd8aaa646229eba213c78a7622f..424c14280186d26dc1a94fd14c68097a12680c76 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/language_model.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rag/pipeline/language_model.py
@@ -187,7 +187,7 @@ class LanguageModel:
                                 custom_template: Optional[str] = None,
                                 include_metadata: bool = True,
                                 context_numbering: bool = True,
-                                max_contexts: Optional[int] = None) -> str:
+                                max_contexts: Optional[int] = None) -> List[Dict]:
         """
         Format prompt untuk RAG dengan berbagai template options (async)
         """
@@ -372,6 +372,7 @@ class LanguageModel:
             raise
 
     async def rag_generate_stream(self,
+                                 chat_memory: List[Dict],
                                  question: str,
                                  contexts: Union[List[str], RetrievalResult],
                                  max_new_tokens: Optional[int] = None,
@@ -380,7 +381,9 @@ class LanguageModel:
         await self._check_model_loaded()
 
         prompt = await self.format_rag_prompt(question, contexts)
-        
+
+        chat_memory = chat_memory + prompt
+
         temp = temperature if temperature is not None else 0.3
         
         async for chunk in self.generate_stream(
@@ -393,6 +396,7 @@ class LanguageModel:
 
 
     async def rag_generate(self,
+                          chat_memory: List[Dict],
                           question: str,
                           contexts: Union[List[str], RetrievalResult],
                           max_new_tokens: Optional[int] = None,
@@ -404,11 +408,12 @@ class LanguageModel:
         
         prompt = await self.format_rag_prompt(question, contexts)
         
+        chat_memory = chat_memory + prompt
         
         temp = temperature if temperature is not None else 0.3
         
         return await self.generate(
-            prompt=prompt,
+            prompt=chat_memory,
             max_new_tokens=max_new_tokens,
             temperature=temp,
             **kwargs
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call.py b/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call.py
index c5deff744fd053e12ebdb84edae18cd81ce024ce..6092d45fc5f177504f8e9aa3a5a62a00d67f7efa 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/rtc/rtc_call.py
@@ -4,9 +4,9 @@ from fastrtc import ReplyOnPause, Stream, AlgoOptions, SileroVadOptions, get_clo
 from src.utils.audio_helper import audio_to_bytes, resample_audio
 from dotenv import load_dotenv
 
-from src.internal.tts.audio_edge_tts import EdgeTTS
-from src.internal.stt.whisper_stt import WhisperSTT
-from src.internal.rag.agents.customer_service_agent import CSAgent
+from src.internal.tts.base_tts import TTS
+from src.internal.stt.base_stt import STT
+from src.internal.rag import Agent
 
 import logging
 import time
@@ -23,10 +23,10 @@ load_dotenv()
 logging.basicConfig(level=logging.INFO)
 
 class RTCHandler:
-    def __init__(self, cs_agent: CSAgent, whisper_stt: WhisperSTT, edge_tts : EdgeTTS):
-        self.cs_agent = cs_agent
-        self.whisper_stt = whisper_stt
-        self.edge_tts = edge_tts
+    def __init__(self, agent : Agent , stt: STT, tts : TTS):
+        self.agent = agent
+        self.stt = stt
+        self.tts = tts
         self.full_response = ""
         self.stream = None
         self.app = None
@@ -49,28 +49,27 @@ class RTCHandler:
     
     def echo(self, audio):
             try:
+                chat_memory = []
                 stt_time = time.time()
                 logging.info("Performing STT")
 
-                transcription = self.whisper_stt.transcribe(audio_to_bytes(audio))
+                transcription = self.stt.transcribe(audio_to_bytes(audio))
                 prompt = transcription
                 if prompt == "":
                     logging.info("STT returned empty string")
                     return
 
                 logging.info(f"STT response: {transcription}")
-                self.messages.append({"role": "user", "content": prompt})
                 logging.info(f"STT took {time.time() - stt_time} seconds")
 
                 llm_time = time.time()
 
                 self.full_response = ""
-                self.cs_agent.inferencer.model.prompt_template.append(self.messages)
 
                 async def stream_text_to_audio():
                     chunk_size = 1024
                     text_buffer = ""
-                    async for stream_data in self.cs_agent.get_result(question = prompt):
+                    async for stream_data in self.agent.get_result(question = prompt):
                         if stream_data["type"] == "chunk":
                             chunk = stream_data["data"]["chunk"]
                             self.full_response += chunk
@@ -78,7 +77,7 @@ class RTCHandler:
                             
                             if re.search(r'[.,?;!]', chunk):
                                 try:
-                                    audio_buffer_gen =  await self.edge_tts.generate_audio_buffer(text_buffer)
+                                    audio_buffer_gen =  await self.tts.generate_audio_buffer(text_buffer)
                                     audio_buffer = audio_buffer_gen[0]
                                     resampled = resample_audio(audio_buffer)
                                     for i in range(0, len(resampled), chunk_size):
@@ -112,8 +111,9 @@ class RTCHandler:
                             break
                 finally:
                     loop.close()
-
-                self.messages.append({"role": "assistant", "content": self.full_response + " "})
+                if(len(chat_memory) >= 15):
+                    chat_memory = []
+                chat_memory.append({"role": "assistant", "content": self.full_response + " "})
                 logging.info(f"LLM response: {self.full_response}")
                 logging.info(f"LLM took {time.time() - llm_time} seconds")
 
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/stt/__init__.py b/space/space/space/space/space/space/space/space/space/src/internal/stt/__init__.py
index 79e70e10f7b1601fc3ce285279d75b37501eeba1..d0b5143c121b232f7c3e57888ce490e7ce82eef2 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/stt/__init__.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/stt/__init__.py
@@ -1 +1,2 @@
-from src.internal.stt.whisper_stt import WhisperSTT
\ No newline at end of file
+from src.internal.stt.whisper_stt import WhisperSTT
+from src.internal.stt.openai_stt import OpenAISTT
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/stt/base_stt.py b/space/space/space/space/space/space/space/space/space/src/internal/stt/base_stt.py
new file mode 100644
index 0000000000000000000000000000000000000000..e78bee9c6b102d25564f1bf7dd0d7dfb85c8fc6a
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/src/internal/stt/base_stt.py
@@ -0,0 +1,7 @@
+from abc import ABC, abstractmethod
+class STT(ABC):
+    def __init__(self):
+        pass
+    @abstractmethod
+    def transcribe():
+        pass
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/stt/openai_stt.py b/space/space/space/space/space/space/space/space/space/src/internal/stt/openai_stt.py
new file mode 100644
index 0000000000000000000000000000000000000000..96098ece41fe9a73339f2c25aef3af5c78b95c07
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/src/internal/stt/openai_stt.py
@@ -0,0 +1,15 @@
+from openai import OpenAI
+from src.internal.stt.base_stt import STT
+import io
+class OpenAISTT(STT):
+    def __init__(self, open_ai_client: OpenAI):
+        self.openai_client = open_ai_client
+    def transcribe(self, audio: io.BytesIO, model: str = "whisper-1", language: str = "id"):
+        transcription = self.openai_client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio,
+                language="id"
+        )
+        return transcription.text
+
+    
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/stt/whisper_stt.py b/space/space/space/space/space/space/space/space/space/src/internal/stt/whisper_stt.py
index 6313b8c777794ca375869f6f98eb60ae9f120add..af72da58191b90183e326eab106f4bf5c587710f 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/stt/whisper_stt.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/stt/whisper_stt.py
@@ -2,16 +2,16 @@
 import whisper
 import torch
 from fastrtc.utils import audio_to_int16
+from src.internal.stt.base_stt import STT
 import io
 import os
 import tempfile
 
-class WhisperSTT:
+class WhisperSTT(STT):
     def __init__(self, model_size: str = "base", device: str = "auto"):
         cache_dir = os.environ.get('WHISPER_CACHE_DIR', '/tmp/.cache/whisper')
         os.makedirs(cache_dir, exist_ok=True)
         
-        
         if device == "auto":
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
         else:
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/tts/audio_edge_tts.py b/space/space/space/space/space/space/space/space/space/src/internal/tts/audio_edge_tts.py
index 003f26c7416dbd504b619e054bc73749dd76b53f..1671684a920d72d89688df0cceb990b5333c417d 100644
--- a/space/space/space/space/space/space/space/space/space/src/internal/tts/audio_edge_tts.py
+++ b/space/space/space/space/space/space/space/space/space/src/internal/tts/audio_edge_tts.py
@@ -2,9 +2,10 @@
 import edge_tts
 import asyncio
 from typing import AsyncGenerator, Optional, Tuple
+from src.internal.tts.base_tts import TTS
 import io
 
-class EdgeTTS:
+class EdgeTTS(TTS):
     def __init__(self, voice_short_name: str,  rate: str = "+0%", volume: str = "+0%", pitch: str = "+0Hz",):
         self.voice_short_name = voice_short_name
         self.rate_str = rate
diff --git a/space/space/space/space/space/space/space/space/space/src/internal/tts/base_tts.py b/space/space/space/space/space/space/space/space/space/src/internal/tts/base_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..81572515cca0e4fd309aef52a1aacea915ca0fd2
--- /dev/null
+++ b/space/space/space/space/space/space/space/space/space/src/internal/tts/base_tts.py
@@ -0,0 +1,17 @@
+
+from typing import AsyncGenerator, Optional, Tuple
+import io
+from abc import ABC, abstractmethod
+class TTS(ABC):
+    @abstractmethod
+    def __init__(self):
+        pass
+    @abstractmethod
+    async def generate_audio_stream(self, text: str) -> AsyncGenerator[bytes, None]:
+        pass
+    @abstractmethod
+    async def generate_audio_buffer(self, text: str) -> Tuple[Optional[io.BytesIO], Optional[str]]:
+        pass
+    @abstractmethod
+    async def generate_audio_with_callback(self, text: str, callback_func):
+        pass
diff --git a/space/space/space/space/space/space/space/space/space/src/provider/app_provider.py b/space/space/space/space/space/space/space/space/space/src/provider/app_provider.py
index a6d9a96ea9eed7cb29c32356caaea92f9395dc8e..d59b64895580c6c0d75266fabd757db25a401936 100644
--- a/space/space/space/space/space/space/space/space/space/src/provider/app_provider.py
+++ b/space/space/space/space/space/space/space/space/space/src/provider/app_provider.py
@@ -3,17 +3,18 @@ from src.provider.chatbot_provider import ChatBotProvider
 from src.provider.stt_provider import STTProvider
 from src.provider.tts_provider import TTSProvider
 from src.provider.rtc_provider import RTCProvider
+from openai import OpenAI
 
 class AppProvider:
-    def __init__(self):
+    def __init__(self, openai_client : OpenAI):
         
-        self.agents_provider = RAGAgentsProvider()
+        self.agents_provider = RAGAgentsProvider(openai_client)
         self.chatbot_provider = ChatBotProvider(self.agents_provider)
-        self.stt_provider = STTProvider()
+        self.stt_provider = STTProvider(openai_client)
         self.tts_provider = TTSProvider()
         self.rtc_provider = RTCProvider(self.agents_provider, self.stt_provider, self.tts_provider )
 
-    def provide_rag_agents(self) -> RAGAgentsProvider:
+    def provide_rag_agents(self, openai_client : OpenAI) -> RAGAgentsProvider:
         return self.agents_provider
     def provide_chatbot(self) -> ChatBotProvider:
         return self.chatbot_provider
diff --git a/space/space/space/space/space/space/space/space/space/src/provider/rag_agents_provider.py b/space/space/space/space/space/space/space/space/space/src/provider/rag_agents_provider.py
index 66619e40c83f1d8c234858688fd629c614006973..2feff9666f95c4b1b3df5a9ca36616284b03f807 100644
--- a/space/space/space/space/space/space/space/space/space/src/provider/rag_agents_provider.py
+++ b/space/space/space/space/space/space/space/space/space/src/provider/rag_agents_provider.py
@@ -8,13 +8,15 @@ from src.internal.rag import (
     Inferencer, 
     InferencerConfig,
     CSAgent,
+    GPTCSAgent,
     QueryMakerAgent,
     DuckDuckGoSearch,
     get_chat_template
 )
-
+from openai import OpenAI
 class RAGAgentsProvider:
-    def __init__(self):
+    def __init__(self, open_ai_client : OpenAI):
+
         self.bnb = BitsAndBytesConfig(
                                     load_in_4bit=True,                    
                                     bnb_4bit_use_double_quant=True,         
@@ -67,7 +69,12 @@ class RAGAgentsProvider:
             inferencer = self.cs_inferencer,
             prompt_template = get_chat_template("customer_service")
         )
-
+        self.openai_client = open_ai_client
+        self.gpt_cs_agent = GPTCSAgent(ddgs = self.ddgs,
+                                       openai_client = self.openai_client,
+                                       prompt_template = get_chat_template("customer_service"),
+                                       retriever = self.document_retriever
+                                       )
         self.query_maker_chat_template = get_chat_template("query_maker")
         self.query_maker_chat_template[1]["content"] = """{question}"""
 
@@ -81,4 +88,8 @@ class RAGAgentsProvider:
     
     def provide_cs_agent(self) -> CSAgent:
         return self.cs_agent
-
+    
+    def provide_gpt_cs_agent(self) -> GPTCSAgent:
+        return self.gpt_cs_agent
+    
+    
diff --git a/space/space/space/space/space/space/space/space/space/src/provider/rtc_provider.py b/space/space/space/space/space/space/space/space/space/src/provider/rtc_provider.py
index a4c2883889c10cd7d82e23cdf25cc8a1a271198a..a628257f261cd61aba8af2570a761cbed8eb2932 100644
--- a/space/space/space/space/space/space/space/space/space/src/provider/rtc_provider.py
+++ b/space/space/space/space/space/space/space/space/space/src/provider/rtc_provider.py
@@ -5,16 +5,16 @@ from src.provider.tts_provider import TTSProvider
 from src.internal.rtc import RTCHandler
 class RTCProvider:
     def __init__(self, rag_agent_provider : RAGAgentsProvider, stt_provider : STTProvider, tts_provider : TTSProvider  ):
-        
-        self.tts_provider = tts_provider
-        self.stt_provider = stt_provider
-        self.rag_agent_provider = rag_agent_provider
 
         cs_agent = rag_agent_provider.provide_cs_agent()
+        cs_gpt_agent = rag_agent_provider.provide_gpt_cs_agent()
         whisper_stt = stt_provider.provide_whisper_stt()
         edge_tts = tts_provider.provide_edge_tts()
-
+        openai_stt = stt_provider.provide_openai_stt()
         self.rtc_handler = RTCHandler(cs_agent, whisper_stt , edge_tts)
-    
+        self.rtc_gpt_handler = RTCHandler(cs_gpt_agent, openai_stt, edge_tts )
+
     def provide_rtc_handler(self) -> RTCHandler:
         return self.rtc_handler
+    def provide_rtc_gpt_handler(self) -> RTCHandler:
+        return self.rtc_gpt_handler
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/src/provider/stt_provider.py b/space/space/space/space/space/space/space/space/space/src/provider/stt_provider.py
index a5ee1021f787393267d3ac29714f787de1a0128d..d9f609003cdb45726e5094fa09d013f61f472a9e 100644
--- a/space/space/space/space/space/space/space/space/space/src/provider/stt_provider.py
+++ b/space/space/space/space/space/space/space/space/space/src/provider/stt_provider.py
@@ -1,6 +1,15 @@
-from src.internal.stt.whisper_stt import WhisperSTT
+from src.internal.stt import(
+    WhisperSTT,
+    OpenAISTT
+)
+
+from openai import OpenAI
 class STTProvider:
-    def __init__(self):
+    def __init__(self, open_ai_client : OpenAI):
         self.whisper_stt = WhisperSTT(model_size = "base", device = "cuda")
+        self.open_ai_client = open_ai_client
+        self.openai_stt = OpenAISTT(open_ai_client)
     def provide_whisper_stt(self) -> WhisperSTT:
-        return self.whisper_stt
\ No newline at end of file
+        return self.whisper_stt
+    def provide_openai_stt(self) -> OpenAISTT:
+        return self.openai_stt
\ No newline at end of file
diff --git a/space/space/space/space/space/space/space/space/space/src/utils/audio_helper.py b/space/space/space/space/space/space/space/space/space/src/utils/audio_helper.py
index d93784fe88ff4008f30aed5418136e1f910f1d90..3e65ad5a6a696cac191725c7084b4d8cfab60f93 100644
--- a/space/space/space/space/space/space/space/space/space/src/utils/audio_helper.py
+++ b/space/space/space/space/space/space/space/space/space/src/utils/audio_helper.py
@@ -6,7 +6,7 @@ import torch
 import torchaudio
 import numpy as np
 
-def audio_to_bytes(self, audio_tuple, sample_rate=24000) -> io.BufferedReader:
+def audio_to_bytes(audio_tuple, sample_rate=24000) -> io.BufferedReader:
         sr, audio_data = audio_tuple
         audio_int16 = audio_to_int16(audio_tuple)
 
diff --git a/space/space/space/space/space/src/internal/stt/openai_stt.py b/space/space/space/space/space/src/internal/stt/openai_stt.py
index 96098ece41fe9a73339f2c25aef3af5c78b95c07..fc4eae795017754cfc72097fad2ed56605d1b300 100644
--- a/space/space/space/space/space/src/internal/stt/openai_stt.py
+++ b/space/space/space/space/space/src/internal/stt/openai_stt.py
@@ -6,9 +6,9 @@ class OpenAISTT(STT):
         self.openai_client = open_ai_client
     def transcribe(self, audio: io.BytesIO, model: str = "whisper-1", language: str = "id"):
         transcription = self.openai_client.audio.transcriptions.create(
-                model="whisper-1",
+                model=model,
                 file=audio,
-                language="id"
+                language=language
         )
         return transcription.text
 
diff --git a/space/space/space/space/src/internal/chatbot/chatbot_handler.py b/space/space/space/space/src/internal/chatbot/chatbot_handler.py
index 50dc9d6a5b4ed51da743f22a25160d243a05b732..f2f29403fd1a3fc6bfe282be188a1123a69722a7 100644
--- a/space/space/space/space/src/internal/chatbot/chatbot_handler.py
+++ b/space/space/space/space/src/internal/chatbot/chatbot_handler.py
@@ -15,7 +15,6 @@ class RAGChatbot:
         self.query_maker_agent = query_maker_agent
         self.title = title
         self.css = self._get_default_css()
-
         async def document_loader() -> AsyncGenerator[str, None]:
             await self.rag_agent.load_documents()
             yield "done"
@@ -48,13 +47,11 @@ class RAGChatbot:
         }
         """
     
-    async def _stream_response(self, message: str) -> AsyncGenerator[str, None]:
+    async def _stream_response(self, message: str, chat_memory : List[Dict], ) -> AsyncGenerator[str, None]:
         """Internal method untuk streaming response"""
         try:
             partial_response = ""
-            
-            
-            async for stream_data in self.rag_agent.get_result(question=message):
+            async for stream_data in self.rag_agent.get_result(chat_memory = chat_memory, question=message):
                 if stream_data["type"] == "chunk":
                     chunk = stream_data["data"]["chunk"]
                     partial_response += chunk
@@ -67,18 +64,19 @@ class RAGChatbot:
                 elif stream_data["type"] == "complete":
                     total_time = stream_data['data']['total_time']
                     print(f"\nTotal time: {total_time:.2f}s")
-                    
+            chat_memory.append({"role": "assistant", "content": partial_response })
+            print("Chat Memory :", chat_memory)
         except Exception as e:
             yield f"❌ Error: {str(e)}"
     
-    def _chatbot_response(self, message: str, history: List[Tuple[str, str]]):
+    def _chatbot_response(self, message: str, history: List[Tuple[str, str]], chat_memory : List[Dict]):
         """Generate chatbot response with proper async handling"""
         try:
             # Create new event loop for this thread
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
             
-            async_gen = self._stream_response(message)
+            async_gen = self._stream_response(message = message, chat_memory = chat_memory)
             
             try:
                 while True:
@@ -87,6 +85,7 @@ class RAGChatbot:
             except StopAsyncIteration:
                 pass
             finally:
+                
                 loop.close()
                 
         except Exception as e:
@@ -103,12 +102,12 @@ class RAGChatbot:
             return "", history, True, gr.update(visible=True), gr.update(interactive=False)
         return message, history, generating, gr.update(visible=False), gr.update(interactive=True)
     
-    def _bot_message_stream(self, history: List, generating: bool):
+    def _bot_message_stream(self,  history: List, generating: bool, chat_memory : List[Dict],):
         """Handle streaming bot response"""
         if history and history[-1][1] is None and generating:
             user_msg = history[-1][0]
             
-            for partial_response in self._chatbot_response(user_msg, history):
+            for partial_response in self._chatbot_response(user_msg, history, chat_memory):
                 history[-1][1] = partial_response
                 yield history, True, gr.update(visible=True), gr.update(interactive=False)
             
diff --git a/space/space/space/space/src/internal/chatbot/ui_chatbot.py b/space/space/space/space/src/internal/chatbot/ui_chatbot.py
index 73647b13f0e5b2361892afe4ec788d05146505c6..5974fd89c92cacc382fa79f2a7aa89ecf983be94 100644
--- a/space/space/space/space/src/internal/chatbot/ui_chatbot.py
+++ b/space/space/space/space/src/internal/chatbot/ui_chatbot.py
@@ -113,13 +113,14 @@ class ChatbotUI(ChatbotInterface):
     def _bind_events(self, msg, chatbot_ui, send_btn, clear_btn, stop_btn, is_generating):
         """Bind all event handlers"""
         # Submit message events
+        chat_memory = gr.State([]) 
         submit_event = msg.submit(
             self.chatbot._user_message,
             inputs=[msg, chatbot_ui, is_generating],
             outputs=[msg, chatbot_ui, is_generating, stop_btn, send_btn]
         ).then(
             self.chatbot._bot_message_stream,
-            inputs=[chatbot_ui, is_generating],
+            inputs=[chatbot_ui, is_generating, chat_memory],
             outputs=[chatbot_ui, is_generating, stop_btn, send_btn]
         )
         
@@ -129,7 +130,7 @@ class ChatbotUI(ChatbotInterface):
             outputs=[msg, chatbot_ui, is_generating, stop_btn, send_btn]
         ).then(
             self.chatbot._bot_message_stream,
-            inputs=[chatbot_ui, is_generating],
+            inputs=[chatbot_ui, is_generating, chat_memory],
             outputs=[chatbot_ui, is_generating, stop_btn, send_btn]
         )
         
diff --git a/space/space/space/space/src/internal/rag/agents/customer_service_agent.py b/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
index 5c7789af288e8b0d85a1b5fe5811360ffa946437..d888eb3c95a356ebd8a351e8a3b291cca272ac80 100644
--- a/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
+++ b/space/space/space/space/src/internal/rag/agents/customer_service_agent.py
@@ -27,5 +27,5 @@ class CSAgent(Agent):
 
     async def get_result(self, chat_memory : List[Dict] = [], enable_search : bool = False, question : str = ""):
         self.inferencer.model.prompt_template = self.prompt_template
-        async for item in self.inferencer.infer_stream(chat_memory = chat_memory, query = question, enable_reranking=False, k=3):
+        async for item in self.inferencer.infer_stream(enable_search = enable_search, chat_memory = chat_memory, query = question, enable_reranking=False, k=3):
                 yield item
diff --git a/space/space/space/space/src/internal/rag/pipeline/language_model.py b/space/space/space/space/src/internal/rag/pipeline/language_model.py
index 424c14280186d26dc1a94fd14c68097a12680c76..3f775f8631dadf354a2db823d16d32079afe5988 100644
--- a/space/space/space/space/src/internal/rag/pipeline/language_model.py
+++ b/space/space/space/space/src/internal/rag/pipeline/language_model.py
@@ -382,12 +382,13 @@ class LanguageModel:
 
         prompt = await self.format_rag_prompt(question, contexts)
 
-        chat_memory = chat_memory + prompt
+        chat_memory += prompt
 
+        print("Chat Memory Recorded :", chat_memory)
         temp = temperature if temperature is not None else 0.3
         
         async for chunk in self.generate_stream(
-            prompt=prompt,
+            prompt=chat_memory,
             max_new_tokens=max_new_tokens,
             temperature=temp,
             **kwargs
@@ -407,8 +408,8 @@ class LanguageModel:
         
         
         prompt = await self.format_rag_prompt(question, contexts)
-        
-        chat_memory = chat_memory + prompt
+
+        chat_memory += prompt
         
         temp = temperature if temperature is not None else 0.3
         
diff --git a/space/space/space/src/internal/rag/pipeline/onnx_embedding.py b/space/space/space/src/internal/rag/pipeline/onnx_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..be86d03e34c6cbcee269f23b7172684af8603054
--- /dev/null
+++ b/space/space/space/src/internal/rag/pipeline/onnx_embedding.py
@@ -0,0 +1,40 @@
+from typing import List
+from langchain_core.embeddings import Embeddings
+from transformers import AutoTokenizer
+import onnxruntime as ort
+import numpy as np
+
+
+class ONNXEmbedding(Embeddings):
+    def __init__(self, embedding_model_name: str, onnx_model_path: str):
+        self.onnx_embedding_model = ort.InferenceSession(
+            onnx_model_path,
+            providers=["CPUExecutionProvider"]  # Bisa ganti ke CUDAExecutionProvider kalau mau GPU
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
+
+    def _embed(self, texts: List[str]) -> List[List[float]]:
+
+        inputs = self.tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            return_tensors="np"  # langsung NumPy array
+        )
+
+        outputs = self.onnx_embedding_model.run(
+            None,  # semua output
+            {k: v for k, v in inputs.items()}
+        )
+
+        embeddings = outputs[0]
+        
+        return embeddings.tolist()
+
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a single query text."""
+        return self._embed([text])[0]
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of texts."""
+        return self._embed(texts)
diff --git a/space/space/space/src/internal/rag/retriever/langchain_retriever.py b/space/space/space/src/internal/rag/retriever/langchain_retriever.py
index f7ec2e66d671bb66e557f0d42a9a86652ab5305c..34722f027ef8d1114e60b4d531622aedcce1e099 100644
--- a/space/space/space/src/internal/rag/retriever/langchain_retriever.py
+++ b/space/space/space/src/internal/rag/retriever/langchain_retriever.py
@@ -13,7 +13,8 @@ from typing import Dict, Optional, List
 from src.internal.rag.retriever.document_loader import MultiFormatDocumentLoader
 from src.internal.rag.retriever.document_processor import DocumentProcessor
 from src.internal.rag.retriever.retriever_types import ProcessingResult, ProcessingStatus, RetrievalResult, DocumentMetadata
-
+from src.internal.rag.pipeline.onnx_embedding import ONNXEmbedding
+from langchain_community.embeddings import QuantizedBgeEmbeddings
 import asyncio
 from pathlib import Path
 import logging
@@ -50,6 +51,11 @@ class LangChainRetriever(BaseRetriever):
         try:
             if self.embedding_model.startswith("text-embedding"):
                 return OpenAIEmbeddings(model=self.embedding_model)
+            elif ".onnx" in self.embedding_model:
+                embedding_str = self.embedding_model.split(":")
+                embedding_model_name = embedding_str[0]
+                onnx_embedding_model_path = embedding_str[1]
+                return ONNXEmbedding(embedding_model_name, f"onnx/{onnx_embedding_model_path}")
             else:
                 return HuggingFaceEmbeddings(model_name=self.embedding_model)
         except Exception as e:
diff --git a/space/space/space/src/provider/rag_agents_provider.py b/space/space/space/src/provider/rag_agents_provider.py
index 2feff9666f95c4b1b3df5a9ca36616284b03f807..c8d2ca7d1d7a3c64eb02b8178f74aa69f1fdc919 100644
--- a/space/space/space/src/provider/rag_agents_provider.py
+++ b/space/space/space/src/provider/rag_agents_provider.py
@@ -42,7 +42,7 @@ class RAGAgentsProvider:
         )
 
         self.document_retriever = LangChainRetriever(
-                embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+                embedding_model="BAAI/bge-m3:bge-m3/model.onnx",
                 vectorstore_type="chroma",
                 vectorstore_path="data/vectorstore/",
                 use_hybrid_search=True,
diff --git a/space/space/src/provider/rag_agents_provider.py b/space/space/src/provider/rag_agents_provider.py
index c8d2ca7d1d7a3c64eb02b8178f74aa69f1fdc919..f3530b85c4494af0eea526fe28efbfbd857091b8 100644
--- a/space/space/src/provider/rag_agents_provider.py
+++ b/space/space/src/provider/rag_agents_provider.py
@@ -69,6 +69,7 @@ class RAGAgentsProvider:
             inferencer = self.cs_inferencer,
             prompt_template = get_chat_template("customer_service")
         )
+        
         self.openai_client = open_ai_client
         self.gpt_cs_agent = GPTCSAgent(ddgs = self.ddgs,
                                        openai_client = self.openai_client,
diff --git a/src/internal/rag/retriever/langchain_retriever.py b/src/internal/rag/retriever/langchain_retriever.py
index 34722f027ef8d1114e60b4d531622aedcce1e099..f0e7051f5f4253abe123cad92554ee60052e95fd 100644
--- a/src/internal/rag/retriever/langchain_retriever.py
+++ b/src/internal/rag/retriever/langchain_retriever.py
@@ -60,7 +60,6 @@ class LangChainRetriever(BaseRetriever):
                 return HuggingFaceEmbeddings(model_name=self.embedding_model)
         except Exception as e:
             logger.error(f"Error initializing embeddings: {str(e)}")
-            return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
     def _initialize_vectorstore(self):
         try: