Spaces:

MuhammadMahmoud
/

Aoun-Ai

Sleeping

App Files Files Community

MuhammadMahmoud commited on 21 days ago

Commit

468ea61

1 Parent(s): 06bb2e7

enhance rag

Browse files

Files changed (21) hide show

app/api/chat.py +12 -36
app/api/feedback.py +8 -9
app/api/health.py +8 -14
app/api/kb_admin.py +50 -1
app/api/ocr.py +8 -17
app/api/prediction.py +8 -8
app/api/voice.py +8 -25
app/core/__pycache__/config.cpython-313.pyc +0 -0
app/core/config.py +2 -0
app/services/chat/chat_engine.py +18 -6
app/services/rag/README.md +30 -16
app/services/rag/__pycache__/knowledge_base.cpython-313.pyc +0 -0
app/services/rag/__pycache__/rag_engine.cpython-313.pyc +0 -0
app/services/rag/embedder.py +174 -0
app/services/rag/knowledge_base.py +39 -12
app/services/rag/rag_engine.py +62 -124
app/services/rag/vector_store.py +310 -0
docker-compose.yml +24 -0
main.py +19 -0
requirements.txt +1 -0
verify_rag.py +42 -0

app/api/chat.py CHANGED Viewed

@@ -20,17 +20,10 @@ logger = logging.getLogger(__name__)
 @router.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(payload: ChatRequest, request: Request):
     """
-    Standard chat endpoint — sends a message and returns the full response.
-    - **message**: The user's text message.
-    - **history**: Optional list of previous messages for context.
-    - **mode**: Execution mode ('chat' for Q&A only, 'agent' for tool execution).
-      Default: 'agent'
-    Returns:
-    - **response**: Text response from the assistant.
-    - **history**: Updated conversation history.
-    - **confirmation**: Pending confirmation request if risky tool needs user approval.
     """
     try:
         user_key = payload.session_id or "anonymous"
@@ -101,17 +94,10 @@ async def chat_endpoint(payload: ChatRequest, request: Request):
 )
 async def chat_stream(payload: ChatRequest, request: Request):
     """
-    Streaming chat endpoint — returns tokens as they arrive via Server-Sent Events (SSE).
-    The user sees the response character-by-character (like ChatGPT), reducing perceived
-    latency from ~4s to ~0.5s.
-    - **message**: The user's text message.
-    - **history**: Optional list of previous messages for context.
-    - **mode**: Execution mode ('chat' for Q&A only, 'agent' for tool execution).
-      Default: 'agent'
-    Returns a `text/event-stream` response. Each chunk is formatted as `data: <text>\\n\\n`.
-    The stream ends with `data: [DONE]\\n\\n`.
     """
     try:
         user_key = payload.session_id or "anonymous"
@@ -188,20 +174,10 @@ async def chat_confirm(
     approved: bool = Query(..., description="User decision: true to approve, false to reject"),
 ):
     """
-    Confirm or reject a pending tool execution request.
-    When a risky tool operation is detected (e.g., cancel_request), the system
-    creates a confirmation request and returns it to the user. This endpoint allows
-    the user to approve or reject the operation.
-    Query Parameters:
-    - **confirmation_id**: UUID of the pending confirmation request.
-    - **approved**: Boolean - true to execute the tool, false to reject.
-    Returns:
-    - **status**: "approved" or "rejected"
-    - **confirmation_id**: UUID of the confirmation
-    - **message**: Status message in Arabic
     """
     try:
         confirmation_manager = get_confirmation_manager()

 @router.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(payload: ChatRequest, request: Request):
     """
+    Processes standard text-based chat interactions returning a single synchronous response.
+    Expects a ChatRequest payload containing the user message, session ID, and chat mode.
+    Under the hood, it applies rate-limiting, RAG context retrieval, and multi-provider LLM processing.
+    Returns a ChatResponse encompassing the generated text, updated history, and optional tool confirmations.
     """
     try:
         user_key = payload.session_id or "anonymous"
 )
 async def chat_stream(payload: ChatRequest, request: Request):
     """
+    Handles streaming chat interactions utilizing Server-Sent Events (SSE) for low-latency responses.
+    Accepts the identical ChatRequest payload but yields generated text tokens in real-time.
+    Implements multi-tiered rate limiting and routes through the robust AI provider circuit-breaker.
+    Returns a sequential `text/event-stream` ending gracefully with a `data: [DONE]` signal.
     """
     try:
         user_key = payload.session_id or "anonymous"
     approved: bool = Query(..., description="User decision: true to approve, false to reject"),
 ):
     """
+    Processes mandatory user confirmations for sensitive tool operations triggered in AI Agent mode.
+    Requires a valid confirmation UUID and a boolean decision matrix (true to auto-approve, false to reject).
+    Validates expiration state, shifts the execution tracker, and securely unblocks backend tasks.
+    Returns a structured confirmation block containing the execution outcome and localized UI alerts.
     """
     try:
         confirmation_manager = get_confirmation_manager()

app/api/feedback.py CHANGED Viewed

@@ -31,13 +31,10 @@ class FeedbackResponse(BaseModel):
 @router.post("/feedback", response_model=FeedbackResponse)
 async def submit_feedback(data: FeedbackRequest):
     """
-    Submit a correction to an AI prediction.
-    - **prediction_id**: UUID from the original prediction response.
-    - **original_prediction**: What the AI predicted (e.g. "Medium").
-    - **corrected_prediction**: What it should have been (e.g. "High").
-    - **corrected_by**: Optional employee identifier.
-    - **reason**: Optional explanation for the correction.
     """
     feedback_id = str(uuid.uuid4())
     try:
@@ -58,8 +55,10 @@ async def submit_feedback(data: FeedbackRequest):
 @router.get("/feedback/summary")
 async def get_feedback_summary():
     """
-    Get aggregated statistics about AI prediction corrections.
-    Admin-only endpoint — shows how often corrections are made and what patterns exist.
     """
     try:
         return feedback_store.get_summary()

 @router.post("/feedback", response_model=FeedbackResponse)
 async def submit_feedback(data: FeedbackRequest):
     """
+    Ingests explicit user feedback validating or rejecting prior AI-driven predictions.
+    Receives payloads mapping the original AI output alongside the human correction footprint.
+    Persists evaluation data chronologically into the structured feedback reporting store.
+    Returns a standard confirmation receipt bridging data gaps between AI logic and human oversight.
     """
     feedback_id = str(uuid.uuid4())
     try:
 @router.get("/feedback/summary")
 async def get_feedback_summary():
     """
+    Aggregates accumulated AI prediction corrections into high-level evaluative statistics.
+    Scans the feedback repository to compute drift metrics, common patterns, and accuracy rates.
+    Designed primarily for platform administrators establishing continuous LLM refinement loops.
+    Returns a structured dictionary mapping distinct models to their respective human correction summaries.
     """
     try:
         return feedback_store.get_summary()

app/api/health.py CHANGED Viewed

@@ -10,8 +10,10 @@ router = APIRouter(prefix="/health", tags=["health"])
 @router.get("/readiness")
 async def readiness():
     """
-    Basic readiness check — verifies critical env vars are configured.
-    Returns 'ok' when the primary LLM (Groq) is configured.
     """
     checks = {
         "dotnet_configured": bool(settings.DOTNET_API_BASE_URL),
@@ -30,18 +32,10 @@ async def readiness():
 @router.get("/providers")
 async def provider_health():
     """
-    Real-time LLM provider health snapshot.
-    Returns the current state of every provider and model in the router:
-    - available: True if the provider can accept requests right now
-    - permanently_disabled: True if all models returned 404 (deprecated)
-    - cooldown_remaining_s: Seconds until the provider exits its cooldown window
-    - active_count / total_count: How many providers are currently usable
-    Use this endpoint to:
-    - Set up uptime alerts (active_count == 0 → page on-call)
-    - Debug fallback routing issues
-    - Verify model registry after a deployment
     """
     # Import here to avoid circular imports at module load time
     from app.services.chat.api.llm_router import llm_router

 @router.get("/readiness")
 async def readiness():
     """
+    Validates essential environment configurations confirming baseline operational readiness.
+    Performs deterministic environment checks for active LLM keys, Redis, and Qdrant connections.
+    Dictates whether the gateway node should accept ingress traffic or flag as heavily degraded.
+    Returns a consolidated status string alongside boolean flags for active backend infrastructure elements.
     """
     checks = {
         "dotnet_configured": bool(settings.DOTNET_API_BASE_URL),
 @router.get("/providers")
 async def provider_health():
     """
+    Exposes a real-time, circuit-breaker-aware snapshot of all active LLM upstream routing providers.
+    Aggregates connection health, permanent deprecation status, and active cooldown timeout thresholds.
+    Enables DevOps systems and administrative dashboards to monitor fallback degradation pipelines proactively.
+    Returns granular provider arrays mapped to their respective recovery timestamps and load configurations.
     """
     # Import here to avoid circular imports at module load time
     from app.services.chat.api.llm_router import llm_router

app/api/kb_admin.py CHANGED Viewed

@@ -1,7 +1,8 @@
-from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 from app.services.rag.rag_engine import rag_engine
 router = APIRouter(prefix="/kb", tags=["kb-admin"])
@@ -13,18 +14,66 @@ class KbDocumentInput(BaseModel):
 @router.get("/search")
 async def search_kb(q: str, top_k: int = 3):
     return {"results": rag_engine.search(q, top_k=top_k)}
 @router.post("/documents")
 async def add_document(payload: KbDocumentInput):
     if not payload.title.strip() or not payload.content.strip():
         raise HTTPException(status_code=400, detail="title/content مطلوبين")
     return rag_engine.add_document(payload.title, payload.content)
 @router.post("/refresh")
 async def refresh_kb():
     rag_engine.refresh_index()
     return {"refreshed": True}

+from fastapi import APIRouter, HTTPException, Query
 from pydantic import BaseModel
 from app.services.rag.rag_engine import rag_engine
+from app.services.rag.vector_store import vector_store
 router = APIRouter(prefix="/kb", tags=["kb-admin"])
 @router.get("/search")
 async def search_kb(q: str, top_k: int = 3):
+    """
+    Interrogates the underlying Retrieval-Augmented Generation (RAG) knowledge base for semantic similarities.
+    Accepts text queries and vector-embeds them utilizing the natively configured embedding parameters.
+    Retrieves the topmost relevant corporate document snippets utilizing optimized cosine similarity algorithms.
+    Returns a structured array of related textual contexts to synthesize or debug LLM contextual injections.
+    """
     return {"results": rag_engine.search(q, top_k=top_k)}
 @router.post("/documents")
 async def add_document(payload: KbDocumentInput):
+    """
+    Ingests authoritative organizational documents permanently embedding them into the vector index space.
+    Requires structured title and content pairings to partition distinct semantic knowledge representations.
+    Executes automated embedding generation sequences to populate the local or remote semantic structures.
+    Returns boolean confirmation markers signaling the successful propagation and indexing of the data chunks.
+    """
     if not payload.title.strip() or not payload.content.strip():
         raise HTTPException(status_code=400, detail="title/content مطلوبين")
     return rag_engine.add_document(payload.title, payload.content)
+@router.get("/documents")
+async def list_documents(limit: int = 50, offset: int = 0):
+    """
+    Lists documents currently indexed in the knowledge base.
+    Supports basic pagination via limit and offset parameters.
+    """
+    return {"documents": rag_engine.list_documents(limit=limit, offset=offset)}
+@router.delete("/documents/{doc_id}")
+async def delete_document(doc_id: str):
+    """
+    Removes a document from the active vector index.
+    Requires the unique UUID of the target document.
+    """
+    success = rag_engine.delete_document(doc_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Document not found or could not be deleted")
+    return {"deleted": True, "id": doc_id}
+@router.get("/stats")
+async def kb_stats():
+    """
+    Returns operational statistics about the underlying vector database.
+    Useful for health checks and capacity monitoring.
+    """
+    return {"vector_store": vector_store.get_collection_info()}
 @router.post("/refresh")
 async def refresh_kb():
+    """
+    Forces a synchronous refresh operation against the local disk storage or remote vector representation indexes.
+    Commands the RAG singleton engine to reload memory-mapped chunks directly from persistent state directory architectures.
+    Essential tool for ensuring high consistency following batched corporate knowledge base uploads via the admin portal.
+    Returns a simple acknowledgment flag dictating the flush cycle and data reloading methodologies have successfully terminated.
+    """
     rag_engine.refresh_index()
     return {"refreshed": True}

app/api/ocr.py CHANGED Viewed

@@ -22,13 +22,10 @@ async def process_ocr(
     document_type: DocumentType = Form(..., description="Type of the document being scanned"),
 ):
     """
-    Process an Arabic document image and extract structured data as JSON.
-    - **file**: The document image to scan (max 10 MB).
-    - **document_type**: One of the supported document types (id_card, income_proof, etc.).
-    The router automatically tries providers in order (Groq → Gemini → OpenAI → HuggingFace)
-    and falls back if any provider hits a rate limit or error.
     """
     contents = await _read_and_validate(file)
     try:
@@ -50,16 +47,10 @@ async def process_ocr_with_analysis(
     document_type: DocumentType = Form(..., description="Type of the document being scanned"),
 ):
     """
-    **OCR + LLM Analysis** — Extract structured data from a document image,
-    then run intelligent AI analysis to produce: summary, risk level (high/medium/low),
-    severity score (0–100), key findings, and a recommendation for the case worker.
-    Steps performed:
-    1. OCR extraction via multi-provider router (Groq → Gemini → OpenAI → HuggingFace)
-    2. LLM analysis via Groq Llama 3.3 70B
-    - **file**: The document image to scan (max 10 MB).
-    - **document_type**: One of the supported document types.
     """
     contents = await _read_and_validate(file)
     try:

     document_type: DocumentType = Form(..., description="Type of the document being scanned"),
 ):
     """
+    Processes official Arabic document images (e.g., IDs, proofs) to extract raw text logic.
+    Accepts standard image formats (PNG, JPG) routing them through a priority-based Vision AI chain.
+    Automatically handles provider rotation and fallback degradation mechanisms ensuring high availability.
+    Returns the processed string output alongside the identifier indicating the successful extraction engine.
     """
     contents = await _read_and_validate(file)
     try:
     document_type: DocumentType = Form(..., description="Type of the document being scanned"),
 ):
     """
+    Executes a multi-stage structured extraction paired with semantic risk assessment on Arabic uploads.
+    Initiates base OCR extractions which cleanly filter into a specialized Llama 3.3 70B analytical pipeline.
+    Synthesizes raw text blocks into concise bullet findings, hierarchical summaries, and relative risk severities.
+    Outputs comprehensive JSON formatting marrying granular text data with executive case worker recommendations.
     """
     contents = await _read_and_validate(file)
     try:

app/api/prediction.py CHANGED Viewed

@@ -24,10 +24,10 @@ logger = logging.getLogger(__name__)
 @router.post("/need-level", response_model=NeedLevelResponse)
 async def predict_need_level(request: Request, data: NeedLevelRequest):
     """
-    Assess the need level of a family's assistance request.
-    Uses rule-based guardrails first (for extreme cases), then falls back
-    to the ML model for nuanced classification.
     """
     prediction_id = str(uuid.uuid4())
     start_time = time.perf_counter()
@@ -115,10 +115,10 @@ async def predict_need_level(request: Request, data: NeedLevelRequest):
 @router.post("/assistance-type", response_model=AssistanceTypeResponse)
 async def classify_assistance(request: Request, data: NeedLevelRequest):
     """
-    Classify the type of assistance needed.
-    Uses deterministic rules first as guardrails, then falls back
-    to an ML model for nuanced classification if available.
     """
     prediction_id = str(uuid.uuid4())
     start_time = time.perf_counter()

 @router.post("/need-level", response_model=NeedLevelResponse)
 async def predict_need_level(request: Request, data: NeedLevelRequest):
     """
+    Evaluates the urgency and need score of an assistance request using a hybrid ML approach.
+    Expects structured financial and demographic data mapped to a standard NeedLevelRequest model.
+    Instantly applies deterministic guardrails before deferring to the XGBoost risk scoring engine.
+    Retruns the categorized need level, confidence percentiles, and optional SHAP feature explanations.
     """
     prediction_id = str(uuid.uuid4())
     start_time = time.perf_counter()
 @router.post("/assistance-type", response_model=AssistanceTypeResponse)
 async def classify_assistance(request: Request, data: NeedLevelRequest):
     """
+    Classifies the most appropriate category of demographic assistance necessary for an applicant.
+    Ingests core family metrics to evaluate rigid deterministic rules (e.g., medical extremity).
+    Falls back transparently to a dedicated classification model capturing advanced socioeconomic contexts.
+    Returns the recommended assistance string definition, boolean rule triggers, and analytical confidence.
     """
     prediction_id = str(uuid.uuid4())
     start_time = time.perf_counter()

app/api/voice.py CHANGED Viewed

@@ -29,16 +29,10 @@ async def voice_chat(
     access_token: Optional[str] = Form(None, description="JWT Bearer token from .NET"),
 ):
     """
-    **Voice AI Chat** — Send audio, receive transcription + AI response.
-    Pipeline:
-    1. Audio file → Groq Whisper → Transcribed Arabic text
-    2. Text → Chat Engine (FAQ → Cache → Groq Llama 3.3 70B)
-    3. Returns: transcription + AI response + conversation history
-    **Supported formats:** mp3, wav, webm, m4a, ogg (max 25 MB)
-    **Session memory:** Pass `session_id` to maintain conversation context across calls.
     """
     # ── Guard: Rate Limits ──
     client_ip = getattr(request, "client", None)
@@ -136,21 +130,10 @@ async def voice_stream(
     access_token: Optional[str] = Form(None, description="JWT Bearer token from .NET"),
 ):
     """
-    **Streaming Voice AI Chat** — Send audio, receive transcription immediately then
-    stream the AI response token-by-token (like ChatGPT), reducing perceived latency.
-    Pipeline:
-    1. Audio file → Groq Whisper → Transcribed text
-    2. **First SSE event** immediately sends the transcription so the UI can show it.
-    3. Text → Chat Engine → Groq Llama 3.3 70B (streamed token-by-token)
-    4. Stream ends with `data: [DONE]`
-    **Event format:**
-    - Transcription event: `data: {"type": "transcription", "text": "..."}\\n\\n`
-    - Text chunk event:    `data: {"type": "chunk", "text": "..."}\\n\\n`
-    - Done event:          `data: [DONE]\\n\\n`
-    **Supported formats:** mp3, wav, webm, m4a, ogg (max 25 MB)
     """
     # ── Guard: Rate Limits ──
     client_ip = getattr(request, "client", None)

     access_token: Optional[str] = Form(None, description="JWT Bearer token from .NET"),
 ):
     """
+    End-to-end voice processing pipeline orchestrating internal Speech-to-Text inference and intelligent chat logic.
+    Accepts robust audio file uploads (mp3, wav, etc.) and performs latency-optimized Whisper transcriptions.
+    Feeds the transcribed Arabic text directly into the chat engine along with existing session context.
+    Outputs a consolidated VoiceResponse detailing the text transcription, the AI reply, and history trails.
     """
     # ── Guard: Rate Limits ──
     client_ip = getattr(request, "client", None)
     access_token: Optional[str] = Form(None, description="JWT Bearer token from .NET"),
 ):
     """
+    Low-latency streaming voice pipeline bridging Whisper transcription workflows with real-time SSE token events.
+    Synchronously transcribes the audio payload and immediately pushes the recognized text to the client UI.
+    Subsequently streams the LLM's dynamically generated conversational response token-by-token directly.
+    Inherits all core chat fallback integrations, ensuring high availability even during backend provider stress.
     """
     # ── Guard: Rate Limits ──
     client_ip = getattr(request, "client", None)

app/core/__pycache__/config.cpython-313.pyc CHANGED Viewed

Binary files a/app/core/__pycache__/config.cpython-313.pyc and b/app/core/__pycache__/config.cpython-313.pyc differ

app/core/config.py CHANGED Viewed

@@ -28,6 +28,8 @@ class Settings(BaseSettings):
     HUGGINGFACE_API_KEY: str = ""
     OPENROUTER_API_KEY: str = ""
     HF_EMBED_MODEL: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
     # Chat Settings (Groq — 14,400 req/day free)
     GROQ_API_KEY: str = ""

     HUGGINGFACE_API_KEY: str = ""
     OPENROUTER_API_KEY: str = ""
     HF_EMBED_MODEL: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+    EMBEDDING_MODE: str = "local"  # "local" (sentence-transformers) or "api" (HuggingFace Inference)
+    EMBEDDING_DIM: int = 384  # Dimension of the embedding model
     # Chat Settings (Groq — 14,400 req/day free)
     GROQ_API_KEY: str = ""

app/services/chat/chat_engine.py CHANGED Viewed

@@ -116,12 +116,6 @@ class ChatEngine:
                  msg = "لقد أجبتك على هذا السؤال للتو! 😊\nهل هناك شيء محدد لم يكن واضحاً في إجابتي السابقة؟ أنا هنا لمساعدتك."
                  return msg, cleaned_message, list(history or [])
-        # FAQ Pre-Check
-        faq_hit = find_faq_answer(cleaned_message)
-        if faq_hit:
-            logger.info("FAQ hit for: %r — skipping API call", cleaned_message[:40])
-            return faq_hit, cleaned_message, list(history or [])
         # Cache Pre-Check
         cache_key = get_cache_key(cleaned_message, history)
         cached_response = get_cached_response(cache_key)
@@ -188,6 +182,14 @@ class ChatEngine:
         from app.services.rag.rag_engine import rag_engine
         rag_context = rag_engine.get_context(cleaned_message)
         messages = self._build_messages(cleaned_message, history, rag_context=rag_context)
         cache_key = get_cache_key(cleaned_message, history)
@@ -236,6 +238,16 @@ class ChatEngine:
         from app.services.rag.rag_engine import rag_engine
         rag_context = rag_engine.get_context(cleaned_message)
         messages = self._build_messages(cleaned_message, history, rag_context=rag_context)
         cache_key = get_cache_key(cleaned_message, history)

                  msg = "لقد أجبتك على هذا السؤال للتو! 😊\nهل هناك شيء محدد لم يكن واضحاً في إجابتي السابقة؟ أنا هنا لمساعدتك."
                  return msg, cleaned_message, list(history or [])
         # Cache Pre-Check
         cache_key = get_cache_key(cleaned_message, history)
         cached_response = get_cached_response(cache_key)
         from app.services.rag.rag_engine import rag_engine
         rag_context = rag_engine.get_context(cleaned_message)
+        # Priority: RAG > FAQ
+        if not rag_context:
+            faq_hit = find_faq_answer(cleaned_message)
+            if faq_hit:
+                logger.info("FAQ hit for: %r — skipping API call", cleaned_message[:40])
+                updated_history = self._update_history(history, message, faq_hit, session_id)
+                return faq_hit, updated_history, None
         messages = self._build_messages(cleaned_message, history, rag_context=rag_context)
         cache_key = get_cache_key(cleaned_message, history)
         from app.services.rag.rag_engine import rag_engine
         rag_context = rag_engine.get_context(cleaned_message)
+        # Priority: RAG > FAQ
+        if not rag_context:
+            faq_hit = find_faq_answer(cleaned_message)
+            if faq_hit:
+                logger.info("FAQ hit for: %r — skipping API call in stream", cleaned_message[:40])
+                self._update_history(history, message, faq_hit, session_id)
+                yield f"data: {json.dumps(faq_hit, ensure_ascii=False)}\n\n"
+                yield "data: [DONE]\n\n"
+                return
         messages = self._build_messages(cleaned_message, history, rag_context=rag_context)
         cache_key = get_cache_key(cleaned_message, history)

app/services/rag/README.md CHANGED Viewed

@@ -1,20 +1,34 @@
 # خدمة البحث المعزز بالاسترجاع (RAG)
-محرك هجين يجمع بين بحث متجهات (Qdrant + HuggingFace Embeddings) وFallback سريع بـ TF‑IDF محلي.
 ## كيف يعمل؟
-- يبني مصفوفة TF‑IDF من `knowledge_base.py` عند التهيئة.
-- يحاول تفعيل Qdrant إذا توفرت (`QDRANT_URL` + `HUGGINGFACE_API_KEY`)، وإلا يستخدم TF‑IDF فقط.
-- `search(query)`: يفضّل المتجهات، ويعود إلى TF‑IDF عند الفشل أو غياب المفاتيح.
-- `get_context(query)`: يجمع أفضل النتائج في نص واحد يحقن داخل Prompt الشات.
-- `add_document` و`refresh_index`: لإضافة/إعادة بناء الفهارس.
-## الاعتمادات
-- مفاتيح: `QDRANT_URL`, `QDRANT_API_KEY`, `HUGGINGFACE_API_KEY`, `HF_EMBED_MODEL`, `QDRANT_COLLECTION_NAME`.
-- مكتبات: `qdrant-client`, `httpx`, `scikit-learn`.
-## تحسينات مقترحة
-- توسيع قاعدة المعرفة بـ 10–15 مقالة إضافية تغطي السياسات والأسئلة المتكررة (P1).
-- إضافة واجهة إدارة بسيطة (لوحة أو Endpoint) لإدخال/تعديل المقالات مع صلاحيات.
-- تخزين نتائج البحث في Cache قصير لتحسين زمن الاستجابة لأسئلة متكررة.
-- تسجيل درجات التشابه في `audit_log` لتحليل جودة الاسترجاع بمرور الوقت.

 # خدمة البحث المعزز بالاسترجاع (RAG)
+محرك هجين يجمع بين بحث متجهات متقدم (Qdrant + Local Embeddings) وFallback سريع بـ TF‑IDF محلي.
+## البنية المعمارية الجديدة
+تم ترقية النظام من (TF-IDF + RAM) إلى نظام إنتاجي متكامل:
+- **Vector DB:** يستخدم Qdrant لتخزين دائم ومستمر للبيانات.
+- **Embeddings:** يعتمد على نموذج `sentence-transformers` محلي (`paraphrase-multilingual-MiniLM-L12-v2`) بـ 384 بعد بسرعة عالية (بدون انتظار API خارجي).
+- **Fallback:** في حال توقف Qdrant، يعود النظام فوراً لاستخدام الـ TF-IDF الموجود بالذاكرة لضمان عدم توقف الخدمة.
 ## كيف يعمل؟
+- عند بدء التشغيل، يتم محاولة تحميل نموذج الـ Embeddings وفتح اتصال بقاعدة بيانات Qdrant.
+- إذا كانت قاعدة البيانات فارغة، سيتم تلقائياً تزويدها بالبيانات الأساسية الموجودة في `knowledge_base.py`.
+- `search(query)`: يفضل البحث باستخدام المتجهات أولاً، ويعود إلى TF-IDF كخيار احتياطي.
+- `get_context(query)`: يجمع أفضل النتائج لتغذية الـ LLM بالمعلومات الدقيقة.
+## الاعتمادات المطلوبة
+- `qdrant-client` للاتصال بقاعدة البيانات.
+- `sentence-transformers` و `torch` لعملية الـ Embedding محلياً.
+## إدارة المعرفة (KB Admin API)
+تم توفير مسارات متكاملة للتحكم بقاعدة المعرفة (تستمر البيانات بعد إعادة التشغيل):
+1. `GET /api/kb/documents` : استعراض المقالات الموجودة.
+2. `POST /api/kb/documents`: إضافة مقالة جديدة يتم تشفيرها وإضافتها لـ Qdrant والـ Fallback معاً.
+3. `DELETE /api/kb/documents/{id}`: مسح مقالة.
+4. `GET /api/kb/stats`: معرفة إحصائيات قاعدة البيانات (لغرض الصحة والمراقبة).
+## التشغيل
+يرجى استخدام Docker Compose لتشغيل التطبيق و Qdrant معاً:
+```bash
+docker-compose up -d
+```

app/services/rag/__pycache__/knowledge_base.cpython-313.pyc CHANGED Viewed

Binary files a/app/services/rag/__pycache__/knowledge_base.cpython-313.pyc and b/app/services/rag/__pycache__/knowledge_base.cpython-313.pyc differ

app/services/rag/__pycache__/rag_engine.cpython-313.pyc CHANGED Viewed

Binary files a/app/services/rag/__pycache__/rag_engine.cpython-313.pyc and b/app/services/rag/__pycache__/rag_engine.cpython-313.pyc differ

app/services/rag/embedder.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Embedding Service — Generates vector embeddings for text.
+Primary:  Local `sentence-transformers` model (zero-latency, no rate limits).
+Fallback: HuggingFace Inference API (HTTP) if local model fails to load.
+The model is loaded once at startup and reused for all embedding requests.
+"""
+import logging
+from typing import List, Optional
+import httpx
+from app.core.config import settings
+logger = logging.getLogger(__name__)
+# ── Lazy imports for sentence-transformers (heavy dependency) ────────────────
+_local_model = None
+_local_model_loaded = False
+def _load_local_model():
+    """Load the sentence-transformers model into memory (one-time)."""
+    global _local_model, _local_model_loaded
+    if _local_model_loaded:
+        return _local_model
+    try:
+        from sentence_transformers import SentenceTransformer
+        model_name = settings.HF_EMBED_MODEL
+        logger.info("Loading local embedding model: %s ...", model_name)
+        _local_model = SentenceTransformer(model_name)
+        _local_model_loaded = True
+        logger.info("Local embedding model loaded successfully (dim=%d).", _local_model.get_sentence_embedding_dimension())
+        return _local_model
+    except Exception as exc:
+        logger.warning("Failed to load local embedding model: %s — will use API fallback.", exc)
+        _local_model_loaded = True  # Don't retry on every call
+        _local_model = None
+        return None
+class Embedder:
+    """Singleton embedding service with local-first, API-fallback strategy."""
+    def __init__(self):
+        self._mode = settings.EMBEDDING_MODE  # "local" or "api"
+        self._dim = settings.EMBEDDING_DIM
+        self._model = None
+        self._ready = False
+    # ── Lifecycle ────────────────────────────────────────────────────────────
+    def initialize(self):
+        """Load model on startup. Call from app lifespan."""
+        if self._mode == "local":
+            self._model = _load_local_model()
+            if self._model:
+                self._ready = True
+                self._dim = self._model.get_sentence_embedding_dimension()
+            else:
+                logger.warning("Local model unavailable, embedding will attempt API fallback.")
+        elif self._mode == "api":
+            if settings.HUGGINGFACE_API_KEY:
+                self._ready = True
+                logger.info("Embedding mode: HuggingFace Inference API.")
+            else:
+                logger.warning("API mode selected but HUGGINGFACE_API_KEY is not set.")
+        else:
+            logger.warning("Unknown EMBEDDING_MODE: '%s'. Disabling embeddings.", self._mode)
+    @property
+    def is_ready(self) -> bool:
+        return self._ready
+    @property
+    def dimension(self) -> int:
+        return self._dim
+    # ── Single Text ──────────────────────────────────────────────────────────
+    def embed_text(self, text: str) -> Optional[List[float]]:
+        """
+        Generate embedding vector for a single text string.
+        Returns None on failure (caller should fallback to TF-IDF).
+        """
+        if not text or not text.strip():
+            return None
+        # Try local model first
+        if self._model:
+            return self._embed_local(text)
+        # Fallback to API
+        if settings.HUGGINGFACE_API_KEY:
+            return self._embed_api(text)
+        return None
+    # ── Batch ────────────────────────────────────────────────────────────────
+    def embed_batch(self, texts: List[str]) -> List[Optional[List[float]]]:
+        """
+        Generate embeddings for a batch of texts.
+        Returns a list of vectors (None entries for failures).
+        """
+        if not texts:
+            return []
+        # Local batch (efficient — single forward pass)
+        if self._model:
+            return self._embed_local_batch(texts)
+        # API fallback (one-by-one, slow but functional)
+        return [self._embed_api(t) if t and t.strip() else None for t in texts]
+    # ── Private: Local ───────────────────────────────────────────────────────
+    def _embed_local(self, text: str) -> Optional[List[float]]:
+        try:
+            vector = self._model.encode(text, normalize_embeddings=True)
+            return vector.tolist()
+        except Exception as exc:
+            logger.warning("Local embedding failed for text: %s", exc)
+            # Try API fallback
+            if settings.HUGGINGFACE_API_KEY:
+                return self._embed_api(text)
+            return None
+    def _embed_local_batch(self, texts: List[str]) -> List[Optional[List[float]]]:
+        try:
+            vectors = self._model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
+            return [v.tolist() for v in vectors]
+        except Exception as exc:
+            logger.warning("Local batch embedding failed: %s — falling back to single.", exc)
+            return [self._embed_local(t) for t in texts]
+    # ── Private: API ─────────────────────────────────────────────────────────
+    def _embed_api(self, text: str) -> Optional[List[float]]:
+        """HuggingFace Inference API fallback."""
+        try:
+            model_name = settings.HF_EMBED_MODEL
+            url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_name}"
+            headers = {"Authorization": f"Bearer {settings.HUGGINGFACE_API_KEY}"}
+            with httpx.Client(timeout=20.0) as client:
+                response = client.post(url, headers=headers, json={"inputs": text})
+            response.raise_for_status()
+            data = response.json()
+            # Handle different response shapes
+            if data and isinstance(data[0], list):
+                if data[0] and isinstance(data[0][0], list):
+                    # 2D: average token vectors
+                    token_vectors = data[0]
+                    size = len(token_vectors[0])
+                    pooled = [0.0] * size
+                    for row in token_vectors:
+                        for i, val in enumerate(row):
+                            pooled[i] += float(val)
+                    return [v / len(token_vectors) for v in pooled]
+                return [float(v) for v in data[0]]
+            if isinstance(data, list):
+                return [float(v) for v in data]
+            return None
+        except Exception as exc:
+            logger.warning("API embedding failed: %s", exc)
+            return None
+# ── Singleton ────────────────────────────────────────────────────────────────
+embedder = Embedder()

app/services/rag/knowledge_base.py CHANGED Viewed

@@ -5,27 +5,54 @@ Contains detailed policies, rules, and facts about the Awn platform.
 KNOWLEDGE_BASE_DATA = [
     {
-        "title": "أنواع المساعدات ومعايير الأهلية",
-        "content": "تقدم منصة عون عدة أنواع من المساعدات: 1. المساعدة الغذائية: تستهدف الأسر التي لا يتجاوز دخلها الشهري 1500 جنيه. 2. الدعم الطبي: مخصص للحالات التي تعاني من أمراض مزمنة أو إعاقات تتطلب علاجاً مستمراً. 3. مساعدة التعليم: للأسر التي لديها أطفال في سن الدراسة (أقل من 18 عاماً) ودخلها الشهري أقل من 3000 جنيه. 4. دعم السكن: للحالات التي تعاني من ظروف سكن غير ملائمة وتتطلب تدخلاً عاجلاً. 5. الدعم المالي العام: للحالات التي لا تندرج تحت الفئات السابقة وتعاني من أزمات طارئة أو ديون تتجاوز ضعف الدخل الشهري."
     },
     {
-        "title": "سياسة التسجيل وأنواع الحسابات",
-        "content": "تتيح المنصة تسجيل ثلاثة أنواع من الحسابات: 1. حساب أسرة (مستفيد): لتقديم طلبات الدعم، ويتطلب إدخال بيانات دقيقة عن الدخل وعدد الأفراد والحالة الصحية وإرفاق المستندات الداعمة. 2. حساب مانح: للأفراد أو الجهات الراغبة في تقديم الدعم المالي أو العيني للحالات المعروضة. يمكن للمانح تصفح الحالات واختيار من يدعمه مباشرة. 3. حساب منظمة: للجمعيات الخيرية والمؤسسات المعتمدة التي تقوم بدور الوسيط للتحقق من صحة بيانات الأسر ومراجعة الطلبات والزيارات الميدانية قبل عرض الطلبات للمانحين."
     },
     {
-        "title": "دورة حياة طلب المساعدة",
-        "content": "يمر طلب المساعدة بالخطوات التالية: 1. تقديم الطلب: تقوم الأسرة بتعبئة النموذج وإرفاق المستندات. 2. التقييم المبدئي: يستخدم نظام الذكاء الاصطناعي لتحديد مستوى الاحتياج (عالي، متوسط، منخفض) وتصنيف نوع المساعدة بناءً على البيانات. 3. المراجعة والتحقق: تقوم منظمة معتمدة بمراجعة الطلب والمستندات وفي بعض الحالات إجراء زيارة ميدانية للتأكد من استحقاق الأسرة. 4. النشر: بعد الموافقة، يُعرض الطلب للمانحين على المنصة بهوية مجهلة لحماية كرامة الأسرة. 5. الدعم: يقوم المانح باختيار الطلب وتقديم الدعم المطلوب."
     },
     {
-        "title": "الوثائق والمستندات المطلوبة",
-        "content": "لضمان سرعة معالجة الطلبات، يجب إرفاق المستندات التالية إن وجدت: 1. إثبات الهوية (بطاقة الرقم القومي أو جواز السفر). 2. إثبات الدخل (مفردات مرتب، معاش، أو إقرار بعدم وجود دخل ثابت). 3. وثائق داعمة (عقد إيجار السكن، روشتات طبية أو تقارير تثبت الحالة الصحية، إيصالات مرافق، أو شهادات ميلاد الأطفال للمساعدة التعليمية). يقوم النظام بتحليل هذه المستندات آلياً لاستخراج البيانات وتقييم مستوى المخاطرة."
     },
     {
-        "title": "سياسة الخصوصية وأمان البيانات",
-        "content": "تلتزم منصة عون التزاماً كاملاً بحماية خصوصية المستخدمين: 1. يتم تشفير جميع البيانات الشخصية والمستندات المرفوعة. 2. عند عرض طلبات المساعدة للمانحين، يتم إخفاء هوية الأسرة واستخدام أسماء مستعارة، وتكتفي المنصة بعرض تفاصيل الاحتياج فقط. 3. لا يتم مشاركة البيانات الحساسة إلا مع المنظمات المعتمدة لأغراض التحقق والمراجعة. 4. المنصة لا تحتفظ بأي بيانات بطاقات ائتمانية ولا تعالج التحويلات المالية داخلياً."
     },
     {
-        "title": "آلية التواصل والدعم المباشر",
-        "content": "بمجرد أن يقرر المانح دعم حالة معينة، تقوم المنصة بتوفير قناة تواصل آمنة أو تزويد المانح بطريقة لدعم الأسرة مباشرة (حسب رغبتهم ووفقاً لسياسة المنظمة المشرفة). المنصة لا تتقاضى أي عمولات ولا تتدخل في عملية تسليم المساعدات بين المانح والمستفيد لضمان وصول الدعم بالكامل لمستحقيه."
     }
 ]

 KNOWLEDGE_BASE_DATA = [
     {
+        "title": "تعريف منصة عون وأهدافها الأساسية",
+        "content": "منصة عون هي جسر رقمي آمن يربط بين الأسر المحتاجة، المنظمات الخيرية، والمانحين لتقديم الدعم بطريقة شفافة وفعالة. تهدف المنصة إلى تسهيل إيصال المساعدات لمستحقيها لحفظ كرامتهم، من خلال عرض الحالات ببيانات مجهّلة (أسماء مستعارة). المنصة لا تقوم بأي عمليات تحويل مالي أو خصم عمولات داخلياً، بل تقتصر على كونها أداة ربط وتوثيق لتسهيل التكافل الاجتماعي والتواصل المباشر."
     },
     {
+        "title": "هوية ومهام المساعد الذكي في منصة عون",
+        "content": "المساعد الذكي لمنصة عون هو نظام ذكاء اصطناعي تم برمجته رسمياً لخدمة مستخدمي المنصة على مدار الساعة. مهامه تشمل: 1. الاستعلام عن حالة الطلبات برقم الطلب. 2. توضيح المستندات المطلوبة لكل مساعدة. 3. التنبؤ الآلي بأهلية الأسرة للدعم (بناءً على الدخل وعدد الأفراد). 4. توفير إحصائيات ملخصة للطلبات. 5. تمكين المستخدم من تحديث بياناته الأساسية. 6. معالجة طلبات الإلغاء للطلبات الحالية."
     },
     {
+        "title": "أنواع الحسابات المتاحة وطريقة التسجيل",
+        "content": "تتيح منصة عون 3 أنواع من الحسابات: 1. حساب أسرة (مستفيد): لتقديم طلبات الدعم، ويتطلب إدخال بيانات دقيقة عن الدخل والحالة الصحية. 2. حساب مانح: للأفراد أو الجهات الراغبة في رؤية طلبات المساعدة وتقديم الدعم المالي أو العيني. 3. حساب منظمة: لجهات المجتمع المدني التي تقوم بمراجعة وتدقيق الطلبات والقيام بزيارات ميدانية. للتسجيل، يتم الدخول لصفحة التسجيل، اختيار نوع الحساب المنشود، وإدخال البريد الإلكتروني وكلمة المرور وتأكيد الحساب."
     },
     {
+        "title": "خطوات الاستفادة وتقديم طلب المساعدة",
+        "content": "لتقديم طلب مساعدة يجب اتباع الخطوات التالية: 1. تسجيل الدخول بحساب أسرة. 2. التوجه إلى لوحة التحكم ثم الضغط على إنشاء طلب جديد عبر مسار الواجهة المخصص. 3. تحديد نوع الطلب وكتابة وصف دقيق وكامل للحالة والاحتياج. 4. رفع المستندات الداعمة وتفاصيل الدخل والصحة. 5. إرسال الطلب. يتم تقييم الطلبات وتُدرج تحت حالة قيد المراجعة حتى تتولاها منظمة."
     },
     {
+        "title": "أنواع المساعدات ومعايير الأهلية المحددة",
+        "content": "تقدم منصة عون 5 أنواع محددة من الدعم: 1. المساعدة الغذائية: للأسر التي لا يتعدى دخلها الشهري 1500 جنيه. 2. الدعم الطبي: مخصص لمرضى الحالات المزمنة والإعاقات والمشاكل الصحية الطارئة. 3. الدعم التعليمي: للأسر التي تعول أطفالاً في سن التعليم (أقل من 18 عاماً) ولديها دخل لا يجاوز 3000 جنيه. 4. دعم السكن: للحالات التي تعاني من طرد، أو سكن غير آمن. 5. مساعدة مالية عامة: للديون الطارئة والأساسيات المتأخرة المستعصية."
     },
     {
+        "title": "سياسة التبرعات والتحويلات المالية المباشرة",
+        "content": "تمنع سياسة منصة عون استلام أو إجراء أي تحويلات مالية أو تبرعات عبر النظام نفسه. دور المنصة يقتصر فقط على عملية العرض والربط. إذا قرر المانح دعم أسرة معينة، يتم توفير آلية تواصل آمنة خارج البوابة المالية للمنصة لكي يتم تحويل المبلغ مباشرة إلى حساب المستفيد أو لمنظمة وسيطة، وبذلك نضمن وصول مبلغ التبرع بالكامل لمستحقيه."
+    },
+    {
+        "title": "دورة حياة الطلب ومدة المراجعة المتوقعة",
+        "content": "دورة حياة الطلب: تقد��م الطلب -> التقييم الآلي بالذكاء الاصطناعي -> المراجعة البشرية من المنظمات -> النشر للمانحين عبر أسماء مستعارة للحماية. مدة المراجعة ليست ثابتة بل تعتمد כلياً على مقدار اكتمال المستندات المرفوعة وشفافيتها. الطلبات المكتملة ذات التقارير الطبية وإثباتات الدخل الواضحة تخضع للقبول أسرع من غيرها."
+    },
+    {
+        "title": "الوثائق والمستندات الثبوتية الشائعة لجميع الطلبات",
+        "content": "لضمان اعتماد الطلب واستلام الدعم بسرعة، تنصح المنصة برفع: بطاقة الرقم القومي سارية، ومستند يثبت مستويات الدخل (كمفردات مرتب، معاش، أو إقرار بعدم العمل)، وإرفاق مستندات الحالة (مثل عقد إيجار موثق لدعم السكن، أو تقارير طبية معتمدة للدعم الطبي، أو بطاقات مدرسية وشهادات قيد لدعم التعليم). يقوم محرك OCR باستخراج النصوص آلياً من هذه الصور."
+    },
+    {
+        "title": "شروط أمان البيانات، التشفير، وحماية خصوصية المستخدمين",
+        "content": "المنصة مُلتزمة بحماية فائقة לلخصوصية. يتم تشفير كافة البيانات الشخصية. كما أن الطلبات عندما تُنشر للمانحين تُخفى كل البيانات المباشرة مثل البطاقات الشخصية والأسماء الحقيقية بتعويضها بأسماء مستعارة، حيث يظهر فقط ملخص معتمد وموثوق للحالة. بالإضافة لاعتماد نظام حماية يمنع أي تسريب لتفاصيل سكن العوائل المستفيدة."
+    },
+    {
+        "title": "تجاوز مشاكل الدخول واستعادة كلمات المرور",
+        "content": "إذا فقد أحد المستخدمين القدرة للوصول لحسابه، يمكنه ببساطة استعادة كلمة المرور عبر الضغط على 'نسيت كلمة المرور' في صفحة تسجيل الدخول، ليرسل له النظام رابطاً آمناً عبر بريده الإلكتروني يتيح له تعيين رقم سري جديد."
+    },
+    {
+        "title": "التواصل مع الدعم الفني لحل المشاكل والإشعارات التقنية",
+        "content": "توفر منصة عون قنوات للدعم والمساعدة. يمكن الإبلاغ عن الأعطال والمشكلات من خلال قسم 'الدعم' أو بالتواصل مع المساعد الذكي، والذي في حالة استشعاره لمشكلة فنية قوية أو فشل، يقوم تلقائياً بإنشاء بإنذار (Alert) وتصنيفه כـ مشكلة عاجلة للإسراع بحلها من فريق الدعم الهندسي للمنصة."
+    },
+    {
+        "title": "دور الذكاء الاصطناعي في التقييم المبدئي للحالات (AI Triage)",
+        "content": "تعتمد المنصة على نماذج Machine Learning لتقييم الاحتياج عند إرسال الطلب (عالي الأهمية، متوسط، منخفض)، ولتصنيف الحالة آلياً لاكتشاف الطوارئ (مثل الطرد من السكن). ورغم هذا التصنيف الذكي السريع، يظل القرار النهائي مرتبطاً بالمنظمة المراجعة."
     }
 ]
+class SEED_VERSION:
+    version = "1.1.0"  # Increment this to force re-seeding the Qdrant DB

app/services/rag/rag_engine.py CHANGED Viewed

@@ -1,124 +1,57 @@
 """
 Hybrid RAG Engine:
-- Primary: vector retrieval via Qdrant + HuggingFace embeddings
 - Fallback: local TF-IDF retrieval
 """
 import logging
-from typing import List, Optional
-import httpx
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from app.core.config import settings
-from app.services.rag.knowledge_base import KNOWLEDGE_BASE_DATA
 logger = logging.getLogger(__name__)
-try:
-    from qdrant_client import QdrantClient
-    from qdrant_client.models import Distance, PointStruct, VectorParams
-except Exception:  # pragma: no cover
-    QdrantClient = None
-    Distance = None
-    PointStruct = None
-    VectorParams = None
 class RAGEngine:
     def __init__(self):
-        self.collection_name = settings.QDRANT_COLLECTION_NAME
         self.documents = list(KNOWLEDGE_BASE_DATA)
         self.vectorizer = TfidfVectorizer()
         self.tfidf_matrix = None
         self._build_tfidf_fallback()
-        self.qdrant_client: Optional[QdrantClient] = None
-        self._vector_enabled = False
-        self._init_vector_backend()
     def _build_tfidf_fallback(self):
         docs = [f"{item['title']} {item['content']}" for item in self.documents]
         if not docs:
             self.tfidf_matrix = None
             return
         self.tfidf_matrix = self.vectorizer.fit_transform(docs)
-    def _init_vector_backend(self):
-        if not QdrantClient or not settings.QDRANT_URL or not settings.HUGGINGFACE_API_KEY:
-            logger.info("Vector RAG disabled, fallback TF-IDF is active.")
-            return
-        try:
-            self.qdrant_client = QdrantClient(
-                url=settings.QDRANT_URL,
-                api_key=settings.QDRANT_API_KEY or None,
-                timeout=8.0,
-            )
-            try:
-                self.qdrant_client.get_collection(self.collection_name)
-            except Exception:
-                self.qdrant_client.create_collection(
-                    collection_name=self.collection_name,
-                    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
-                )
-                self._seed_qdrant()
-            self._vector_enabled = True
-            logger.info("Vector RAG enabled with collection '%s'.", self.collection_name)
-        except Exception as exc:
-            logger.warning("Vector RAG init failed: %s. TF-IDF fallback stays active.", exc)
-            self._vector_enabled = False
-    def _embed(self, text: str) -> Optional[list[float]]:
-        try:
-            model_name = settings.HF_EMBED_MODEL
-            url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_name}"
-            headers = {"Authorization": f"Bearer {settings.HUGGINGFACE_API_KEY}"}
-            with httpx.Client(timeout=20.0) as client:
-                response = client.post(url, headers=headers, json={"inputs": text})
-            response.raise_for_status()
-            data = response.json()
-            if data and isinstance(data[0], list):
-                # average token vectors if 2D response
-                if data and data[0] and isinstance(data[0][0], list):
-                    token_vectors = data[0]
-                    size = len(token_vectors[0])
-                    pooled = [0.0] * size
-                    for row in token_vectors:
-                        for i, val in enumerate(row):
-                            pooled[i] += float(val)
-                    return [v / len(token_vectors) for v in pooled]
-                return [float(v) for v in data[0]]
-            if isinstance(data, list):
-                return [float(v) for v in data]
-            return None
-        except Exception as exc:
-            logger.warning("Embedding failed, fallback to TF-IDF: %s", exc)
-            return None
-    def _seed_qdrant(self):
-        if not self.qdrant_client:
             return
-        points = []
-        for idx, doc in enumerate(self.documents):
-            emb = self._embed(f"{doc['title']} {doc['content']}")
-            if not emb:
-                continue
-            points.append(
-                PointStruct(
-                    id=idx,
-                    vector=emb,
-                    payload={"title": doc["title"], "content": doc["content"]},
-                )
-            )
-        if points:
-            self.qdrant_client.upsert(collection_name=self.collection_name, points=points)
-    def _search_tfidf(self, query: str, top_k: int = 3, threshold: float = 0.15) -> List[dict]:
         if self.tfidf_matrix is None or not query.strip():
             return []
         query_vec = self.vectorizer.transform([query])
         similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
         top_indices = similarities.argsort()[-top_k:][::-1]
         results = []
         for idx in top_indices:
             score = float(similarities[idx])
@@ -128,64 +61,69 @@ class RAGEngine:
                         "score": score,
                         "title": self.documents[idx]["title"],
                         "content": self.documents[idx]["content"],
                     }
                 )
         return results
-    def search(self, query: str, top_k: int = 3, threshold: float = 0.20) -> List[dict]:
         if not query.strip():
             return []
-        if self._vector_enabled and self.qdrant_client:
-            embedding = self._embed(query)
-            if embedding:
-                try:
-                    points = self.qdrant_client.search(
-                        collection_name=self.collection_name,
-                        query_vector=embedding,
-                        limit=top_k,
-                        score_threshold=threshold,
-                    )
-                    return [
-                        {
-                            "score": float(p.score),
-                            "title": p.payload.get("title", ""),
-                            "content": p.payload.get("content", ""),
-                        }
-                        for p in points
-                    ]
-                except Exception as exc:
-                    logger.warning("Vector search failed, using TF-IDF fallback: %s", exc)
         return self._search_tfidf(query, top_k=top_k, threshold=0.12)
     def get_context(self, query: str, top_k: int = 3) -> str:
         results = self.search(query, top_k=top_k)
         if not results:
             return ""
         return "\n\n".join([f"[{r['title']}]: {r['content']}" for r in results])
-    def add_document(self, title: str, content: str) -> dict:
         doc = {"title": title.strip(), "content": content.strip()}
         self.documents.append(doc)
         self._build_tfidf_fallback()
-        if self._vector_enabled and self.qdrant_client:
-            emb = self._embed(f"{doc['title']} {doc['content']}")
-            if emb:
-                self.qdrant_client.upsert(
-                    collection_name=self.collection_name,
-                    points=[
-                        PointStruct(
-                            id=len(self.documents),
-                            vector=emb,
-                            payload=doc,
-                        )
-                    ],
-                )
-        return {"added": True, "count": len(self.documents)}
     def refresh_index(self):
         self._build_tfidf_fallback()
-        if self._vector_enabled and self.qdrant_client:
-            self._seed_qdrant()
 rag_engine = RAGEngine()

 """
 Hybrid RAG Engine:
+- Primary: vector retrieval via Qdrant (managed by vector_store)
 - Fallback: local TF-IDF retrieval
 """
 import logging
+from typing import Dict, List, Any
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from app.services.rag.knowledge_base import KNOWLEDGE_BASE_DATA, SEED_VERSION
+from app.services.rag.vector_store import vector_store
 logger = logging.getLogger(__name__)
 class RAGEngine:
     def __init__(self):
         self.documents = list(KNOWLEDGE_BASE_DATA)
         self.vectorizer = TfidfVectorizer()
         self.tfidf_matrix = None
         self._build_tfidf_fallback()
     def _build_tfidf_fallback(self):
+        """Build the in-memory TF-IDF matrix from self.documents."""
         docs = [f"{item['title']} {item['content']}" for item in self.documents]
         if not docs:
             self.tfidf_matrix = None
             return
         self.tfidf_matrix = self.vectorizer.fit_transform(docs)
+    def initialize(self):
+        """Called at app startup. Seeds Qdrant if empty."""
+        if not vector_store.is_connected:
+            logger.info("Vector store not connected. RAG will use TF-IDF fallback only.")
             return
+        # Check if we need to seed
+        if vector_store.collection_is_empty():
+            logger.info("Qdrant collection is empty. Seeding from knowledge base (version %s)...", getattr(SEED_VERSION, "version", "1"))
+            count = vector_store.upsert_documents_batch(self.documents, source="seed")
+            logger.info("Seeded %d documents into vector store.", count)
+    def _search_tfidf(self, query: str, top_k: int = 3, threshold: float = 0.15) -> List[Dict[str, Any]]:
+        """Fallback keyword search."""
         if self.tfidf_matrix is None or not query.strip():
             return []
         query_vec = self.vectorizer.transform([query])
         similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
         top_indices = similarities.argsort()[-top_k:][::-1]
         results = []
         for idx in top_indices:
             score = float(similarities[idx])
                         "score": score,
                         "title": self.documents[idx]["title"],
                         "content": self.documents[idx]["content"],
+                        "id": str(idx),  # TF-IDF uses array index as ID
                     }
                 )
         return results
+    def search(self, query: str, top_k: int = 3, threshold: float = 0.20) -> List[Dict[str, Any]]:
+        """Hybrid search: Prefer vector search, fallback to TF-IDF."""
         if not query.strip():
             return []
+        if vector_store.is_connected:
+            results = vector_store.search(query, top_k=top_k, threshold=threshold)
+            if results:
+                return results
+            logger.debug("Vector search returned 0 results. Trying TF-IDF fallback.")
+        # Fallback
         return self._search_tfidf(query, top_k=top_k, threshold=0.12)
     def get_context(self, query: str, top_k: int = 3) -> str:
+        """Get formatted context string for LLM injection."""
         results = self.search(query, top_k=top_k)
         if not results:
             return ""
         return "\n\n".join([f"[{r['title']}]: {r['content']}" for r in results])
+    def add_document(self, title: str, content: str) -> Dict[str, Any]:
+        """Add a document to both vector store and local fallback block."""
         doc = {"title": title.strip(), "content": content.strip()}
+        # 1. Update fallback
         self.documents.append(doc)
         self._build_tfidf_fallback()
+        # 2. Update vector store
+        doc_id = None
+        if vector_store.is_connected:
+            doc_id = vector_store.upsert_document(title=doc["title"], content=doc["content"])
+        return {
+            "added": True,
+            "id": doc_id,
+            "vector_store": bool(doc_id),
+            "fallback_count": len(self.documents)
+        }
+    def delete_document(self, doc_id: str) -> bool:
+        """Delete from vector store. (Note: difficult to delete from TF-IDF list without ID tracking)"""
+        if vector_store.is_connected:
+            return vector_store.delete_document(doc_id)
+        return False
+    def list_documents(self, limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]:
+        """List documents from vector store."""
+        if vector_store.is_connected:
+            return vector_store.list_documents(limit=limit, offset=offset)
+        # Fallback to local list (basic)
+        return [{"id": str(i), "title": d["title"], "content": d["content"]} for i, d in enumerate(self.documents[offset:offset+limit])]
     def refresh_index(self):
+        """Force rebuild of fallback and re-seed vector store if empty."""
         self._build_tfidf_fallback()
+        self.initialize()
 rag_engine = RAGEngine()

app/services/rag/vector_store.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+Vector Store — Qdrant-backed persistent vector storage for the RAG knowledge base.
+Manages the full lifecycle: connection, collection creation, document CRUD, and search.
+Designed to be initialized during app startup and shared across requests.
+"""
+import logging
+import uuid
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+from app.core.config import settings
+from app.services.rag.embedder import embedder
+logger = logging.getLogger(__name__)
+# ── Lazy imports (graceful degradation if qdrant-client not installed) ────────
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import (
+        Distance,
+        FieldCondition,
+        Filter,
+        MatchValue,
+        PointStruct,
+        VectorParams,
+    )
+    QDRANT_AVAILABLE = True
+except ImportError:
+    QDRANT_AVAILABLE = False
+    QdrantClient = None
+    logger.warning("qdrant-client not installed — vector store disabled.")
+class VectorStore:
+    """Persistent vector storage backed by Qdrant."""
+    def __init__(self):
+        self.client: Optional[Any] = None
+        self.collection_name = settings.QDRANT_COLLECTION_NAME
+        self._connected = False
+    # ── Lifecycle ────────────────────────────────────────────────────────────
+    def connect(self) -> bool:
+        """
+        Establish connection to Qdrant and ensure the collection exists.
+        Returns True if connected, False otherwise.
+        """
+        if not QDRANT_AVAILABLE:
+            logger.warning("qdrant-client not available — vector store disabled.")
+            return False
+        if not settings.QDRANT_URL:
+            logger.info("QDRANT_URL not configured — vector store disabled.")
+            return False
+        try:
+            self.client = QdrantClient(
+                url=settings.QDRANT_URL,
+                api_key=settings.QDRANT_API_KEY or None,
+                timeout=10.0,
+            )
+            # Verify connectivity
+            self.client.get_collections()
+            self._connected = True
+            logger.info("Connected to Qdrant at %s", settings.QDRANT_URL)
+            # Ensure collection exists
+            self._ensure_collection()
+            return True
+        except Exception as exc:
+            logger.warning("Failed to connect to Qdrant: %s — vector store disabled.", exc)
+            self.client = None
+            self._connected = False
+            return False
+    def disconnect(self):
+        """Clean up Qdrant client on shutdown."""
+        if self.client:
+            try:
+                self.client.close()
+            except Exception:
+                pass
+            self.client = None
+            self._connected = False
+            logger.info("Disconnected from Qdrant.")
+    @property
+    def is_connected(self) -> bool:
+        return self._connected and self.client is not None
+    # ── Collection Management ────────────────────────────────────────────────
+    def _ensure_collection(self):
+        """Create collection if it doesn't exist."""
+        try:
+            self.client.get_collection(self.collection_name)
+            logger.info("Qdrant collection '%s' already exists.", self.collection_name)
+        except Exception:
+            dim = embedder.dimension
+            self.client.create_collection(
+                collection_name=self.collection_name,
+                vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
+            )
+            logger.info("Created Qdrant collection '%s' (dim=%d, cosine).", self.collection_name, dim)
+    def get_collection_info(self) -> Dict[str, Any]:
+        """Return collection statistics for health/admin endpoints."""
+        if not self.is_connected:
+            return {"status": "disconnected"}
+        try:
+            info = self.client.get_collection(self.collection_name)
+            return {
+                "status": "connected",
+                "collection": self.collection_name,
+                "vectors_count": info.vectors_count,
+                "points_count": info.points_count,
+                "segments_count": len(info.segments) if hasattr(info, "segments") else None,
+                "disk_data_size": getattr(info, "disk_data_size", None),
+            }
+        except Exception as exc:
+            return {"status": "error", "detail": str(exc)}
+    def collection_is_empty(self) -> bool:
+        """Check whether the collection has zero points."""
+        if not self.is_connected:
+            return True
+        try:
+            info = self.client.get_collection(self.collection_name)
+            return (info.points_count or 0) == 0
+        except Exception:
+            return True
+    # ── Document CRUD ────────────────────────────────────────────────────────
+    def upsert_document(
+        self,
+        title: str,
+        content: str,
+        doc_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> Optional[str]:
+        """
+        Insert or update a single document.
+        Returns the document ID (UUID string) on success, None on failure.
+        """
+        if not self.is_connected:
+            return None
+        text = f"{title} {content}"
+        vector = embedder.embed_text(text)
+        if not vector:
+            logger.warning("Failed to embed document: %s", title[:50])
+            return None
+        point_id = doc_id or str(uuid.uuid4())
+        payload = {
+            "title": title.strip(),
+            "content": content.strip(),
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "source": "api",
+            **(metadata or {}),
+        }
+        try:
+            self.client.upsert(
+                collection_name=self.collection_name,
+                points=[PointStruct(id=point_id, vector=vector, payload=payload)],
+            )
+            return point_id
+        except Exception as exc:
+            logger.error("Failed to upsert document '%s': %s", title[:50], exc)
+            return None
+    def upsert_documents_batch(
+        self,
+        documents: List[Dict[str, str]],
+        source: str = "seed",
+    ) -> int:
+        """
+        Batch insert documents. Each dict must have 'title' and 'content'.
+        Returns the number of successfully inserted documents.
+        """
+        if not self.is_connected or not documents:
+            return 0
+        texts = [f"{d['title']} {d['content']}" for d in documents]
+        vectors = embedder.embed_batch(texts)
+        points = []
+        for doc, vec in zip(documents, vectors):
+            if vec is None:
+                continue
+            point_id = doc.get("id") or str(uuid.uuid4())
+            points.append(
+                PointStruct(
+                    id=point_id,
+                    vector=vec,
+                    payload={
+                        "title": doc["title"].strip(),
+                        "content": doc["content"].strip(),
+                        "created_at": datetime.now(timezone.utc).isoformat(),
+                        "source": source,
+                    },
+                )
+            )
+        if not points:
+            return 0
+        try:
+            # Qdrant supports batches up to 100 by default
+            batch_size = 64
+            for i in range(0, len(points), batch_size):
+                self.client.upsert(
+                    collection_name=self.collection_name,
+                    points=points[i : i + batch_size],
+                )
+            logger.info("Batch upserted %d documents into Qdrant.", len(points))
+            return len(points)
+        except Exception as exc:
+            logger.error("Batch upsert failed: %s", exc)
+            return 0
+    def delete_document(self, doc_id: str) -> bool:
+        """Delete a document by its ID. Returns True on success."""
+        if not self.is_connected:
+            return False
+        try:
+            self.client.delete(
+                collection_name=self.collection_name,
+                points_selector=[doc_id],
+            )
+            return True
+        except Exception as exc:
+            logger.error("Failed to delete document '%s': %s", doc_id, exc)
+            return False
+    def list_documents(self, limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]:
+        """List documents in the collection (paginated via scroll)."""
+        if not self.is_connected:
+            return []
+        try:
+            results, _next = self.client.scroll(
+                collection_name=self.collection_name,
+                limit=limit,
+                offset=offset if offset else None,
+                with_payload=True,
+                with_vectors=False,
+            )
+            return [
+                {
+                    "id": str(point.id),
+                    "title": point.payload.get("title", ""),
+                    "content": point.payload.get("content", "")[:200],
+                    "source": point.payload.get("source", ""),
+                    "created_at": point.payload.get("created_at", ""),
+                }
+                for point in results
+            ]
+        except Exception as exc:
+            logger.error("Failed to list documents: %s", exc)
+            return []
+    # ── Search ───────────────────────────────────────────────────────────────
+    def search(
+        self,
+        query: str,
+        top_k: int = 3,
+        threshold: float = 0.20,
+    ) -> List[Dict[str, Any]]:
+        """
+        Semantic search: embed the query and find nearest documents.
+        Returns list of dicts with score, title, content.
+        """
+        if not self.is_connected:
+            return []
+        vector = embedder.embed_text(query)
+        if not vector:
+            return []
+        try:
+            points = self.client.search(
+                collection_name=self.collection_name,
+                query_vector=vector,
+                limit=top_k,
+                score_threshold=threshold,
+            )
+            return [
+                {
+                    "score": float(p.score),
+                    "title": p.payload.get("title", ""),
+                    "content": p.payload.get("content", ""),
+                    "id": str(p.id),
+                }
+                for p in points
+            ]
+        except Exception as exc:
+            logger.warning("Vector search failed: %s", exc)
+            return []
+# ── Singleton ────────────────────────────────────────────────────────────────
+vector_store = VectorStore()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,24 @@

+services:
+  qdrant:
+    image: qdrant/qdrant:latest
+    ports:
+      - "6333:6333"
+    volumes:
+      - qdrant_data:/qdrant/storage
+    restart: unless-stopped
+  app:
+    build: .
+    ports:
+      - "7860:7860"
+    depends_on:
+      - qdrant
+    env_file:
+      - ../.env
+    environment:
+      # Override to use Docker network name instead of localhost
+      - QDRANT_URL=http://qdrant:6333
+    restart: unless-stopped
+volumes:
+  qdrant_data:

main.py CHANGED Viewed

@@ -59,6 +59,20 @@ async def lifespan(app: FastAPI):
         app.state.models = await asyncio.to_thread(get_or_train_models, False)
         logger.info("Successfully loaded/trained ML models.")
         # Start background model revalidation task
         asyncio.create_task(periodic_model_revalidation())
@@ -71,6 +85,11 @@ async def lifespan(app: FastAPI):
     yield
     app.state.models.clear()
     await redis_client.disconnect()
     logger.info("Shutting down %s...", settings.PROJECT_NAME)

         app.state.models = await asyncio.to_thread(get_or_train_models, False)
         logger.info("Successfully loaded/trained ML models.")
+        # Initialize RAG vector store and embeddings
+        from app.services.rag.embedder import embedder
+        from app.services.rag.vector_store import vector_store
+        from app.services.rag.rag_engine import rag_engine
+        logger.info("Initializing embedding service...")
+        embedder.initialize()
+        logger.info("Connecting to vector store...")
+        vector_store.connect()
+        logger.info("Initializing RAG engine...")
+        rag_engine.initialize()
         # Start background model revalidation task
         asyncio.create_task(periodic_model_revalidation())
     yield
     app.state.models.clear()
     await redis_client.disconnect()
+    # Graceful vector store disconnect
+    from app.services.rag.vector_store import vector_store
+    vector_store.disconnect()
     logger.info("Shutting down %s...", settings.PROJECT_NAME)

requirements.txt CHANGED Viewed

@@ -19,6 +19,7 @@ huggingface_hub>=0.24.0,<1.0.0
 cachetools>=5.5.0,<6.0.0
 shap==0.45.0
 qdrant-client>=1.11.0,<2.0.0
 upstash-redis>=1.1.0,<2.0.0
 presidio-analyzer>=2.2.0,<3.0.0
 presidio-anonymizer>=2.2.0,<3.0.0

 cachetools>=5.5.0,<6.0.0
 shap==0.45.0
 qdrant-client>=1.11.0,<2.0.0
+sentence-transformers>=3.0.0,<4.0.0
 upstash-redis>=1.1.0,<2.0.0
 presidio-analyzer>=2.2.0,<3.0.0
 presidio-anonymizer>=2.2.0,<3.0.0

verify_rag.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import sys
+# Add current dir to path to allow absolute imports
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__))))
+import asyncio
+from app.core.config import settings
+from app.services.rag.embedder import embedder
+from app.services.rag.vector_store import vector_store
+from app.services.rag.rag_engine import rag_engine
+async def main():
+    print("Initializing embedder...")
+    embedder.initialize()
+    print(f"Embedder mode: {embedder.is_ready}, dimension: {embedder.dimension}")
+    vec = embedder.embed_text("مرحبا بك في منصة عون")
+    if vec:
+        print(f"Embedding successful: Length {len(vec)}, first elements: {vec[:3]}")
+    else:
+        print("Embedding failed!")
+    print("\nConnecting to Qdrant (Make sure Docker is running!)...")
+    # This might fail if Docker Qdrant is not up, but it shouldn't crash the app.
+    success = vector_store.connect()
+    print(f"Qdrant connection: {success}")
+    if success:
+        print(vector_store.get_collection_info())
+    print("\nInitializing RAG Engine...")
+    rag_engine.initialize()
+    print(f"TF-IDF Matrix Shape: {rag_engine.tfidf_matrix.shape if rag_engine.tfidf_matrix is not None else 'None'}")
+    print("\nTesting Hybrid Search:")
+    res = rag_engine.search("كيف يتم دعم الأسر؟")
+    for r in res:
+         print(f"- [Score: {r['score']:.4f}]: {r['title']}")
+if __name__ == "__main__":
+    asyncio.run(main())