Spaces:

wessamelden
/

chatbot

Running

App Files Files Community

wessamelden commited on 12 days ago

Commit

4c09ca7

1 Parent(s): f2c87c6

add API key protection

Browse files

Files changed (19) hide show

.env.example +14 -4
README.md +0 -31
app/__init__.py +0 -0
app/api/auth.py +0 -16
app/api/routes_chat.py +2 -1
app/api/routes_health.py +23 -9
app/core/config.py +12 -2
app/llm/ollama_client.py +1 -24
app/memory/conversation_memory.py +4 -15
app/pipeline/chat_pipeline.py +6 -14
app/pipeline/context_builder.py +1 -13
app/pipeline/prompt_builder.py +0 -8
app/pipeline/query_handler.py +16 -28
app/retrieval/__init__.py +1 -1
app/retrieval/retriever.py +8 -31
main.py +5 -31
requirements.txt +13 -8
setup.sh +27 -24
test_reranker.py +0 -28

.env.example CHANGED Viewed

@@ -1,7 +1,9 @@
-# ── Groq LLM ─────────────────────────────────────────────────────────────────
-# احصل على مفتاح مجاني من: https://console.groq.com/keys
 GROQ_API_KEY=gsk_your_groq_api_key_here
-# النماذج المتاحة: llama-3.3-70b-versatile, llama-3.1-8b-instant, mixtral-8x7b-32768
 GROQ_MODEL=llama-3.3-70b-versatile
 # ── الاسترجاع ─────────────────────────────────────────────────────────────────
@@ -11,6 +13,12 @@ CHROMA_COLLECTION=rag_docs
 TOP_K=8
 MAX_CONTEXT_CHARS=10000
 # ── ذاكرة المحادثة ────────────────────────────────────────────────────────────
 MAX_TURNS=6
 MAX_SESSIONS=200
@@ -22,4 +30,6 @@ CHUNK_SIZE=1600
 CHUNK_OVERLAP=200
 # ── الشبكة ─────────────────────────────────────────────────────────────────────
-ALLOWED_ORIGINS=*

+# ── نموذج اللغة ───────────────────────────────────────────────────────────────
+OLLAMA_MODEL=gemma3
+OLLAMA_URL=http://127.0.0.1:11434/api/chat
+# ── Groq LLM (Reranker & Chat) ────────────────────────────────────────────────
 GROQ_API_KEY=gsk_your_groq_api_key_here
 GROQ_MODEL=llama-3.3-70b-versatile
 # ── الاسترجاع ─────────────────────────────────────────────────────────────────
 TOP_K=8
 MAX_CONTEXT_CHARS=10000
+# ── Reranker (HuggingFace) ─────────────────────────────────────────────────────
+# احصل على token مجاني من: https://huggingface.co/settings/tokens
+# HF_API_TOKEN=hf_your_token_here
+# RERANKER_MODEL=Qwen/Qwen3-Reranker-0.6B
+# RERANKER_CONCURRENCY=4
 # ── ذاكرة المحادثة ────────────────────────────────────────────────────────────
 MAX_TURNS=6
 MAX_SESSIONS=200
 CHUNK_OVERLAP=200
 # ── الشبكة ─────────────────────────────────────────────────────────────────────
+# في التطوير: *
+# في الإنتاج: ضع عنوان تطبيق Flutter أو رابط الخادم
+ALLOWED_ORIGINS=*

README.md DELETED Viewed

@@ -1,31 +0,0 @@
----
-title: ASU RAG Chatbot
-emoji: 🎓
-colorFrom: blue
-colorTo: green
-sdk: docker
-app_port: 7860
-pinned: false
----
-# ASU RAG Chatbot
-مساعد أكاديمي ذكي لطلاب كلية العلوم - جامعة عين شمس
-## Setup
-Set the following secrets in your HuggingFace Space settings:
-| Secret | Description |
-|--------|-------------|
-| `GROQ_API_KEY` | Get free at [console.groq.com](https://console.groq.com/keys) |
-| `HF_API_TOKEN` | Get free at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) |
-## API
-| Endpoint | Description |
-|----------|-------------|
-| `POST /chat` | Streaming chat (SSE) |
-| `GET /health` | Health check |
-| `POST /retrieve` | Debug: raw retrieval results |
-| `DELETE /session/{id}` | Clear conversation history |

app/__init__.py DELETED Viewed

File without changes

app/api/auth.py DELETED Viewed

@@ -1,16 +0,0 @@
-from fastapi import Header, HTTPException, status
-from app.core.config import settings
-async def require_api_key(x_api_key: str = Header(default="")):
-    """
-    تحقق من مفتاح الـ API في header كل طلب.
-    إذا كان API_SECRET_KEY فارغاً في الـ env → الحماية معطّلة (للتطوير).
-    """
-    if not settings.api_secret_key:
-        return  # no key configured → open (local dev only)
-    if x_api_key != settings.api_secret_key:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Invalid or missing API key",
-        )

app/api/routes_chat.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import uuid
 import json
 import time
-from fastapi import APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 from app.memory import memory
 from app.pipeline import chat_pipeline
 from app.llm.groq_client import stream_response
 from app.core.logging_setup import get_logger

 import uuid
 import json
 import time
+from fastapi import APIRouter
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 from app.memory import memory
 from app.pipeline import chat_pipeline
+# from app.llm.ollama_client import stream_response
 from app.llm.groq_client import stream_response
 from app.core.logging_setup import get_logger

app/api/routes_health.py CHANGED Viewed

@@ -1,13 +1,6 @@
-"""
-app/api/routes_health.py
-=========================
-Endpoints للمراقبة والتشخيص.
-/health   → هل الـ server يعمل؟ هل Groq متصل؟
-/retrieve → أداة debug للتحقق من جودة الاسترجاع
-"""
 from groq import AsyncGroq
 from fastapi import APIRouter
 from pydantic import BaseModel
@@ -18,12 +11,30 @@ from app.memory import memory
 router = APIRouter()
 _groq_client = AsyncGroq(api_key=settings.groq_api_key)
 @router.get("/health")
 async def health():
     retriever = get_retriever()
     groq_ok = False
     try:
         await _groq_client.chat.completions.create(
@@ -36,6 +47,9 @@ async def health():
         pass
     return {
         "status":              "ok" if groq_ok else "error",
         "groq_connected":      groq_ok,
         "model":               settings.groq_model,
@@ -56,4 +70,4 @@ async def retrieve(req: RetrieveRequest):
     retriever = get_retriever()
     k = req.top_k or settings.top_k
     chunks = retriever.search(req.question, top_k=k)
-    return {"chunks": chunks}

 from groq import AsyncGroq
+# import httpx
 from fastapi import APIRouter
 from pydantic import BaseModel
 router = APIRouter()
+# _OLLAMA_BASE_URL = settings.ollama_url.split("/api/")[0]
 _groq_client = AsyncGroq(api_key=settings.groq_api_key)
 @router.get("/health")
 async def health():
+    """
+    تحقق من حالة النظام.
+    يُعيد:
+        status: "ok" أو "error"
+        ollama_connected: هل Ollama يستجيب؟
+        chunks_indexed: عدد chunks في قاعدة البيانات
+        sessions_active: عدد الجلسات النشطة في الذاكرة
+    """
     retriever = get_retriever()
+    # ollama_ok = False
+    # try:
+    #     async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
+    #         r = await client.get(f"{_OLLAMA_BASE_URL}/api/tags")
+    #         ollama_ok = r.status_code == 200
+    # except (httpx.HTTPError, OSError):
+    #     pass
     groq_ok = False
     try:
         await _groq_client.chat.completions.create(
         pass
     return {
+        # "status":           "ok" if ollama_ok else "error",
+        # "ollama_connected": ollama_ok,
+        # "model":            settings.ollama_model,
         "status":              "ok" if groq_ok else "error",
         "groq_connected":      groq_ok,
         "model":               settings.groq_model,
     retriever = get_retriever()
     k = req.top_k or settings.top_k
     chunks = retriever.search(req.question, top_k=k)
+    return {"chunks": chunks}

app/core/config.py CHANGED Viewed

@@ -5,18 +5,26 @@ load_dotenv()
 class Settings:
-    # ── نموذج اللغة (Groq) ───────────────────────────────────────────────────
     groq_api_key: str = os.getenv("GROQ_API_KEY", "")
     groq_model: str   = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
     # ── الاسترجاع والتضمين ───────────────────────────────────────────────────
-    # نموذج التضمين — يجب أن يكون نفسه في الاستيعاب والاسترجاع دائماً
     embed_model: str      = os.getenv("EMBED_MODEL", "paraphrase-multilingual-mpnet-base-v2")
     chroma_path: str      = os.getenv("CHROMA_PATH", "vectorstore")
     chroma_collection: str = os.getenv("CHROMA_COLLECTION", "rag_docs")
     top_k: int            = min(int(os.getenv("TOP_K", "8")), 8)
     max_context_chars: int = int(os.getenv("MAX_CONTEXT_CHARS", "10000"))
     # ── ذاكرة المحادثة ────────────────────────────────────────────────────────
     max_turns: int    = int(os.getenv("MAX_TURNS", "6"))
     max_sessions: int = int(os.getenv("MAX_SESSIONS", "200"))
@@ -36,6 +44,8 @@ class Settings:
     ]
     # ── Timeouts ─────────────────────────────────────────────────────────────
     keepalive_interval: int = 20  # ثانية — heartbeat للـ SSE

 class Settings:
+    # ── نموذج اللغة (Ollama) ──────────────────────────────────────────────────
+    ollama_model: str = os.getenv("OLLAMA_MODEL", "gemma3")
+    ollama_url: str   = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
+    # ── نموذج اللغة (Groq) ──────────────────────────────────────────────────
     groq_api_key: str = os.getenv("GROQ_API_KEY", "")
     groq_model: str   = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
     # ── الاسترجاع والتضمين ───────────────────────────────────────────────────
     embed_model: str      = os.getenv("EMBED_MODEL", "paraphrase-multilingual-mpnet-base-v2")
     chroma_path: str      = os.getenv("CHROMA_PATH", "vectorstore")
     chroma_collection: str = os.getenv("CHROMA_COLLECTION", "rag_docs")
     top_k: int            = min(int(os.getenv("TOP_K", "8")), 8)
     max_context_chars: int = int(os.getenv("MAX_CONTEXT_CHARS", "10000"))
+    # ── Reranker (HuggingFace) ────────────────────────────────────────────────
+    #hf_api_token: str       = os.getenv("HF_API_TOKEN", "")
+    #reranker_model: str     = os.getenv("RERANKER_MODEL", "Qwen/Qwen3-Reranker-0.6B")
+    #reranker_concurrency: int = int(os.getenv("RERANKER_CONCURRENCY", "4"))
     # ── ذاكرة المحادثة ────────────────────────────────────────────────────────
     max_turns: int    = int(os.getenv("MAX_TURNS", "6"))
     max_sessions: int = int(os.getenv("MAX_SESSIONS", "200"))
     ]
     # ── Timeouts ─────────────────────────────────────────────────────────────
+    # None = لا timeout (الاستنتاج على CPU قد يأخذ 200+ ثانية)
+    ollama_timeout = None
     keepalive_interval: int = 20  # ثانية — heartbeat للـ SSE

app/llm/ollama_client.py CHANGED Viewed

@@ -16,25 +16,7 @@ async def stream_response(
     session_id: str,
     original_question: str,
 ):
-    """
-    بث الرد من Ollama مع حفظ المحادثة في الذاكرة.
-    كيف يعمل الـ Streaming؟
-    ─────────────────────────
-    بدلاً من الانتظار حتى تكتمل الإجابة كاملاً (قد يستغرق 200 ثانية)،
-    نفتح اتصالاً مستمراً ونُرسل كل كلمة فور إنتاجها.
-    المستخدم يرى الرد يظهر تدريجياً كأن شخصاً يكتب.
-    Heartbeat (نبضة القلب):
-    ─────────────────────────
-    إذا لم يأتِ توكن لـ 20 ثانية، نُرسل مسافة صغيرة (zero-width space).
-    هذا يمنع المتصفح أو الشبكة من اعتبار الاتصال "مات" وقطعه.
-    الحفظ في الذاكرة:
-    ─────────────────────────
-    عند اكتمال الإجابة، نحفظ السؤال والإجابة في ConversationMemory
-    حتى يعمل الـ follow-up في الرسائل التالية.
-    """
     full_answer      = ""
     stream_completed = False
     logger.info("بدء توليد الإجابة | session=%s", session_id)
@@ -120,12 +102,7 @@ async def stream_response(
 async def warmup_model() -> bool:
-    """
-    حمّل النموذج في ذاكرة Ollama عند بدء التطبيق.
-    يمنع التأخير الكبير في أول طلب.
-    يُعاد True إذا نجح، False إذا فشل.
-    """
     try:
         async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
             await client.post(

     session_id: str,
     original_question: str,
 ):
     full_answer      = ""
     stream_completed = False
     logger.info("بدء توليد الإجابة | session=%s", session_id)
 async def warmup_model() -> bool:
     try:
         async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
             await client.post(

app/memory/conversation_memory.py CHANGED Viewed

@@ -1,20 +1,14 @@
-"""
-memory.py — Production conversation store with TTL eviction.
-Keeps the last N turns per session so "why?" follow-ups work correctly.
-Evicts sessions older than TTL and caps total sessions to prevent OOM.
-"""
 import time
 import threading
 from collections import OrderedDict, deque
 from dataclasses import dataclass
 from typing import Literal
-import os
 from app.core.config import settings
 from app.core.logging_setup import get_logger
 logger = get_logger(__name__)
 @dataclass
 class Turn:
     role: Literal["user", "assistant"]
@@ -108,13 +102,8 @@ class ConversationMemory:
     def session_count(self) -> int:
         return len(self._sessions)
-# Global singleton shared across all requests
-MAX_TURNS    = int(os.getenv("MAX_TURNS", "6"))
-MAX_SESSIONS = int(os.getenv("MAX_SESSIONS", "200"))
-SESSION_TTL  = int(os.getenv("SESSION_TTL", "3600"))
 memory = ConversationMemory(
-    max_turns=MAX_TURNS,
-    max_sessions=MAX_SESSIONS,
-    ttl_seconds=SESSION_TTL,
 )

 import time
 import threading
 from collections import OrderedDict, deque
 from dataclasses import dataclass
 from typing import Literal
 from app.core.config import settings
 from app.core.logging_setup import get_logger
 logger = get_logger(__name__)
 @dataclass
 class Turn:
     role: Literal["user", "assistant"]
     def session_count(self) -> int:
         return len(self._sessions)
 memory = ConversationMemory(
+    max_turns=settings.max_turns,
+    max_sessions=settings.max_sessions,
+    ttl_seconds=settings.session_ttl,
 )

app/pipeline/chat_pipeline.py CHANGED Viewed

@@ -5,7 +5,9 @@ from langdetect import detect as detect_lang, LangDetectException
 from app.core.config import settings
 from app.core.logging_setup import get_logger
-from app.retrieval import get_retriever, rerank_chunks
 from app.pipeline.query_handler import is_followup_question, rewrite_query
 from app.pipeline.context_builder import build_context, extract_sources
 from app.pipeline.prompt_builder import build_system_prompt
@@ -37,17 +39,6 @@ async def run(
     session_id: str,
     history: list[dict],
 ) -> PipelineResult:
-    """
-    نفّذ pipeline كامل لسؤال واحد.
-    المدخلات:
-        question:   نص السؤال (بعد trim)
-        session_id: معرّف الجلسة
-        history:    تاريخ المحادثة من ConversationMemory
-    المخرج:
-        PipelineResult جاهز للإرسال للـ LLM
-    """
     t_start = time.time()
     lang = _detect_language(question)
@@ -65,9 +56,10 @@ async def run(
     )
     # ── الخطوة 4: إعادة الترتيب بالـ Reranker ─────────────────────────────────
-    # إذا لم يوجد HF_API_TOKEN → يُعاد الترتيب الأصلي بدون تغيير (graceful degradation)
     chunks = await rerank_chunks(search_query, chunks, top_k=5, lang=lang)
     # ── الخطوة 5: بناء السياق ─────────────────────────────────────────────────
     context = build_context(chunks)
     sources = extract_sources(chunks)

 from app.core.config import settings
 from app.core.logging_setup import get_logger
+from app.retrieval import get_retriever
+# from app.retrieval import get_retriever # rerank_chunks DISABLED: using Groq API
+from app.retrieval.reranker import rerank_chunks
 from app.pipeline.query_handler import is_followup_question, rewrite_query
 from app.pipeline.context_builder import build_context, extract_sources
 from app.pipeline.prompt_builder import build_system_prompt
     session_id: str,
     history: list[dict],
 ) -> PipelineResult:
     t_start = time.time()
     lang = _detect_language(question)
     )
     # ── الخطوة 4: إعادة الترتيب بالـ Reranker ─────────────────────────────────
+    # DISABLED: local HuggingFace reranker replaced by Groq API before discussion day.
     chunks = await rerank_chunks(search_query, chunks, top_k=5, lang=lang)
+    # chunks = chunks[:5]  # fallback: take top 5 from RRF order
     # ── الخطوة 5: بناء السياق ─────────────────────────────────────────────────
     context = build_context(chunks)
     sources = extract_sources(chunks)

app/pipeline/context_builder.py CHANGED Viewed

@@ -2,19 +2,7 @@ from app.core.config import settings
 def format_chunk(index: int, chunk: dict) -> str:
-    """
-    نسّق chunk واحد مع ترويسة تحتوي على معلومات السياق.
-    المدخل:
-        index: رقم الـ chunk (0-based)
-        chunk: dict يحتوي على "text" و "metadata"
-    المخرج:
-        نص منسّق مع ترويسة بين أقواس مربعة
-        مثال:
-        [مقتطف 1 — السياق: برنامج الرياضيات — المستوى 3 — الفصل: الأول]
-        ... نص الـ chunk ...
-    """
     meta       = chunk.get("metadata", {})
     article    = meta.get("article_number", "")
     breadcrumb = (

 def format_chunk(index: int, chunk: dict) -> str:
     meta       = chunk.get("metadata", {})
     article    = meta.get("article_number", "")
     breadcrumb = (

app/pipeline/prompt_builder.py CHANGED Viewed

@@ -7,15 +7,7 @@ _PROMPTS_DIR = Path(__file__).resolve().parent.parent.parent / "prompts"
 def build_system_prompt(language: str) -> str:
-    """
-    ارجع الـ system prompt المناسب حسب اللغة.
-    المعاملات:
-        language: "ar" للعربية، "en" للإنجليزية
-    الإرجاع:
-        نص الـ prompt الكامل
-    """
     filename = "system_ar.txt" if language == "ar" else "system_en.txt"
     prompt_path = _PROMPTS_DIR / filename

 def build_system_prompt(language: str) -> str:
     filename = "system_ar.txt" if language == "ar" else "system_en.txt"
     prompt_path = _PROMPTS_DIR / filename

app/pipeline/query_handler.py CHANGED Viewed

@@ -1,25 +1,14 @@
 import json
-from groq import AsyncGroq
 from app.core.config import settings
 from app.core.logging_setup import get_logger
 logger = get_logger(__name__)
-_client = AsyncGroq(api_key=settings.groq_api_key)
 def is_followup_question(question: str) -> bool:
-    """
-    اكشف ما إذا كان السؤال يعتمد على context سابق.
-    يعتمد على 4 إشارات:
-    - قصير (≤8 كلمات)
-    - يحتوي على كلمة استفهام غير محددة
-    - يحتوي على ضمير إشاري (ده/هذا)
-    - يبدأ بحرف عطف (و/ف/لكن)
-    إذا توفّرت إشارتان أو أكثر → سؤال متابِع.
-    """
     followup_keywords = [
         "لماذا", "كيف", "ماذا", "وضح", "اشرح", "يعني", "طيب", "وإيه",
         "why", "how", "what do you mean", "explain", "elaborate",
@@ -37,11 +26,6 @@ def is_followup_question(question: str) -> bool:
 async def rewrite_query(question: str, history: list[dict]) -> str:
-    """
-    أعد صياغة السؤال ليكون مستقلاً باستخدام Groq.
-    إذا فشل الـ LLM لأي سبب → يُعاد السؤال الأصلي بدون تغيير.
-    """
     if not history:
         return question
@@ -55,16 +39,20 @@ async def rewrite_query(question: str, history: list[dict]) -> str:
     )
     try:
-        response = await _client.chat.completions.create(
-            model=settings.groq_model,
-            messages=[{"role": "user", "content": rewrite_prompt}],
-            max_tokens=100,
-            temperature=0.1,
-        )
-        rewritten = response.choices[0].message.content.strip()
-        logger.info("تمت إعادة صياغة السؤال: %s", rewritten)
-        return rewritten
     except Exception as exc:
         logger.warning("فشل إعادة الصياغة: %s", exc)
-    return question  # fallback: السؤال الأصلي

 import json
+import httpx
 from app.core.config import settings
 from app.core.logging_setup import get_logger
 logger = get_logger(__name__)
+_OLLAMA_TIMEOUT = httpx.Timeout(settings.ollama_timeout)
 def is_followup_question(question: str) -> bool:
     followup_keywords = [
         "لماذا", "كيف", "ماذا", "وضح", "اشرح", "يعني", "طيب", "وإيه",
         "why", "how", "what do you mean", "explain", "elaborate",
 async def rewrite_query(question: str, history: list[dict]) -> str:
     if not history:
         return question
     )
     try:
+        async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as client:
+            response = await client.post(
+                settings.ollama_url,
+                json={
+                    "model":    settings.ollama_model,
+                    "messages": [{"role": "user", "content": rewrite_prompt}],
+                    "stream":   False,
+                },
+            )
+            if response.status_code == 200:
+                rewritten = response.json()["message"]["content"].strip()
+                logger.info("تمت إعادة صياغة السؤال: %s", rewritten)
+                return rewritten
     except Exception as exc:
         logger.warning("فشل إعادة الصياغة: %s", exc)
+    return question  # fallback: السؤال الأصلي

app/retrieval/__init__.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	from app.retrieval.retriever import get_retriever, reset_retriever
2	- from app.retrieval.reranker import rerank_chunks, ~~warmup_reranker~~


1	from app.retrieval.retriever import get_retriever, reset_retriever
2	+ # from app.retrieval.reranker import rerank_chunks # DISABLED: using Groq API reranker

app/retrieval/retriever.py CHANGED Viewed

@@ -1,20 +1,3 @@
-"""
-retriever.py — Hybrid retriever with weighted RRF fusion (CPU-only production)
-==============================================================================
-  - CPU-only embedding (GPU reserved for Ollama LLM)
-  - Arabic-aware BM25 tokenizer (diacritics, prefix stripping, alef normalization)
-  - BM25 index persisted via joblib — skips rebuild if collection unchanged
-  - top_k hard-capped at 8 (raised from 5) to give LLM enough rows to
-    reconstruct multi-row academic tables without OOM risk on CPU
-  - fetch_k = top_k × 4 — wide candidate pool for fragmented tables
-  - Weighted RRF: structural queries (level/dept/course) get 2× vector weight
-    and 0.5× BM25 weight to suppress noise from ubiquitous terms like "ساعة"
-  - BM25 score threshold: skip BM25 results when max raw score < 0.1
-    (query has no meaningful keyword match — prevents random noise from
-     contaminating the fusion ranking)
-  - reset_retriever() holds _init_lock to prevent concurrent partial-reset reads
-"""
 import re
 import time
 import joblib
@@ -193,16 +176,6 @@ def _is_structural_query(query: str) -> bool:
 def _build_metadata_filter(query: str) -> dict | None:
-    """Build a ChromaDB `where` filter from the query's level/semester mentions.
-    When a user asks about "المستوى الأول الفصل الثاني", this returns a filter
-    that restricts vector search to chunks whose `level_number` = "1" AND
-    `semester` = "الثاني".  This prevents Level-2/3/4 chunks (which may be
-    semantically closer due to course-code overlap) from outranking the
-    actually-requested Level-1 chunks.
-    Returns None if no level/semester can be extracted (general query).
-    """
     level = _extract_level_number(query)
     semester = _extract_semester(query)
@@ -221,7 +194,11 @@ def _build_metadata_filter(query: str) -> dict | None:
 def _select_device() -> str:
-    return "cpu"   # embeddings run on CPU; LLM is cloud-based (Groq)
 # ── Retriever ─────────────────────────────────────────────────────────────────
@@ -229,8 +206,8 @@ def _select_device() -> str:
 class Retriever:
     def __init__(self):
         device = _select_device()
-        logger.info("[INIT] Embedding device: {device}")
-        self.embed_model = SentenceTransformer(settings.embed_model, ...)
         self.client     = chromadb.PersistentClient(path=settings.chroma_path)
         self.collection = self.client.get_or_create_collection(name=settings.chroma_collection)
@@ -255,7 +232,7 @@ class Retriever:
                     print(f"[CACHE] BM25 loaded ({len(self.documents)} docs)")
                     return
             except Exception as e:
-                logger.warning("[WARN] BM25 cache invalid: {e}")
         print("[BUILD] Building BM25 index...")
         all_docs       = self.collection.get()

 import re
 import time
 import joblib
 def _build_metadata_filter(query: str) -> dict | None:
     level = _extract_level_number(query)
     semester = _extract_semester(query)
 def _select_device() -> str:
+    try:
+        import torch
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    except ImportError:
+        return "cpu"
 # ── Retriever ─────────────────────────────────────────────────────────────────
 class Retriever:
     def __init__(self):
         device = _select_device()
+        logger.info("[INIT] Embedding device: %s", device)
+        self.embed_model = SentenceTransformer(settings.embed_model, device=device)
         self.client     = chromadb.PersistentClient(path=settings.chroma_path)
         self.collection = self.client.get_or_create_collection(name=settings.chroma_collection)
                     print(f"[CACHE] BM25 loaded ({len(self.documents)} docs)")
                     return
             except Exception as e:
+                logger.warning("[WARN] BM25 cache invalid: %s", e)
         print("[BUILD] Building BM25 index...")
         all_docs       = self.collection.get()

main.py CHANGED Viewed

@@ -1,17 +1,6 @@
-"""
-main.py — نقطة الدخول الوحيدة للتطبيق.
-=========================================
-هذا الملف مسؤول عن شيء واحد فقط:
-    تجميع كل أجزاء التطبيق وتشغيله.
-لا يحتوي على أي منطق.
-أي منطق يجب أن يكون في app/
-"""
 import os
 import sys
-# إجبار UTF-8 على Windows
 if sys.stdout.encoding != "utf-8":
     sys.stdout.reconfigure(encoding="utf-8", errors="replace")
 if sys.stderr.encoding != "utf-8":
@@ -29,26 +18,21 @@ from app.core.logging_setup import setup_logging, get_logger
 from app.api.routes_chat import router as chat_router
 from app.api.routes_health import router as health_router
-# إعداد الـ logging فوراً
 setup_logging()
 logger = get_logger("startup")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """
-    Startup/Shutdown hooks.
-    يُنفَّذ عند بدء التشغيل، و yield يعني "التطبيق يعمل الآن".
-    """
     from pathlib import Path
-    from app.retrieval import get_retriever, reset_retriever, warmup_reranker
     from app.llm.groq_client import warmup_model
-    # أنشئ المجلدات اللازمة
     Path(settings.data_dir).mkdir(parents=True, exist_ok=True)
     Path("data/pdfs").mkdir(parents=True, exist_ok=True)
-    # تحقق من الـ vectorstore — أعد الاستيعاب تلقائياً إذا كان فارغاً
     guide_path = Path(settings.data_dir) / "guide.md"
     if guide_path.exists():
         retriever = get_retriever()
@@ -56,30 +40,20 @@ async def lifespan(app: FastAPI):
             logger.info("قاعدة البيانات المتجهية فارغة — بدء الاستيعاب التلقائي...")
             from app.ingestion import ingest_all_markdown
             ingest_all_markdown(settings.data_dir)
-            # ── إعادة بناء الـ Retriever بعد الاستيعاب ─────────────────────
-            # الـ singleton القديم أُنشئ والمجموعة فارغة، لذا documents=[]
-            # و bm25=None. يجب إعادة إنشائه ليقرأ البيانات الجديدة.
-            reset_retriever()
-            logger.info("تم إعادة تهيئة الـ Retriever بعد الاستيعاب")
-    # تسخين نموذج التضمين
     retriever = get_retriever()
-    logger.info("عدد الـ chunks في المجموعة: %d", retriever.collection.count())
     retriever.embed_model.encode(["warm up"], normalize_embeddings=True)
     logger.info("تم تسخين نموذج التضمين")
-    # تحميل Groq مسبقاً
     await warmup_model()
-    # تحقق من الـ Reranker
     await warmup_reranker()
-    logger.info("✅ التطبيق جاهز — %s", settings.groq_model)
     yield
     logger.info("التطبيق يُغلق...")
-# ── إنشاء التطبيق ──────────────────────────────────────────────────────────────
 app = FastAPI(
     title="ASU RAG Chatbot",
     version="3.0.0",

 import os
 import sys
 if sys.stdout.encoding != "utf-8":
     sys.stdout.reconfigure(encoding="utf-8", errors="replace")
 if sys.stderr.encoding != "utf-8":
 from app.api.routes_chat import router as chat_router
 from app.api.routes_health import router as health_router
 setup_logging()
 logger = get_logger("startup")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     from pathlib import Path
+    from app.retrieval import get_retriever
+    # from app.llm.ollama_client import warmup_model
     from app.llm.groq_client import warmup_model
+    from app.retrieval.reranker import warmup_reranker
     Path(settings.data_dir).mkdir(parents=True, exist_ok=True)
     Path("data/pdfs").mkdir(parents=True, exist_ok=True)
     guide_path = Path(settings.data_dir) / "guide.md"
     if guide_path.exists():
         retriever = get_retriever()
             logger.info("قاعدة البيانات المتجهية فارغة — بدء الاستيعاب التلقائي...")
             from app.ingestion import ingest_all_markdown
             ingest_all_markdown(settings.data_dir)
     retriever = get_retriever()
     retriever.embed_model.encode(["warm up"], normalize_embeddings=True)
     logger.info("تم تسخين نموذج التضمين")
     await warmup_model()
     await warmup_reranker()
+    # logger.info("التطبيق جاهز — %s", settings.ollama_model)
+    logger.info("التطبيق جاهز — %s", settings.groq_model)
     yield
     logger.info("التطبيق يُغلق...")
 app = FastAPI(
     title="ASU RAG Chatbot",
     version="3.0.0",

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 fastapi==0.111.0
 uvicorn[standard]==0.29.0
 python-multipart==0.0.9
@@ -7,12 +10,12 @@ sentence-transformers==3.0.1
 # Vector DB (must match vectorstore format)
 chromadb==0.5.3
-posthog==3.0.2
-# Groq LLM client
 groq==0.9.0
-# Reranker HTTP client (HuggingFace Inference API)
 httpx==0.27.0
 # NLP utilities
@@ -23,7 +26,6 @@ rank-bm25==0.2.2
 # Data stack (CRITICAL pins)
 numpy==1.26.4
 scikit-learn==1.4.2
-pandas==2.2.2
 joblib>=1.3.0
 # Optional but stabilizes HF stack
@@ -35,6 +37,9 @@ python-dotenv==1.0.1
 pydantic==2.7.1
 pydantic-settings==2.2.1
-# Torch CPU build (HF Spaces compatible)
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.3.1+cpu

+# PyTorch index — CUDA 12.1 (GPU). For CPU: change cu121 to cpu
+--extra-index-url https://download.pytorch.org/whl/cu121
 fastapi==0.111.0
 uvicorn[standard]==0.29.0
 python-multipart==0.0.9
 # Vector DB (must match vectorstore format)
 chromadb==0.5.3
+# LLM + Reranker HTTP client
 groq==0.9.0
+# httpx is used for:
+#   1. Streaming Ollama responses
+#   2. Reranker API calls (currently Groq — HuggingFace disable for now)
 httpx==0.27.0
 # NLP utilities
 # Data stack (CRITICAL pins)
 numpy==1.26.4
 scikit-learn==1.4.2
 joblib>=1.3.0
 # Optional but stabilizes HF stack
 pydantic==2.7.1
 pydantic-settings==2.2.1
+# Torch — CUDA 12.1 build for RTX 3050 GPU acceleration (default)
+# To switch to CPU instead:
+#   Step 1: Comment out the --extra-index-url line at the top of this file
+#   Step 2: Replace the line below with: torch==2.3.1+cpu
+#   Step 3: Run: pip install torch==2.3.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.3.1+cu121

setup.sh CHANGED Viewed

@@ -6,34 +6,41 @@ set -e
 echo ""
 echo "╔══════════════════════════════════════════════╗"
-echo "║     Arabic RAG Chatbot — Setup Script        ║"
 echo "╚══════════════════════════════════════════════╝"
 echo ""
 # ── 1. Python virtual environment ──────────────────────────────────────────────
 if [ ! -d ".venv" ]; then
-    echo "📦 Creating Python virtual environment..."
     python -m venv .venv
 fi
 source .venv/bin/activate
-echo "📦 Installing Python dependencies..."
 pip install --upgrade pip -q
 pip install -r requirements.txt -q
-echo "✅ Python dependencies installed"
 # ── 2. Ollama check ────────────────────────────────────────────────────────────
 echo ""
-echo "🔍 Checking Ollama..."
 if ! command -v ollama &> /dev/null; then
-    echo "❌ Ollama not found. Install it from: https://ollama.com/download"
-    echo "   Then run:  ollama pull gemma3"
     exit 1
 fi
 # Check if Ollama is running
 if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
-    echo "🚀 Starting Ollama server in background..."
     ollama serve &
     sleep 3
 fi
@@ -42,38 +49,34 @@ echo "✅ Ollama is running"
 # ── 3. Pull LLM model if needed ────────────────────────────────────────────────
 echo ""
-echo "📥 Checking for gemma3 model..."
 if ! ollama list | grep -q "gemma3"; then
-    echo "📥 Pulling gemma3 (this downloads ~4 GB once)..."
     ollama pull gemma3
 else
-    echo "✅ gemma3 already available"
 fi
 # ── 4. Prepare Markdown knowledge base ────────────────────────────────────────
 echo ""
-echo "📁 Preparing data/markdown/ directory..."
 mkdir -p data/markdown
 MD_FILES=$(find data/markdown -name "*.md" 2>/dev/null | wc -l)
 if [ "$MD_FILES" -gt 0 ]; then
-    echo "📄 Found $MD_FILES Markdown file(s). Ingesting..."
-    python ingest_markdown.py
-    echo "✅ Knowledge base ready"
 else
-    echo "⚠️  No Markdown files found in data/markdown/"
-    echo "   Place your .md files there and either:"
-    echo "     • Run:  python ingest_markdown.py"
-    echo "     • Or upload via the web UI at http://localhost:8000"
 fi
 # ── 5. Start FastAPI ───────────────────────────────────────────────────────────
 echo ""
-echo "🚀 Starting FastAPI server..."
-echo "   UI:        http://localhost:8000"
-echo "   API docs:  http://localhost:8000/docs"
-echo "   Health:    http://localhost:8000/health"
-echo "   Press Ctrl+C to stop"
 echo ""
 uvicorn main:app --host 0.0.0.0 --port 8000 --reload

 echo ""
 echo "╔══════════════════════════════════════════════╗"
+echo "║     ASU RAG Chatbot — Setup Script        ║"
 echo "╚══════════════════════════════════════════════╝"
 echo ""
 # ── 1. Python virtual environment ──────────────────────────────────────────────
 if [ ! -d ".venv" ]; then
+    echo "Creating Python virtual environment..."
     python -m venv .venv
 fi
 source .venv/bin/activate
+echo "Installing Python dependencies..."
 pip install --upgrade pip -q
 pip install -r requirements.txt -q
+echo "Python dependencies installed"
+# ── .env setup ─────────────────────────────────────────────────────────────────
+if [ ! -f ".env" ]; then
+    echo "Creating .env from template..."
+    cp .env.example .env
+    echo ".env created — edit it if needed"
+fi
 # ── 2. Ollama check ────────────────────────────────────────────────────────────
 echo ""
+echo "Checking Ollama..."
 if ! command -v ollama &> /dev/null; then
+    echo "Ollama not found. Install it from: https://ollama.com/download"
+    echo "Then run:  ollama pull gemma3"
     exit 1
 fi
 # Check if Ollama is running
 if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+    echo "Starting Ollama server in background..."
     ollama serve &
     sleep 3
 fi
 # ── 3. Pull LLM model if needed ────────────────────────────────────────────────
 echo ""
+echo "Checking for gemma3 model..."
 if ! ollama list | grep -q "gemma3"; then
+    echo "Pulling gemma3 (this downloads ~4 GB once)..."
     ollama pull gemma3
 else
+    echo "gemma3 already available"
 fi
 # ── 4. Prepare Markdown knowledge base ────────────────────────────────────────
 echo ""
+echo "Preparing data/markdown/ directory..."
 mkdir -p data/markdown
 MD_FILES=$(find data/markdown -name "*.md" 2>/dev/null | wc -l)
 if [ "$MD_FILES" -gt 0 ]; then
+    echo "Found $MD_FILES Markdown file(s). Ingestion will run automatically on server startup."
 else
+    echo "No Markdown files found in data/markdown/"
+    echo "• Place your .md files then restart the server — ingestion runs automatically"
 fi
 # ── 5. Start FastAPI ───────────────────────────────────────────────────────────
 echo ""
+echo "Starting FastAPI server..."
+echo "API docs:  http://localhost:8000/docs"
+echo "Health:    http://localhost:8000/health"
+echo "Press Ctrl+C to stop"
 echo ""
 uvicorn main:app --host 0.0.0.0 --port 8000 --reload

test_reranker.py DELETED Viewed

@@ -1,28 +0,0 @@
-import asyncio
-import logging
-from app.retrieval.reranker import rerank_chunks, warmup_reranker
-logging.basicConfig(level=logging.INFO)
-async def test():
-    print("Testing warmup...")
-    ok = await warmup_reranker()
-    print("Warmup OK:", ok)
-    chunks = [
-        {'text': 'تتكون كلية العلوم من اقسام الرياضيات والفيزياء والكيمياء', 'source': 'guide.md', 'rrf_score': 0.5, 'metadata': {}},
-        {'text': 'يجب على الطالب اجتياز 140 ساعة معتمدة', 'source': 'guide.md', 'rrf_score': 0.4, 'metadata': {}},
-        {'text': 'الطقس جميل اليوم', 'source': 'x.md', 'rrf_score': 0.3, 'metadata': {}},
-        {'text': 'قسم الرياضيات يضم تخصصات عديدة', 'source': 'guide.md', 'rrf_score': 0.2, 'metadata': {}},
-        {'text': 'مواعيد التسجيل في الفصل الاول', 'source': 'guide.md', 'rrf_score': 0.1, 'metadata': {}},
-        {'text': 'كلية العلوم جامعة عين شمس تأسست عام 1950', 'source': 'guide.md', 'rrf_score': 0.05, 'metadata': {}}
-    ]
-    print("\nTesting reranking...")
-    result = await rerank_chunks('ما هي اقسام كلية العلوم؟', chunks, top_k=3)
-    print("\nResults:")
-    for r in result:
-        print(f"{r['rerank_score']:.4f} | {r['text'][:60]}")
-if __name__ == "__main__":
-    asyncio.run(test())