Spaces:

therandomuser03
/

psypredict-backend

Sleeping

App Files Files Community

therandomuser03 commited on Mar 11

Commit

f0f84fb

1 Parent(s): 874733d

Add application file

Browse files

Files changed (14) hide show

.dockerignore +34 -0
.env.example +2 -2
.gitattributes +1 -0
Dockerfile +54 -25
app/api/endpoints/therapist.py +2 -5
app/config.py +8 -8
app/main.py +9 -7
app/ml_assets/MEDICATION.csv +0 -0
app/schemas.py +25 -25
app/services/ollama_engine.py +77 -129
download_models.py +1 -12
main.py +175 -0
requirements.txt +27 -9
start.sh +59 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# ── Python ────────────────────────────────────────────────────────────────────
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.egg-info/
+.pytest_cache/
+# ── Virtual Environments ─────────────────────────────────────────────────────
+venv/
+.venv/
+env/
+# ── Secrets (never bake into image) ──────────────────────────────────────────
+.env
+.env.*
+# ── Git ───────────────────────────────────────────────────────────────────────
+.git/
+.gitignore
+.gitattributes
+# ── IDE / OS ──────────────────────────────────────────────────────────────────
+.vscode/
+.idea/
+.DS_Store
+Thumbs.db
+# ── Notebooks / Dev tools ────────────────────────────────────────────────────
+notebooks/
+*.ipynb
+# ── README (not needed in image) ─────────────────────────────────────────────
+README.md

.env.example CHANGED Viewed

@@ -2,9 +2,9 @@
 # Copy this file to .env and fill in any overrides needed.
 # All values below are production defaults.
-# ── Ollama / LLaMA 3 (Local Inference) ──────────────────────────────────────
 OLLAMA_BASE_URL=http://localhost:11434
-OLLAMA_MODEL=llama3
 OLLAMA_TIMEOUT_S=90
 OLLAMA_RETRIES=3
 OLLAMA_RETRY_DELAY_S=2.0

 # Copy this file to .env and fill in any overrides needed.
 # All values below are production defaults.
+# ── Ollama / Phi-3.5 Mini (Local Inference) ──────────────────────────────────
 OLLAMA_BASE_URL=http://localhost:11434
+OLLAMA_MODEL=phi3.5:3.8b-mini-instruct-q4_0
 OLLAMA_TIMEOUT_S=90
 OLLAMA_RETRIES=3
 OLLAMA_RETRY_DELAY_S=2.0

.gitattributes CHANGED Viewed

@@ -4,3 +4,4 @@ app/ml_assets/emotion_model_trained.h5 filter=lfs diff=lfs merge=lfs -text
 app/ml_assets/emotion_model_trained.keras filter=lfs diff=lfs merge=lfs -text
 app/ml_assets/*.h5 filter=lfs diff=lfs merge=lfs -text
 app/ml_assets/*.keras filter=lfs diff=lfs merge=lfs -text

 app/ml_assets/emotion_model_trained.keras filter=lfs diff=lfs merge=lfs -text
 app/ml_assets/*.h5 filter=lfs diff=lfs merge=lfs -text
 app/ml_assets/*.keras filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,47 +1,76 @@
-# 1. Use Python 3.10 slim
 FROM python:3.10-slim
-# 2. Set working directory
 WORKDIR /app
-# 3. Install system dependencies
-#    - libgl1/libglib2.0-0: OpenCV needs these
-#    - build-essential/cmake: needed to compile llama-cpp-python from source
-RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends -o Acquire::Retries=3 \
     libgl1 \
     libglib2.0-0 \
-    build-essential \
-    cmake \
     && rm -rf /var/lib/apt/lists/*
-# 4. Install PyTorch CPU-only FIRST (saves ~1.5GB vs full CUDA torch)
-#    This is a separate layer so it caches well
 RUN pip install --no-cache-dir \
     torch --index-url https://download.pytorch.org/whl/cpu
-# 5. Install llama-cpp-python from pre-built CPU wheels (avoids 30+ min C++ compile)
-#    PINNED to 0.3.2 — the latest version with pre-built wheels on the abetlen index.
-#    Without the pin, pip resolves 0.3.16+ from PyPI which has NO pre-built wheel
-#    and falls back to compiling llama.cpp from source (times out on HF Spaces).
-RUN pip install --no-cache-dir --prefer-binary \
-    llama-cpp-python==0.3.2 \
-    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-# 6. Install remaining Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# 7. Copy your code
 COPY . .
-# 8. Download all ML models (Face, Text, LLaMA 3 GGUF) during build
 RUN python download_models.py
-# 9. Environment & Port settings (7860 is HF Spaces standard)
 ENV PYTHONPATH=/app
-ENV USE_EMBEDDED_LLM=True
 ENV HF_HUB_OFFLINE=1
 EXPOSE 7860
-# 10. Run the app with Uvicorn
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

+# ─────────────────────────────────────────────────────────────────────────────
+# PsyPredict — Backend Dockerfile for Hugging Face Spaces (CPU / Docker SDK)
+#
+# Architecture:
+#   - Ollama binary installed inside the container (serves Phi-3.5 on port 11434)
+#   - FastAPI app served by Uvicorn on port 7860 (HF Spaces standard port)
+#   - start.sh orchestrates: Ollama → model pull → Uvicorn
+#   - ML assets (Keras face model + CSV) are downloaded at BUILD time via gdown
+#   - DistilBERT + Crisis classifier are downloaded at BUILD time from HF Hub
+#   - HF_HUB_OFFLINE=1 at runtime so the container starts offline-capable
+# ─────────────────────────────────────────────────────────────────────────────
 FROM python:3.10-slim
 WORKDIR /app
+# ── 1. System dependencies ────────────────────────────────────────────────────
+# libgl1 + libglib2.0-0: OpenCV headless needs these
+# curl + ca-certificates: needed to download Ollama install script
+RUN apt-get update && apt-get install -y --no-install-recommends \
     libgl1 \
     libglib2.0-0 \
+    curl \
+    ca-certificates \
     && rm -rf /var/lib/apt/lists/*
+# ── 2. Install Ollama binary ──────────────────────────────────────────────────
+# Uses the official install script — places `ollama` binary in /usr/local/bin
+RUN curl -fsSL https://ollama.com/install.sh | sh
+# ── 3. PyTorch CPU-only (separate layer — ~800MB, caches very well) ───────────
 RUN pip install --no-cache-dir \
     torch --index-url https://download.pytorch.org/whl/cpu
+# ── 4. Install remaining Python dependencies ──────────────────────────────────
+# Note: torch is already installed above; pip will skip it when it hits
+# the torch line in requirements.txt (version constraint already satisfied).
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# ── 5. Copy application source code ──────────────────────────────────────────
 COPY . .
+# ── 6. Download ML assets at BUILD time ──────────────────────────────────────
+# Downloads:
+#   - app/ml_assets/emotion_model_trained.h5  (Keras CNN face model, ~4MB, Google Drive)
+#   - app/ml_assets/MEDICATION.csv            (remedy database, Google Drive)
+#   - app/ml_assets/distilbert_model/         (DistilBERT emotion classifier, ~260MB, HF Hub)
+#   - app/ml_assets/crisis_model/             (MiniLM zero-shot classifier, ~130MB, HF Hub)
+#
+# Skips files that already exist in the build context (e.g. haarcascade XML).
+# HF_HUB_OFFLINE must be 0 here so transformers can reach HuggingFace.
+ENV HF_HUB_OFFLINE=0
 RUN python download_models.py
+# ── 7. Runtime environment ────────────────────────────────────────────────────
 ENV PYTHONPATH=/app
+# Ollama runs locally inside the container
+ENV OLLAMA_BASE_URL=http://localhost:11434
+ENV OLLAMA_MODEL=phi3.5:3.8b-mini-instruct-q4_0
+ENV OLLAMA_TIMEOUT_S=300
+ENV OLLAMA_RETRIES=2
+# All HF models were baked in at build time — go offline for faster startup
 ENV HF_HUB_OFFLINE=1
+ENV LOG_LEVEL=INFO
+ENV RATE_LIMIT=30/minute
+# ── 8. Expose HF Spaces standard port ────────────────────────────────────────
 EXPOSE 7860
+# ── 9. Startup script ─────────────────────────────────────────────────────────
+# start.sh: starts Ollama daemon → pulls Phi-3.5 model → launches Uvicorn
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+CMD ["/start.sh"]

app/api/endpoints/therapist.py CHANGED Viewed

@@ -5,7 +5,7 @@ Full inference pipeline:
   2. Text emotion classification (DistilBERT)
   3. Crisis evaluation (zero-shot NLI) — override if triggered
   4. Multimodal fusion (text + face)
-  5. Ollama/LLaMA 3 structured report generation
   6. PsychReport JSON schema validation
   7. Streaming response option
 """
@@ -52,7 +52,7 @@ EMOTION_TO_CONDITION: dict[str, str] = {
 # POST /api/chat
 # ---------------------------------------------------------------------------
-@router.post("/chat", response_model=ChatResponse)
 async def chat(req: ChatRequest):  # type: ignore[misc]
     """
     Main inference endpoint.
@@ -98,16 +98,13 @@ async def chat(req: ChatRequest):  # type: ignore[misc]
     # ── Step 4: Streaming Response ───────────────────────────────────────────
     if req.stream:
-        import asyncio as _asyncio
         async def stream_generator():
-            accumulated = ""
             async for token in ollama_engine.generate_stream(
                 user_text=user_text,
                 face_emotion=face_emotion,
                 history=history,
                 text_emotion_summary=text_emotion_summary,
             ):
-                accumulated += token
                 yield token
         return StreamingResponse(stream_generator(), media_type="text/plain")

   2. Text emotion classification (DistilBERT)
   3. Crisis evaluation (zero-shot NLI) — override if triggered
   4. Multimodal fusion (text + face)
+  5. Ollama/Phi-3.5 Mini structured report generation
   6. PsychReport JSON schema validation
   7. Streaming response option
 """
 # POST /api/chat
 # ---------------------------------------------------------------------------
+@router.post("/chat")
 async def chat(req: ChatRequest):  # type: ignore[misc]
     """
     Main inference endpoint.
     # ── Step 4: Streaming Response ───────────────────────────────────────────
     if req.stream:
         async def stream_generator():
             async for token in ollama_engine.generate_stream(
                 user_text=user_text,
                 face_emotion=face_emotion,
                 history=history,
                 text_emotion_summary=text_emotion_summary,
             ):
                 yield token
         return StreamingResponse(stream_generator(), media_type="text/plain")

app/config.py CHANGED Viewed

@@ -7,15 +7,15 @@ from functools import lru_cache
 class Settings(BaseSettings):
-    # Ollama / LLM
-    OLLAMA_BASE_URL: str = "http://localhost:11434"
-    OLLAMA_MODEL: str = "llama3"
-    OLLAMA_TIMEOUT_S: int = 120
-    # --- Embedded LLM Settings (for Docker/HF Spaces) ---
-    USE_EMBEDDED_LLM: bool = False  # Set to True in .env for Docker/HF Spaces
-    GGUF_MODEL_PATH: str = "app/ml_assets/llama-3-8b-instruct.Q4_K_M.gguf"
-    LLM_CONTEXT_SIZE: int = 2048
     OLLAMA_RETRIES: int = 3
     OLLAMA_RETRY_DELAY_S: float = 2.0

 class Settings(BaseSettings):
+    # Ollama / LLM (Centralized API)
+    # Update this to your DigitalOcean/VPS IP address where Ollama is running
+    # Default is localhost (e.g. for development), but in production it should be like:
+    # OLLAMA_BASE_URL: str = "http://123.45.67.89:11434"
+    OLLAMA_BASE_URL: str = "http://127.0.0.1:11434"
+    OLLAMA_MODEL: str = "phi3.5:3.8b-mini-instruct-q4_0"
+    OLLAMA_TIMEOUT_S: int = 90
+    # Retry logic for external LLM API
     OLLAMA_RETRIES: int = 3
     OLLAMA_RETRY_DELAY_S: float = 2.0

app/main.py CHANGED Viewed

@@ -63,15 +63,17 @@ async def lifespan(app: FastAPI):
     logger.info("═══════════════════════════════════════")
     logger.info("Config: Ollama=%s model=%s", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
-    # Pre-warm DistilBERT text emotion model
-    logger.info("Pre-warming DistilBERT text emotion model...")
     from app.services.text_emotion_engine import initialize as init_text
-    init_text(settings.DISTILBERT_MODEL)
-    # Pre-warm Crisis zero-shot classifier
-    logger.info("Pre-warming crisis detection classifier...")
     from app.services.crisis_engine import initialize_crisis_classifier
-    initialize_crisis_classifier()
     # Check Ollama availability (non-blocking warn only)
     from app.services.ollama_engine import ollama_engine
@@ -106,7 +108,7 @@ def create_app() -> FastAPI:
         title="PsyPredict API",
         description=(
             "Production-grade multimodal mental health AI system. "
-            "Powered by LLaMA 3 (Ollama) + DistilBERT + Keras CNN facial emotion model."
         ),
         version="2.0.0",
         lifespan=lifespan,

     logger.info("═══════════════════════════════════════")
     logger.info("Config: Ollama=%s model=%s", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
+    import asyncio as _asyncio
+    # Pre-warm DistilBERT text emotion model (in background)
+    logger.info("Initializing DistilBERT text emotion model (background)...")
     from app.services.text_emotion_engine import initialize as init_text
+    _asyncio.create_task(_asyncio.to_thread(init_text, settings.DISTILBERT_MODEL))
+    # Pre-warm Crisis zero-shot classifier (in background)
+    logger.info("Initializing crisis detection classifier (background)...")
     from app.services.crisis_engine import initialize_crisis_classifier
+    _asyncio.create_task(_asyncio.to_thread(initialize_crisis_classifier))
     # Check Ollama availability (non-blocking warn only)
     from app.services.ollama_engine import ollama_engine
         title="PsyPredict API",
         description=(
             "Production-grade multimodal mental health AI system. "
+            "Powered by Phi-3.5 Mini (Ollama) + DistilBERT + Keras CNN facial emotion model."
         ),
         version="2.0.0",
         lifespan=lifespan,

app/ml_assets/MEDICATION.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

app/schemas.py CHANGED Viewed

@@ -108,6 +108,31 @@ def fallback_report() -> PsychReport:
     )
 # ---------------------------------------------------------------------------
 # Chat Endpoint
 # ---------------------------------------------------------------------------
@@ -162,31 +187,6 @@ class TextAnalysisResponse(BaseModel):
     crisis_triggered: bool
-# ---------------------------------------------------------------------------
-# Facial / Emotion Endpoint
-# ---------------------------------------------------------------------------
-class EmotionResponse(BaseModel):
-    emotion: Optional[str] = None
-    confidence: Optional[float] = None
-    face_box: Optional[List[int]] = None
-    message: Optional[str] = None
-    error: Optional[str] = None
-# ---------------------------------------------------------------------------
-# Remedy Endpoint
-# ---------------------------------------------------------------------------
-class RemedyResponse(BaseModel):
-    condition: str
-    symptoms: str
-    treatments: str
-    medications: str
-    dosage: str
-    gita_remedy: str
 # ---------------------------------------------------------------------------
 # Health Endpoint
 # ---------------------------------------------------------------------------

     )
+# ---------------------------------------------------------------------------
+# Remedy Endpoint  (must be defined BEFORE ChatResponse which references it)
+# ---------------------------------------------------------------------------
+class RemedyResponse(BaseModel):
+    condition: str
+    symptoms: str
+    treatments: str
+    medications: str
+    dosage: str
+    gita_remedy: str
+# ---------------------------------------------------------------------------
+# Facial / Emotion Endpoint
+# ---------------------------------------------------------------------------
+class EmotionResponse(BaseModel):
+    emotion: Optional[str] = None
+    confidence: Optional[float] = None
+    face_box: Optional[List[int]] = None
+    message: Optional[str] = None
+    error: Optional[str] = None
 # ---------------------------------------------------------------------------
 # Chat Endpoint
 # ---------------------------------------------------------------------------
     crisis_triggered: bool
 # ---------------------------------------------------------------------------
 # Health Endpoint
 # ---------------------------------------------------------------------------

app/services/ollama_engine.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-ollama_engine.py — PsyPredict Local LLM Engine
 Async Ollama client with:
   - Structured JSON output enforced via schema-in-prompt + Ollama format param
   - Context window trimming
@@ -97,47 +97,40 @@ FACE_DISTRESS_MAP: dict[str, float] = {
 class OllamaEngine:
     """
-    Production async LLM engine backed by local Ollama/LLaMA 3.
     """
     def __init__(self) -> None:
         self.settings = get_settings()
         self._client: Optional[httpx.AsyncClient] = None
-        self._local_llm: Optional[any] = None  # llama_cpp.Llama instance
     @property
     def client(self) -> httpx.AsyncClient:
         if self._client is None or self._client.is_closed:
-            self._client = httpx.AsyncClient(
-                base_url=self.settings.OLLAMA_BASE_URL,
-                timeout=httpx.Timeout(
-                    connect=10.0,
-                    read=self.settings.OLLAMA_TIMEOUT_S,
-                    write=30.0,
-                    pool=5.0,
-                ),
-            )
         return self._client
-    def _get_local_llm(self):
-        """Lazy load llama-cpp-python model."""
-        if self._local_llm is None:
             try:
-                from llama_cpp import Llama
-                logger.info("Loading local GGUF model from %s", self.settings.GGUF_MODEL_PATH)
-                self._local_llm = Llama(
-                    model_path=self.settings.GGUF_MODEL_PATH,
-                    n_ctx=self.settings.LLM_CONTEXT_SIZE,
-                    n_threads=os.cpu_count() or 4,
-                    verbose=False
-                )
-            except ImportError:
-                logger.error("llama-cpp-python not installed. Cannot use embedded LLM.")
-                raise RuntimeError("llama-cpp-python not installed")
-            except Exception as exc:
-                logger.error("Failed to load local GGUF model: %s", exc)
-                raise
-        return self._local_llm
     async def close(self) -> None:
         if self._client and not self._client.is_closed:
@@ -254,61 +247,28 @@ class OllamaEngine:
         text_emotion_summary: Optional[str] = None,
     ) -> tuple[str, PsychReport]:
         """
-        Calls either Ollama API or Embedded LLM based on settings,
-        with automatic fallback to local if Ollama is unreachable.
         """
-        # If user explicitly wants embedded mode
-        if self.settings.USE_EMBEDDED_LLM:
-            return await self._generate_local(user_text, face_emotion, history, text_emotion_summary)
-        # Otherwise try Ollama, fallback to local if it fails and GGUF is available
         try:
-            reply, report = await self._generate_ollama(user_text, face_emotion, history, text_emotion_summary)
-            # If _generate_ollama returned the hardcoded fallback string, it failed its retries
-            if "inference service is temporarily unavailable" in reply:
-                # Check for GGUF before giving up
-                if os.path.exists(self.settings.GGUF_MODEL_PATH):
-                    logger.info("Ollama service unreachable after retries, falling back to GGUF.")
-                    return await self._generate_local(user_text, face_emotion, history, text_emotion_summary)
-            return reply, report
         except Exception as exc:
-            if os.path.exists(self.settings.GGUF_MODEL_PATH):
-                logger.info("Ollama failed, falling back to embedded GGUF model: %s", exc)
-                return await self._generate_local(user_text, face_emotion, history, text_emotion_summary)
-            else:
-                logger.error("Ollama failed and no GGUF model found for fallback at %s", self.settings.GGUF_MODEL_PATH)
-                return (
-                    "The inference service is temporarily unavailable and no local fallback is configured.",
-                    fallback_report(),
-                )
-    async def _generate_local(
-        self,
-        user_text: str,
-        face_emotion: str,
-        history: Optional[List[ConversationMessage]],
-        text_emotion_summary: Optional[str]
-    ) -> tuple[str, PsychReport]:
-        """Embedded generation via llama-cpp-python."""
-        if history is None: history = []
-        prompt = self._build_prompt(user_text, face_emotion, history, text_emotion_summary)
-        try:
-            llm = self._get_local_llm()
-            # Run blocking LLM call in a separate thread
-            response = await asyncio.to_thread(
-                llm,
-                prompt=prompt,
-                max_tokens=600,
-                temperature=0.2,
-                top_p=0.9,
-                stop=["USER:", "CURRENT USER INPUT:"]
             )
-            raw_text = response["choices"][0]["text"]
-            return self._parse_response(raw_text)
-        except Exception as exc:
-            logger.error("Embedded local LLM failed: %s", exc)
-            return "The local inference service encountered an error.", fallback_report()
     async def _generate_ollama(
         self,
@@ -327,9 +287,9 @@ class OllamaEngine:
             "prompt": prompt,
             "stream": False,
             "options": {
-                "temperature": 0.2,      # Low temp for determinism
                 "top_p": 0.9,
-                "num_ctx": 4096,
                 "stop": [],
             },
         }
@@ -355,6 +315,7 @@ class OllamaEngine:
             except httpx.TimeoutException as exc:
                 last_error = exc
                 logger.warning("Ollama timeout on attempt %d: %s", attempt, exc)
             except httpx.HTTPStatusError as exc:
                 last_error = exc
                 logger.error("Ollama HTTP error %s: %s", exc.response.status_code, exc)
@@ -362,6 +323,7 @@ class OllamaEngine:
             except Exception as exc:
                 last_error = exc
                 logger.error("Ollama unexpected error: %s", exc)
             if attempt < self.settings.OLLAMA_RETRIES:
                 await asyncio.sleep(delay)
@@ -388,45 +350,27 @@ class OllamaEngine:
         text_emotion_summary: Optional[str] = None,
     ) -> AsyncIterator[str]:
         """
-        Yields raw text chunks as they arrive from either Ollama or Embedded LLM.
         """
-        if self.settings.USE_EMBEDDED_LLM:
-            async for chunk in self._generate_stream_local(user_text, face_emotion, history, text_emotion_summary):
-                yield chunk
-        else:
-            async for chunk in self._generate_stream_ollama(user_text, face_emotion, history, text_emotion_summary):
-                yield chunk
-    async def _generate_stream_local(
-        self,
-        user_text: str,
-        face_emotion: str,
-        history: Optional[List[ConversationMessage]],
-        text_emotion_summary: Optional[str]
-    ) -> AsyncIterator[str]:
-        """Embedded streaming via llama-cpp-python."""
-        if history is None: history = []
-        prompt = self._build_prompt(user_text, face_emotion, history, text_emotion_summary)
-        try:
-            llm = self._get_local_llm()
-            # llama-cpp-python streaming is synchronous, so we need to wrap it
-            stream = llm(
-                prompt=prompt,
-                max_tokens=600,
-                temperature=0.2,
-                top_p=0.9,
-                stream=True,
-                stop=["USER:", "CURRENT USER INPUT:"]
             )
-            for chunk in stream:
-                token = chunk["choices"][0]["text"]
-                if token:
-                    yield token
-                await asyncio.sleep(0) # Yield control
-        except Exception as exc:
-            logger.error("Embedded streaming failed: %s", exc)
-            yield "\n[Local inference error]"
     async def _generate_stream_ollama(
         self,
@@ -437,7 +381,7 @@ class OllamaEngine:
     ) -> AsyncIterator[str]:
         """
         Yields raw text chunks as they arrive from Ollama.
-        With automatic fallback to local streaming if Ollama is unreachable.
         """
         if history is None:
             history = []
@@ -448,11 +392,18 @@ class OllamaEngine:
             "model": self.settings.OLLAMA_MODEL,
             "prompt": prompt,
             "stream": True,
-            "options": {"temperature": 0.2, "top_p": 0.9, "num_ctx": 4096},
         }
         try:
-            async with self.client.stream("POST", "/api/generate", json=payload) as resp:
                 resp.raise_for_status()
                 async for line in resp.aiter_lines():
                     if not line.strip():
@@ -468,12 +419,9 @@ class OllamaEngine:
                         continue
         except Exception as exc:
             logger.error("Ollama streaming failed: %s", exc)
-            if os.path.exists(self.settings.GGUF_MODEL_PATH):
-                logger.info("Falling back to local GGUF streaming.")
-                async for chunk in self._generate_stream_local(user_text, face_emotion, history, text_emotion_summary):
-                    yield chunk
-            else:
-                yield "\n[Inference service error — please retry]\n"
 # ---------------------------------------------------------------------------

 """
+ollama_engine.py — PsyPredict Local LLM Engine (Phi-3.5 Mini)
 Async Ollama client with:
   - Structured JSON output enforced via schema-in-prompt + Ollama format param
   - Context window trimming
 class OllamaEngine:
     """
+    Production async LLM engine backed by local Ollama/Phi-3.5 Mini.
     """
     def __init__(self) -> None:
         self.settings = get_settings()
         self._client: Optional[httpx.AsyncClient] = None
+    def _make_client(self, stream: bool = False) -> httpx.AsyncClient:
+        """Create a fresh httpx client. For streaming, read timeout is None (unbounded)."""
+        read_timeout = None if stream else float(self.settings.OLLAMA_TIMEOUT_S)
+        return httpx.AsyncClient(
+            base_url=self.settings.OLLAMA_BASE_URL,
+            timeout=httpx.Timeout(
+                connect=10.0,
+                read=read_timeout,
+                write=30.0,
+                pool=5.0,
+            ),
+        )
     @property
     def client(self) -> httpx.AsyncClient:
         if self._client is None or self._client.is_closed:
+            self._client = self._make_client(stream=False)
         return self._client
+    async def _reset_client(self) -> None:
+        """Close and discard the current client so the next call gets a fresh one."""
+        if self._client and not self._client.is_closed:
             try:
+                await self._client.aclose()
+            except Exception:
+                pass
+        self._client = None
     async def close(self) -> None:
         if self._client and not self._client.is_closed:
         text_emotion_summary: Optional[str] = None,
     ) -> tuple[str, PsychReport]:
         """
+        Calls external Ollama API with early reachability check.
         """
+        # Fast-fail: check reachability before waiting for full timeout
+        if not await self.is_reachable():
+            logger.warning(
+                "Ollama unreachable at %s — skipping inference, returning fallback.",
+                self.settings.OLLAMA_BASE_URL,
+            )
+            return (
+                "The inference service is currently offline. Please ensure Ollama is running "
+                f"at {self.settings.OLLAMA_BASE_URL} with model '{self.settings.OLLAMA_MODEL}'.",
+                fallback_report(),
+            )
         try:
+            return await self._generate_ollama(user_text, face_emotion, history, text_emotion_summary)
         except Exception as exc:
+            logger.error("Ollama API call failed entirely: %s", exc)
+            await self._reset_client()
+            return (
+                "The inference service is temporarily unavailable. Please verify your external Ollama server is running.",
+                fallback_report(),
             )
     async def _generate_ollama(
         self,
             "prompt": prompt,
             "stream": False,
             "options": {
+                "temperature": 0.2,
                 "top_p": 0.9,
+                "num_ctx": 8192,   # Match model's full context window
                 "stop": [],
             },
         }
             except httpx.TimeoutException as exc:
                 last_error = exc
                 logger.warning("Ollama timeout on attempt %d: %s", attempt, exc)
+                await self._reset_client()  # Reset client after timeout
             except httpx.HTTPStatusError as exc:
                 last_error = exc
                 logger.error("Ollama HTTP error %s: %s", exc.response.status_code, exc)
             except Exception as exc:
                 last_error = exc
                 logger.error("Ollama unexpected error: %s", exc)
+                await self._reset_client()
             if attempt < self.settings.OLLAMA_RETRIES:
                 await asyncio.sleep(delay)
         text_emotion_summary: Optional[str] = None,
     ) -> AsyncIterator[str]:
         """
+        Yields raw text chunks as they arrive from External Ollama.
+        Fast-fails with a clear message if Ollama is unreachable.
         """
+        # Early reachability check — prevents indefinite hang on dead server
+        if not await self.is_reachable():
+            logger.warning(
+                "Ollama unreachable at %s — aborting stream, returning fallback.",
+                self.settings.OLLAMA_BASE_URL,
             )
+            fallback_msg = (
+                f"The inference service is currently offline. "
+                f"Please ensure Ollama is running at {self.settings.OLLAMA_BASE_URL} "
+                f"with model '{self.settings.OLLAMA_MODEL}'.\n"
+                f"---JSON---\n"
+                + __import__('json').dumps(fallback_report().model_dump())
+            )
+            yield fallback_msg
+            return
+        async for chunk in self._generate_stream_ollama(user_text, face_emotion, history, text_emotion_summary):
+            yield chunk
     async def _generate_stream_ollama(
         self,
     ) -> AsyncIterator[str]:
         """
         Yields raw text chunks as they arrive from Ollama.
+        Uses an unbounded read timeout so slow CPU inference never times out mid-stream.
         """
         if history is None:
             history = []
             "model": self.settings.OLLAMA_MODEL,
             "prompt": prompt,
             "stream": True,
+            "options": {
+                "temperature": 0.2,
+                "top_p": 0.9,
+                "num_ctx": 8192,   # Match model's full context window
+            },
         }
+        # Use a dedicated streaming client with no read timeout
+        # (tokens trickle in slowly on CPU — we must not cut the connection)
+        stream_client = self._make_client(stream=True)
         try:
+            async with stream_client.stream("POST", "/api/generate", json=payload) as resp:
                 resp.raise_for_status()
                 async for line in resp.aiter_lines():
                     if not line.strip():
                         continue
         except Exception as exc:
             logger.error("Ollama streaming failed: %s", exc)
+            yield "\n[Inference error — Ollama took too long or disconnected. Try again.]\n"
+        finally:
+            await stream_client.aclose()
 # ---------------------------------------------------------------------------

download_models.py CHANGED Viewed

@@ -6,15 +6,10 @@ from huggingface_hub import hf_hub_download
 MODEL_ID = "10GWSogJNKlPlTeWtJkDq_zc4roB1Vmnu" # Keras Face Emotion
 CSV_ID   = "1bJ8C1BY0rvPNKuWcBgqiUtiSzHziZokH" # Medication CSV
-# Llama-3-8B-Instruct GGUF (Quantized for CPU/RAM efficiency)
-LLAMA_REPO = "MaziyarPanahi/Llama-3-8B-Instruct-v0.1-GGUF"
-LLAMA_FILE = "Llama-3-8B-Instruct-v0.1.Q4_K_M.gguf"
 # Destinations
 ML_ASSETS = "app/ml_assets"
 FACE_MODEL_PATH = os.path.join(ML_ASSETS, "emotion_model_trained.h5")
 MEDS_CSV_PATH = os.path.join(ML_ASSETS, "MEDICATION.csv")
-LLAMA_GGUF_PATH = os.path.join(ML_ASSETS, "llama-3-8b-instruct.Q4_K_M.gguf")
 # HF Transformers (Downloaded via snapshot_download for full directory)
 CRISIS_MODEL_REPO = "cross-encoder/nli-MiniLM2-L6-H768"
@@ -69,13 +64,7 @@ if __name__ == "__main__":
     download_drive_file(MODEL_ID, FACE_MODEL_PATH)
     download_drive_file(CSV_ID, MEDS_CSV_PATH)
-    # 2. HF Models (Llama 3)
-    try:
-        download_hf_model(LLAMA_REPO, LLAMA_FILE, LLAMA_GGUF_PATH)
-    except Exception as e:
-        print(f"⚠️ HF LLaMA Download failed (expected on local dev if no internet): {e}")
-    # 3. HF Transformers Pipeline Models
     try:
         download_hf_directory(CRISIS_MODEL_REPO, CRISIS_MODEL_PATH)
         download_hf_directory(DISTILBERT_MODEL_REPO, DISTILBERT_MODEL_PATH)

 MODEL_ID = "10GWSogJNKlPlTeWtJkDq_zc4roB1Vmnu" # Keras Face Emotion
 CSV_ID   = "1bJ8C1BY0rvPNKuWcBgqiUtiSzHziZokH" # Medication CSV
 # Destinations
 ML_ASSETS = "app/ml_assets"
 FACE_MODEL_PATH = os.path.join(ML_ASSETS, "emotion_model_trained.h5")
 MEDS_CSV_PATH = os.path.join(ML_ASSETS, "MEDICATION.csv")
 # HF Transformers (Downloaded via snapshot_download for full directory)
 CRISIS_MODEL_REPO = "cross-encoder/nli-MiniLM2-L6-H768"
     download_drive_file(MODEL_ID, FACE_MODEL_PATH)
     download_drive_file(CSV_ID, MEDS_CSV_PATH)
+    # 2. HF Transformers Pipeline Models
     try:
         download_hf_directory(CRISIS_MODEL_REPO, CRISIS_MODEL_PATH)
         download_hf_directory(DISTILBERT_MODEL_REPO, DISTILBERT_MODEL_PATH)

main.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""
+main.py — PsyPredict FastAPI Application (Production)
+Replaces Flask. Key features:
+  - Async request handling (FastAPI + Uvicorn)
+  - CORS middleware
+  - Rate limiting (SlowAPI)
+  - Structured logging (Python logging)
+  - Startup model pre-warming
+  - Graceful shutdown (Ollama client cleanup)
+  - FastAPI auto docs at /docs (Swagger) and /redoc
+"""
+from __future__ import annotations
+import asyncio
+import logging
+import sys
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from slowapi import Limiter, _rate_limit_exceeded_handler
+from slowapi.errors import RateLimitExceeded
+from slowapi.util import get_remote_address
+from app.config import get_settings
+from app.api.endpoints.facial import router as facial_router
+from app.api.endpoints.remedies import router as remedies_router
+from app.api.endpoints.therapist import router as therapist_router
+from app.api.endpoints.analysis import router as analysis_router
+# ---------------------------------------------------------------------------
+# Windows asyncio fix — prevents noisy "ConnectionResetError: [WinError 10054]"
+# when a streaming client disconnects before the response finishes.
+# SelectorEventLoop handles abrupt pipe closures gracefully unlike the default
+# ProactorEventLoop on Windows.
+# ---------------------------------------------------------------------------
+if sys.platform == "win32":
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+settings = get_settings()
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logging.basicConfig(
+    level=getattr(logging, settings.LOG_LEVEL, logging.INFO),
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Rate Limiter
+# ---------------------------------------------------------------------------
+limiter = Limiter(key_func=get_remote_address, default_limits=[settings.RATE_LIMIT])
+# ---------------------------------------------------------------------------
+# Lifespan (startup / shutdown events)
+# ---------------------------------------------------------------------------
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Startup: pre-warm models (DistilBERT + Crisis classifier).
+    Shutdown: close Ollama async client.
+    """
+    logger.info("═══════════════════════════════════════")
+    logger.info("🚀 PsyPredict v2.0 — Production Backend")
+    logger.info("═══════════════════════════════════════")
+    logger.info("Config: Ollama=%s model=%s", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
+    import asyncio as _asyncio
+    # Pre-warm DistilBERT text emotion model (in background)
+    logger.info("Initializing DistilBERT text emotion model (background)...")
+    from app.services.text_emotion_engine import initialize as init_text
+    _asyncio.create_task(_asyncio.to_thread(init_text, settings.DISTILBERT_MODEL))
+    # Pre-warm Crisis zero-shot classifier (in background)
+    logger.info("Initializing crisis detection classifier (background)...")
+    from app.services.crisis_engine import initialize_crisis_classifier
+    _asyncio.create_task(_asyncio.to_thread(initialize_crisis_classifier))
+    # Check Ollama availability (non-blocking warn only)
+    from app.services.ollama_engine import ollama_engine
+    reachable = await ollama_engine.is_reachable()
+    if reachable:
+        logger.info("✅ Ollama reachable at %s (model: %s)", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
+    else:
+        logger.warning(
+            "⚠️  Ollama NOT reachable at %s — chat will return fallback responses. "
+            "Run: ollama serve && ollama pull %s",
+            settings.OLLAMA_BASE_URL,
+            settings.OLLAMA_MODEL,
+        )
+    logger.info("✅ Startup complete. Listening on port 7860.")
+    logger.info("   Docs: http://localhost:7860/docs")
+    logger.info("═══════════════════════════════════════")
+    yield  # ── Application Running ──
+    logger.info("Shutting down PsyPredict backend...")
+    await ollama_engine.close()
+    logger.info("Goodbye.")
+# ---------------------------------------------------------------------------
+# FastAPI App
+# ---------------------------------------------------------------------------
+def create_app() -> FastAPI:
+    app = FastAPI(
+        title="PsyPredict API",
+        description=(
+            "Production-grade multimodal mental health AI system. "
+            "Powered by Phi-3.5 Mini (Ollama) + DistilBERT + Keras CNN facial emotion model."
+        ),
+        version="2.0.0",
+        lifespan=lifespan,
+        docs_url="/docs",
+        redoc_url="/redoc",
+    )
+    # ── Rate Limiter ─────────────────────────────────────────────────────────
+    app.state.limiter = limiter
+    app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+    # ── CORS ────────────────────────────────────────────────────────────────
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],          # Tighten to specific origin in production
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    # ── Global Exception Handler ─────────────────────────────────────────────
+    @app.exception_handler(Exception)
+    async def global_exception_handler(request: Request, exc: Exception):
+        logger.error("Unhandled exception: %s | path=%s", exc, request.url.path)
+        return JSONResponse(
+            status_code=500,
+            content={"detail": "Internal server error. Please try again."},
+        )
+    # ── Routers ──────────────────────────────────────────────────────────────
+    app.include_router(facial_router, prefix="/api", tags=["Facial Emotion"])
+    app.include_router(remedies_router, prefix="/api", tags=["Remedies"])
+    app.include_router(therapist_router, prefix="/api", tags=["AI Therapist"])
+    app.include_router(analysis_router, prefix="/api", tags=["Text Analysis & Health"])
+    return app
+app = create_app()
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app.main:app",
+        host="0.0.0.0",
+        port=7860,
+        reload=False,
+        log_level=settings.LOG_LEVEL.lower(),
+        workers=1,  # Keep at 1: models are singletons loaded in memory
+    )

requirements.txt CHANGED Viewed

@@ -1,31 +1,49 @@
-# --- Core Backend (FastAPI) ---
 fastapi>=0.111.0
 uvicorn[standard]>=0.30.0
 python-dotenv>=1.0.0
 pydantic>=2.0.0
 pydantic-settings>=2.0.0
-# --- HTTP + Async ---
 httpx>=0.27.0
 anyio>=4.0.0
-# --- Rate Limiting ---
 slowapi>=0.1.9
-# --- AI & Vision (CPU-only TensorFlow — saves ~500MB vs full tensorflow) ---
 numpy<2.0
 opencv-python-headless
 tensorflow-cpu
 pandas
-pillow
 gdown
-# --- NLP (PyTorch CPU-only via --index-url, see Dockerfile) ---
-# llama-cpp-python is installed separately in Dockerfile from pre-built CPU wheels
 transformers>=4.40.0
 sentencepiece==0.1.99
 huggingface-hub>=0.23.0
-# --- Utilities ---
 requests
-python-multipart

+# ─────────────────────────────────────────────────────────────────────────────
+# PsyPredict Backend — Python Dependencies
+#
+# HOW TORCH IS HANDLED:
+#   Docker:  torch is pre-installed in a separate layer BEFORE this file runs:
+#              RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
+#            pip will then skip the torch line below (version already satisfied).
+#   Local:   Run manually first:
+#              pip install torch --index-url https://download.pytorch.org/whl/cpu
+#            Then: pip install -r requirements.txt
+# ─────────────────────────────────────────────────────────────────────────────
+# ── Core Backend (FastAPI) ────────────────────────────────────────────────────
 fastapi>=0.111.0
 uvicorn[standard]>=0.30.0
 python-dotenv>=1.0.0
 pydantic>=2.0.0
 pydantic-settings>=2.0.0
+# ── HTTP + Async ──────────────────────────────────────────────────────────────
 httpx>=0.27.0
 anyio>=4.0.0
+# ── Rate Limiting ─────────────────────────────────────────────────────────────
 slowapi>=0.1.9
+# ── Computer Vision (CPU-only, no CUDA) ──────────────────────────────────────
 numpy<2.0
 opencv-python-headless
+pillow
+# ── Deep Learning: TensorFlow CPU (Keras face emotion model) ─────────────────
+# tensorflow-cpu is ~500MB lighter than full tensorflow (no CUDA/ROCm)
 tensorflow-cpu
 pandas
 gdown
+# ── Deep Learning: PyTorch CPU + HuggingFace Transformers ────────────────────
+# torch is pre-installed by Dockerfile (CPU wheel from PyTorch index).
+# The line below is kept so `pip install -r requirements.txt` works locally
+# after you have manually installed the CPU torch wheel (see note above).
+torch>=2.0.0 --index-url https://download.pytorch.org/whl/cpu
 transformers>=4.40.0
 sentencepiece==0.1.99
 huggingface-hub>=0.23.0
+# ── Utilities ─────────────────────────────────────────────────────────────────
 requests
+python-multipart

start.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/bin/bash
+# ─────────────────────────────────────────────────────────────────────────────
+# start.sh — PsyPredict HF Spaces Startup Orchestrator
+#
+# Execution order:
+#   1. Start Ollama server daemon in the background
+#   2. Wait until Ollama API is healthy (up to 60 seconds)
+#   3. Pull the Phi-3.5 quantized model (skips if already cached in this run)
+#   4. Launch FastAPI / Uvicorn on port 7860
+#
+# Environment variables (set in Dockerfile or HF Space secrets):
+#   OLLAMA_MODEL  — model tag to pull (default: phi3.5:3.8b-mini-instruct-q4_0)
+# ─────────────────────────────────────────────────────────────────────────────
+set -e  # Exit immediately on any error
+echo "═══════════════════════════════════════════════"
+echo "🚀  PsyPredict — Hugging Face Spaces Startup"
+echo "═══════════════════════════════════════════════"
+# ── Step 1: Start Ollama server in the background ─────────────────────────────
+echo "▶  Starting Ollama server..."
+ollama serve &
+OLLAMA_PID=$!
+# ── Step 2: Wait for Ollama to become healthy (max 60 seconds) ────────────────
+echo "⏳  Waiting for Ollama to be ready..."
+RETRIES=30
+for i in $(seq 1 $RETRIES); do
+    if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then
+        echo "✅  Ollama is ready (attempt $i/$RETRIES)."
+        break
+    fi
+    if [ "$i" -eq "$RETRIES" ]; then
+        echo "❌  Ollama failed to start within 60 seconds. Exiting."
+        exit 1
+    fi
+    sleep 2
+done
+# ── Step 3: Pull the Phi-3.5 model ────────────────────────────────────────────
+# 'ollama pull' is idempotent — safe to call even if the model is cached.
+# On HF Spaces, the first pull will download ~2.4 GB; subsequent restarts
+# are faster because the container's /root/.ollama layer is reused.
+MODEL="${OLLAMA_MODEL:-phi3.5:3.8b-mini-instruct-q4_0}"
+echo "▶  Pulling model: $MODEL"
+echo "   (First run downloads ~2.4 GB — may take several minutes on CPU)"
+ollama pull "$MODEL"
+echo "✅  Model ready: $MODEL"
+# ── Step 4: Launch FastAPI on port 7860 ───────────────────────────────────────
+echo "▶  Starting FastAPI (Uvicorn) on port 7860..."
+echo "   API docs → http://localhost:7860/docs"
+echo "═══════════════════════════════════════════════"
+exec uvicorn app.main:app \
+    --host 0.0.0.0 \
+    --port 7860 \
+    --workers 1 \
+    --log-level info