therandomuser03 commited on
Commit
f0f84fb
Β·
1 Parent(s): 874733d

Add application file

Browse files
.dockerignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Python ────────────────────────────────────────────────────────────────────
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.egg-info/
7
+ .pytest_cache/
8
+
9
+ # ── Virtual Environments ─────────────────────────────────────────────────────
10
+ venv/
11
+ .venv/
12
+ env/
13
+
14
+ # ── Secrets (never bake into image) ──────────────────────────────────────────
15
+ .env
16
+ .env.*
17
+
18
+ # ── Git ───────────────────────────────────────────────────────────────────────
19
+ .git/
20
+ .gitignore
21
+ .gitattributes
22
+
23
+ # ── IDE / OS ──────────────────────────────────────────────────────────────────
24
+ .vscode/
25
+ .idea/
26
+ .DS_Store
27
+ Thumbs.db
28
+
29
+ # ── Notebooks / Dev tools ────────────────────────────────────────────────────
30
+ notebooks/
31
+ *.ipynb
32
+
33
+ # ── README (not needed in image) ─────────────────────────────────────────────
34
+ README.md
.env.example CHANGED
@@ -2,9 +2,9 @@
2
  # Copy this file to .env and fill in any overrides needed.
3
  # All values below are production defaults.
4
 
5
- # ── Ollama / LLaMA 3 (Local Inference) ──────────────────────────────────────
6
  OLLAMA_BASE_URL=http://localhost:11434
7
- OLLAMA_MODEL=llama3
8
  OLLAMA_TIMEOUT_S=90
9
  OLLAMA_RETRIES=3
10
  OLLAMA_RETRY_DELAY_S=2.0
 
2
  # Copy this file to .env and fill in any overrides needed.
3
  # All values below are production defaults.
4
 
5
+ # ── Ollama / Phi-3.5 Mini (Local Inference) ──────────────────────────────────
6
  OLLAMA_BASE_URL=http://localhost:11434
7
+ OLLAMA_MODEL=phi3.5:3.8b-mini-instruct-q4_0
8
  OLLAMA_TIMEOUT_S=90
9
  OLLAMA_RETRIES=3
10
  OLLAMA_RETRY_DELAY_S=2.0
.gitattributes CHANGED
@@ -4,3 +4,4 @@ app/ml_assets/emotion_model_trained.h5 filter=lfs diff=lfs merge=lfs -text
4
  app/ml_assets/emotion_model_trained.keras filter=lfs diff=lfs merge=lfs -text
5
  app/ml_assets/*.h5 filter=lfs diff=lfs merge=lfs -text
6
  app/ml_assets/*.keras filter=lfs diff=lfs merge=lfs -text
 
 
4
  app/ml_assets/emotion_model_trained.keras filter=lfs diff=lfs merge=lfs -text
5
  app/ml_assets/*.h5 filter=lfs diff=lfs merge=lfs -text
6
  app/ml_assets/*.keras filter=lfs diff=lfs merge=lfs -text
7
+ *.gguf filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,47 +1,76 @@
1
- # 1. Use Python 3.10 slim
 
 
 
 
 
 
 
 
 
 
 
2
  FROM python:3.10-slim
3
 
4
- # 2. Set working directory
5
  WORKDIR /app
6
 
7
- # 3. Install system dependencies
8
- # - libgl1/libglib2.0-0: OpenCV needs these
9
- # - build-essential/cmake: needed to compile llama-cpp-python from source
10
- RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends -o Acquire::Retries=3 \
11
  libgl1 \
12
  libglib2.0-0 \
13
- build-essential \
14
- cmake \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
- # 4. Install PyTorch CPU-only FIRST (saves ~1.5GB vs full CUDA torch)
18
- # This is a separate layer so it caches well
 
 
 
19
  RUN pip install --no-cache-dir \
20
  torch --index-url https://download.pytorch.org/whl/cpu
21
 
22
- # 5. Install llama-cpp-python from pre-built CPU wheels (avoids 30+ min C++ compile)
23
- # PINNED to 0.3.2 β€” the latest version with pre-built wheels on the abetlen index.
24
- # Without the pin, pip resolves 0.3.16+ from PyPI which has NO pre-built wheel
25
- # and falls back to compiling llama.cpp from source (times out on HF Spaces).
26
- RUN pip install --no-cache-dir --prefer-binary \
27
- llama-cpp-python==0.3.2 \
28
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
29
-
30
- # 6. Install remaining Python dependencies
31
  COPY requirements.txt .
32
  RUN pip install --no-cache-dir -r requirements.txt
33
 
34
- # 7. Copy your code
35
  COPY . .
36
 
37
- # 8. Download all ML models (Face, Text, LLaMA 3 GGUF) during build
 
 
 
 
 
 
 
 
 
38
  RUN python download_models.py
39
 
40
- # 9. Environment & Port settings (7860 is HF Spaces standard)
41
  ENV PYTHONPATH=/app
42
- ENV USE_EMBEDDED_LLM=True
 
 
 
 
 
43
  ENV HF_HUB_OFFLINE=1
 
 
 
 
44
  EXPOSE 7860
45
 
46
- # 10. Run the app with Uvicorn
47
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
1
+ # ─────────────────────────────────────────────────────────────────────────────
2
+ # PsyPredict β€” Backend Dockerfile for Hugging Face Spaces (CPU / Docker SDK)
3
+ #
4
+ # Architecture:
5
+ # - Ollama binary installed inside the container (serves Phi-3.5 on port 11434)
6
+ # - FastAPI app served by Uvicorn on port 7860 (HF Spaces standard port)
7
+ # - start.sh orchestrates: Ollama β†’ model pull β†’ Uvicorn
8
+ # - ML assets (Keras face model + CSV) are downloaded at BUILD time via gdown
9
+ # - DistilBERT + Crisis classifier are downloaded at BUILD time from HF Hub
10
+ # - HF_HUB_OFFLINE=1 at runtime so the container starts offline-capable
11
+ # ─────────────────────────────────────────────────────────────────────────────
12
+
13
  FROM python:3.10-slim
14
 
 
15
  WORKDIR /app
16
 
17
+ # ── 1. System dependencies ────────────────────────────────────────────────────
18
+ # libgl1 + libglib2.0-0: OpenCV headless needs these
19
+ # curl + ca-certificates: needed to download Ollama install script
20
+ RUN apt-get update && apt-get install -y --no-install-recommends \
21
  libgl1 \
22
  libglib2.0-0 \
23
+ curl \
24
+ ca-certificates \
25
  && rm -rf /var/lib/apt/lists/*
26
 
27
+ # ── 2. Install Ollama binary ──────────────────────────────────────────────────
28
+ # Uses the official install script β€” places `ollama` binary in /usr/local/bin
29
+ RUN curl -fsSL https://ollama.com/install.sh | sh
30
+
31
+ # ── 3. PyTorch CPU-only (separate layer β€” ~800MB, caches very well) ───────────
32
  RUN pip install --no-cache-dir \
33
  torch --index-url https://download.pytorch.org/whl/cpu
34
 
35
+ # ── 4. Install remaining Python dependencies ──────────────────────────────────
36
+ # Note: torch is already installed above; pip will skip it when it hits
37
+ # the torch line in requirements.txt (version constraint already satisfied).
 
 
 
 
 
 
38
  COPY requirements.txt .
39
  RUN pip install --no-cache-dir -r requirements.txt
40
 
41
+ # ── 5. Copy application source code ──────────────────────────────────────────
42
  COPY . .
43
 
44
+ # ── 6. Download ML assets at BUILD time ──────────────────────────────────────
45
+ # Downloads:
46
+ # - app/ml_assets/emotion_model_trained.h5 (Keras CNN face model, ~4MB, Google Drive)
47
+ # - app/ml_assets/MEDICATION.csv (remedy database, Google Drive)
48
+ # - app/ml_assets/distilbert_model/ (DistilBERT emotion classifier, ~260MB, HF Hub)
49
+ # - app/ml_assets/crisis_model/ (MiniLM zero-shot classifier, ~130MB, HF Hub)
50
+ #
51
+ # Skips files that already exist in the build context (e.g. haarcascade XML).
52
+ # HF_HUB_OFFLINE must be 0 here so transformers can reach HuggingFace.
53
+ ENV HF_HUB_OFFLINE=0
54
  RUN python download_models.py
55
 
56
+ # ── 7. Runtime environment ────────────────────────────────────────────────────
57
  ENV PYTHONPATH=/app
58
+ # Ollama runs locally inside the container
59
+ ENV OLLAMA_BASE_URL=http://localhost:11434
60
+ ENV OLLAMA_MODEL=phi3.5:3.8b-mini-instruct-q4_0
61
+ ENV OLLAMA_TIMEOUT_S=300
62
+ ENV OLLAMA_RETRIES=2
63
+ # All HF models were baked in at build time β€” go offline for faster startup
64
  ENV HF_HUB_OFFLINE=1
65
+ ENV LOG_LEVEL=INFO
66
+ ENV RATE_LIMIT=30/minute
67
+
68
+ # ── 8. Expose HF Spaces standard port ────────────────────────────────────────
69
  EXPOSE 7860
70
 
71
+ # ── 9. Startup script ─────────────────────────────────────────────────────────
72
+ # start.sh: starts Ollama daemon β†’ pulls Phi-3.5 model β†’ launches Uvicorn
73
+ COPY start.sh /start.sh
74
+ RUN chmod +x /start.sh
75
+
76
+ CMD ["/start.sh"]
app/api/endpoints/therapist.py CHANGED
@@ -5,7 +5,7 @@ Full inference pipeline:
5
  2. Text emotion classification (DistilBERT)
6
  3. Crisis evaluation (zero-shot NLI) β€” override if triggered
7
  4. Multimodal fusion (text + face)
8
- 5. Ollama/LLaMA 3 structured report generation
9
  6. PsychReport JSON schema validation
10
  7. Streaming response option
11
  """
@@ -52,7 +52,7 @@ EMOTION_TO_CONDITION: dict[str, str] = {
52
  # POST /api/chat
53
  # ---------------------------------------------------------------------------
54
 
55
- @router.post("/chat", response_model=ChatResponse)
56
  async def chat(req: ChatRequest): # type: ignore[misc]
57
  """
58
  Main inference endpoint.
@@ -98,16 +98,13 @@ async def chat(req: ChatRequest): # type: ignore[misc]
98
 
99
  # ── Step 4: Streaming Response ───────────────────────────────────────────
100
  if req.stream:
101
- import asyncio as _asyncio
102
  async def stream_generator():
103
- accumulated = ""
104
  async for token in ollama_engine.generate_stream(
105
  user_text=user_text,
106
  face_emotion=face_emotion,
107
  history=history,
108
  text_emotion_summary=text_emotion_summary,
109
  ):
110
- accumulated += token
111
  yield token
112
 
113
  return StreamingResponse(stream_generator(), media_type="text/plain")
 
5
  2. Text emotion classification (DistilBERT)
6
  3. Crisis evaluation (zero-shot NLI) β€” override if triggered
7
  4. Multimodal fusion (text + face)
8
+ 5. Ollama/Phi-3.5 Mini structured report generation
9
  6. PsychReport JSON schema validation
10
  7. Streaming response option
11
  """
 
52
  # POST /api/chat
53
  # ---------------------------------------------------------------------------
54
 
55
+ @router.post("/chat")
56
  async def chat(req: ChatRequest): # type: ignore[misc]
57
  """
58
  Main inference endpoint.
 
98
 
99
  # ── Step 4: Streaming Response ───────────────────────────────────────────
100
  if req.stream:
 
101
  async def stream_generator():
 
102
  async for token in ollama_engine.generate_stream(
103
  user_text=user_text,
104
  face_emotion=face_emotion,
105
  history=history,
106
  text_emotion_summary=text_emotion_summary,
107
  ):
 
108
  yield token
109
 
110
  return StreamingResponse(stream_generator(), media_type="text/plain")
app/config.py CHANGED
@@ -7,15 +7,15 @@ from functools import lru_cache
7
 
8
 
9
  class Settings(BaseSettings):
10
- # Ollama / LLM
11
- OLLAMA_BASE_URL: str = "http://localhost:11434"
12
- OLLAMA_MODEL: str = "llama3"
13
- OLLAMA_TIMEOUT_S: int = 120
 
 
 
14
 
15
- # --- Embedded LLM Settings (for Docker/HF Spaces) ---
16
- USE_EMBEDDED_LLM: bool = False # Set to True in .env for Docker/HF Spaces
17
- GGUF_MODEL_PATH: str = "app/ml_assets/llama-3-8b-instruct.Q4_K_M.gguf"
18
- LLM_CONTEXT_SIZE: int = 2048
19
  OLLAMA_RETRIES: int = 3
20
  OLLAMA_RETRY_DELAY_S: float = 2.0
21
 
 
7
 
8
 
9
  class Settings(BaseSettings):
10
+ # Ollama / LLM (Centralized API)
11
+ # Update this to your DigitalOcean/VPS IP address where Ollama is running
12
+ # Default is localhost (e.g. for development), but in production it should be like:
13
+ # OLLAMA_BASE_URL: str = "http://123.45.67.89:11434"
14
+ OLLAMA_BASE_URL: str = "http://127.0.0.1:11434"
15
+ OLLAMA_MODEL: str = "phi3.5:3.8b-mini-instruct-q4_0"
16
+ OLLAMA_TIMEOUT_S: int = 90
17
 
18
+ # Retry logic for external LLM API
 
 
 
19
  OLLAMA_RETRIES: int = 3
20
  OLLAMA_RETRY_DELAY_S: float = 2.0
21
 
app/main.py CHANGED
@@ -63,15 +63,17 @@ async def lifespan(app: FastAPI):
63
  logger.info("═══════════════════════════════════════")
64
  logger.info("Config: Ollama=%s model=%s", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
65
 
66
- # Pre-warm DistilBERT text emotion model
67
- logger.info("Pre-warming DistilBERT text emotion model...")
 
 
68
  from app.services.text_emotion_engine import initialize as init_text
69
- init_text(settings.DISTILBERT_MODEL)
70
 
71
- # Pre-warm Crisis zero-shot classifier
72
- logger.info("Pre-warming crisis detection classifier...")
73
  from app.services.crisis_engine import initialize_crisis_classifier
74
- initialize_crisis_classifier()
75
 
76
  # Check Ollama availability (non-blocking warn only)
77
  from app.services.ollama_engine import ollama_engine
@@ -106,7 +108,7 @@ def create_app() -> FastAPI:
106
  title="PsyPredict API",
107
  description=(
108
  "Production-grade multimodal mental health AI system. "
109
- "Powered by LLaMA 3 (Ollama) + DistilBERT + Keras CNN facial emotion model."
110
  ),
111
  version="2.0.0",
112
  lifespan=lifespan,
 
63
  logger.info("═══════════════════════════════════════")
64
  logger.info("Config: Ollama=%s model=%s", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
65
 
66
+ import asyncio as _asyncio
67
+
68
+ # Pre-warm DistilBERT text emotion model (in background)
69
+ logger.info("Initializing DistilBERT text emotion model (background)...")
70
  from app.services.text_emotion_engine import initialize as init_text
71
+ _asyncio.create_task(_asyncio.to_thread(init_text, settings.DISTILBERT_MODEL))
72
 
73
+ # Pre-warm Crisis zero-shot classifier (in background)
74
+ logger.info("Initializing crisis detection classifier (background)...")
75
  from app.services.crisis_engine import initialize_crisis_classifier
76
+ _asyncio.create_task(_asyncio.to_thread(initialize_crisis_classifier))
77
 
78
  # Check Ollama availability (non-blocking warn only)
79
  from app.services.ollama_engine import ollama_engine
 
108
  title="PsyPredict API",
109
  description=(
110
  "Production-grade multimodal mental health AI system. "
111
+ "Powered by Phi-3.5 Mini (Ollama) + DistilBERT + Keras CNN facial emotion model."
112
  ),
113
  version="2.0.0",
114
  lifespan=lifespan,
app/ml_assets/MEDICATION.csv CHANGED
The diff for this file is too large to render. See raw diff
 
app/schemas.py CHANGED
@@ -108,6 +108,31 @@ def fallback_report() -> PsychReport:
108
  )
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # ---------------------------------------------------------------------------
112
  # Chat Endpoint
113
  # ---------------------------------------------------------------------------
@@ -162,31 +187,6 @@ class TextAnalysisResponse(BaseModel):
162
  crisis_triggered: bool
163
 
164
 
165
- # ---------------------------------------------------------------------------
166
- # Facial / Emotion Endpoint
167
- # ---------------------------------------------------------------------------
168
-
169
- class EmotionResponse(BaseModel):
170
- emotion: Optional[str] = None
171
- confidence: Optional[float] = None
172
- face_box: Optional[List[int]] = None
173
- message: Optional[str] = None
174
- error: Optional[str] = None
175
-
176
-
177
- # ---------------------------------------------------------------------------
178
- # Remedy Endpoint
179
- # ---------------------------------------------------------------------------
180
-
181
- class RemedyResponse(BaseModel):
182
- condition: str
183
- symptoms: str
184
- treatments: str
185
- medications: str
186
- dosage: str
187
- gita_remedy: str
188
-
189
-
190
  # ---------------------------------------------------------------------------
191
  # Health Endpoint
192
  # ---------------------------------------------------------------------------
 
108
  )
109
 
110
 
111
+ # ---------------------------------------------------------------------------
112
+ # Remedy Endpoint (must be defined BEFORE ChatResponse which references it)
113
+ # ---------------------------------------------------------------------------
114
+
115
+ class RemedyResponse(BaseModel):
116
+ condition: str
117
+ symptoms: str
118
+ treatments: str
119
+ medications: str
120
+ dosage: str
121
+ gita_remedy: str
122
+
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Facial / Emotion Endpoint
126
+ # ---------------------------------------------------------------------------
127
+
128
+ class EmotionResponse(BaseModel):
129
+ emotion: Optional[str] = None
130
+ confidence: Optional[float] = None
131
+ face_box: Optional[List[int]] = None
132
+ message: Optional[str] = None
133
+ error: Optional[str] = None
134
+
135
+
136
  # ---------------------------------------------------------------------------
137
  # Chat Endpoint
138
  # ---------------------------------------------------------------------------
 
187
  crisis_triggered: bool
188
 
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # ---------------------------------------------------------------------------
191
  # Health Endpoint
192
  # ---------------------------------------------------------------------------
app/services/ollama_engine.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- ollama_engine.py β€” PsyPredict Local LLM Engine
3
  Async Ollama client with:
4
  - Structured JSON output enforced via schema-in-prompt + Ollama format param
5
  - Context window trimming
@@ -97,47 +97,40 @@ FACE_DISTRESS_MAP: dict[str, float] = {
97
 
98
  class OllamaEngine:
99
  """
100
- Production async LLM engine backed by local Ollama/LLaMA 3.
101
  """
102
 
103
  def __init__(self) -> None:
104
  self.settings = get_settings()
105
  self._client: Optional[httpx.AsyncClient] = None
106
- self._local_llm: Optional[any] = None # llama_cpp.Llama instance
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  @property
109
  def client(self) -> httpx.AsyncClient:
110
  if self._client is None or self._client.is_closed:
111
- self._client = httpx.AsyncClient(
112
- base_url=self.settings.OLLAMA_BASE_URL,
113
- timeout=httpx.Timeout(
114
- connect=10.0,
115
- read=self.settings.OLLAMA_TIMEOUT_S,
116
- write=30.0,
117
- pool=5.0,
118
- ),
119
- )
120
  return self._client
121
 
122
- def _get_local_llm(self):
123
- """Lazy load llama-cpp-python model."""
124
- if self._local_llm is None:
125
  try:
126
- from llama_cpp import Llama
127
- logger.info("Loading local GGUF model from %s", self.settings.GGUF_MODEL_PATH)
128
- self._local_llm = Llama(
129
- model_path=self.settings.GGUF_MODEL_PATH,
130
- n_ctx=self.settings.LLM_CONTEXT_SIZE,
131
- n_threads=os.cpu_count() or 4,
132
- verbose=False
133
- )
134
- except ImportError:
135
- logger.error("llama-cpp-python not installed. Cannot use embedded LLM.")
136
- raise RuntimeError("llama-cpp-python not installed")
137
- except Exception as exc:
138
- logger.error("Failed to load local GGUF model: %s", exc)
139
- raise
140
- return self._local_llm
141
 
142
  async def close(self) -> None:
143
  if self._client and not self._client.is_closed:
@@ -254,61 +247,28 @@ class OllamaEngine:
254
  text_emotion_summary: Optional[str] = None,
255
  ) -> tuple[str, PsychReport]:
256
  """
257
- Calls either Ollama API or Embedded LLM based on settings,
258
- with automatic fallback to local if Ollama is unreachable.
259
  """
260
- # If user explicitly wants embedded mode
261
- if self.settings.USE_EMBEDDED_LLM:
262
- return await self._generate_local(user_text, face_emotion, history, text_emotion_summary)
263
-
264
- # Otherwise try Ollama, fallback to local if it fails and GGUF is available
 
 
 
 
 
 
265
  try:
266
- reply, report = await self._generate_ollama(user_text, face_emotion, history, text_emotion_summary)
267
- # If _generate_ollama returned the hardcoded fallback string, it failed its retries
268
- if "inference service is temporarily unavailable" in reply:
269
- # Check for GGUF before giving up
270
- if os.path.exists(self.settings.GGUF_MODEL_PATH):
271
- logger.info("Ollama service unreachable after retries, falling back to GGUF.")
272
- return await self._generate_local(user_text, face_emotion, history, text_emotion_summary)
273
- return reply, report
274
  except Exception as exc:
275
- if os.path.exists(self.settings.GGUF_MODEL_PATH):
276
- logger.info("Ollama failed, falling back to embedded GGUF model: %s", exc)
277
- return await self._generate_local(user_text, face_emotion, history, text_emotion_summary)
278
- else:
279
- logger.error("Ollama failed and no GGUF model found for fallback at %s", self.settings.GGUF_MODEL_PATH)
280
- return (
281
- "The inference service is temporarily unavailable and no local fallback is configured.",
282
- fallback_report(),
283
- )
284
-
285
- async def _generate_local(
286
- self,
287
- user_text: str,
288
- face_emotion: str,
289
- history: Optional[List[ConversationMessage]],
290
- text_emotion_summary: Optional[str]
291
- ) -> tuple[str, PsychReport]:
292
- """Embedded generation via llama-cpp-python."""
293
- if history is None: history = []
294
- prompt = self._build_prompt(user_text, face_emotion, history, text_emotion_summary)
295
-
296
- try:
297
- llm = self._get_local_llm()
298
- # Run blocking LLM call in a separate thread
299
- response = await asyncio.to_thread(
300
- llm,
301
- prompt=prompt,
302
- max_tokens=600,
303
- temperature=0.2,
304
- top_p=0.9,
305
- stop=["USER:", "CURRENT USER INPUT:"]
306
  )
307
- raw_text = response["choices"][0]["text"]
308
- return self._parse_response(raw_text)
309
- except Exception as exc:
310
- logger.error("Embedded local LLM failed: %s", exc)
311
- return "The local inference service encountered an error.", fallback_report()
312
 
313
  async def _generate_ollama(
314
  self,
@@ -327,9 +287,9 @@ class OllamaEngine:
327
  "prompt": prompt,
328
  "stream": False,
329
  "options": {
330
- "temperature": 0.2, # Low temp for determinism
331
  "top_p": 0.9,
332
- "num_ctx": 4096,
333
  "stop": [],
334
  },
335
  }
@@ -355,6 +315,7 @@ class OllamaEngine:
355
  except httpx.TimeoutException as exc:
356
  last_error = exc
357
  logger.warning("Ollama timeout on attempt %d: %s", attempt, exc)
 
358
  except httpx.HTTPStatusError as exc:
359
  last_error = exc
360
  logger.error("Ollama HTTP error %s: %s", exc.response.status_code, exc)
@@ -362,6 +323,7 @@ class OllamaEngine:
362
  except Exception as exc:
363
  last_error = exc
364
  logger.error("Ollama unexpected error: %s", exc)
 
365
 
366
  if attempt < self.settings.OLLAMA_RETRIES:
367
  await asyncio.sleep(delay)
@@ -388,45 +350,27 @@ class OllamaEngine:
388
  text_emotion_summary: Optional[str] = None,
389
  ) -> AsyncIterator[str]:
390
  """
391
- Yields raw text chunks as they arrive from either Ollama or Embedded LLM.
 
392
  """
393
- if self.settings.USE_EMBEDDED_LLM:
394
- async for chunk in self._generate_stream_local(user_text, face_emotion, history, text_emotion_summary):
395
- yield chunk
396
- else:
397
- async for chunk in self._generate_stream_ollama(user_text, face_emotion, history, text_emotion_summary):
398
- yield chunk
399
-
400
- async def _generate_stream_local(
401
- self,
402
- user_text: str,
403
- face_emotion: str,
404
- history: Optional[List[ConversationMessage]],
405
- text_emotion_summary: Optional[str]
406
- ) -> AsyncIterator[str]:
407
- """Embedded streaming via llama-cpp-python."""
408
- if history is None: history = []
409
- prompt = self._build_prompt(user_text, face_emotion, history, text_emotion_summary)
410
-
411
- try:
412
- llm = self._get_local_llm()
413
- # llama-cpp-python streaming is synchronous, so we need to wrap it
414
- stream = llm(
415
- prompt=prompt,
416
- max_tokens=600,
417
- temperature=0.2,
418
- top_p=0.9,
419
- stream=True,
420
- stop=["USER:", "CURRENT USER INPUT:"]
421
  )
422
- for chunk in stream:
423
- token = chunk["choices"][0]["text"]
424
- if token:
425
- yield token
426
- await asyncio.sleep(0) # Yield control
427
- except Exception as exc:
428
- logger.error("Embedded streaming failed: %s", exc)
429
- yield "\n[Local inference error]"
 
 
 
 
430
 
431
  async def _generate_stream_ollama(
432
  self,
@@ -437,7 +381,7 @@ class OllamaEngine:
437
  ) -> AsyncIterator[str]:
438
  """
439
  Yields raw text chunks as they arrive from Ollama.
440
- With automatic fallback to local streaming if Ollama is unreachable.
441
  """
442
  if history is None:
443
  history = []
@@ -448,11 +392,18 @@ class OllamaEngine:
448
  "model": self.settings.OLLAMA_MODEL,
449
  "prompt": prompt,
450
  "stream": True,
451
- "options": {"temperature": 0.2, "top_p": 0.9, "num_ctx": 4096},
 
 
 
 
452
  }
453
 
 
 
 
454
  try:
455
- async with self.client.stream("POST", "/api/generate", json=payload) as resp:
456
  resp.raise_for_status()
457
  async for line in resp.aiter_lines():
458
  if not line.strip():
@@ -468,12 +419,9 @@ class OllamaEngine:
468
  continue
469
  except Exception as exc:
470
  logger.error("Ollama streaming failed: %s", exc)
471
- if os.path.exists(self.settings.GGUF_MODEL_PATH):
472
- logger.info("Falling back to local GGUF streaming.")
473
- async for chunk in self._generate_stream_local(user_text, face_emotion, history, text_emotion_summary):
474
- yield chunk
475
- else:
476
- yield "\n[Inference service error β€” please retry]\n"
477
 
478
 
479
  # ---------------------------------------------------------------------------
 
1
  """
2
+ ollama_engine.py β€” PsyPredict Local LLM Engine (Phi-3.5 Mini)
3
  Async Ollama client with:
4
  - Structured JSON output enforced via schema-in-prompt + Ollama format param
5
  - Context window trimming
 
97
 
98
  class OllamaEngine:
99
  """
100
+ Production async LLM engine backed by local Ollama/Phi-3.5 Mini.
101
  """
102
 
103
  def __init__(self) -> None:
104
  self.settings = get_settings()
105
  self._client: Optional[httpx.AsyncClient] = None
106
+
107
+ def _make_client(self, stream: bool = False) -> httpx.AsyncClient:
108
+ """Create a fresh httpx client. For streaming, read timeout is None (unbounded)."""
109
+ read_timeout = None if stream else float(self.settings.OLLAMA_TIMEOUT_S)
110
+ return httpx.AsyncClient(
111
+ base_url=self.settings.OLLAMA_BASE_URL,
112
+ timeout=httpx.Timeout(
113
+ connect=10.0,
114
+ read=read_timeout,
115
+ write=30.0,
116
+ pool=5.0,
117
+ ),
118
+ )
119
 
120
  @property
121
  def client(self) -> httpx.AsyncClient:
122
  if self._client is None or self._client.is_closed:
123
+ self._client = self._make_client(stream=False)
 
 
 
 
 
 
 
 
124
  return self._client
125
 
126
+ async def _reset_client(self) -> None:
127
+ """Close and discard the current client so the next call gets a fresh one."""
128
+ if self._client and not self._client.is_closed:
129
  try:
130
+ await self._client.aclose()
131
+ except Exception:
132
+ pass
133
+ self._client = None
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  async def close(self) -> None:
136
  if self._client and not self._client.is_closed:
 
247
  text_emotion_summary: Optional[str] = None,
248
  ) -> tuple[str, PsychReport]:
249
  """
250
+ Calls external Ollama API with early reachability check.
 
251
  """
252
+ # Fast-fail: check reachability before waiting for full timeout
253
+ if not await self.is_reachable():
254
+ logger.warning(
255
+ "Ollama unreachable at %s β€” skipping inference, returning fallback.",
256
+ self.settings.OLLAMA_BASE_URL,
257
+ )
258
+ return (
259
+ "The inference service is currently offline. Please ensure Ollama is running "
260
+ f"at {self.settings.OLLAMA_BASE_URL} with model '{self.settings.OLLAMA_MODEL}'.",
261
+ fallback_report(),
262
+ )
263
  try:
264
+ return await self._generate_ollama(user_text, face_emotion, history, text_emotion_summary)
 
 
 
 
 
 
 
265
  except Exception as exc:
266
+ logger.error("Ollama API call failed entirely: %s", exc)
267
+ await self._reset_client()
268
+ return (
269
+ "The inference service is temporarily unavailable. Please verify your external Ollama server is running.",
270
+ fallback_report(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  )
 
 
 
 
 
272
 
273
  async def _generate_ollama(
274
  self,
 
287
  "prompt": prompt,
288
  "stream": False,
289
  "options": {
290
+ "temperature": 0.2,
291
  "top_p": 0.9,
292
+ "num_ctx": 8192, # Match model's full context window
293
  "stop": [],
294
  },
295
  }
 
315
  except httpx.TimeoutException as exc:
316
  last_error = exc
317
  logger.warning("Ollama timeout on attempt %d: %s", attempt, exc)
318
+ await self._reset_client() # Reset client after timeout
319
  except httpx.HTTPStatusError as exc:
320
  last_error = exc
321
  logger.error("Ollama HTTP error %s: %s", exc.response.status_code, exc)
 
323
  except Exception as exc:
324
  last_error = exc
325
  logger.error("Ollama unexpected error: %s", exc)
326
+ await self._reset_client()
327
 
328
  if attempt < self.settings.OLLAMA_RETRIES:
329
  await asyncio.sleep(delay)
 
350
  text_emotion_summary: Optional[str] = None,
351
  ) -> AsyncIterator[str]:
352
  """
353
+ Yields raw text chunks as they arrive from External Ollama.
354
+ Fast-fails with a clear message if Ollama is unreachable.
355
  """
356
+ # Early reachability check β€” prevents indefinite hang on dead server
357
+ if not await self.is_reachable():
358
+ logger.warning(
359
+ "Ollama unreachable at %s β€” aborting stream, returning fallback.",
360
+ self.settings.OLLAMA_BASE_URL,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  )
362
+ fallback_msg = (
363
+ f"The inference service is currently offline. "
364
+ f"Please ensure Ollama is running at {self.settings.OLLAMA_BASE_URL} "
365
+ f"with model '{self.settings.OLLAMA_MODEL}'.\n"
366
+ f"---JSON---\n"
367
+ + __import__('json').dumps(fallback_report().model_dump())
368
+ )
369
+ yield fallback_msg
370
+ return
371
+
372
+ async for chunk in self._generate_stream_ollama(user_text, face_emotion, history, text_emotion_summary):
373
+ yield chunk
374
 
375
  async def _generate_stream_ollama(
376
  self,
 
381
  ) -> AsyncIterator[str]:
382
  """
383
  Yields raw text chunks as they arrive from Ollama.
384
+ Uses an unbounded read timeout so slow CPU inference never times out mid-stream.
385
  """
386
  if history is None:
387
  history = []
 
392
  "model": self.settings.OLLAMA_MODEL,
393
  "prompt": prompt,
394
  "stream": True,
395
+ "options": {
396
+ "temperature": 0.2,
397
+ "top_p": 0.9,
398
+ "num_ctx": 8192, # Match model's full context window
399
+ },
400
  }
401
 
402
+ # Use a dedicated streaming client with no read timeout
403
+ # (tokens trickle in slowly on CPU β€” we must not cut the connection)
404
+ stream_client = self._make_client(stream=True)
405
  try:
406
+ async with stream_client.stream("POST", "/api/generate", json=payload) as resp:
407
  resp.raise_for_status()
408
  async for line in resp.aiter_lines():
409
  if not line.strip():
 
419
  continue
420
  except Exception as exc:
421
  logger.error("Ollama streaming failed: %s", exc)
422
+ yield "\n[Inference error β€” Ollama took too long or disconnected. Try again.]\n"
423
+ finally:
424
+ await stream_client.aclose()
 
 
 
425
 
426
 
427
  # ---------------------------------------------------------------------------
download_models.py CHANGED
@@ -6,15 +6,10 @@ from huggingface_hub import hf_hub_download
6
  MODEL_ID = "10GWSogJNKlPlTeWtJkDq_zc4roB1Vmnu" # Keras Face Emotion
7
  CSV_ID = "1bJ8C1BY0rvPNKuWcBgqiUtiSzHziZokH" # Medication CSV
8
 
9
- # Llama-3-8B-Instruct GGUF (Quantized for CPU/RAM efficiency)
10
- LLAMA_REPO = "MaziyarPanahi/Llama-3-8B-Instruct-v0.1-GGUF"
11
- LLAMA_FILE = "Llama-3-8B-Instruct-v0.1.Q4_K_M.gguf"
12
-
13
  # Destinations
14
  ML_ASSETS = "app/ml_assets"
15
  FACE_MODEL_PATH = os.path.join(ML_ASSETS, "emotion_model_trained.h5")
16
  MEDS_CSV_PATH = os.path.join(ML_ASSETS, "MEDICATION.csv")
17
- LLAMA_GGUF_PATH = os.path.join(ML_ASSETS, "llama-3-8b-instruct.Q4_K_M.gguf")
18
 
19
  # HF Transformers (Downloaded via snapshot_download for full directory)
20
  CRISIS_MODEL_REPO = "cross-encoder/nli-MiniLM2-L6-H768"
@@ -69,13 +64,7 @@ if __name__ == "__main__":
69
  download_drive_file(MODEL_ID, FACE_MODEL_PATH)
70
  download_drive_file(CSV_ID, MEDS_CSV_PATH)
71
 
72
- # 2. HF Models (Llama 3)
73
- try:
74
- download_hf_model(LLAMA_REPO, LLAMA_FILE, LLAMA_GGUF_PATH)
75
- except Exception as e:
76
- print(f"⚠️ HF LLaMA Download failed (expected on local dev if no internet): {e}")
77
-
78
- # 3. HF Transformers Pipeline Models
79
  try:
80
  download_hf_directory(CRISIS_MODEL_REPO, CRISIS_MODEL_PATH)
81
  download_hf_directory(DISTILBERT_MODEL_REPO, DISTILBERT_MODEL_PATH)
 
6
  MODEL_ID = "10GWSogJNKlPlTeWtJkDq_zc4roB1Vmnu" # Keras Face Emotion
7
  CSV_ID = "1bJ8C1BY0rvPNKuWcBgqiUtiSzHziZokH" # Medication CSV
8
 
 
 
 
 
9
  # Destinations
10
  ML_ASSETS = "app/ml_assets"
11
  FACE_MODEL_PATH = os.path.join(ML_ASSETS, "emotion_model_trained.h5")
12
  MEDS_CSV_PATH = os.path.join(ML_ASSETS, "MEDICATION.csv")
 
13
 
14
  # HF Transformers (Downloaded via snapshot_download for full directory)
15
  CRISIS_MODEL_REPO = "cross-encoder/nli-MiniLM2-L6-H768"
 
64
  download_drive_file(MODEL_ID, FACE_MODEL_PATH)
65
  download_drive_file(CSV_ID, MEDS_CSV_PATH)
66
 
67
+ # 2. HF Transformers Pipeline Models
 
 
 
 
 
 
68
  try:
69
  download_hf_directory(CRISIS_MODEL_REPO, CRISIS_MODEL_PATH)
70
  download_hf_directory(DISTILBERT_MODEL_REPO, DISTILBERT_MODEL_PATH)
main.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ main.py β€” PsyPredict FastAPI Application (Production)
3
+ Replaces Flask. Key features:
4
+ - Async request handling (FastAPI + Uvicorn)
5
+ - CORS middleware
6
+ - Rate limiting (SlowAPI)
7
+ - Structured logging (Python logging)
8
+ - Startup model pre-warming
9
+ - Graceful shutdown (Ollama client cleanup)
10
+ - FastAPI auto docs at /docs (Swagger) and /redoc
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import asyncio
15
+ import logging
16
+ import sys
17
+ from contextlib import asynccontextmanager
18
+
19
+ from fastapi import FastAPI, Request
20
+ from fastapi.middleware.cors import CORSMiddleware
21
+ from fastapi.responses import JSONResponse
22
+ from slowapi import Limiter, _rate_limit_exceeded_handler
23
+ from slowapi.errors import RateLimitExceeded
24
+ from slowapi.util import get_remote_address
25
+
26
+ from app.config import get_settings
27
+ from app.api.endpoints.facial import router as facial_router
28
+ from app.api.endpoints.remedies import router as remedies_router
29
+ from app.api.endpoints.therapist import router as therapist_router
30
+ from app.api.endpoints.analysis import router as analysis_router
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Windows asyncio fix β€” prevents noisy "ConnectionResetError: [WinError 10054]"
34
+ # when a streaming client disconnects before the response finishes.
35
+ # SelectorEventLoop handles abrupt pipe closures gracefully unlike the default
36
+ # ProactorEventLoop on Windows.
37
+ # ---------------------------------------------------------------------------
38
+ if sys.platform == "win32":
39
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
40
+
41
+ settings = get_settings()
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Logging
45
+ # ---------------------------------------------------------------------------
46
+
47
+ logging.basicConfig(
48
+ level=getattr(logging, settings.LOG_LEVEL, logging.INFO),
49
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
50
+ handlers=[logging.StreamHandler(sys.stdout)],
51
+ )
52
+ logger = logging.getLogger(__name__)
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Rate Limiter
56
+ # ---------------------------------------------------------------------------
57
+
58
+ limiter = Limiter(key_func=get_remote_address, default_limits=[settings.RATE_LIMIT])
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Lifespan (startup / shutdown events)
63
+ # ---------------------------------------------------------------------------
64
+
65
+ @asynccontextmanager
66
+ async def lifespan(app: FastAPI):
67
+ """
68
+ Startup: pre-warm models (DistilBERT + Crisis classifier).
69
+ Shutdown: close Ollama async client.
70
+ """
71
+ logger.info("═══════════════════════════════════════")
72
+ logger.info("πŸš€ PsyPredict v2.0 β€” Production Backend")
73
+ logger.info("═══════════════════════════════════════")
74
+ logger.info("Config: Ollama=%s model=%s", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
75
+
76
+ import asyncio as _asyncio
77
+
78
+ # Pre-warm DistilBERT text emotion model (in background)
79
+ logger.info("Initializing DistilBERT text emotion model (background)...")
80
+ from app.services.text_emotion_engine import initialize as init_text
81
+ _asyncio.create_task(_asyncio.to_thread(init_text, settings.DISTILBERT_MODEL))
82
+
83
+ # Pre-warm Crisis zero-shot classifier (in background)
84
+ logger.info("Initializing crisis detection classifier (background)...")
85
+ from app.services.crisis_engine import initialize_crisis_classifier
86
+ _asyncio.create_task(_asyncio.to_thread(initialize_crisis_classifier))
87
+
88
+ # Check Ollama availability (non-blocking warn only)
89
+ from app.services.ollama_engine import ollama_engine
90
+ reachable = await ollama_engine.is_reachable()
91
+ if reachable:
92
+ logger.info("βœ… Ollama reachable at %s (model: %s)", settings.OLLAMA_BASE_URL, settings.OLLAMA_MODEL)
93
+ else:
94
+ logger.warning(
95
+ "⚠️ Ollama NOT reachable at %s β€” chat will return fallback responses. "
96
+ "Run: ollama serve && ollama pull %s",
97
+ settings.OLLAMA_BASE_URL,
98
+ settings.OLLAMA_MODEL,
99
+ )
100
+
101
+ logger.info("βœ… Startup complete. Listening on port 7860.")
102
+ logger.info(" Docs: http://localhost:7860/docs")
103
+ logger.info("═══════════════════════════════════════")
104
+
105
+ yield # ── Application Running ──
106
+
107
+ logger.info("Shutting down PsyPredict backend...")
108
+ await ollama_engine.close()
109
+ logger.info("Goodbye.")
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # FastAPI App
114
+ # ---------------------------------------------------------------------------
115
+
116
+ def create_app() -> FastAPI:
117
+ app = FastAPI(
118
+ title="PsyPredict API",
119
+ description=(
120
+ "Production-grade multimodal mental health AI system. "
121
+ "Powered by Phi-3.5 Mini (Ollama) + DistilBERT + Keras CNN facial emotion model."
122
+ ),
123
+ version="2.0.0",
124
+ lifespan=lifespan,
125
+ docs_url="/docs",
126
+ redoc_url="/redoc",
127
+ )
128
+
129
+ # ── Rate Limiter ─────────────────────────────────────────────────────────
130
+ app.state.limiter = limiter
131
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
132
+
133
+ # ── CORS ────────────────────────────────────────────────────────────────
134
+ app.add_middleware(
135
+ CORSMiddleware,
136
+ allow_origins=["*"], # Tighten to specific origin in production
137
+ allow_credentials=True,
138
+ allow_methods=["*"],
139
+ allow_headers=["*"],
140
+ )
141
+
142
+ # ── Global Exception Handler ─────────────────────────────────────────────
143
+ @app.exception_handler(Exception)
144
+ async def global_exception_handler(request: Request, exc: Exception):
145
+ logger.error("Unhandled exception: %s | path=%s", exc, request.url.path)
146
+ return JSONResponse(
147
+ status_code=500,
148
+ content={"detail": "Internal server error. Please try again."},
149
+ )
150
+
151
+ # ── Routers ──────────────────────────────────────────────────────────────
152
+ app.include_router(facial_router, prefix="/api", tags=["Facial Emotion"])
153
+ app.include_router(remedies_router, prefix="/api", tags=["Remedies"])
154
+ app.include_router(therapist_router, prefix="/api", tags=["AI Therapist"])
155
+ app.include_router(analysis_router, prefix="/api", tags=["Text Analysis & Health"])
156
+
157
+ return app
158
+
159
+
160
+ app = create_app()
161
+
162
+ # ---------------------------------------------------------------------------
163
+ # Entry point
164
+ # ---------------------------------------------------------------------------
165
+
166
+ if __name__ == "__main__":
167
+ import uvicorn
168
+ uvicorn.run(
169
+ "app.main:app",
170
+ host="0.0.0.0",
171
+ port=7860,
172
+ reload=False,
173
+ log_level=settings.LOG_LEVEL.lower(),
174
+ workers=1, # Keep at 1: models are singletons loaded in memory
175
+ )
requirements.txt CHANGED
@@ -1,31 +1,49 @@
1
- # --- Core Backend (FastAPI) ---
 
 
 
 
 
 
 
 
 
 
 
 
2
  fastapi>=0.111.0
3
  uvicorn[standard]>=0.30.0
4
  python-dotenv>=1.0.0
5
  pydantic>=2.0.0
6
  pydantic-settings>=2.0.0
7
 
8
- # --- HTTP + Async ---
9
  httpx>=0.27.0
10
  anyio>=4.0.0
11
 
12
- # --- Rate Limiting ---
13
  slowapi>=0.1.9
14
 
15
- # --- AI & Vision (CPU-only TensorFlow β€” saves ~500MB vs full tensorflow) ---
16
  numpy<2.0
17
  opencv-python-headless
 
 
 
 
18
  tensorflow-cpu
19
  pandas
20
- pillow
21
  gdown
22
 
23
- # --- NLP (PyTorch CPU-only via --index-url, see Dockerfile) ---
24
- # llama-cpp-python is installed separately in Dockerfile from pre-built CPU wheels
 
 
 
25
  transformers>=4.40.0
26
  sentencepiece==0.1.99
27
  huggingface-hub>=0.23.0
28
 
29
- # --- Utilities ---
30
  requests
31
- python-multipart
 
1
+ # ─────────────────────────────────────────────────────────────────────────────
2
+ # PsyPredict Backend β€” Python Dependencies
3
+ #
4
+ # HOW TORCH IS HANDLED:
5
+ # Docker: torch is pre-installed in a separate layer BEFORE this file runs:
6
+ # RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
7
+ # pip will then skip the torch line below (version already satisfied).
8
+ # Local: Run manually first:
9
+ # pip install torch --index-url https://download.pytorch.org/whl/cpu
10
+ # Then: pip install -r requirements.txt
11
+ # ─────────────────────────────────────────────────────────────────────────────
12
+
13
+ # ── Core Backend (FastAPI) ────────────────────────────────────────────────────
14
  fastapi>=0.111.0
15
  uvicorn[standard]>=0.30.0
16
  python-dotenv>=1.0.0
17
  pydantic>=2.0.0
18
  pydantic-settings>=2.0.0
19
 
20
+ # ── HTTP + Async ──────────────────────────────────────────────────────────────
21
  httpx>=0.27.0
22
  anyio>=4.0.0
23
 
24
+ # ── Rate Limiting ─────────────────────────────────────────────────────────────
25
  slowapi>=0.1.9
26
 
27
+ # ── Computer Vision (CPU-only, no CUDA) ──────────────────────────────────────
28
  numpy<2.0
29
  opencv-python-headless
30
+ pillow
31
+
32
+ # ── Deep Learning: TensorFlow CPU (Keras face emotion model) ─────────────────
33
+ # tensorflow-cpu is ~500MB lighter than full tensorflow (no CUDA/ROCm)
34
  tensorflow-cpu
35
  pandas
 
36
  gdown
37
 
38
+ # ── Deep Learning: PyTorch CPU + HuggingFace Transformers ────────────────────
39
+ # torch is pre-installed by Dockerfile (CPU wheel from PyTorch index).
40
+ # The line below is kept so `pip install -r requirements.txt` works locally
41
+ # after you have manually installed the CPU torch wheel (see note above).
42
+ torch>=2.0.0 --index-url https://download.pytorch.org/whl/cpu
43
  transformers>=4.40.0
44
  sentencepiece==0.1.99
45
  huggingface-hub>=0.23.0
46
 
47
+ # ── Utilities ─────────────────────────────────────────────────────────────────
48
  requests
49
+ python-multipart
start.sh ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ # start.sh β€” PsyPredict HF Spaces Startup Orchestrator
4
+ #
5
+ # Execution order:
6
+ # 1. Start Ollama server daemon in the background
7
+ # 2. Wait until Ollama API is healthy (up to 60 seconds)
8
+ # 3. Pull the Phi-3.5 quantized model (skips if already cached in this run)
9
+ # 4. Launch FastAPI / Uvicorn on port 7860
10
+ #
11
+ # Environment variables (set in Dockerfile or HF Space secrets):
12
+ # OLLAMA_MODEL β€” model tag to pull (default: phi3.5:3.8b-mini-instruct-q4_0)
13
+ # ─────────────────────────────────────────────────────────────────────────────
14
+
15
+ set -e # Exit immediately on any error
16
+
17
+ echo "═══════════════════════════════════════════════"
18
+ echo "πŸš€ PsyPredict β€” Hugging Face Spaces Startup"
19
+ echo "═══════════════════════════════════════════════"
20
+
21
+ # ── Step 1: Start Ollama server in the background ─────────────────────────────
22
+ echo "β–Ά Starting Ollama server..."
23
+ ollama serve &
24
+ OLLAMA_PID=$!
25
+
26
+ # ── Step 2: Wait for Ollama to become healthy (max 60 seconds) ────────────────
27
+ echo "⏳ Waiting for Ollama to be ready..."
28
+ RETRIES=30
29
+ for i in $(seq 1 $RETRIES); do
30
+ if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then
31
+ echo "βœ… Ollama is ready (attempt $i/$RETRIES)."
32
+ break
33
+ fi
34
+ if [ "$i" -eq "$RETRIES" ]; then
35
+ echo "❌ Ollama failed to start within 60 seconds. Exiting."
36
+ exit 1
37
+ fi
38
+ sleep 2
39
+ done
40
+
41
+ # ── Step 3: Pull the Phi-3.5 model ────────────────────────────────────────────
42
+ # 'ollama pull' is idempotent β€” safe to call even if the model is cached.
43
+ # On HF Spaces, the first pull will download ~2.4 GB; subsequent restarts
44
+ # are faster because the container's /root/.ollama layer is reused.
45
+ MODEL="${OLLAMA_MODEL:-phi3.5:3.8b-mini-instruct-q4_0}"
46
+ echo "β–Ά Pulling model: $MODEL"
47
+ echo " (First run downloads ~2.4 GB β€” may take several minutes on CPU)"
48
+ ollama pull "$MODEL"
49
+ echo "βœ… Model ready: $MODEL"
50
+
51
+ # ── Step 4: Launch FastAPI on port 7860 ───────────────────────────────────────
52
+ echo "β–Ά Starting FastAPI (Uvicorn) on port 7860..."
53
+ echo " API docs β†’ http://localhost:7860/docs"
54
+ echo "═══════════════════════════════════════════════"
55
+ exec uvicorn app.main:app \
56
+ --host 0.0.0.0 \
57
+ --port 7860 \
58
+ --workers 1 \
59
+ --log-level info