pierreramez commited on
Commit
d840997
·
verified ·
1 Parent(s): 65db64e
Files changed (1) hide show
  1. app.py +73 -91
app.py CHANGED
@@ -1,16 +1,5 @@
1
  """
2
  Enhanced FastAPI Backend with Feedback Management
3
- --------------------------------------------------
4
- New endpoints for production continuous learning workflow:
5
- - GET /download-feedback: Download feedback for training
6
- - POST /clear-feedback: Clear feedback after training
7
- - GET /correction-count: Monitor training readiness
8
- - POST /reload-adapter: Hot reload new model without restart
9
-
10
- Deploy to HuggingFace Spaces (FREE):
11
- 1. Create new Space: "YourUsername/chatbot-api"
12
- 2. Select: SDK = "Docker"
13
- 3. Upload: app.py, requirements.txt, Dockerfile, README.md
14
  """
15
 
16
  from fastapi import FastAPI, HTTPException
@@ -23,6 +12,7 @@ from pathlib import Path
23
  import torch
24
  from transformers import AutoTokenizer, AutoModelForCausalLM
25
  from peft import PeftModel
 
26
 
27
  app = FastAPI(
28
  title="Personalized Chatbot API",
@@ -114,39 +104,58 @@ class ModelManager:
114
  if adapter_path:
115
  print(f"With adapter: {adapter_path}")
116
 
 
117
  self._device = "cuda" if torch.cuda.is_available() else "cpu"
118
  print(f"Using device: {self._device}")
119
 
120
- self._tokenizer = AutoTokenizer.from_pretrained(
121
- model_name,
122
- trust_remote_code=True
123
- )
 
 
 
 
 
124
 
125
  if self._tokenizer.pad_token is None:
126
  self._tokenizer.pad_token = self._tokenizer.eos_token
127
 
128
- if use_4bit and torch.cuda.is_available():
129
- from transformers import BitsAndBytesConfig
130
-
131
- bnb_config = BitsAndBytesConfig(
132
- load_in_4bit=True,
133
- bnb_4bit_quant_type="nf4",
134
- bnb_4bit_compute_dtype=torch.float16,
135
- bnb_4bit_use_double_quant=True,
136
- )
137
-
138
- base_model = AutoModelForCausalLM.from_pretrained(
139
- model_name,
140
- quantization_config=bnb_config,
141
- device_map="auto",
142
- trust_remote_code=True,
143
- torch_dtype=torch.float16,
144
- )
 
 
 
 
 
 
 
 
 
 
145
  else:
 
146
  base_model = AutoModelForCausalLM.from_pretrained(
147
  model_name,
148
- device_map="auto",
149
  trust_remote_code=True,
 
 
150
  )
151
 
152
  if adapter_path and (isinstance(adapter_path, str) and adapter_path.strip()):
@@ -155,7 +164,7 @@ class ModelManager:
155
  self._model = PeftModel.from_pretrained(
156
  base_model,
157
  adapter_path,
158
- torch_dtype=torch.float16
159
  )
160
  self._current_adapter = adapter_path
161
  print(f"✅ Adapter loaded successfully")
@@ -218,6 +227,10 @@ class ModelManager:
218
  skip_special_tokens=True
219
  ).strip()
220
 
 
 
 
 
221
  return reply
222
 
223
 
@@ -225,6 +238,7 @@ class FeedbackManager:
225
  """Manages feedback storage and statistics."""
226
  def __init__(self, feedback_file: str = "data/feedback.jsonl"):
227
  self.feedback_file = Path(feedback_file)
 
228
  self.feedback_file.parent.mkdir(parents=True, exist_ok=True)
229
 
230
  def save_interaction(
@@ -295,9 +309,16 @@ async def startup_event():
295
  print("Starting up...")
296
 
297
  model_manager.initialize(
298
- model_name="meta-llama/Llama-3.2-1B-Instruct",
299
- adapter_path=None, # Update this after training: "username/adapter-v1"
300
- use_4bit=True
 
 
 
 
 
 
 
301
  )
302
 
303
  print("Ready to serve!")
@@ -310,6 +331,7 @@ async def root():
310
  "message": "Personalized Chatbot API v2.0",
311
  "version": "2.0.0",
312
  "current_adapter": model_manager._current_adapter,
 
313
  "endpoints": {
314
  "chat": "POST /chat",
315
  "feedback": "POST /feedback",
@@ -358,6 +380,7 @@ async def chat(request: ChatRequest):
358
  )
359
 
360
  except Exception as e:
 
361
  raise HTTPException(status_code=500, detail=str(e))
362
 
363
 
@@ -365,10 +388,15 @@ async def chat(request: ChatRequest):
365
  async def submit_feedback(request: FeedbackRequest):
366
  """Submit correction for a model response."""
367
  try:
368
- with open(feedback_manager.feedback_file, "r", encoding="utf-8") as f:
369
- lines = f.readlines()
 
 
 
 
370
 
371
  found = False
 
372
  for i in range(len(lines) - 1, -1, -1):
373
  try:
374
  record = json.loads(lines[i])
@@ -395,6 +423,7 @@ async def submit_feedback(request: FeedbackRequest):
395
  message="Feedback recorded successfully"
396
  )
397
  else:
 
398
  feedback_manager.save_interaction(
399
  user_input=request.user_input,
400
  model_reply=request.model_reply,
@@ -420,11 +449,7 @@ async def get_stats():
420
 
421
  @app.get("/correction-count", response_model=CorrectionCountResponse)
422
  async def get_correction_count():
423
- """
424
- Get count of corrections for training readiness monitoring.
425
-
426
- Use this to check if you have enough corrections to train.
427
- """
428
  if not feedback_manager.feedback_file.exists():
429
  return CorrectionCountResponse(
430
  corrections=0,
@@ -454,21 +479,7 @@ async def get_correction_count():
454
 
455
  @app.get("/download-feedback", response_model=DownloadFeedbackResponse)
456
  async def download_feedback():
457
- """
458
- Download feedback file for training.
459
-
460
- Use this endpoint to download feedback from production backend
461
- to your training notebook.
462
-
463
- Example:
464
- ```python
465
- response = requests.get(f"{API_URL}/download-feedback")
466
- feedback_data = response.json()
467
-
468
- with open(HITL_FILE, 'w') as f:
469
- f.write(feedback_data["content"])
470
- ```
471
- """
472
  if not feedback_manager.feedback_file.exists():
473
  return DownloadFeedbackResponse(
474
  content="",
@@ -487,17 +498,7 @@ async def download_feedback():
487
 
488
  @app.post("/clear-feedback")
489
  async def clear_feedback():
490
- """
491
- Clear feedback file after training.
492
-
493
- Call this after you've downloaded feedback and completed training
494
- to start collecting fresh feedback for the next training cycle.
495
-
496
- Example:
497
- ```python
498
- requests.post(f"{API_URL}/clear-feedback")
499
- ```
500
- """
501
  try:
502
  if feedback_manager.feedback_file.exists():
503
  feedback_manager.feedback_file.unlink()
@@ -516,20 +517,7 @@ async def clear_feedback():
516
 
517
  @app.post("/reload-adapter")
518
  async def reload_adapter(request: ReloadAdapterRequest):
519
- """
520
- Hot reload model with new adapter without restarting the Space.
521
-
522
- This allows you to deploy new models without downtime.
523
-
524
- Example:
525
- ```python
526
- # After training and pushing to HF Hub
527
- requests.post(
528
- f"{API_URL}/reload-adapter",
529
- json={"adapter_path": "username/adapter-v2"}
530
- )
531
- ```
532
- """
533
  try:
534
  model_manager.initialize(
535
  model_name="meta-llama/Llama-3.2-1B-Instruct",
@@ -550,10 +538,4 @@ async def reload_adapter(request: ReloadAdapterRequest):
550
 
551
  if __name__ == "__main__":
552
  import uvicorn
553
-
554
- uvicorn.run(
555
- "app:app",
556
- host="0.0.0.0",
557
- port=7860,
558
- reload=True
559
- )
 
1
  """
2
  Enhanced FastAPI Backend with Feedback Management
 
 
 
 
 
 
 
 
 
 
 
3
  """
4
 
5
  from fastapi import FastAPI, HTTPException
 
12
  import torch
13
  from transformers import AutoTokenizer, AutoModelForCausalLM
14
  from peft import PeftModel
15
+ import os
16
 
17
  app = FastAPI(
18
  title="Personalized Chatbot API",
 
104
  if adapter_path:
105
  print(f"With adapter: {adapter_path}")
106
 
107
+ # Check for GPU
108
  self._device = "cuda" if torch.cuda.is_available() else "cpu"
109
  print(f"Using device: {self._device}")
110
 
111
+ try:
112
+ self._tokenizer = AutoTokenizer.from_pretrained(
113
+ model_name,
114
+ trust_remote_code=True
115
+ )
116
+ except Exception as e:
117
+ print(f"Error loading tokenizer: {e}")
118
+ print("Did you set HF_TOKEN in Settings > Secrets?")
119
+ raise e
120
 
121
  if self._tokenizer.pad_token is None:
122
  self._tokenizer.pad_token = self._tokenizer.eos_token
123
 
124
+ # CRITICAL FIX: Only try 4-bit if we actually have a GPU
125
+ if use_4bit and self._device == "cuda":
126
+ print("🚀 GPU detected: Loading in 4-bit mode")
127
+ try:
128
+ from transformers import BitsAndBytesConfig
129
+
130
+ bnb_config = BitsAndBytesConfig(
131
+ load_in_4bit=True,
132
+ bnb_4bit_quant_type="nf4",
133
+ bnb_4bit_compute_dtype=torch.float16,
134
+ bnb_4bit_use_double_quant=True,
135
+ )
136
+
137
+ base_model = AutoModelForCausalLM.from_pretrained(
138
+ model_name,
139
+ quantization_config=bnb_config,
140
+ device_map="auto",
141
+ trust_remote_code=True,
142
+ torch_dtype=torch.float16,
143
+ )
144
+ except ImportError:
145
+ print("⚠️ bitsandbytes not installed. Falling back to standard loading.")
146
+ base_model = AutoModelForCausalLM.from_pretrained(
147
+ model_name,
148
+ device_map="auto",
149
+ trust_remote_code=True,
150
+ )
151
  else:
152
+ print(f"⚠️ Using {self._device} (No GPU or use_4bit=False). Loading standard model.")
153
  base_model = AutoModelForCausalLM.from_pretrained(
154
  model_name,
155
+ device_map=self._device,
156
  trust_remote_code=True,
157
+ # Use float32 for CPU stability
158
+ torch_dtype=torch.float32 if self._device == "cpu" else torch.float16
159
  )
160
 
161
  if adapter_path and (isinstance(adapter_path, str) and adapter_path.strip()):
 
164
  self._model = PeftModel.from_pretrained(
165
  base_model,
166
  adapter_path,
167
+ torch_dtype=torch.float16 if self._device == "cuda" else torch.float32
168
  )
169
  self._current_adapter = adapter_path
170
  print(f"✅ Adapter loaded successfully")
 
227
  skip_special_tokens=True
228
  ).strip()
229
 
230
+ # Remove the system/user prompt if it leaked into response
231
+ if "assistant" in reply.lower() and len(reply.split("assistant")) > 1:
232
+ reply = reply.split("assistant")[-1].strip()
233
+
234
  return reply
235
 
236
 
 
238
  """Manages feedback storage and statistics."""
239
  def __init__(self, feedback_file: str = "data/feedback.jsonl"):
240
  self.feedback_file = Path(feedback_file)
241
+ # Ensure directory exists (Handled by Dockerfile too, but good safety)
242
  self.feedback_file.parent.mkdir(parents=True, exist_ok=True)
243
 
244
  def save_interaction(
 
309
  print("Starting up...")
310
 
311
  model_manager.initialize(
312
+ # 1. The Base Model (The heavy lifter)
313
+ # We use the official Llama 3.2 3B Instruct as the foundation
314
+ model_name="meta-llama/Llama-3.2-3B-Instruct",
315
+
316
+ # 2. Adapter (The personalization)
317
+ adapter_path="pierreramez/Llama-3.2-3B-Instruct-bnb-4bit_finetuned",
318
+
319
+ # 3. CPU Optimization
320
+ # Must be False for the free CPU tier
321
+ use_4bit=False
322
  )
323
 
324
  print("Ready to serve!")
 
331
  "message": "Personalized Chatbot API v2.0",
332
  "version": "2.0.0",
333
  "current_adapter": model_manager._current_adapter,
334
+ "device": model_manager._device,
335
  "endpoints": {
336
  "chat": "POST /chat",
337
  "feedback": "POST /feedback",
 
380
  )
381
 
382
  except Exception as e:
383
+ print(f"Error during chat: {e}")
384
  raise HTTPException(status_code=500, detail=str(e))
385
 
386
 
 
388
  async def submit_feedback(request: FeedbackRequest):
389
  """Submit correction for a model response."""
390
  try:
391
+ # Optimistic feedback update: try to find existing entry
392
+ if feedback_manager.feedback_file.exists():
393
+ with open(feedback_manager.feedback_file, "r", encoding="utf-8") as f:
394
+ lines = f.readlines()
395
+ else:
396
+ lines = []
397
 
398
  found = False
399
+ # Search backwards to find the most recent matching interaction
400
  for i in range(len(lines) - 1, -1, -1):
401
  try:
402
  record = json.loads(lines[i])
 
423
  message="Feedback recorded successfully"
424
  )
425
  else:
426
+ # If not found (e.g., app restarted), just append new record
427
  feedback_manager.save_interaction(
428
  user_input=request.user_input,
429
  model_reply=request.model_reply,
 
449
 
450
  @app.get("/correction-count", response_model=CorrectionCountResponse)
451
  async def get_correction_count():
452
+ """Get count of corrections for training readiness monitoring."""
 
 
 
 
453
  if not feedback_manager.feedback_file.exists():
454
  return CorrectionCountResponse(
455
  corrections=0,
 
479
 
480
  @app.get("/download-feedback", response_model=DownloadFeedbackResponse)
481
  async def download_feedback():
482
+ """Download feedback file for training."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  if not feedback_manager.feedback_file.exists():
484
  return DownloadFeedbackResponse(
485
  content="",
 
498
 
499
  @app.post("/clear-feedback")
500
  async def clear_feedback():
501
+ """Clear feedback file after training."""
 
 
 
 
 
 
 
 
 
 
502
  try:
503
  if feedback_manager.feedback_file.exists():
504
  feedback_manager.feedback_file.unlink()
 
517
 
518
  @app.post("/reload-adapter")
519
  async def reload_adapter(request: ReloadAdapterRequest):
520
+ """Hot reload model with new adapter."""
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  try:
522
  model_manager.initialize(
523
  model_name="meta-llama/Llama-3.2-1B-Instruct",
 
538
 
539
  if __name__ == "__main__":
540
  import uvicorn
541
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)