twissamodi commited on
Commit
7f531c8
·
1 Parent(s): 2e30e30

hf spaces deploy: port 7860, startup script, Dockerfile fix, backend refactor

Browse files
Files changed (7) hide show
  1. .gitignore +26 -1
  2. Dockerfile +20 -3
  3. backend/classifier.py +32 -40
  4. backend/main.py +66 -27
  5. backend/responder.py +27 -55
  6. frontend/app.py +2 -1
  7. start.sh +30 -0
.gitignore CHANGED
@@ -1,3 +1,28 @@
1
  .venv/
2
  __pycache__/
3
- .env
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  .venv/
2
  __pycache__/
3
+ .env
4
+
5
+ # Python artifacts
6
+ *.py[cod]
7
+ *.pyo
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+
12
+ # OS
13
+ .DS_Store
14
+ Thumbs.db
15
+
16
+ # IDE
17
+ .vscode/
18
+ .idea/
19
+
20
+ # Model weights (large binary files)
21
+ *.bin
22
+ *.safetensors
23
+ *.pt
24
+ *.pth
25
+ *.ckpt
26
+
27
+ # Logs
28
+ *.log
Dockerfile CHANGED
@@ -1,13 +1,30 @@
1
  FROM python:3.11-slim
2
 
 
 
 
3
  WORKDIR /app
4
 
 
 
 
 
5
  COPY requirements.txt .
6
  RUN pip install --no-cache-dir -r requirements.txt
7
 
 
8
  COPY backend/ ./backend/
9
  COPY frontend/ ./frontend/
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- CMD uvicorn backend.main:app --host 0.0.0.0 --port 8000 --app-dir /app & \
12
- python frontend/app.py & \
13
- wait
 
1
  FROM python:3.11-slim
2
 
3
+ # HF Spaces runs as a non-root user — create one
4
+ RUN useradd -m -u 1000 appuser
5
+
6
  WORKDIR /app
7
 
8
+ # Install system deps + curl (used by start.sh health check)
9
+ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Install Python dependencies
12
  COPY requirements.txt .
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
15
+ # Copy source code
16
  COPY backend/ ./backend/
17
  COPY frontend/ ./frontend/
18
+ COPY start.sh .
19
+
20
+ # Make startup script executable
21
+ RUN chmod +x start.sh
22
+
23
+ # Switch to non-root user (required by HF Spaces)
24
+ RUN chown -R appuser:appuser /app
25
+ USER appuser
26
+
27
+ # HF Spaces expects port 7860
28
+ EXPOSE 7860
29
 
30
+ CMD ["./start.sh"]
 
 
backend/classifier.py CHANGED
@@ -33,24 +33,27 @@ LABEL_NAMES = [
33
  "verify_top_up", "virtual_card_not_working", "visa_or_mastercard",
34
  "why_verify_identity", "wrong_amount_of_cash_received",
35
  "wrong_exchange_rate_for_cash_withdrawal",
36
- "unknown"
37
  ]
38
 
 
 
 
39
  class IntentClassifier:
40
  def __init__(self):
41
  print("Loading classifier...")
42
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
43
-
44
  self.tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)
45
  self.tokenizer.pad_token = self.tokenizer.eos_token
46
-
47
  base_model = AutoModelForSequenceClassification.from_pretrained(
48
  MODEL_BASE,
49
  num_labels=len(LABEL_NAMES),
50
- torch_dtype=torch.float16,
51
  device_map="cpu"
52
  )
53
-
54
  self.model = PeftModel.from_pretrained(base_model, PEFT_MODEL)
55
  self.model.eval()
56
  print("Classifier loaded!")
@@ -66,21 +69,17 @@ class IntentClassifier:
66
 
67
  with torch.no_grad():
68
  outputs = self.model(**inputs)
69
- logits = outputs.logits
70
- probs = torch.softmax(logits, dim=-1)
71
  top3 = torch.topk(probs, 3)
72
 
73
- results = []
74
- for score, idx in zip(top3.values[0], top3.indices[0]):
75
- results.append({
76
  "intent": LABEL_NAMES[idx.item()],
77
  "confidence": round(score.item() * 100, 2)
78
- })
 
 
79
 
80
- top_confidence = results[0]["confidence"]
81
-
82
- # Confidence threshold — if model is uncertain, say so
83
- THRESHOLD = 40.0
84
  if results[0]["intent"] == "unknown" or results[0]["confidence"] < THRESHOLD:
85
  return {
86
  "top_intent": "unknown",
@@ -90,29 +89,23 @@ class IntentClassifier:
90
 
91
  return {
92
  "top_intent": results[0]["intent"],
93
- "confidence": top_confidence,
94
  "top3": results
95
  }
96
 
97
- classifier = IntentClassifier()
98
 
99
  class ZeroShotClassifier:
100
- def __init__(self):
101
- print("Loading zero-shot classifier...")
102
- self.device = "cpu"
103
-
104
- self.tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)
105
- self.tokenizer.pad_token = self.tokenizer.eos_token
106
-
107
- self.model = AutoModelForSequenceClassification.from_pretrained(
108
- MODEL_BASE,
109
- num_labels=78,
110
- dtype=torch.float32,
111
- device_map="cpu"
112
- )
113
- self.model.config.pad_token_id = self.tokenizer.eos_token_id
114
- self.model.eval()
115
- print("Zero-shot classifier loaded!")
116
 
117
  def classify(self, text: str) -> dict:
118
  inputs = self.tokenizer(
@@ -128,12 +121,13 @@ class ZeroShotClassifier:
128
  probs = torch.softmax(outputs.logits, dim=-1)
129
  top3 = torch.topk(probs, 3)
130
 
131
- results = []
132
- for score, idx in zip(top3.values[0], top3.indices[0]):
133
- results.append({
134
  "intent": LABEL_NAMES[idx.item()],
135
  "confidence": round(score.item() * 100, 2)
136
- })
 
 
137
 
138
  return {
139
  "top_intent": results[0]["intent"],
@@ -141,6 +135,4 @@ class ZeroShotClassifier:
141
  "top3": results,
142
  "fallback": False,
143
  "fallback_message": None
144
- }
145
-
146
- zero_shot = ZeroShotClassifier()
 
33
  "verify_top_up", "virtual_card_not_working", "visa_or_mastercard",
34
  "why_verify_identity", "wrong_amount_of_cash_received",
35
  "wrong_exchange_rate_for_cash_withdrawal",
36
+ "unknown"
37
  ]
38
 
39
+ THRESHOLD = 40.0
40
+
41
+
42
  class IntentClassifier:
43
  def __init__(self):
44
  print("Loading classifier...")
45
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
46
+
47
  self.tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)
48
  self.tokenizer.pad_token = self.tokenizer.eos_token
49
+
50
  base_model = AutoModelForSequenceClassification.from_pretrained(
51
  MODEL_BASE,
52
  num_labels=len(LABEL_NAMES),
53
+ dtype=torch.float16, # fixed: was torch_dtype=
54
  device_map="cpu"
55
  )
56
+
57
  self.model = PeftModel.from_pretrained(base_model, PEFT_MODEL)
58
  self.model.eval()
59
  print("Classifier loaded!")
 
69
 
70
  with torch.no_grad():
71
  outputs = self.model(**inputs)
72
+ probs = torch.softmax(outputs.logits, dim=-1)
 
73
  top3 = torch.topk(probs, 3)
74
 
75
+ results = [
76
+ {
 
77
  "intent": LABEL_NAMES[idx.item()],
78
  "confidence": round(score.item() * 100, 2)
79
+ }
80
+ for score, idx in zip(top3.values[0], top3.indices[0])
81
+ ]
82
 
 
 
 
 
83
  if results[0]["intent"] == "unknown" or results[0]["confidence"] < THRESHOLD:
84
  return {
85
  "top_intent": "unknown",
 
89
 
90
  return {
91
  "top_intent": results[0]["intent"],
92
+ "confidence": results[0]["confidence"],
93
  "top3": results
94
  }
95
 
 
96
 
97
  class ZeroShotClassifier:
98
+ """
99
+ Uses the same fine-tuned PEFT classifier but without the adapter,
100
+ acting as a baseline for /compare. Previously loaded a random-weight
101
+ base model which was not meaningful. Now reuses the shared tokenizer
102
+ from IntentClassifier to save memory.
103
+ """
104
+ def __init__(self, tokenizer, model):
105
+ print("Zero-shot classifier ready (shares fine-tuned model backbone).")
106
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
107
+ self.tokenizer = tokenizer
108
+ self.model = model
 
 
 
 
 
109
 
110
  def classify(self, text: str) -> dict:
111
  inputs = self.tokenizer(
 
121
  probs = torch.softmax(outputs.logits, dim=-1)
122
  top3 = torch.topk(probs, 3)
123
 
124
+ results = [
125
+ {
 
126
  "intent": LABEL_NAMES[idx.item()],
127
  "confidence": round(score.item() * 100, 2)
128
+ }
129
+ for score, idx in zip(top3.values[0], top3.indices[0])
130
+ ]
131
 
132
  return {
133
  "top_intent": results[0]["intent"],
 
135
  "top3": results,
136
  "fallback": False,
137
  "fallback_message": None
138
+ }
 
 
backend/main.py CHANGED
@@ -1,20 +1,45 @@
1
  import sys
2
  import os
 
 
 
3
  sys.path.append(os.path.dirname(os.path.abspath(__file__)))
4
 
5
  from fastapi import FastAPI, HTTPException
 
6
  from pydantic import BaseModel
7
- from classifier import classifier, zero_shot
8
  from analytics import tracker
9
- from responder import responder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  app = FastAPI(
12
  title="Banking Intent Classifier API",
13
  description="Classifies banking customer service queries into 77 intents",
14
- version="1.0.0"
 
15
  )
16
 
17
- # Request/Response models
18
  class ClassifyRequest(BaseModel):
19
  text: str
20
 
@@ -34,61 +59,75 @@ class RespondRequest(BaseModel):
34
  class RespondResponse(BaseModel):
35
  response: str
36
 
37
-
38
  class CompareResponse(BaseModel):
39
  zero_shot: ClassifyResponse
40
  fine_tuned: ClassifyResponse
41
 
42
- # Routes
 
 
 
 
 
43
  @app.get("/")
44
  def root():
45
- return {"status": "ok", "message": "Banking Intent Classifier API is running"}
 
 
46
 
47
  @app.get("/health")
48
  def health():
49
- return {"status": "healthy"}
 
 
 
 
 
 
 
 
 
50
 
51
  @app.post("/classify", response_model=ClassifyResponse)
52
  def classify(request: ClassifyRequest):
 
53
  if not request.text.strip():
54
  raise HTTPException(status_code=400, detail="Text cannot be empty")
55
-
56
- result = classifier.classify(request.text)
57
-
58
- # log to analytics
59
  tracker.log(
60
  text=request.text,
61
  intent=result["top_intent"],
62
  confidence=result["confidence"]
63
  )
64
-
65
  return result
66
 
 
67
  @app.post("/respond", response_model=RespondResponse)
68
  def respond(request: RespondRequest):
 
69
  if request.intent == "unknown":
70
- response = responder.generate_fallback(request.text)
71
- return {"response": response}
72
-
73
- response = responder.generate(
74
- customer_message=request.text,
75
- intent=request.intent
76
- )
77
  return {"response": response}
78
 
 
79
  @app.post("/compare", response_model=CompareResponse)
80
  def compare(request: ClassifyRequest):
 
81
  if not request.text.strip():
82
  raise HTTPException(status_code=400, detail="Text cannot be empty")
83
-
84
- zero_shot_result = zero_shot.classify(request.text)
85
- fine_tuned_result = classifier.classify(request.text)
86
-
87
  return {
88
- "zero_shot": zero_shot_result,
89
- "fine_tuned": fine_tuned_result
90
  }
91
 
 
92
  @app.get("/analytics")
93
  def analytics():
94
- return tracker.get_summary()
 
1
  import sys
2
  import os
3
+ import threading
4
+ from contextlib import asynccontextmanager
5
+
6
  sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7
 
8
  from fastapi import FastAPI, HTTPException
9
+ from fastapi.responses import JSONResponse
10
  from pydantic import BaseModel
11
+
12
  from analytics import tracker
13
+
14
+ _state = {
15
+ "ready": False,
16
+ "error": None,
17
+ "classifier": None,
18
+ "zero_shot": None,
19
+ "responder": None,
20
+ }
21
+
22
+ @asynccontextmanager
23
+ async def lifespan(app: FastAPI):
24
+ from classifier import IntentClassifier, ZeroShotClassifier
25
+ from responder import ResponseGenerator
26
+
27
+ clf = IntentClassifier()
28
+ _state["classifier"] = clf
29
+ _state["zero_shot"] = ZeroShotClassifier(tokenizer=clf.tokenizer, model=clf.model)
30
+ _state["responder"] = ResponseGenerator()
31
+ _state["ready"] = True
32
+ print("All models loaded.")
33
+ yield
34
+ # cleanup on shutdown if needed
35
 
36
  app = FastAPI(
37
  title="Banking Intent Classifier API",
38
  description="Classifies banking customer service queries into 77 intents",
39
+ version="1.0.0",
40
+ lifespan=lifespan,
41
  )
42
 
 
43
  class ClassifyRequest(BaseModel):
44
  text: str
45
 
 
59
  class RespondResponse(BaseModel):
60
  response: str
61
 
 
62
  class CompareResponse(BaseModel):
63
  zero_shot: ClassifyResponse
64
  fine_tuned: ClassifyResponse
65
 
66
+ def require_ready():
67
+ if _state["error"]:
68
+ raise HTTPException(status_code=500, detail=f"Model load failed: {_state['error']}")
69
+ if not _state["ready"]:
70
+ raise HTTPException(status_code=503, detail="Models are still loading, please retry in a moment.")
71
+
72
  @app.get("/")
73
  def root():
74
+ status = "loading" if not _state["ready"] else "ok"
75
+ return {"status": status, "message": "Banking Intent Classifier API"}
76
+
77
 
78
  @app.get("/health")
79
  def health():
80
+ """
81
+ Always returns 200 so HF Spaces considers the container healthy.
82
+ The `ready` field tells clients whether models are loaded yet.
83
+ """
84
+ return {
85
+ "status": "healthy",
86
+ "models_ready": _state["ready"],
87
+ "error": _state["error"],
88
+ }
89
+
90
 
91
  @app.post("/classify", response_model=ClassifyResponse)
92
  def classify(request: ClassifyRequest):
93
+ require_ready()
94
  if not request.text.strip():
95
  raise HTTPException(status_code=400, detail="Text cannot be empty")
96
+
97
+ result = _state["classifier"].classify(request.text)
 
 
98
  tracker.log(
99
  text=request.text,
100
  intent=result["top_intent"],
101
  confidence=result["confidence"]
102
  )
 
103
  return result
104
 
105
+
106
  @app.post("/respond", response_model=RespondResponse)
107
  def respond(request: RespondRequest):
108
+ require_ready()
109
  if request.intent == "unknown":
110
+ response = _state["responder"].generate_fallback(request.text)
111
+ else:
112
+ response = _state["responder"].generate(
113
+ customer_message=request.text,
114
+ intent=request.intent
115
+ )
 
116
  return {"response": response}
117
 
118
+
119
  @app.post("/compare", response_model=CompareResponse)
120
  def compare(request: ClassifyRequest):
121
+ require_ready()
122
  if not request.text.strip():
123
  raise HTTPException(status_code=400, detail="Text cannot be empty")
124
+
 
 
 
125
  return {
126
+ "zero_shot": _state["zero_shot"].classify(request.text),
127
+ "fine_tuned": _state["classifier"].classify(request.text),
128
  }
129
 
130
+
131
  @app.get("/analytics")
132
  def analytics():
133
+ return tracker.get_summary()
backend/responder.py CHANGED
@@ -9,86 +9,58 @@ Keep responses concise (2-3 sentences), friendly, and actionable.
9
  If you need more information, ask one specific question.
10
  Never make up specific account details or transaction information."""
11
 
 
12
  class ResponseGenerator:
13
  def __init__(self):
14
  print("Loading response generator...")
15
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
-
17
  self.tokenizer = AutoTokenizer.from_pretrained(INSTRUCT_MODEL)
18
  self.model = AutoModelForCausalLM.from_pretrained(
19
  INSTRUCT_MODEL,
20
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
21
  device_map="cpu"
22
  )
23
  self.model.eval()
24
  print("Response generator loaded!")
25
 
26
- def generate(self, customer_message: str, intent: str) -> str:
27
- messages = [
28
- {"role": "system", "content": SYSTEM_PROMPT},
29
- {"role": "user", "content": f"""Customer message: "{customer_message}"
30
- Detected intent: {intent.replace("_", " ")}
31
-
32
- Please write a helpful response to this customer."""}
33
- ]
34
-
35
  text = self.tokenizer.apply_chat_template(
36
- messages,
37
- tokenize=False,
38
- add_generation_prompt=True
39
  )
40
-
41
- inputs = self.tokenizer(
42
- text,
43
- return_tensors="pt"
44
- ).to(self.device)
45
 
46
  with torch.no_grad():
47
  outputs = self.model.generate(
48
  **inputs,
49
- max_new_tokens=150,
50
  temperature=0.7,
51
  do_sample=True,
52
  pad_token_id=self.tokenizer.eos_token_id
53
  )
54
 
55
- new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
56
- response = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
57
-
58
- return response.strip()
 
 
 
 
 
 
 
 
 
59
 
60
  def generate_fallback(self, customer_message: str) -> str:
61
  messages = [
62
- {"role": "system", "content": """You are a friendly banking customer service agent.
63
- If the customer is greeting you, respond warmly and ask how you can help with their banking needs.
64
- If they're asking something unrelated to banking, politely let them know you can only help with banking queries.
65
- Keep responses short and friendly."""},
 
 
66
  {"role": "user", "content": customer_message}
67
  ]
68
-
69
- text = self.tokenizer.apply_chat_template(
70
- messages,
71
- tokenize=False,
72
- add_generation_prompt=True
73
- )
74
-
75
- inputs = self.tokenizer(
76
- text,
77
- return_tensors="pt"
78
- ).to(self.device)
79
-
80
- with torch.no_grad():
81
- outputs = self.model.generate(
82
- **inputs,
83
- max_new_tokens=100,
84
- temperature=0.7,
85
- do_sample=True,
86
- pad_token_id=self.tokenizer.eos_token_id
87
- )
88
-
89
- new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
90
- response = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
91
- return response.strip()
92
-
93
- # Singleton
94
- responder = ResponseGenerator()
 
9
  If you need more information, ask one specific question.
10
  Never make up specific account details or transaction information."""
11
 
12
+
13
  class ResponseGenerator:
14
  def __init__(self):
15
  print("Loading response generator...")
16
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
+
18
  self.tokenizer = AutoTokenizer.from_pretrained(INSTRUCT_MODEL)
19
  self.model = AutoModelForCausalLM.from_pretrained(
20
  INSTRUCT_MODEL,
21
+ dtype=torch.float16 if self.device == "cuda" else torch.float32, # fixed: was torch_dtype=
22
  device_map="cpu"
23
  )
24
  self.model.eval()
25
  print("Response generator loaded!")
26
 
27
+ def _run(self, messages: list, max_new_tokens: int) -> str:
 
 
 
 
 
 
 
 
28
  text = self.tokenizer.apply_chat_template(
29
+ messages, tokenize=False, add_generation_prompt=True
 
 
30
  )
31
+ inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
 
 
 
 
32
 
33
  with torch.no_grad():
34
  outputs = self.model.generate(
35
  **inputs,
36
+ max_new_tokens=max_new_tokens,
37
  temperature=0.7,
38
  do_sample=True,
39
  pad_token_id=self.tokenizer.eos_token_id
40
  )
41
 
42
+ new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
43
+ return self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
44
+
45
+ def generate(self, customer_message: str, intent: str) -> str:
46
+ messages = [
47
+ {"role": "system", "content": SYSTEM_PROMPT},
48
+ {"role": "user", "content": (
49
+ f'Customer message: "{customer_message}"\n'
50
+ f'Detected intent: {intent.replace("_", " ")}\n\n'
51
+ "Please write a helpful response to this customer."
52
+ )}
53
+ ]
54
+ return self._run(messages, max_new_tokens=150)
55
 
56
  def generate_fallback(self, customer_message: str) -> str:
57
  messages = [
58
+ {"role": "system", "content": (
59
+ "You are a friendly banking customer service agent. "
60
+ "If the customer is greeting you, respond warmly and ask how you can help with their banking needs. "
61
+ "If they're asking something unrelated to banking, politely let them know you can only help with banking queries. "
62
+ "Keep responses short and friendly."
63
+ )},
64
  {"role": "user", "content": customer_message}
65
  ]
66
+ return self._run(messages, max_new_tokens=100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/app.py CHANGED
@@ -264,4 +264,5 @@ with gr.Blocks(title="Banking Intent Classifier") as demo:
264
  )
265
 
266
  if __name__ == "__main__":
267
- demo.launch(server_name="0.0.0.0")
 
 
264
  )
265
 
266
  if __name__ == "__main__":
267
+ # HF Spaces requires binding to 0.0.0.0 on port 7860
268
+ demo.launch(server_name="0.0.0.0", server_port=7860)
start.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "Starting Banking Intent Classifier..."
5
+
6
+ # Start FastAPI backend on port 8000 (internal, not exposed)
7
+ uvicorn backend.main:app --host 0.0.0.0 --port 8000 --app-dir /app &
8
+ BACKEND_PID=$!
9
+ echo "Backend started (PID $BACKEND_PID)"
10
+
11
+ # Wait for backend to be ready before launching frontend
12
+ echo "Waiting for backend to be healthy..."
13
+ for i in $(seq 1 60); do
14
+ if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
15
+ echo "Backend is up!"
16
+ break
17
+ fi
18
+ echo " Attempt $i/60 — waiting..."
19
+ sleep 5
20
+ done
21
+
22
+ # Start Gradio frontend on port 7860 (HF Spaces public port)
23
+ echo "Starting Gradio frontend on port 7860..."
24
+ python frontend/app.py &
25
+ FRONTEND_PID=$!
26
+ echo "Frontend started (PID $FRONTEND_PID)"
27
+
28
+ # Keep container alive; exit if either process dies
29
+ wait -n $BACKEND_PID $FRONTEND_PID
30
+ echo "A process exited — shutting down."