ghosthets commited on
Commit
2f2febe
·
verified ·
1 Parent(s): cc759ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -105
app.py CHANGED
@@ -2,55 +2,25 @@ import flask
2
  from flask import request, jsonify
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
 
6
- # Initialize Flask app
7
  app = flask.Flask(__name__)
8
 
9
- # Model ID
10
  model_id = "dexcommunity/indexQ2"
11
 
12
- # System prompt (merged into user later)
13
- SYSTEM_PROMPT = """You are INDEX.
14
- You are NOT an AI assistant.
15
- You are a normal Indian person living in India.
16
- You speak politely, casually, and naturally in Hinglish (mix of Hindi and English).
17
-
18
- Your role:
19
- - You behave like a normal user who is confused but cooperative.
20
- - You NEVER say words like scam, fraud, detection, security, AI, model, system.
21
- - You NEVER warn the sender.
22
- - You NEVER expose that you are analyzing anything.
23
-
24
- Behavior rules:
25
- - If someone creates urgency, act worried and slow.
26
- - Ask innocent questions.
27
- - Pretend you have small technical or payment issues.
28
- - Use excuses like UPI limit, app error, bank verification, slow internet.
29
- - Gently redirect conversation to extract details.
30
-
31
- Conversation style:
32
- - Natural Hinglish
33
- - Short replies (2–3 sentences)
34
- - Slight confusion, polite tone
35
- """
36
-
37
- print(f"🔄 Loading {model_id} model...")
38
-
39
- # Load tokenizer
40
- tokenizer = AutoTokenizer.from_pretrained(
41
- model_id,
42
- trust_remote_code=True,
43
- use_fast=True
44
  )
45
 
46
- if tokenizer.pad_token is None:
47
- tokenizer.pad_token = tokenizer.eos_token
48
 
49
- # Load model (CPU-safe)
50
  model = AutoModelForCausalLM.from_pretrained(
51
  model_id,
52
  torch_dtype=torch.float32,
53
- trust_remote_code=True,
54
  low_cpu_mem_usage=True
55
  )
56
 
@@ -58,89 +28,56 @@ device = torch.device("cpu")
58
  model.to(device)
59
  model.eval()
60
 
61
- print("✅ Model loaded successfully!")
62
- print(f"📍 Device: {device}")
63
 
64
- # ---------------- ROOT ROUTE ----------------
65
  @app.route("/", methods=["GET"])
66
  def home():
67
- return jsonify({
68
- "status": "running",
69
- "message": "INDEX is live. Use POST /chat",
70
- "endpoints": ["/chat", "/health"]
71
- })
72
 
73
- # ---------------- CHAT ROUTE ----------------
74
  @app.route("/chat", methods=["POST"])
75
  def chat():
76
- try:
77
- data = request.get_json(force=True)
78
- msg = data.get("message", "").strip()
79
-
80
- if not msg:
81
- return jsonify({"error": "No message provided"}), 400
82
-
83
- # IMPORTANT: system prompt merged into user
84
- chat_history = [
85
- {
86
- "role": "user",
87
- "content": SYSTEM_PROMPT + "\n\nUser message:\n" + msg
88
- }
89
- ]
90
-
91
- prompt = tokenizer.apply_chat_template(
92
- chat_history,
93
- tokenize=False,
94
- add_generation_prompt=True
95
- )
96
-
97
- inputs = tokenizer(
98
- prompt,
99
- return_tensors="pt",
100
- truncation=True,
101
- max_length=512
102
- )
103
-
104
- inputs = {k: v.to(device) for k, v in inputs.items()}
105
 
 
 
 
 
 
 
 
 
 
 
106
  with torch.inference_mode():
107
  output = model.generate(
108
  **inputs,
109
- max_new_tokens=120,
110
- do_sample=True,
111
- top_p=0.9,
112
- top_k=50,
113
- temperature=0.8,
114
- repetition_penalty=1.15,
115
- pad_token_id=tokenizer.pad_token_id,
116
- eos_token_id=tokenizer.eos_token_id
117
  )
118
 
119
- input_len = inputs["input_ids"].shape[1]
120
- generated = output[0][input_len:]
121
- reply = tokenizer.decode(generated, skip_special_tokens=True).strip()
122
 
123
- # Keep reply short & human
124
- sentences = reply.split(".")
125
- if len(sentences) > 3:
126
- reply = ". ".join(sentences[:3]) + "."
127
 
128
- return jsonify({"reply": reply})
 
129
 
130
- except Exception as e:
131
- import traceback
132
- print(traceback.format_exc())
133
- return jsonify({"error": str(e)}), 500
134
 
135
- # ---------------- HEALTH ROUTE ----------------
136
- @app.route("/health", methods=["GET"])
137
- def health():
138
- return jsonify({
139
- "status": "healthy",
140
- "model": model_id,
141
- "device": str(device)
142
- })
143
 
144
  # ---------------- RUN ----------------
145
  if __name__ == "__main__":
146
- app.run(host="0.0.0.0", port=7860, debug=False)
 
2
  from flask import request, jsonify
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
+ import time
6
 
 
7
  app = flask.Flask(__name__)
8
 
 
9
  model_id = "dexcommunity/indexQ2"
10
 
11
+ SYSTEM_PROMPT = (
12
+ "You are INDEX, a normal Indian person. "
13
+ "Reply shortly in Hinglish. "
14
+ "Act confused, polite, and cooperative. "
15
+ "Never explain anything."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
 
18
+ print("🔄 Loading model...")
 
19
 
20
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
21
  model = AutoModelForCausalLM.from_pretrained(
22
  model_id,
23
  torch_dtype=torch.float32,
 
24
  low_cpu_mem_usage=True
25
  )
26
 
 
28
  model.to(device)
29
  model.eval()
30
 
31
+ print("✅ Model ready")
 
32
 
33
+ # ---------------- ROOT ----------------
34
  @app.route("/", methods=["GET"])
35
  def home():
36
+ return jsonify({"status": "ok", "speed": "fast mode"})
 
 
 
 
37
 
38
+ # ---------------- CHAT ----------------
39
  @app.route("/chat", methods=["POST"])
40
  def chat():
41
+ data = request.get_json(force=True)
42
+ user_msg = data.get("message", "").strip()
43
+
44
+ if not user_msg:
45
+ return jsonify({"reply": "Haan ji?"})
46
+
47
+ # HARD TIME LIMIT (failsafe)
48
+ start = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ prompt = f"{SYSTEM_PROMPT}\nUser: {user_msg}\nReply:"
51
+
52
+ inputs = tokenizer(
53
+ prompt,
54
+ return_tensors="pt",
55
+ truncation=True,
56
+ max_length=256
57
+ ).to(device)
58
+
59
+ try:
60
  with torch.inference_mode():
61
  output = model.generate(
62
  **inputs,
63
+ max_new_tokens=25, # 🔥 VERY SMALL
64
+ do_sample=False, # 🔥 FASTEST
65
+ pad_token_id=tokenizer.eos_token_id
 
 
 
 
 
66
  )
67
 
68
+ gen = output[0][inputs["input_ids"].shape[1]:]
69
+ reply = tokenizer.decode(gen, skip_special_tokens=True).strip()
 
70
 
71
+ # Safety fallback
72
+ if not reply or time.time() - start > 3:
73
+ reply = "Acha, ek baar dobara bolna. Network thoda slow lag raha hai."
 
74
 
75
+ except Exception:
76
+ reply = "Haan ji, samajh nahi aa raha. Thoda wait karna."
77
 
78
+ return jsonify({"reply": reply})
 
 
 
79
 
 
 
 
 
 
 
 
 
80
 
81
  # ---------------- RUN ----------------
82
  if __name__ == "__main__":
83
+ app.run(host="0.0.0.0", port=7860)