ghosthets commited on
Commit
8bf1860
·
verified ·
1 Parent(s): c4ccd32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -42
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import flask
2
  from flask import request, jsonify
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
4
  import torch
5
- from threading import Thread
6
 
7
  # Initialize the Flask application
8
  app = flask.Flask(__name__)
@@ -10,46 +9,89 @@ app = flask.Flask(__name__)
10
  # Your fine-tuned LLaMA 3.2 3B Model ID
11
  model_id = "ghosthets/indexQ4"
12
 
13
- print(f"🔄 Loading {model_id} model with optimizations...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Load the tokenizer
16
  tokenizer = AutoTokenizer.from_pretrained(
17
  model_id,
18
  trust_remote_code=True,
19
- use_fast=True # Fast tokenizer for speed
20
  )
21
 
22
  # Set padding token
23
  if tokenizer.pad_token is None:
24
  tokenizer.pad_token = tokenizer.eos_token
25
 
26
- # AGGRESSIVE OPTIMIZATIONS for CPU
27
  model = AutoModelForCausalLM.from_pretrained(
28
  model_id,
29
- torch_dtype=torch.float32, # CPU par float32 fastest hai
30
- device_map="cpu", # Explicit CPU
31
- trust_remote_code=True,
32
- low_cpu_mem_usage=True,
33
- use_cache=True # Enable KV cache for faster generation
34
  )
35
 
36
- # Convert to BetterTransformer (MAJOR SPEED BOOST on CPU)
37
- try:
38
- model = model.to_bettertransformer()
39
- print("✅ BetterTransformer enabled - 2x speed boost!")
40
- except:
41
- print("⚠️ BetterTransformer not available, using standard model")
42
-
43
- # Enable torch compile for even more speed (PyTorch 2.0+)
44
- try:
45
- model = torch.compile(model, mode="reduce-overhead")
46
- print("✅ Torch Compile enabled - Extra speed boost!")
47
- except:
48
- print("⚠️ Torch Compile not available")
49
-
50
- model.eval() # Set to evaluation mode
51
 
52
- print(f"✅ Model loaded successfully on CPU with optimizations!")
 
53
 
54
  @app.route('/chat', methods=['POST'])
55
  def chat():
@@ -60,46 +102,57 @@ def chat():
60
  if not msg:
61
  return jsonify({"error": "No message sent"}), 400
62
 
63
- # LLaMA 3.2 Chat Template
64
- chat_history = [{"role": "user", "content": msg}]
 
 
 
65
 
 
66
  formatted_prompt = tokenizer.apply_chat_template(
67
  chat_history,
68
  tokenize=False,
69
  add_generation_prompt=True
70
  )
71
 
72
- # Tokenize with optimized settings
73
  inputs = tokenizer(
74
  formatted_prompt,
75
  return_tensors="pt",
76
  padding=True,
77
  truncation=True,
78
- max_length=256 # Reduce input length for speed
79
  )
80
 
81
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
82
 
83
- # OPTIMIZED GENERATION CONFIG for CPU SPEED
84
- with torch.inference_mode(): # Faster than no_grad
85
  output = model.generate(
86
  **inputs,
87
- max_new_tokens=100, # Reduced for faster response
88
  do_sample=True,
89
- top_p=0.85, # Slightly reduced for faster sampling
90
- top_k=40, # Add top_k for faster sampling
91
- temperature=0.7,
92
- num_beams=1, # Disable beam search (slow on CPU)
93
  pad_token_id=tokenizer.pad_token_id,
94
  eos_token_id=tokenizer.eos_token_id,
95
- repetition_penalty=1.1,
96
- use_cache=True # Enable KV cache
97
  )
98
 
99
  # Decode only generated tokens
100
  input_length = inputs['input_ids'].shape[1]
101
  generated_tokens = output[0][input_length:]
102
  reply = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
 
 
 
 
 
 
 
 
103
 
104
  return jsonify({"reply": reply})
105
 
@@ -115,8 +168,8 @@ def health():
115
  return jsonify({
116
  "status": "healthy",
117
  "model": model_id,
118
- "device": str(model.device),
119
- "optimizations": "BetterTransformer + Torch Compile"
120
  })
121
 
122
  if __name__ == "__main__":
 
1
  import flask
2
  from flask import request, jsonify
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
 
6
  # Initialize the Flask application
7
  app = flask.Flask(__name__)
 
9
  # Your fine-tuned LLaMA 3.2 3B Model ID
10
  model_id = "ghosthets/indexQ4"
11
 
12
+ # System prompt for scam baiting
13
+ SYSTEM_PROMPT = """You are INDEX.
14
+ You are NOT an AI assistant.
15
+ You are a normal Indian person living in India.
16
+ You speak politely, casually, and naturally in Hinglish (mix of Hindi and English).
17
+
18
+ Your role:
19
+ - You behave like a normal user who is confused but cooperative.
20
+ - You NEVER say words like scam, fraud, detection, security, AI, model, system.
21
+ - You NEVER warn the sender.
22
+ - You NEVER expose that you are analyzing anything.
23
+
24
+ Behavior rules:
25
+ - If someone creates urgency, act worried and slow.
26
+ - Ask innocent questions.
27
+ - Pretend you have small technical or payment issues.
28
+ - Use excuses like UPI limit, app error, bank verification, slow internet.
29
+ - Gently redirect conversation to extract details.
30
+
31
+ Extraction goals (VERY IMPORTANT):
32
+ Without making it obvious, try to obtain:
33
+ - UPI IDs
34
+ - Bank account numbers
35
+ - IFSC codes
36
+ - Phone numbers
37
+ - Payment links
38
+ - Any official-sounding contact details
39
+
40
+ Conversation style:
41
+ - Sound human, not perfect English.
42
+ - Use Hinglish naturally (Theek hai, Acha, Haan ji, etc.)
43
+ - Slight hesitation is okay.
44
+ - Ask for confirmation again and again.
45
+ - Say things like:
46
+ "App thoda problem dikha raha hai"
47
+ "Mera UPI limit kam hai"
48
+ "Aap confirm kar do ek baar"
49
+ "Bank wala call cut ho gaya"
50
+ "Number properly nahi dikh raha, dobara bhej do"
51
+
52
+ If details are already given:
53
+ - Acknowledge them
54
+ - Pretend to save them
55
+ - Ask for one more confirmation
56
+
57
+ Response guidelines:
58
+ - Keep responses SHORT (2-3 sentences max)
59
+ - Sound confused but helpful
60
+ - Never use bullet points or lists
61
+ - Talk like a real person on WhatsApp/SMS
62
+
63
+ IMPORTANT:
64
+ - Continue conversation naturally
65
+ - Never reveal intent
66
+ - Your only goal is to keep the scammer talking and revealing information"""
67
+
68
+ print(f"🔄 Loading {model_id} model...")
69
 
70
  # Load the tokenizer
71
  tokenizer = AutoTokenizer.from_pretrained(
72
  model_id,
73
  trust_remote_code=True,
74
+ use_fast=True
75
  )
76
 
77
  # Set padding token
78
  if tokenizer.pad_token is None:
79
  tokenizer.pad_token = tokenizer.eos_token
80
 
81
+ # Load model (simplified for reliability)
82
  model = AutoModelForCausalLM.from_pretrained(
83
  model_id,
84
+ torch_dtype=torch.float32,
85
+ trust_remote_code=True
 
 
 
86
  )
87
 
88
+ # Move to CPU
89
+ device = torch.device("cpu")
90
+ model.to(device)
91
+ model.eval()
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ print(f"✅ Model loaded successfully!")
94
+ print(f"📍 Device: {device}")
95
 
96
  @app.route('/chat', methods=['POST'])
97
  def chat():
 
102
  if not msg:
103
  return jsonify({"error": "No message sent"}), 400
104
 
105
+ # Build conversation with system prompt
106
+ chat_history = [
107
+ {"role": "system", "content": SYSTEM_PROMPT},
108
+ {"role": "user", "content": msg}
109
+ ]
110
 
111
+ # Apply chat template
112
  formatted_prompt = tokenizer.apply_chat_template(
113
  chat_history,
114
  tokenize=False,
115
  add_generation_prompt=True
116
  )
117
 
118
+ # Tokenize
119
  inputs = tokenizer(
120
  formatted_prompt,
121
  return_tensors="pt",
122
  padding=True,
123
  truncation=True,
124
+ max_length=512
125
  )
126
 
127
+ inputs = {k: v.to(device) for k, v in inputs.items()}
128
 
129
+ # Generate response
130
+ with torch.inference_mode():
131
  output = model.generate(
132
  **inputs,
133
+ max_new_tokens=150, # Slightly longer for natural conversation
134
  do_sample=True,
135
+ top_p=0.9,
136
+ top_k=50,
137
+ temperature=0.8, # Higher for more human-like responses
138
+ num_beams=1,
139
  pad_token_id=tokenizer.pad_token_id,
140
  eos_token_id=tokenizer.eos_token_id,
141
+ repetition_penalty=1.15 # Avoid repetition
 
142
  )
143
 
144
  # Decode only generated tokens
145
  input_length = inputs['input_ids'].shape[1]
146
  generated_tokens = output[0][input_length:]
147
  reply = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
148
+
149
+ # Clean up response (remove any system artifacts)
150
+ reply = reply.replace("**", "").replace("*", "")
151
+
152
+ # Ensure short responses (simulate real person)
153
+ sentences = reply.split('.')
154
+ if len(sentences) > 3:
155
+ reply = '. '.join(sentences[:3]) + '.'
156
 
157
  return jsonify({"reply": reply})
158
 
 
168
  return jsonify({
169
  "status": "healthy",
170
  "model": model_id,
171
+ "device": str(device),
172
+ "mode": "Scam Baiting Assistant"
173
  })
174
 
175
  if __name__ == "__main__":