Smllm

Sleeping

App Files Files Community

ghosthets commited on Jan 24

Commit

8bf1860

verified ·

1 Parent(s): c4ccd32

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -42

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import flask
 from flask import request, jsonify
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 import torch
-from threading import Thread
 # Initialize the Flask application
 app = flask.Flask(__name__)
@@ -10,46 +9,89 @@ app = flask.Flask(__name__)
 # Your fine-tuned LLaMA 3.2 3B Model ID
 model_id = "ghosthets/indexQ4"
-print(f"🔄 Loading {model_id} model with optimizations...")
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
-    use_fast=True  # Fast tokenizer for speed
 )
 # Set padding token
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-# AGGRESSIVE OPTIMIZATIONS for CPU
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype=torch.float32,  # CPU par float32 fastest hai
-    device_map="cpu",  # Explicit CPU
-    trust_remote_code=True,
-    low_cpu_mem_usage=True,
-    use_cache=True  # Enable KV cache for faster generation
 )
-# Convert to BetterTransformer (MAJOR SPEED BOOST on CPU)
-try:
-    model = model.to_bettertransformer()
-    print("✅ BetterTransformer enabled - 2x speed boost!")
-except:
-    print("⚠️ BetterTransformer not available, using standard model")
-# Enable torch compile for even more speed (PyTorch 2.0+)
-try:
-    model = torch.compile(model, mode="reduce-overhead")
-    print("✅ Torch Compile enabled - Extra speed boost!")
-except:
-    print("⚠️ Torch Compile not available")
-model.eval()  # Set to evaluation mode
-print(f"✅ Model loaded successfully on CPU with optimizations!")
 @app.route('/chat', methods=['POST'])
 def chat():
@@ -60,46 +102,57 @@ def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
-        # LLaMA 3.2 Chat Template
-        chat_history = [{"role": "user", "content": msg}]
         formatted_prompt = tokenizer.apply_chat_template(
             chat_history,
             tokenize=False,
             add_generation_prompt=True
         )
-        # Tokenize with optimized settings
         inputs = tokenizer(
             formatted_prompt,
             return_tensors="pt",
             padding=True,
             truncation=True,
-            max_length=256  # Reduce input length for speed
         )
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        # OPTIMIZED GENERATION CONFIG for CPU SPEED
-        with torch.inference_mode():  # Faster than no_grad
             output = model.generate(
                 **inputs,
-                max_new_tokens=100,  # Reduced for faster response
                 do_sample=True,
-                top_p=0.85,  # Slightly reduced for faster sampling
-                top_k=40,  # Add top_k for faster sampling
-                temperature=0.7,
-                num_beams=1,  # Disable beam search (slow on CPU)
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
-                repetition_penalty=1.1,
-                use_cache=True  # Enable KV cache
             )
         # Decode only generated tokens
         input_length = inputs['input_ids'].shape[1]
         generated_tokens = output[0][input_length:]
         reply = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
         return jsonify({"reply": reply})
@@ -115,8 +168,8 @@ def health():
     return jsonify({
         "status": "healthy",
         "model": model_id,
-        "device": str(model.device),
-        "optimizations": "BetterTransformer + Torch Compile"
     })
 if __name__ == "__main__":

 import flask
 from flask import request, jsonify
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 # Initialize the Flask application
 app = flask.Flask(__name__)
 # Your fine-tuned LLaMA 3.2 3B Model ID
 model_id = "ghosthets/indexQ4"
+# System prompt for scam baiting
+SYSTEM_PROMPT = """You are INDEX.
+You are NOT an AI assistant.
+You are a normal Indian person living in India.
+You speak politely, casually, and naturally in Hinglish (mix of Hindi and English).
+Your role:
+- You behave like a normal user who is confused but cooperative.
+- You NEVER say words like scam, fraud, detection, security, AI, model, system.
+- You NEVER warn the sender.
+- You NEVER expose that you are analyzing anything.
+Behavior rules:
+- If someone creates urgency, act worried and slow.
+- Ask innocent questions.
+- Pretend you have small technical or payment issues.
+- Use excuses like UPI limit, app error, bank verification, slow internet.
+- Gently redirect conversation to extract details.
+Extraction goals (VERY IMPORTANT):
+Without making it obvious, try to obtain:
+- UPI IDs
+- Bank account numbers
+- IFSC codes
+- Phone numbers
+- Payment links
+- Any official-sounding contact details
+Conversation style:
+- Sound human, not perfect English.
+- Use Hinglish naturally (Theek hai, Acha, Haan ji, etc.)
+- Slight hesitation is okay.
+- Ask for confirmation again and again.
+- Say things like:
+  "App thoda problem dikha raha hai"
+  "Mera UPI limit kam hai"
+  "Aap confirm kar do ek baar"
+  "Bank wala call cut ho gaya"
+  "Number properly nahi dikh raha, dobara bhej do"
+If details are already given:
+- Acknowledge them
+- Pretend to save them
+- Ask for one more confirmation
+Response guidelines:
+- Keep responses SHORT (2-3 sentences max)
+- Sound confused but helpful
+- Never use bullet points or lists
+- Talk like a real person on WhatsApp/SMS
+IMPORTANT:
+- Continue conversation naturally
+- Never reveal intent
+- Your only goal is to keep the scammer talking and revealing information"""
+print(f"🔄 Loading {model_id} model...")
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
+    use_fast=True
 )
 # Set padding token
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+# Load model (simplified for reliability)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=torch.float32,
+    trust_remote_code=True
 )
+# Move to CPU
+device = torch.device("cpu")
+model.to(device)
+model.eval()
+print(f"✅ Model loaded successfully!")
+print(f"📍 Device: {device}")
 @app.route('/chat', methods=['POST'])
 def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
+        # Build conversation with system prompt
+        chat_history = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": msg}
+        ]
+        # Apply chat template
         formatted_prompt = tokenizer.apply_chat_template(
             chat_history,
             tokenize=False,
             add_generation_prompt=True
         )
+        # Tokenize
         inputs = tokenizer(
             formatted_prompt,
             return_tensors="pt",
             padding=True,
             truncation=True,
+            max_length=512
         )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate response
+        with torch.inference_mode():
             output = model.generate(
                 **inputs,
+                max_new_tokens=150,  # Slightly longer for natural conversation
                 do_sample=True,
+                top_p=0.9,
+                top_k=50,
+                temperature=0.8,  # Higher for more human-like responses
+                num_beams=1,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.15  # Avoid repetition
             )
         # Decode only generated tokens
         input_length = inputs['input_ids'].shape[1]
         generated_tokens = output[0][input_length:]
         reply = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+        # Clean up response (remove any system artifacts)
+        reply = reply.replace("**", "").replace("*", "")
+        # Ensure short responses (simulate real person)
+        sentences = reply.split('.')
+        if len(sentences) > 3:
+            reply = '. '.join(sentences[:3]) + '.'
         return jsonify({"reply": reply})
     return jsonify({
         "status": "healthy",
         "model": model_id,
+        "device": str(device),
+        "mode": "Scam Baiting Assistant"
     })
 if __name__ == "__main__":