import flask from flask import request, jsonify from transformers import AutoTokenizer, AutoModelForCausalLM import torch import time app = flask.Flask(__name__) model_id = "dexcommunity/indexQ2" SYSTEM_PROMPT = ( "You are INDEX, a normal Indian person. " "Reply shortly in Hinglish. " "Act confused, polite, and cooperative. " "Never explain anything." ) print("🔄 Loading model...") tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True ) device = torch.device("cpu") model.to(device) model.eval() print("✅ Model ready") # ---------------- ROOT ---------------- @app.route("/", methods=["GET"]) def home(): return jsonify({"status": "ok", "speed": "fast mode"}) # ---------------- CHAT ---------------- @app.route("/chat", methods=["POST"]) def chat(): data = request.get_json(force=True) user_msg = data.get("message", "").strip() if not user_msg: return jsonify({"reply": "Haan ji?"}) # HARD TIME LIMIT (failsafe) start = time.time() prompt = f"{SYSTEM_PROMPT}\nUser: {user_msg}\nReply:" inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=256 ).to(device) try: with torch.inference_mode(): output = model.generate( **inputs, max_new_tokens=25, # 🔥 VERY SMALL do_sample=False, # 🔥 FASTEST pad_token_id=tokenizer.eos_token_id ) gen = output[0][inputs["input_ids"].shape[1]:] reply = tokenizer.decode(gen, skip_special_tokens=True).strip() # Safety fallback if not reply or time.time() - start > 3: reply = "Acha, ek baar dobara bolna. Network thoda slow lag raha hai." except Exception: reply = "Haan ji, samajh nahi aa raha. Thoda wait karna." return jsonify({"reply": reply}) # ---------------- RUN ---------------- if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)