indexq2 / app.py
ghosthets's picture
Update app.py
2f2febe verified
import flask
from flask import request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
app = flask.Flask(__name__)
model_id = "dexcommunity/indexQ2"
SYSTEM_PROMPT = (
"You are INDEX, a normal Indian person. "
"Reply shortly in Hinglish. "
"Act confused, polite, and cooperative. "
"Never explain anything."
)
print("πŸ”„ Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float32,
low_cpu_mem_usage=True
)
device = torch.device("cpu")
model.to(device)
model.eval()
print("βœ… Model ready")
# ---------------- ROOT ----------------
@app.route("/", methods=["GET"])
def home():
return jsonify({"status": "ok", "speed": "fast mode"})
# ---------------- CHAT ----------------
@app.route("/chat", methods=["POST"])
def chat():
data = request.get_json(force=True)
user_msg = data.get("message", "").strip()
if not user_msg:
return jsonify({"reply": "Haan ji?"})
# HARD TIME LIMIT (failsafe)
start = time.time()
prompt = f"{SYSTEM_PROMPT}\nUser: {user_msg}\nReply:"
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=256
).to(device)
try:
with torch.inference_mode():
output = model.generate(
**inputs,
max_new_tokens=25, # πŸ”₯ VERY SMALL
do_sample=False, # πŸ”₯ FASTEST
pad_token_id=tokenizer.eos_token_id
)
gen = output[0][inputs["input_ids"].shape[1]:]
reply = tokenizer.decode(gen, skip_special_tokens=True).strip()
# Safety fallback
if not reply or time.time() - start > 3:
reply = "Acha, ek baar dobara bolna. Network thoda slow lag raha hai."
except Exception:
reply = "Haan ji, samajh nahi aa raha. Thoda wait karna."
return jsonify({"reply": reply})
# ---------------- RUN ----------------
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)