Spaces:
Sleeping
Sleeping
File size: 2,141 Bytes
7339449 8bf1860 7339449 2f2febe 7339449 91b60c4 7339449 2f2febe 313924f 7339449 2f2febe 05fe403 2f2febe 313924f 8bf1860 c6011c6 313924f 7339449 8bf1860 313924f 2f2febe 7339449 2f2febe c6011c6 2f2febe c6011c6 2f2febe c6011c6 7339449 2f2febe c6011c6 2f2febe 8bf1860 313924f 2f2febe 313924f 2f2febe c6011c6 2f2febe 02f2b8a 2f2febe 7339449 2f2febe 7339449 cc759ac 7339449 2f2febe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | import flask
from flask import request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
app = flask.Flask(__name__)
model_id = "dexcommunity/indexQ2"
SYSTEM_PROMPT = (
"You are INDEX, a normal Indian person. "
"Reply shortly in Hinglish. "
"Act confused, polite, and cooperative. "
"Never explain anything."
)
print("🔄 Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float32,
low_cpu_mem_usage=True
)
device = torch.device("cpu")
model.to(device)
model.eval()
print("✅ Model ready")
# ---------------- ROOT ----------------
@app.route("/", methods=["GET"])
def home():
return jsonify({"status": "ok", "speed": "fast mode"})
# ---------------- CHAT ----------------
@app.route("/chat", methods=["POST"])
def chat():
data = request.get_json(force=True)
user_msg = data.get("message", "").strip()
if not user_msg:
return jsonify({"reply": "Haan ji?"})
# HARD TIME LIMIT (failsafe)
start = time.time()
prompt = f"{SYSTEM_PROMPT}\nUser: {user_msg}\nReply:"
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=256
).to(device)
try:
with torch.inference_mode():
output = model.generate(
**inputs,
max_new_tokens=25, # 🔥 VERY SMALL
do_sample=False, # 🔥 FASTEST
pad_token_id=tokenizer.eos_token_id
)
gen = output[0][inputs["input_ids"].shape[1]:]
reply = tokenizer.decode(gen, skip_special_tokens=True).strip()
# Safety fallback
if not reply or time.time() - start > 3:
reply = "Acha, ek baar dobara bolna. Network thoda slow lag raha hai."
except Exception:
reply = "Haan ji, samajh nahi aa raha. Thoda wait karna."
return jsonify({"reply": reply})
# ---------------- RUN ----------------
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)
|