indexq2

Sleeping

File size: 2,141 Bytes

7339449
 
8bf1860
7339449
2f2febe
7339449
 
 
91b60c4
7339449
2f2febe
 
 
 
 
313924f
7339449
2f2febe
05fe403
2f2febe
313924f
 
8bf1860
c6011c6
313924f
7339449
8bf1860
 
 
313924f
2f2febe
7339449
2f2febe
c6011c6
 
2f2febe
c6011c6
2f2febe
c6011c6
7339449
2f2febe
 
 
 
 
 
 
 
c6011c6
2f2febe
 
 
 
 
 
 
 
 
 
8bf1860
313924f
 
2f2febe
 
 
313924f
 
2f2febe
 
c6011c6
2f2febe
 
 
02f2b8a
2f2febe
 
7339449
2f2febe
7339449
 
cc759ac
7339449
2f2febe

import flask
from flask import request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time

app = flask.Flask(__name__)

model_id = "dexcommunity/indexQ2"

SYSTEM_PROMPT = (
    "You are INDEX, a normal Indian person. "
    "Reply shortly in Hinglish. "
    "Act confused, polite, and cooperative. "
    "Never explain anything."
)

print("🔄 Loading model...")

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
)

device = torch.device("cpu")
model.to(device)
model.eval()

print("✅ Model ready")

# ---------------- ROOT ----------------
@app.route("/", methods=["GET"])
def home():
    return jsonify({"status": "ok", "speed": "fast mode"})

# ---------------- CHAT ----------------
@app.route("/chat", methods=["POST"])
def chat():
    data = request.get_json(force=True)
    user_msg = data.get("message", "").strip()

    if not user_msg:
        return jsonify({"reply": "Haan ji?"})

    # HARD TIME LIMIT (failsafe)
    start = time.time()

    prompt = f"{SYSTEM_PROMPT}\nUser: {user_msg}\nReply:"

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(device)

    try:
        with torch.inference_mode():
            output = model.generate(
                **inputs,
                max_new_tokens=25,        # 🔥 VERY SMALL
                do_sample=False,          # 🔥 FASTEST
                pad_token_id=tokenizer.eos_token_id
            )

        gen = output[0][inputs["input_ids"].shape[1]:]
        reply = tokenizer.decode(gen, skip_special_tokens=True).strip()

        # Safety fallback
        if not reply or time.time() - start > 3:
            reply = "Acha, ek baar dobara bolna. Network thoda slow lag raha hai."

    except Exception:
        reply = "Haan ji, samajh nahi aa raha. Thoda wait karna."

    return jsonify({"reply": reply})


# ---------------- RUN ----------------
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)