Spaces:

Kelvin000010191
/

krypton-api-node

Running

App Files Files Community

Kelvin000010191 commited on 7 days ago

Commit

4c94b66

verified ·

1 Parent(s): 7e5cb5d

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -23

app.py CHANGED Viewed

@@ -1,48 +1,70 @@
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 import os
-from huggingface_hub import InferenceClient
 app = Flask(__name__)
-CORS(app)  # Unlocks your Acode connection securely
-# Pulls your hidden WRITE token from Space secrets
 HF_TOKEN = os.getenv("HF_TOKEN")
-# Connects directly to the official, fully-supported Mistral-7B Instruct cluster
-client = InferenceClient(token=HF_TOKEN)
 @app.route("/", methods=["GET"])
 def home():
-    return "Krypton-1 Core Engine Online."
 @app.route("/api/chat", methods=["POST"])
 def chat():
     try:
         data = request.json
         user_prompt = data.get("prompt", "")
-        # Targets the official active Mistral endpoint layout
-        response = client.chat_completion(
-            model="mistralai/Mistral-7B-Instruct-v0.3",
-            messages=[{"role": "user", "content": user_prompt}],
-            max_tokens=250,
-            temperature=0.7
-        )
-        reply_text = response.choices[0].message.content
         return jsonify({"reply": reply_text})
     except Exception as e:
-        raw_error = str(e)
-        if "401" in raw_error or "unauthorized" in raw_error.lower():
-            return jsonify({"reply": "SYS_AUTH_ERR: Token unauthorized. Check your HF_TOKEN in Space Secrets."})
-        if "loading" in raw_error.lower() or "503" in raw_error:
-            return jsonify({"reply": "SYS_BOOT: Mistral cluster is warming up weights. Give the cloud 30 seconds, then try again!"})
-        return jsonify({"reply": f"SYS_ALERT: Connection failure. Details: {raw_error}"})
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

 from flask import Flask, request, jsonify
 from flask_cors import CORS
 import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 app = Flask(__name__)
+CORS(app)
+MODEL_NAME = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
 HF_TOKEN = os.getenv("HF_TOKEN")
+print("Initializing local memory nodes... Downloading Unsloth bits...")
+# 1. Load the tokenizer matching your exact model structure
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
+# 2. Configure 4-bit loading directly inside your Space container memory
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16
+)
+# 3. Pull weights directly into local cache space
+try:
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        quantization_config=bnb_config,
+        device_map="auto",
+        token=HF_TOKEN
+    )
+    print("Krypton-1 Core Engine Fully Loaded in Space Memory.")
+except Exception as e:
+    print(f"Boot Failure Error: {str(e)}")
+    model = None
 @app.route("/", methods=["GET"])
 def home():
+    return "Krypton-1 Dedicated Unsloth Node is Online."
 @app.route("/api/chat", methods=["POST"])
 def chat():
+    if model is None:
+        return jsonify({"reply": "SYS_ERR: Engine failed to cache local weights. Check container storage logs."})
     try:
         data = request.json
         user_prompt = data.get("prompt", "")
+        # Structure the prompt properly for Mistral Instruct layout
+        messages = [{"role": "user", "content": user_prompt}]
+        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
+        # Generate raw response tokens straight from your loaded weights
+        outputs = model.generate(inputs, max_new_tokens=250, temperature=0.7, do_sample=True)
+        # Decode the output text cleanly
+        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Clean up output text by stripping out the original prompt block if echoed
+        reply_text = decoded.split(user_prompt)[-1].strip() if user_prompt in decoded else decoded
         return jsonify({"reply": reply_text})
     except Exception as e:
+        return jsonify({"reply": f"SYS_ALERT: Internal processing breakdown. Details: {str(e)}"})
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)