Kelvin000010191 commited on
Commit
4c94b66
·
verified ·
1 Parent(s): 7e5cb5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -23
app.py CHANGED
@@ -1,48 +1,70 @@
1
  from flask import Flask, request, jsonify
2
  from flask_cors import CORS
3
  import os
4
- from huggingface_hub import InferenceClient
 
5
 
6
  app = Flask(__name__)
7
- CORS(app) # Unlocks your Acode connection securely
8
 
9
- # Pulls your hidden WRITE token from Space secrets
10
  HF_TOKEN = os.getenv("HF_TOKEN")
11
 
12
- # Connects directly to the official, fully-supported Mistral-7B Instruct cluster
13
- client = InferenceClient(token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  @app.route("/", methods=["GET"])
16
  def home():
17
- return "Krypton-1 Core Engine Online."
18
 
19
  @app.route("/api/chat", methods=["POST"])
20
  def chat():
 
 
 
21
  try:
22
  data = request.json
23
  user_prompt = data.get("prompt", "")
24
 
25
- # Targets the official active Mistral endpoint layout
26
- response = client.chat_completion(
27
- model="mistralai/Mistral-7B-Instruct-v0.3",
28
- messages=[{"role": "user", "content": user_prompt}],
29
- max_tokens=250,
30
- temperature=0.7
31
- )
 
 
 
 
 
32
 
33
- reply_text = response.choices[0].message.content
34
  return jsonify({"reply": reply_text})
35
 
36
  except Exception as e:
37
- raw_error = str(e)
38
-
39
- if "401" in raw_error or "unauthorized" in raw_error.lower():
40
- return jsonify({"reply": "SYS_AUTH_ERR: Token unauthorized. Check your HF_TOKEN in Space Secrets."})
41
-
42
- if "loading" in raw_error.lower() or "503" in raw_error:
43
- return jsonify({"reply": "SYS_BOOT: Mistral cluster is warming up weights. Give the cloud 30 seconds, then try again!"})
44
-
45
- return jsonify({"reply": f"SYS_ALERT: Connection failure. Details: {raw_error}"})
46
 
47
  if __name__ == "__main__":
48
  app.run(host="0.0.0.0", port=7860)
 
1
  from flask import Flask, request, jsonify
2
  from flask_cors import CORS
3
  import os
4
+ import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
6
 
7
  app = Flask(__name__)
8
+ CORS(app)
9
 
10
+ MODEL_NAME = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
 
13
+ print("Initializing local memory nodes... Downloading Unsloth bits...")
14
+
15
+ # 1. Load the tokenizer matching your exact model structure
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
17
+
18
+ # 2. Configure 4-bit loading directly inside your Space container memory
19
+ bnb_config = BitsAndBytesConfig(
20
+ load_in_4bit=True,
21
+ bnb_4bit_quant_type="nf4",
22
+ bnb_4bit_compute_dtype=torch.float16
23
+ )
24
+
25
+ # 3. Pull weights directly into local cache space
26
+ try:
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ MODEL_NAME,
29
+ quantization_config=bnb_config,
30
+ device_map="auto",
31
+ token=HF_TOKEN
32
+ )
33
+ print("Krypton-1 Core Engine Fully Loaded in Space Memory.")
34
+ except Exception as e:
35
+ print(f"Boot Failure Error: {str(e)}")
36
+ model = None
37
 
38
  @app.route("/", methods=["GET"])
39
  def home():
40
+ return "Krypton-1 Dedicated Unsloth Node is Online."
41
 
42
  @app.route("/api/chat", methods=["POST"])
43
  def chat():
44
+ if model is None:
45
+ return jsonify({"reply": "SYS_ERR: Engine failed to cache local weights. Check container storage logs."})
46
+
47
  try:
48
  data = request.json
49
  user_prompt = data.get("prompt", "")
50
 
51
+ # Structure the prompt properly for Mistral Instruct layout
52
+ messages = [{"role": "user", "content": user_prompt}]
53
+ inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
54
+
55
+ # Generate raw response tokens straight from your loaded weights
56
+ outputs = model.generate(inputs, max_new_tokens=250, temperature=0.7, do_sample=True)
57
+
58
+ # Decode the output text cleanly
59
+ decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
60
+
61
+ # Clean up output text by stripping out the original prompt block if echoed
62
+ reply_text = decoded.split(user_prompt)[-1].strip() if user_prompt in decoded else decoded
63
 
 
64
  return jsonify({"reply": reply_text})
65
 
66
  except Exception as e:
67
+ return jsonify({"reply": f"SYS_ALERT: Internal processing breakdown. Details: {str(e)}"})
 
 
 
 
 
 
 
 
68
 
69
  if __name__ == "__main__":
70
  app.run(host="0.0.0.0", port=7860)