Spaces:

Ashok75
/

react

Sleeping

App Files Files Community

Ashok75 commited on Mar 5

Commit

9982780

verified ·

1 Parent(s): 7a36849

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -37

app.py CHANGED Viewed

@@ -1,50 +1,53 @@
-import json
 from flask import Flask, request, Response, render_template
-from llama_cpp import Llama
 app = Flask(__name__)
-# Load the Nanbeige 4.1 3B GGUF model
-# Ensure the .gguf file is in the same directory
-llm = Llama(
-    model_path="nanbeige4.1-3b-Q5_K_M.gguf",
-    n_ctx=2048,  # Attention budget [8]
-    n_threads=4,
-    verbose=False
 )
-SYSTEM_PROMPT = (
-    "You are a helpful assistant. Before giving your final answer, "
-    "provide your internal reasoning inside <thought> tags. "
-    "Format: <thought>Your reasoning here</thought> Final response here."
-)
 @app.route('/')
 def index():
     return render_template('index.html')
-@app.route('/chat', methods=['POST'])
-def chat():
-    user_input = request.json.get("message")
-    # Constructing the context window [9]
-    prompt = f"System: {SYSTEM_PROMPT}\nUser: {user_input}\nAssistant:"
-    def generate():
-        # Streaming inference [10]
-        stream = llm(
-            prompt,
-            max_tokens=512,
-            stream=True,
-            temperature=0.7,
-            stop=["User:", "System:"]
-        )
-        for chunk in stream:
-            text = chunk['choices']['text']
-            if text:
-                yield text
-    return Response(generate(), mimetype='text/plain')
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)

+import torch
 from flask import Flask, request, Response, render_template
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
 app = Flask(__name__)
+# Load Nanbeige 4.1 3B
+model_id = "Nanbeige/Nanbeige4.1-3B"
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
 )
+@app.route('/chat', methods=['POST'])
+def chat():
+    user_msg = request.json.get("message")
+    # System Prompt Construction [14, 32]
+    prompt = f"<|system|>\nYou are an Enterprise ReAct Agent. Always think before answering.\n<|user|>\n{user_msg}\n<|assistant|>\n<thought>"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=1024,
+        do_sample=True,
+        temperature=0.7,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    def stream():
+        # Start with the tag we forced in the prompt
+        yield "<thought>"
+        for new_text in streamer:
+            yield new_text
+    return Response(stream(), mimetype='text/plain')
 @app.route('/')
 def index():
     return render_template('index.html')
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)