Spaces:

Ashok75
/

react

Sleeping

App Files Files Community

Ashok75 commited on Mar 5

Commit

cfc606a

verified ·

1 Parent(s): 066210c

Create app.py

Browse files

Files changed (1) hide show

app.py +58 -0

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+from flask import Flask, request, Response, render_template
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+app = Flask(__name__)
+# 1. Load the Model and Tokenizer
+# The sources highlight that the LLM serves as the cognitive core or "brain" [5, 6].
+model_id = "AshokGakr/model-tiny"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto"
+)
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/chat', methods=['POST'])
+def chat():
+    data = request.json
+    user_messages = data.get("messages", [])
+    # 2. Apply Chat Template
+    # Using the specific Jinja template from the model repo to format the prompt.
+    # This ensures the model follows the intended "Thought-Action-Observation" cycle [7].
+    input_ids = tokenizer.apply_chat_template(
+        user_messages,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(model.device)
+    # 3. Setup Streaming
+    # Context engineering involves curating the optimal set of tokens for inference [8].
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
+        max_new_tokens=512,
+        do_sample=True,
+        temperature=0.7
+    )
+    # Run generation in a separate thread to allow the Flask response to stream
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    def generate():
+        for new_text in streamer:
+            yield new_text
+    return Response(generate(), mimetype='text/plain')
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860) # Standard HF Space port