Ashok75 commited on
Commit
cfc606a
·
verified ·
1 Parent(s): 066210c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from flask import Flask, request, Response, render_template
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ from threading import Thread
5
+
6
+ app = Flask(__name__)
7
+
8
+ # 1. Load the Model and Tokenizer
9
+ # The sources highlight that the LLM serves as the cognitive core or "brain" [5, 6].
10
+ model_id = "AshokGakr/model-tiny"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ model_id,
14
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
15
+ device_map="auto"
16
+ )
17
+
18
+ @app.route('/')
19
+ def index():
20
+ return render_template('index.html')
21
+
22
+ @app.route('/chat', methods=['POST'])
23
+ def chat():
24
+ data = request.json
25
+ user_messages = data.get("messages", [])
26
+
27
+ # 2. Apply Chat Template
28
+ # Using the specific Jinja template from the model repo to format the prompt.
29
+ # This ensures the model follows the intended "Thought-Action-Observation" cycle [7].
30
+ input_ids = tokenizer.apply_chat_template(
31
+ user_messages,
32
+ add_generation_prompt=True,
33
+ return_tensors="pt"
34
+ ).to(model.device)
35
+
36
+ # 3. Setup Streaming
37
+ # Context engineering involves curating the optimal set of tokens for inference [8].
38
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
39
+ generation_kwargs = dict(
40
+ input_ids=input_ids,
41
+ streamer=streamer,
42
+ max_new_tokens=512,
43
+ do_sample=True,
44
+ temperature=0.7
45
+ )
46
+
47
+ # Run generation in a separate thread to allow the Flask response to stream
48
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
49
+ thread.start()
50
+
51
+ def generate():
52
+ for new_text in streamer:
53
+ yield new_text
54
+
55
+ return Response(generate(), mimetype='text/plain')
56
+
57
+ if __name__ == '__main__':
58
+ app.run(host='0.0.0.0', port=7860) # Standard HF Space port