Valtry commited on
Commit
b00a63a
·
verified ·
1 Parent(s): e12212e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -40
app.py CHANGED
@@ -1,9 +1,13 @@
1
- import gradio as gr
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
3
  import torch
4
  from threading import Thread
 
 
 
5
 
6
- # Faster small model for CPU
7
  model_name = "Qwen/Qwen2-0.5B-Instruct"
8
 
9
  print("Loading tokenizer...")
@@ -18,49 +22,93 @@ model = AutoModelForCausalLM.from_pretrained(
18
  print("Model loaded successfully!")
19
 
20
 
21
- def chat(message):
 
 
 
 
 
 
 
 
 
22
 
23
  prompt = f"""
24
- You are a helpful AI assistant.
25
 
26
- User: {message}
27
  Assistant:
28
  """
29
 
30
  inputs = tokenizer(prompt, return_tensors="pt")
31
 
32
- # streamer for token streaming
33
- streamer = TextIteratorStreamer(
34
- tokenizer,
35
- skip_prompt=True,
36
- skip_special_tokens=True
37
- )
38
-
39
- generation_kwargs = dict(
40
- **inputs,
41
- streamer=streamer,
42
- max_new_tokens=80,
43
- temperature=0.7,
44
- do_sample=True
45
- )
46
-
47
- # run generation in separate thread
48
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
49
- thread.start()
50
-
51
- partial_text = ""
52
-
53
- for new_token in streamer:
54
- partial_text += new_token
55
- yield partial_text
56
-
57
-
58
- demo = gr.Interface(
59
- fn=chat,
60
- inputs=gr.Textbox(label="Ask something"),
61
- outputs=gr.Textbox(label="AI Response"),
62
- title="Auric AI Model Test (Streaming)",
63
- description="Testing Qwen2-0.5B model with streaming output"
64
- )
65
-
66
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ from sse_starlette.sse import EventSourceResponse
5
  import torch
6
  from threading import Thread
7
+ import json
8
+
9
+ app = FastAPI()
10
 
 
11
  model_name = "Qwen/Qwen2-0.5B-Instruct"
12
 
13
  print("Loading tokenizer...")
 
22
  print("Model loaded successfully!")
23
 
24
 
25
+ class ChatRequest(BaseModel):
26
+ model: str
27
+ messages: list
28
+ stream: bool = False
29
+
30
+
31
+ @app.post("/v1/chat/completions")
32
+ async def chat(req: ChatRequest):
33
+
34
+ user_message = req.messages[-1]["content"]
35
 
36
  prompt = f"""
37
+ You are a helpful assistant.
38
 
39
+ User: {user_message}
40
  Assistant:
41
  """
42
 
43
  inputs = tokenizer(prompt, return_tensors="pt")
44
 
45
+ # ---------- STREAM MODE ----------
46
+ if req.stream:
47
+
48
+ streamer = TextIteratorStreamer(
49
+ tokenizer,
50
+ skip_prompt=True,
51
+ skip_special_tokens=True
52
+ )
53
+
54
+ generation_kwargs = dict(
55
+ **inputs,
56
+ streamer=streamer,
57
+ max_new_tokens=80,
58
+ temperature=0.7,
59
+ do_sample=True
60
+ )
61
+
62
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
63
+ thread.start()
64
+
65
+ async def event_generator():
66
+
67
+ for token in streamer:
68
+
69
+ data = {
70
+ "choices": [
71
+ {
72
+ "delta": {
73
+ "content": token
74
+ }
75
+ }
76
+ ]
77
+ }
78
+
79
+ yield {
80
+ "event": "message",
81
+ "data": json.dumps(data)
82
+ }
83
+
84
+ yield {
85
+ "event": "message",
86
+ "data": "[DONE]"
87
+ }
88
+
89
+ return EventSourceResponse(event_generator())
90
+
91
+ # ---------- NORMAL MODE ----------
92
+ else:
93
+
94
+ output = model.generate(
95
+ **inputs,
96
+ max_new_tokens=80,
97
+ temperature=0.7
98
+ )
99
+
100
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
101
+
102
+ if "Assistant:" in response:
103
+ response = response.split("Assistant:")[-1].strip()
104
+
105
+ return {
106
+ "choices": [
107
+ {
108
+ "message": {
109
+ "role": "assistant",
110
+ "content": response
111
+ }
112
+ }
113
+ ]
114
+ }