Joe6636564 commited on
Commit
9af0f0c
·
verified ·
1 Parent(s): 4899a6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -349
app.py CHANGED
@@ -1,396 +1,160 @@
1
  import os
2
- import time
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AutoProcessor
5
  import gradio as gr
6
  from threading import Thread
7
  from PIL import Image
8
- from flask import Flask, request, jsonify
9
- import threading
10
  import numpy as np
 
 
11
 
12
- # Disable CUDA and force CPU usage
13
  os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
14
  torch.cuda.is_available = lambda: False
15
 
16
- # Initialize Flask app
17
- flask_app = Flask(__name__)
18
 
19
- # Device detection - force CPU
20
- def get_device():
21
- device = "cpu"
22
- print("Using CPU (GPU disabled)")
23
- return device
24
-
25
- device = get_device()
26
-
27
- # Model and tokenizer for the chatbot
28
  MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
29
- MODEL_LIST1 = ["microsoft/Phi-3.5-mini-instruct"]
30
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
31
-
32
- print("Loading tokenizer and model...")
33
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
34
-
35
- # CPU-only model loading
36
  model = AutoModelForCausalLM.from_pretrained(
37
  MODEL_ID1,
38
- torch_dtype=torch.float32, # Use float32 for CPU
39
  device_map="cpu",
40
- low_cpu_mem_usage=True # Optimize for CPU memory
41
  )
42
 
43
- # Vision model setup
44
- print("Loading vision models...")
45
  models = {}
46
  processors = {}
47
 
48
  try:
49
  models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
50
- "microsoft/Phi-3.5-vision-instruct",
51
- trust_remote_code=True,
52
- torch_dtype=torch.float32, # Use float32 for CPU
53
  device_map="cpu",
54
- low_cpu_mem_usage=True # Optimize for CPU memory
55
  ).eval()
56
-
57
  processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
58
- "microsoft/Phi-3.5-vision-instruct",
59
  trust_remote_code=True
60
  )
61
- print("Vision model loaded successfully on CPU")
62
  except Exception as e:
63
- print(f"Error loading vision model: {e}")
64
-
65
- # Chatbot function
66
- def stream_chat(
67
- message: str,
68
- history: list,
69
- system_prompt: str,
70
- temperature: float = 0.8,
71
- max_new_tokens: int = 1024,
72
- top_p: float = 1.0,
73
- top_k: int = 20,
74
- penalty: float = 1.2,
75
- ):
76
- print(f'message: {message}')
77
- print(f'history: {history}')
78
  conversation = [{"role": "system", "content": system_prompt}]
79
-
80
- for prompt, answer in history:
81
- conversation.extend([
82
- {"role": "user", "content": prompt},
83
- {"role": "assistant", "content": answer},
84
- ])
85
-
86
  conversation.append({"role": "user", "content": message})
87
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(device)
88
-
89
- streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
90
- generate_kwargs = dict(
 
91
  input_ids=input_ids,
92
  max_new_tokens=max_new_tokens,
93
- do_sample=False if temperature == 0 else True,
 
94
  top_p=top_p,
95
  top_k=top_k,
96
- temperature=temperature,
97
- eos_token_id=[128001,128008,128009],
98
- streamer=streamer,
99
  )
100
 
101
- with torch.no_grad():
102
- thread = Thread(target=model.generate, kwargs=generate_kwargs)
103
- thread.start()
104
-
105
- buffer = ""
106
- for new_text in streamer:
107
- buffer += new_text
108
- yield buffer
109
-
110
- # Vision model function
111
- def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
 
112
  if model_id not in models:
113
- return "Vision model not available"
114
-
115
  model_vision = models[model_id]
116
  processor = processors[model_id]
117
-
118
- # Prepare the image list and corresponding tags
119
  images = [Image.fromarray(image).convert("RGB")]
120
  placeholder = "<|image_1|>\n"
121
-
122
- # Construct the prompt with the image tag and the user's text input
123
- if text_input:
124
- prompt_content = placeholder + text_input
125
- else:
126
- prompt_content = placeholder
127
-
128
- messages = [
129
- {"role": "user", "content": prompt_content},
130
- ]
131
-
132
- # Apply the chat template to the messages
133
- prompt = processor.tokenizer.apply_chat_template(
134
- messages, tokenize=False, add_generation_prompt=True
135
- )
136
-
137
- # Process the inputs with the processor
138
- inputs = processor(prompt, images, return_tensors="pt").to(device)
139
-
140
- # Generation parameters
141
- generation_args = {
142
- "max_new_tokens": 500, # Reduced for CPU
143
- "temperature": 0.0,
144
- "do_sample": False,
145
- }
146
-
147
- # Generate the response
148
- generate_ids = model_vision.generate(
149
  **inputs,
150
- eos_token_id=processor.tokenizer.eos_token_id,
151
- **generation_args
152
  )
153
-
154
- # Remove input tokens from the generated response
155
- generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
156
-
157
- # Decode the generated output
158
- response = processor.batch_decode(
159
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
160
- )[0]
161
-
162
- return response
163
-
164
- # Flask API Routes
165
- @flask_app.route('/health', methods=['GET'])
166
- def health_check():
167
- return jsonify({
168
- "status": "healthy",
169
- "device": device,
170
- "models_loaded": {
171
- "chatbot": MODEL_ID1 in globals() and 'model' in globals(),
172
- "vision": len(models) > 0
173
- }
174
- })
175
-
176
- @flask_app.route('/api/chat', methods=['POST'])
177
- def api_chat():
178
- try:
179
- data = request.json
180
- message = data.get('message', '')
181
- system_prompt = data.get('system_prompt', 'You are a helpful assistant')
182
- temperature = data.get('temperature', 0.8)
183
- max_new_tokens = data.get('max_new_tokens', 512) # Reduced for CPU
184
-
185
- # Prepare conversation
186
- conversation = [{"role": "system", "content": system_prompt}]
187
- conversation.append({"role": "user", "content": message})
188
-
189
- input_ids = tokenizer.apply_chat_template(
190
- conversation, add_generation_prompt=True, return_tensors="pt"
191
- ).to(device)
192
-
193
- # Generate response
194
- with torch.no_grad():
195
- generate_ids = model.generate(
196
- input_ids,
197
- max_new_tokens=max_new_tokens,
198
- temperature=temperature,
199
- do_sample=temperature > 0,
200
- eos_token_id=[128001, 128008, 128009]
201
- )
202
-
203
- # Decode response
204
- response = tokenizer.decode(
205
- generate_ids[0][input_ids.shape[1]:],
206
- skip_special_tokens=True
207
- )
208
-
209
- return jsonify({
210
- "response": response,
211
- "device": device,
212
- "model": MODEL_ID1
213
- })
214
-
215
- except Exception as e:
216
- return jsonify({"error": str(e)}), 500
217
-
218
- @flask_app.route('/api/vision', methods=['POST'])
219
- def api_vision():
220
- try:
221
- if 'image' not in request.files:
222
- return jsonify({"error": "No image provided"}), 400
223
-
224
- image_file = request.files['image']
225
- text_input = request.form.get('text_input', '')
226
- model_id = request.form.get('model_id', 'microsoft/Phi-3.5-vision-instruct')
227
-
228
- if model_id not in models:
229
- return jsonify({"error": "Vision model not available"}), 400
230
-
231
- # Process image
232
- image = Image.open(image_file.stream).convert("RGB")
233
-
234
- # Use the existing vision function
235
- response = stream_vision(
236
- image=np.array(image),
237
- text_input=text_input,
238
- model_id=model_id
239
- )
240
-
241
- return jsonify({
242
- "response": response,
243
- "device": device,
244
- "model": model_id
245
- })
246
-
247
- except Exception as e:
248
- return jsonify({"error": str(e)}), 500
249
-
250
- @flask_app.route('/api/models', methods=['GET'])
251
- def get_models():
252
- return jsonify({
253
- "chat_model": MODEL_ID1,
254
- "vision_models": list(models.keys()),
255
- "device": device
256
- })
257
-
258
- def run_flask():
259
- flask_app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
260
-
261
- def forward_flask(path, request):
262
- import requests
263
- url = f"http://localhost:5000/{path}"
264
-
265
- if request.method == "POST":
266
- r = requests.post(url, json=request.json)
267
- else:
268
- r = requests.get(url)
269
- return r.json()
270
-
271
- api = gr.routes.App.create_app()
272
- api.router.add_api_route(
273
- "/api/chat",
274
- forward_flask,
275
- methods=["POST"]
276
- )
277
- api.router.add_api_route(
278
- "/api/vision",
279
- forward_flask,
280
- methods=["POST"]
281
- )
282
 
283
- def run_gradio():
284
- # CSS for the interface
285
- CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
286
- PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
287
- TITLE = "<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (CPU Version)</center></h1>"
288
- EXPLANATION = """<div style="text-align: center; margin-top: 20px;">
289
- <p><strong>CPU-Only Version</strong> - This instance is running on CPU. Responses may be slower than GPU-accelerated versions.</p>
290
- <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
291
- <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support.</p>
292
- <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data. The model belongs to the Phi-3 model family and supports 128K token context length.</p>
293
- </div>"""
294
- footer = """<div style="text-align: center; margin-top: 20px;">
295
- <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
296
- <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
297
- <a href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" target="_blank">microsoft/Phi-3.5-mini-instruct</a> |
298
- <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>
299
- <br> Made with 💖 by Pejman Ebrahimi | Running on CPU
300
- </div>"""
301
-
302
- # Gradio app with two tabs
303
- with gr.Blocks(css=CSS, theme="small_and_pretty") as demo:
304
- gr.HTML(TITLE)
305
- gr.HTML(EXPLANATION)
306
- gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
307
-
308
- with gr.Tab("Chatbot"):
309
- chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
310
- gr.ChatInterface(
311
- fn=stream_chat,
312
- chatbot=chatbot,
313
- fill_height=True,
314
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
315
- additional_inputs=[
316
- gr.Textbox(
317
- value="You are a helpful assistant",
318
- label="System Prompt",
319
- render=False,
320
- ),
321
- gr.Slider(
322
- minimum=0,
323
- maximum=1,
324
- step=0.1,
325
- value=0.8,
326
- label="Temperature",
327
- render=False,
328
- ),
329
- gr.Slider(
330
- minimum=128,
331
- maximum=2048, # Reduced for CPU
332
- step=1,
333
- value=512, # Reduced for CPU
334
- label="Max new tokens",
335
- render=False,
336
- ),
337
- gr.Slider(
338
- minimum=0.0,
339
- maximum=1.0,
340
- step=0.1,
341
- value=1.0,
342
- label="top_p",
343
- render=False,
344
- ),
345
- gr.Slider(
346
- minimum=1,
347
- maximum=20,
348
- step=1,
349
- value=20,
350
- label="top_k",
351
- render=False,
352
- ),
353
- gr.Slider(
354
- minimum=0.0,
355
- maximum=2.0,
356
- step=0.1,
357
- value=1.2,
358
- label="Repetition penalty",
359
- render=False,
360
- ),
361
- ],
362
- examples=[
363
- ["Hello, how are you?"],
364
- ["Explain quantum computing in simple terms"],
365
- ["What are the benefits of renewable energy?"],
366
- ["Write a short poem about technology"],
367
- ],
368
- cache_examples=False,
369
- )
370
-
371
  with gr.Tab("Vision"):
372
- with gr.Row():
373
- input_img = gr.Image(label="Input Picture")
374
- with gr.Row():
375
- model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
376
- with gr.Row():
377
- text_input = gr.Textbox(label="Question", value="What's in this image?")
378
- with gr.Row():
379
- submit_btn = gr.Button(value="Submit")
380
- with gr.Row():
381
- output_text = gr.Textbox(label="Output Text")
382
-
383
- submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
384
-
385
- gr.HTML(footer)
386
-
387
- # Launch the Gradio app
388
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
389
-
390
- if __name__ == "__main__":
391
- # Start Flask server in a separate thread
392
- flask_thread = threading.Thread(target=run_flask, daemon=True)
393
- flask_thread.start()
394
-
395
- # Run Gradio in main thread
396
- run_gradio()
 
1
  import os
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AutoProcessor
4
  import gradio as gr
5
  from threading import Thread
6
  from PIL import Image
 
 
7
  import numpy as np
8
+ from fastapi import FastAPI, UploadFile, File, Form
9
+ from gradio.routes import mount_gradio_app
10
 
11
+ # Disable CUDA
12
  os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
13
  torch.cuda.is_available = lambda: False
14
 
15
+ device = "cpu"
16
+ print("Using CPU only")
17
 
18
+ # Load Chat Model
 
 
 
 
 
 
 
 
19
  MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
 
 
 
 
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
 
 
21
  model = AutoModelForCausalLM.from_pretrained(
22
  MODEL_ID1,
23
+ torch_dtype=torch.float32,
24
  device_map="cpu",
25
+ low_cpu_mem_usage=True
26
  )
27
 
28
+ # Load Vision Model
 
29
  models = {}
30
  processors = {}
31
 
32
  try:
33
  models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
34
+ "microsoft/Phi-3.5-vision-instruct",
35
+ trust_remote_code=True,
36
+ torch_dtype=torch.float32,
37
  device_map="cpu",
38
+ low_cpu_mem_usage=True
39
  ).eval()
40
+
41
  processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
42
+ "microsoft/Phi-3.5-vision-instruct",
43
  trust_remote_code=True
44
  )
45
+ print("Vision model loaded ")
46
  except Exception as e:
47
+ print("Vision model failed to load:", e)
48
+
49
+
50
+ # -------------- CHAT FUNCTION --------------
51
+
52
+ def stream_chat(message, history, system_prompt, temperature, max_new_tokens, top_p, top_k, penalty):
 
 
 
 
 
 
 
 
 
53
  conversation = [{"role": "system", "content": system_prompt}]
54
+ for user, assistant in history:
55
+ conversation.append({"role": "user", "content": user})
56
+ conversation.append({"role": "assistant", "content": assistant})
57
+
 
 
 
58
  conversation.append({"role": "user", "content": message})
59
+
60
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
61
+
62
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
63
+ kwargs = dict(
64
  input_ids=input_ids,
65
  max_new_tokens=max_new_tokens,
66
+ do_sample=temperature > 0,
67
+ temperature=temperature,
68
  top_p=top_p,
69
  top_k=top_k,
70
+ repetition_penalty=penalty,
71
+ eos_token_id=[128001, 128008, 128009],
72
+ streamer=streamer
73
  )
74
 
75
+ thread = Thread(target=model.generate, kwargs=kwargs)
76
+ thread.start()
77
+
78
+ output = ""
79
+ for token in streamer:
80
+ output += token
81
+ yield output
82
+
83
+
84
+ # -------------- VISION FUNCTION --------------
85
+
86
+ def stream_vision(image, text_input, model_id):
87
  if model_id not in models:
88
+ return "Vision model not loaded."
89
+
90
  model_vision = models[model_id]
91
  processor = processors[model_id]
92
+
 
93
  images = [Image.fromarray(image).convert("RGB")]
94
  placeholder = "<|image_1|>\n"
95
+ prompt = placeholder + (text_input or "")
96
+
97
+ messages = [{"role": "user", "content": prompt}]
98
+ template = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
99
+
100
+ inputs = processor(template, images, return_tensors="pt").to(device)
101
+
102
+ output = model_vision.generate(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  **inputs,
104
+ max_new_tokens=400,
105
+ do_sample=False,
106
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ output = output[:, inputs["input_ids"].shape[1]:]
109
+ text = processor.batch_decode(output, skip_special_tokens=True)[0]
110
+ return text
111
+
112
+
113
+ # -------------- FASTAPI BACKEND --------------
114
+
115
+ api = FastAPI()
116
+
117
+ @api.get("/health")
118
+ def health():
119
+ return {"status": "ok", "device": device, "chat_model": MODEL_ID1, "vision_loaded": len(models)>0}
120
+
121
+ @api.post("/api/chat")
122
+ async def api_chat(message: str = Form(...), system_prompt: str = Form("You are a helpful assistant")):
123
+ conversation = [
124
+ {"role": "system", "content": system_prompt},
125
+ {"role": "user", "content": message}
126
+ ]
127
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
128
+ out = model.generate(input_ids, max_new_tokens=512, do_sample=False)
129
+ reply = tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True)
130
+ return {"response": reply}
131
+
132
+ @api.post("/api/vision")
133
+ async def api_vision(image: UploadFile = File(...), text_input: str = Form(""), model_id: str = Form("microsoft/Phi-3.5-vision-instruct")):
134
+ img = Image.open(image.file).convert("RGB")
135
+ result = stream_vision(np.array(img), text_input, model_id)
136
+ return {"response": result}
137
+
138
+
139
+ # -------------- GRADIO UI --------------
140
+
141
+ def build_gradio_ui():
142
+ CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important;}"""
143
+ with gr.Blocks(css=CSS) as demo:
144
+ with gr.Tab("Chat"):
145
+ chat = gr.Chatbot(height=600)
146
+ gr.ChatInterface(fn=stream_chat, chatbot=chat)
147
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  with gr.Tab("Vision"):
149
+ img = gr.Image()
150
+ txt = gr.Textbox("What's in this image?")
151
+ model_sel = gr.Dropdown(list(models.keys()), value="microsoft/Phi-3.5-vision-instruct")
152
+ out = gr.Textbox()
153
+ gr.Button("Analyze").click(stream_vision, [img, txt, model_sel], out)
154
+
155
+ return demo
156
+
157
+
158
+ gradio_app = build_gradio_ui()
159
+
160
+ app = mount_gradio_app(api, gradio_app, path="/")