Spaces:

Joe6636564
/

coderpilot

Sleeping

App Files Files Community

Joe6636564 commited on Nov 7, 2025

Commit

1b4c043

verified ·

1 Parent(s): ecdc3e0

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -72

app.py CHANGED Viewed

@@ -1,103 +1,107 @@
 import os
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AutoProcessor
-import gradio as gr
 from threading import Thread
 from PIL import Image
 import numpy as np
 from fastapi import FastAPI, UploadFile, File, Form
 from gradio.routes import mount_gradio_app
-# Disable CUDA
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 torch.cuda.is_available = lambda: False
 device = "cpu"
-print("Using CPU only")
-# Load Chat Model
-MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID1,
     torch_dtype=torch.float32,
     device_map="cpu",
     low_cpu_mem_usage=True
-)
-# Load Vision Model
 models = {}
 processors = {}
 try:
-    models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
-        "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         torch_dtype=torch.float32,
         device_map="cpu",
-        low_cpu_mem_usage=True
     ).eval()
-    processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
-        "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True
     )
     print("Vision model loaded ✅")
 except Exception as e:
     print("Vision model failed to load:", e)
-# -------------- CHAT FUNCTION --------------
-def stream_chat(message, history, system_prompt, temperature, max_new_tokens, top_p, top_k, penalty):
     conversation = [{"role": "system", "content": system_prompt}]
     for user, assistant in history:
         conversation.append({"role": "user", "content": user})
         conversation.append({"role": "assistant", "content": assistant})
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    kwargs = dict(
-        input_ids=input_ids,
-        max_new_tokens=max_new_tokens,
-        do_sample=temperature > 0,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        repetition_penalty=penalty,
-        eos_token_id=[128001, 128008, 128009],
-        streamer=streamer
     )
-    thread = Thread(target=model.generate, kwargs=kwargs)
-    thread.start()
-    output = ""
-    for token in streamer:
-        output += token
-        yield output
-# -------------- VISION FUNCTION --------------
-def stream_vision(image, text_input, model_id):
     if model_id not in models:
-        return "Vision model not loaded."
     model_vision = models[model_id]
     processor = processors[model_id]
-    images = [Image.fromarray(image).convert("RGB")]
     placeholder = "<|image_1|>\n"
     prompt = placeholder + (text_input or "")
     messages = [{"role": "user", "content": prompt}]
     template = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(template, images, return_tensors="pt").to(device)
     output = model_vision.generate(
         **inputs,
@@ -109,52 +113,49 @@ def stream_vision(image, text_input, model_id):
     text = processor.batch_decode(output, skip_special_tokens=True)[0]
     return text
-# -------------- FASTAPI BACKEND --------------
 api = FastAPI()
 @api.get("/health")
 def health():
-    return {"status": "ok", "device": device, "chat_model": MODEL_ID1, "vision_loaded": len(models)>0}
 @api.post("/api/chat")
-async def api_chat(message: str = Form(...), system_prompt: str = Form("You are a helpful assistant")):
-    conversation = [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": message}
-    ]
     input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    out = model.generate(input_ids, max_new_tokens=512, do_sample=False)
-    reply = tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True)
     return {"response": reply}
 @api.post("/api/vision")
-async def api_vision(image: UploadFile = File(...), text_input: str = Form(""), model_id: str = Form("microsoft/Phi-3.5-vision-instruct")):
     img = Image.open(image.file).convert("RGB")
-    result = stream_vision(np.array(img), text_input, model_id)
-    return {"response": result}
-# -------------- GRADIO UI --------------
-def build_gradio_ui():
-    CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important;}"""
-    with gr.Blocks(css=CSS) as demo:
         with gr.Tab("Chat"):
-            chat = gr.Chatbot(height=600)
-            gr.ChatInterface(fn=stream_chat, chatbot=chat)
         with gr.Tab("Vision"):
             img = gr.Image()
             txt = gr.Textbox("What's in this image?")
-            model_sel = gr.Dropdown(list(models.keys()), value="microsoft/Phi-3.5-vision-instruct")
             out = gr.Textbox()
-            gr.Button("Analyze").click(stream_vision, [img, txt, model_sel], out)
     return demo
-gradio_app = build_gradio_ui()
 app = mount_gradio_app(api, gradio_app, path="/")

 import os
 import torch
 from threading import Thread
 from PIL import Image
 import numpy as np
 from fastapi import FastAPI, UploadFile, File, Form
 from gradio.routes import mount_gradio_app
+import gradio as gr
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    AutoProcessor,
+)
+# Force CPU
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 torch.cuda.is_available = lambda: False
 device = "cpu"
+print("Running on CPU ✅")
+# ---------------- LOAD MAIN CHAT MODEL ----------------
+MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
     torch_dtype=torch.float32,
     device_map="cpu",
     low_cpu_mem_usage=True
+).eval()
+# ---------------- LOAD VISION MODEL (FlashAttention disabled) ----------------
 models = {}
 processors = {}
 try:
+    VISION_ID = "microsoft/Phi-3.5-vision-instruct"
+    models[VISION_ID] = AutoModelForCausalLM.from_pretrained(
+        VISION_ID,
         trust_remote_code=True,
         torch_dtype=torch.float32,
         device_map="cpu",
+        low_cpu_mem_usage=True,
+        attn_implementation="eager"   # <<< KEY FIX ✅
     ).eval()
+    processors[VISION_ID] = AutoProcessor.from_pretrained(
+        VISION_ID,
         trust_remote_code=True
     )
     print("Vision model loaded ✅")
 except Exception as e:
     print("Vision model failed to load:", e)
+# ---------------- CHAT FUNCTION (for UI) ----------------
+def chat_simple(message, history):
+    system_prompt = "You are a helpful assistant."
     conversation = [{"role": "system", "content": system_prompt}]
     for user, assistant in history:
         conversation.append({"role": "user", "content": user})
         conversation.append({"role": "assistant", "content": assistant})
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(device)
+    output = model.generate(
+        input_ids,
+        max_new_tokens=256,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True
     )
+    reply = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+    return reply
+# ---------------- VISION FUNCTION ----------------
+def run_vision(image, text_input, model_id):
     if model_id not in models:
+        return "⚠️ Vision model not loaded."
     model_vision = models[model_id]
     processor = processors[model_id]
+    img = Image.fromarray(image).convert("RGB")
     placeholder = "<|image_1|>\n"
     prompt = placeholder + (text_input or "")
     messages = [{"role": "user", "content": prompt}]
     template = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(template, [img], return_tensors="pt")
     output = model_vision.generate(
         **inputs,
     text = processor.batch_decode(output, skip_special_tokens=True)[0]
     return text
+# ---------------- FASTAPI BACKEND API ----------------
 api = FastAPI()
 @api.get("/health")
 def health():
+    return {
+        "status": "ok",
+        "device": device,
+        "chat_model": MODEL_ID,
+        "vision_loaded": len(models) > 0
+    }
 @api.post("/api/chat")
+async def api_chat(message: str = Form(...)):
+    conversation = [{"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": message}]
     input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    output = model.generate(input_ids, max_new_tokens=256)
+    reply = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
     return {"response": reply}
 @api.post("/api/vision")
+async def api_vision(image: UploadFile = File(...), text_input: str = Form("Describe this"), model_id: str = Form("microsoft/Phi-3.5-vision-instruct")):
     img = Image.open(image.file).convert("RGB")
+    return {"response": run_vision(np.array(img), text_input, model_id)}
+# ---------------- GRADIO UI ----------------
+def create_ui():
+    with gr.Blocks() as demo:
         with gr.Tab("Chat"):
+            gr.ChatInterface(fn=chat_simple)
         with gr.Tab("Vision"):
             img = gr.Image()
             txt = gr.Textbox("What's in this image?")
+            model_sel = gr.Dropdown(choices=list(models.keys()), value=list(models.keys())[0] if models else None)
             out = gr.Textbox()
+            gr.Button("Analyze").click(run_vision, [img, txt, model_sel], out)
     return demo
+gradio_app = create_ui()
 app = mount_gradio_app(api, gradio_app, path="/")