Spaces:

Joe6636564
/

coderpilot

Sleeping

App Files Files Community

Joe6636564 commited on Nov 6, 2025

Commit

9af0f0c

verified ·

1 Parent(s): 4899a6d

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -349

app.py CHANGED Viewed

@@ -1,396 +1,160 @@
 import os
-import time
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AutoProcessor
 import gradio as gr
 from threading import Thread
 from PIL import Image
-from flask import Flask, request, jsonify
-import threading
 import numpy as np
-# Disable CUDA and force CPU usage
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 torch.cuda.is_available = lambda: False
-# Initialize Flask app
-flask_app = Flask(__name__)
-# Device detection - force CPU
-def get_device():
-    device = "cpu"
-    print("Using CPU (GPU disabled)")
-    return device
-device = get_device()
-# Model and tokenizer for the chatbot
 MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
-MODEL_LIST1 = ["microsoft/Phi-3.5-mini-instruct"]
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
-# CPU-only model loading
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID1,
-    torch_dtype=torch.float32,  # Use float32 for CPU
     device_map="cpu",
-    low_cpu_mem_usage=True  # Optimize for CPU memory
 )
-# Vision model setup
-print("Loading vision models...")
 models = {}
 processors = {}
 try:
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
-        "microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,
-        torch_dtype=torch.float32,  # Use float32 for CPU
         device_map="cpu",
-        low_cpu_mem_usage=True  # Optimize for CPU memory
     ).eval()
     processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
-        "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True
     )
-    print("Vision model loaded successfully on CPU")
 except Exception as e:
-    print(f"Error loading vision model: {e}")
-# Chatbot function
-def stream_chat(
-    message: str,
-    history: list,
-    system_prompt: str,
-    temperature: float = 0.8,
-    max_new_tokens: int = 1024,
-    top_p: float = 1.0,
-    top_k: int = 20,
-    penalty: float = 1.2,
-):
-    print(f'message: {message}')
-    print(f'history: {history}')
     conversation = [{"role": "system", "content": system_prompt}]
-    for prompt, answer in history:
-        conversation.extend([
-            {"role": "user", "content": prompt},
-            {"role": "assistant", "content": answer},
-        ])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
-        do_sample=False if temperature == 0 else True,
         top_p=top_p,
         top_k=top_k,
-        temperature=temperature,
-        eos_token_id=[128001,128008,128009],
-        streamer=streamer,
     )
-    with torch.no_grad():
-        thread = Thread(target=model.generate, kwargs=generate_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            yield buffer
-# Vision model function
-def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
     if model_id not in models:
-        return "Vision model not available"
     model_vision = models[model_id]
     processor = processors[model_id]
-    # Prepare the image list and corresponding tags
     images = [Image.fromarray(image).convert("RGB")]
     placeholder = "<|image_1|>\n"
-    # Construct the prompt with the image tag and the user's text input
-    if text_input:
-        prompt_content = placeholder + text_input
-    else:
-        prompt_content = placeholder
-    messages = [
-        {"role": "user", "content": prompt_content},
-    ]
-    # Apply the chat template to the messages
-    prompt = processor.tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    # Process the inputs with the processor
-    inputs = processor(prompt, images, return_tensors="pt").to(device)
-    # Generation parameters
-    generation_args = {
-        "max_new_tokens": 500,  # Reduced for CPU
-        "temperature": 0.0,
-        "do_sample": False,
-    }
-    # Generate the response
-    generate_ids = model_vision.generate(
         **inputs,
-        eos_token_id=processor.tokenizer.eos_token_id,
-        **generation_args
     )
-    # Remove input tokens from the generated response
-    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
-    # Decode the generated output
-    response = processor.batch_decode(
-        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    return response
-# Flask API Routes
-@flask_app.route('/health', methods=['GET'])
-def health_check():
-    return jsonify({
-        "status": "healthy",
-        "device": device,
-        "models_loaded": {
-            "chatbot": MODEL_ID1 in globals() and 'model' in globals(),
-            "vision": len(models) > 0
-        }
-    })
-@flask_app.route('/api/chat', methods=['POST'])
-def api_chat():
-    try:
-        data = request.json
-        message = data.get('message', '')
-        system_prompt = data.get('system_prompt', 'You are a helpful assistant')
-        temperature = data.get('temperature', 0.8)
-        max_new_tokens = data.get('max_new_tokens', 512)  # Reduced for CPU
-        # Prepare conversation
-        conversation = [{"role": "system", "content": system_prompt}]
-        conversation.append({"role": "user", "content": message})
-        input_ids = tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, return_tensors="pt"
-        ).to(device)
-        # Generate response
-        with torch.no_grad():
-            generate_ids = model.generate(
-                input_ids,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                do_sample=temperature > 0,
-                eos_token_id=[128001, 128008, 128009]
-            )
-        # Decode response
-        response = tokenizer.decode(
-            generate_ids[0][input_ids.shape[1]:],
-            skip_special_tokens=True
-        )
-        return jsonify({
-            "response": response,
-            "device": device,
-            "model": MODEL_ID1
-        })
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-@flask_app.route('/api/vision', methods=['POST'])
-def api_vision():
-    try:
-        if 'image' not in request.files:
-            return jsonify({"error": "No image provided"}), 400
-        image_file = request.files['image']
-        text_input = request.form.get('text_input', '')
-        model_id = request.form.get('model_id', 'microsoft/Phi-3.5-vision-instruct')
-        if model_id not in models:
-            return jsonify({"error": "Vision model not available"}), 400
-        # Process image
-        image = Image.open(image_file.stream).convert("RGB")
-        # Use the existing vision function
-        response = stream_vision(
-            image=np.array(image),
-            text_input=text_input,
-            model_id=model_id
-        )
-        return jsonify({
-            "response": response,
-            "device": device,
-            "model": model_id
-        })
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-@flask_app.route('/api/models', methods=['GET'])
-def get_models():
-    return jsonify({
-        "chat_model": MODEL_ID1,
-        "vision_models": list(models.keys()),
-        "device": device
-    })
-def run_flask():
-    flask_app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
-def forward_flask(path, request):
-    import requests
-    url = f"http://localhost:5000/{path}"
-    if request.method == "POST":
-        r = requests.post(url, json=request.json)
-    else:
-        r = requests.get(url)
-    return r.json()
-api = gr.routes.App.create_app()
-api.router.add_api_route(
-    "/api/chat",
-    forward_flask,
-    methods=["POST"]
-)
-api.router.add_api_route(
-    "/api/vision",
-    forward_flask,
-    methods=["POST"]
-)
-def run_gradio():
-    # CSS for the interface
-    CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
-    PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
-    TITLE = "<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (CPU Version)</center></h1>"
-    EXPLANATION = """<div style="text-align: center; margin-top: 20px;">
-        <p><strong>CPU-Only Version</strong> - This instance is running on CPU. Responses may be slower than GPU-accelerated versions.</p>
-        <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
-        <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support.</p>
-        <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data. The model belongs to the Phi-3 model family and supports 128K token context length.</p>
-    </div>"""
-    footer = """<div style="text-align: center; margin-top: 20px;">
-        <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
-        <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
-        <a href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" target="_blank">microsoft/Phi-3.5-mini-instruct</a> |
-        <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>
-        <br> Made with 💖 by Pejman Ebrahimi | Running on CPU
-    </div>"""
-    # Gradio app with two tabs
-    with gr.Blocks(css=CSS, theme="small_and_pretty") as demo:
-        gr.HTML(TITLE)
-        gr.HTML(EXPLANATION)
-        gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
-        with gr.Tab("Chatbot"):
-            chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
-            gr.ChatInterface(
-                fn=stream_chat,
-                chatbot=chatbot,
-                fill_height=True,
-                additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
-                additional_inputs=[
-                    gr.Textbox(
-                        value="You are a helpful assistant",
-                        label="System Prompt",
-                        render=False,
-                    ),
-                    gr.Slider(
-                        minimum=0,
-                        maximum=1,
-                        step=0.1,
-                        value=0.8,
-                        label="Temperature",
-                        render=False,
-                    ),
-                    gr.Slider(
-                        minimum=128,
-                        maximum=2048,  # Reduced for CPU
-                        step=1,
-                        value=512,  # Reduced for CPU
-                        label="Max new tokens",
-                        render=False,
-                    ),
-                    gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=1.0,
-                        label="top_p",
-                        render=False,
-                    ),
-                    gr.Slider(
-                        minimum=1,
-                        maximum=20,
-                        step=1,
-                        value=20,
-                        label="top_k",
-                        render=False,
-                    ),
-                    gr.Slider(
-                        minimum=0.0,
-                        maximum=2.0,
-                        step=0.1,
-                        value=1.2,
-                        label="Repetition penalty",
-                        render=False,
-                    ),
-                ],
-                examples=[
-                    ["Hello, how are you?"],
-                    ["Explain quantum computing in simple terms"],
-                    ["What are the benefits of renewable energy?"],
-                    ["Write a short poem about technology"],
-                ],
-                cache_examples=False,
-            )
         with gr.Tab("Vision"):
-            with gr.Row():
-                input_img = gr.Image(label="Input Picture")
-            with gr.Row():
-                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
-            with gr.Row():
-                text_input = gr.Textbox(label="Question", value="What's in this image?")
-            with gr.Row():
-                submit_btn = gr.Button(value="Submit")
-            with gr.Row():
-                output_text = gr.Textbox(label="Output Text")
-            submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
-        gr.HTML(footer)
-    # Launch the Gradio app
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
-if __name__ == "__main__":
-    # Start Flask server in a separate thread
-    flask_thread = threading.Thread(target=run_flask, daemon=True)
-    flask_thread.start()
-    # Run Gradio in main thread
-    run_gradio()

 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AutoProcessor
 import gradio as gr
 from threading import Thread
 from PIL import Image
 import numpy as np
+from fastapi import FastAPI, UploadFile, File, Form
+from gradio.routes import mount_gradio_app
+# Disable CUDA
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 torch.cuda.is_available = lambda: False
+device = "cpu"
+print("Using CPU only")
+# Load Chat Model
 MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID1,
+    torch_dtype=torch.float32,
     device_map="cpu",
+    low_cpu_mem_usage=True
 )
+# Load Vision Model
 models = {}
 processors = {}
 try:
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
+        "microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,
+        torch_dtype=torch.float32,
         device_map="cpu",
+        low_cpu_mem_usage=True
     ).eval()
     processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
+        "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True
     )
+    print("Vision model loaded ✅")
 except Exception as e:
+    print("Vision model failed to load:", e)
+# -------------- CHAT FUNCTION --------------
+def stream_chat(message, history, system_prompt, temperature, max_new_tokens, top_p, top_k, penalty):
     conversation = [{"role": "system", "content": system_prompt}]
+    for user, assistant in history:
+        conversation.append({"role": "user", "content": user})
+        conversation.append({"role": "assistant", "content": assistant})
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
+        do_sample=temperature > 0,
+        temperature=temperature,
         top_p=top_p,
         top_k=top_k,
+        repetition_penalty=penalty,
+        eos_token_id=[128001, 128008, 128009],
+        streamer=streamer
     )
+    thread = Thread(target=model.generate, kwargs=kwargs)
+    thread.start()
+    output = ""
+    for token in streamer:
+        output += token
+        yield output
+# -------------- VISION FUNCTION --------------
+def stream_vision(image, text_input, model_id):
     if model_id not in models:
+        return "Vision model not loaded."
     model_vision = models[model_id]
     processor = processors[model_id]
     images = [Image.fromarray(image).convert("RGB")]
     placeholder = "<|image_1|>\n"
+    prompt = placeholder + (text_input or "")
+    messages = [{"role": "user", "content": prompt}]
+    template = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(template, images, return_tensors="pt").to(device)
+    output = model_vision.generate(
         **inputs,
+        max_new_tokens=400,
+        do_sample=False,
     )
+    output = output[:, inputs["input_ids"].shape[1]:]
+    text = processor.batch_decode(output, skip_special_tokens=True)[0]
+    return text
+# -------------- FASTAPI BACKEND --------------
+api = FastAPI()
+@api.get("/health")
+def health():
+    return {"status": "ok", "device": device, "chat_model": MODEL_ID1, "vision_loaded": len(models)>0}
+@api.post("/api/chat")
+async def api_chat(message: str = Form(...), system_prompt: str = Form("You are a helpful assistant")):
+    conversation = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": message}
+    ]
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    out = model.generate(input_ids, max_new_tokens=512, do_sample=False)
+    reply = tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True)
+    return {"response": reply}
+@api.post("/api/vision")
+async def api_vision(image: UploadFile = File(...), text_input: str = Form(""), model_id: str = Form("microsoft/Phi-3.5-vision-instruct")):
+    img = Image.open(image.file).convert("RGB")
+    result = stream_vision(np.array(img), text_input, model_id)
+    return {"response": result}
+# -------------- GRADIO UI --------------
+def build_gradio_ui():
+    CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important;}"""
+    with gr.Blocks(css=CSS) as demo:
+        with gr.Tab("Chat"):
+            chat = gr.Chatbot(height=600)
+            gr.ChatInterface(fn=stream_chat, chatbot=chat)
         with gr.Tab("Vision"):
+            img = gr.Image()
+            txt = gr.Textbox("What's in this image?")
+            model_sel = gr.Dropdown(list(models.keys()), value="microsoft/Phi-3.5-vision-instruct")
+            out = gr.Textbox()
+            gr.Button("Analyze").click(stream_vision, [img, txt, model_sel], out)
+    return demo
+gradio_app = build_gradio_ui()
+app = mount_gradio_app(api, gradio_app, path="/")