Spaces:

arad1367
/

Phi-3.5-Chatbot-Vision-App

Running on Zero

App Files Files Community

arad1367 commited on Aug 22, 2024

Commit

c305876

verified ·

1 Parent(s): 24faa28

Upload 2 files

Browse files

Files changed (2) hide show

app.py +277 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import os
+import time
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig, AutoProcessor
+import gradio as gr
+from threading import Thread
+from PIL import Image
+import subprocess
+import spaces
+# Install flash-attn if not already installed
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+# Model and tokenizer for the chatbot
+MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
+MODEL_LIST1 = ["microsoft/Phi-3.5-mini-instruct"]
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+device = "cuda"  # for GPU usage or "cpu" for CPU usage / But you need GPU :)
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID1,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config)
+# Chatbot tab function
+@spaces.GPU()
+def stream_chat(
+    message: str,
+    history: list,
+    system_prompt: str,
+    temperature: float = 0.8,
+    max_new_tokens: int = 1024,
+    top_p: float = 1.0,
+    top_k: int = 20,
+    penalty: float = 1.2,
+):
+    print(f'message: {message}')
+    print(f'history: {history}')
+    conversation = [
+        {"role": "system", "content": system_prompt}
+    ]
+    for prompt, answer in history:
+        conversation.extend([
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": answer},
+        ])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        max_new_tokens = max_new_tokens,
+        do_sample = False if temperature == 0 else True,
+        top_p = top_p,
+        top_k = top_k,
+        temperature = temperature,
+        eos_token_id=[128001,128008,128009],
+        streamer=streamer,
+    )
+    with torch.no_grad():
+        thread = Thread(target=model.generate, kwargs=generate_kwargs)
+        thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        yield buffer
+# Vision model setup
+models = {
+    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
+}
+processors = {
+    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
+}
+user_prompt = '\n'
+assistant_prompt = '\n'
+prompt_suffix = "\n"
+# Vision model tab function
+@spaces.GPU()
+def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
+    model = models[model_id]
+    processor = processors[model_id]
+    # Prepare the image list and corresponding tags
+    images = [Image.fromarray(image).convert("RGB")]
+    placeholder = "<|image_1|>\n"  # Using the image tag as per the example
+    # Construct the prompt with the image tag and the user's text input
+    if text_input:
+        prompt_content = placeholder + text_input
+    else:
+        prompt_content = placeholder
+    messages = [
+        {"role": "user", "content": prompt_content},
+    ]
+    # Apply the chat template to the messages
+    prompt = processor.tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # Process the inputs with the processor
+    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
+    # Generation parameters
+    generation_args = {
+        "max_new_tokens": 1000,
+        "temperature": 0.0,
+        "do_sample": False,
+    }
+    # Generate the response
+    generate_ids = model.generate(
+        **inputs,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        **generation_args
+    )
+    # Remove input tokens from the generated response
+    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+    # Decode the generated output
+    response = processor.batch_decode(
+        generate_ids,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+    )[0]
+    return response
+# CSS for the interface
+CSS = """
+.duplicate-button {
+    margin: auto !important;
+    color: white !important;
+    background: black !important;
+    border-radius: 100vh !important;
+}
+h3 {
+    text-align: center;
+}
+"""
+PLACEHOLDER = """
+<center>
+<p>Hi! I'm your assistant. Feel free to ask your questions</p>
+</center>
+"""
+TITLE = "<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision</center></h1>"
+EXPLANATION = """
+<div style="text-align: center; margin-top: 20px;">
+    <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
+    <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures.</p>
+    <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data. The model belongs to the Phi-3 model family and supports 128K token context length. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning, proximal policy optimization, and direct preference optimization to ensure precise instruction adherence and robust safety measures.</p>
+</div>
+"""
+footer = """
+<div style="text-align: center; margin-top: 20px;">
+    <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
+    <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
+    <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
+    <a href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" target="_blank">microsoft/Phi-3.5-mini-instruct</a> |
+    <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>
+    <br>
+    Made with 💖 by Pejman Ebrahimi
+</div>
+"""
+# Gradio app with two tabs
+with gr.Blocks(css=CSS, theme="small_and_pretty") as demo:
+    gr.HTML(TITLE)
+    gr.HTML(EXPLANATION)
+    with gr.Tab("Chatbot"):
+        chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
+        gr.ChatInterface(
+            fn=stream_chat,
+            chatbot=chatbot,
+            fill_height=True,
+            additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+            additional_inputs=[
+                gr.Textbox(
+                    value="You are a helpful assistant",
+                    label="System Prompt",
+                    render=False,
+                ),
+                gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    step=0.1,
+                    value=0.8,
+                    label="Temperature",
+                    render=False,
+                ),
+                gr.Slider(
+                    minimum=128,
+                    maximum=8192,
+                    step=1,
+                    value=1024,
+                    label="Max new tokens",
+                    render=False,
+                ),
+                gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.1,
+                    value=1.0,
+                    label="top_p",
+                    render=False,
+                ),
+                gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    step=1,
+                    value=20,
+                    label="top_k",
+                    render=False,
+                ),
+                gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    step=0.1,
+                    value=1.2,
+                    label="Repetition penalty",
+                    render=False,
+                ),
+            ],
+            examples=[
+                ["How to make a self-driving car?"],
+                ["Give me a creative idea to establish a startup"],
+                ["How can I improve my programming skills?"],
+                ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
+            ],
+            cache_examples=False,
+        )
+    with gr.Tab("Vision"):
+        with gr.Row():
+            input_img = gr.Image(label="Input Picture")
+        with gr.Row():
+            model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
+        with gr.Row():
+            text_input = gr.Textbox(label="Question")
+        with gr.Row():
+            submit_btn = gr.Button(value="Submit")
+        with gr.Row():
+            output_text = gr.Textbox(label="Output Text")
+        submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
+    gr.HTML(footer)
+# Launch the combined app
+demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+accelerate==0.30.0
+bitsandbytes
+torch
+torchvision
+transformers==4.43.0
+einops
+sentencepiece
+numpy==1.24.4
+Pillow==10.3.0
+Requests==2.31.0
+gradio