Spaces:

Joe6636564
/

coderpilot

Sleeping

App Files Files Community

Joe6636564 commited on Nov 6, 2025

Commit

2949c89

verified ·

1 Parent(s): 9e0c9bc

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -157

app.py CHANGED Viewed

@@ -32,16 +32,12 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
-# Add padding token if it doesn't exist
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
 # CPU-only model loading
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID1,
-    torch_dtype=torch.float32,
     device_map="cpu",
-    low_cpu_mem_usage=True
 )
 # Vision model setup
@@ -53,10 +49,9 @@ try:
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
         "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
-        torch_dtype=torch.float32,
         device_map="cpu",
-        low_cpu_mem_usage=True,
-        _attn_implementation=None
     ).eval()
     processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
@@ -67,20 +62,19 @@ try:
 except Exception as e:
     print(f"Error loading vision model: {e}")
-# Optimized chatbot function with better generation parameters
 def stream_chat(
     message: str,
     history: list,
     system_prompt: str,
-    temperature: float = 0.7,  # Lower temperature for more focused responses
     max_new_tokens: int = 1024,
-    top_p: float = 0.9,  # Lower top_p for less randomness
-    top_k: int = 40,     # Moderate top_k
-    repetition_penalty: float = 1.1,  # Lower repetition penalty
 ):
     print(f'message: {message}')
     print(f'history: {history}')
     conversation = [{"role": "system", "content": system_prompt}]
     for prompt, answer in history:
@@ -90,35 +84,18 @@ def stream_chat(
         ])
     conversation.append({"role": "user", "content": message})
-    # Apply chat template
-    input_ids = tokenizer.apply_chat_template(
-        conversation,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to(device)
-    streamer = TextIteratorStreamer(
-        tokenizer,
-        timeout=60.0,
-        skip_prompt=True,
-        skip_special_tokens=True
-    )
-    # Optimized generation parameters to reduce repetition
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
-        temperature=temperature,
         top_p=top_p,
         top_k=top_k,
-        repetition_penalty=repetition_penalty,  # Use repetition_penalty instead of penalty
-        do_sample=True if temperature > 0 else False,
-        pad_token_id=tokenizer.eos_token_id,
-        eos_token_id=[tokenizer.eos_token_id, 128001, 128008, 128009],
         streamer=streamer,
-        no_repeat_ngram_size=3,  # Prevent repeating n-grams
-        early_stopping=True,
     )
     with torch.no_grad():
@@ -130,7 +107,7 @@ def stream_chat(
             buffer += new_text
             yield buffer
-# Optimized vision model function
 def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
     if model_id not in models:
         return "Vision model not available"
@@ -160,48 +137,39 @@ def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-ins
     # Process the inputs with the processor
     inputs = processor(prompt, images, return_tensors="pt").to(device)
-    # Optimized generation parameters for vision model
     generation_args = {
-        "max_new_tokens": 500,
-        "temperature": 0.3,  # Lower temperature for more factual responses
-        "top_p": 0.9,
-        "top_k": 30,
-        "repetition_penalty": 1.1,
-        "do_sample": True,
-        "no_repeat_ngram_size": 3,
-        "early_stopping": True,
-        "eos_token_id": processor.tokenizer.eos_token_id,
     }
     # Generate the response
-    try:
-        generate_ids = model_vision.generate(
-            **inputs,
-            **generation_args
-        )
-        # Remove input tokens from the generated response
-        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
-        # Decode the generated output
-        response = processor.batch_decode(
-            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-        return response
-    except Exception as e:
-        return f"Error generating vision response: {str(e)}"
-# Flask API Routes with optimized parameters
 @flask_app.route('/health', methods=['GET'])
 def health_check():
-    vision_loaded = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
     return jsonify({
         "status": "healthy",
         "device": device,
         "models_loaded": {
             "chatbot": MODEL_ID1 in globals() and 'model' in globals(),
-            "vision": vision_loaded
         }
     })
@@ -211,10 +179,8 @@ def api_chat():
         data = request.json
         message = data.get('message', '')
         system_prompt = data.get('system_prompt', 'You are a helpful assistant')
-        temperature = data.get('temperature', 0.7)  # Default to lower temperature
-        max_new_tokens = data.get('max_new_tokens', 512)
-        top_p = data.get('top_p', 0.9)
-        repetition_penalty = data.get('repetition_penalty', 1.1)
         # Prepare conversation
         conversation = [{"role": "system", "content": system_prompt}]
@@ -224,26 +190,20 @@ def api_chat():
             conversation, add_generation_prompt=True, return_tensors="pt"
         ).to(device)
-        # Generate response with optimized parameters
         with torch.no_grad():
             generate_ids = model.generate(
                 input_ids,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
                 do_sample=temperature > 0,
-                no_repeat_ngram_size=3,
-                early_stopping=True,
-                eos_token_id=[tokenizer.eos_token_id, 128001, 128008, 128009],
-                pad_token_id=tokenizer.eos_token_id,
             )
         # Decode response
         response = tokenizer.decode(
             generate_ids[0][input_ids.shape[1]:],
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=True
         )
         return jsonify({
@@ -289,142 +249,126 @@ def api_vision():
 @flask_app.route('/api/models', methods=['GET'])
 def get_models():
-    vision_loaded = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
     return jsonify({
         "chat_model": MODEL_ID1,
-        "vision_models": list(models.keys()) if vision_loaded else [],
-        "device": device,
-        "vision_available": vision_loaded
     })
 def run_flask():
     flask_app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
 def run_gradio():
     CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
     PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
-    vision_available = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
-    vision_status = "Available" if vision_available else "Not Available"
-    TITLE = f"<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (Optimized CPU Version)</center></h1>"
-    EXPLANATION = f"""<div style="text-align: center; margin-top: 20px;">
-        <p><strong>Optimized CPU Version</strong> - Better response quality with reduced repetition</p>
-        <p><strong>Vision Model Status:</strong> {vision_status}</p>
-        <p><strong>Optimizations applied:</strong> Lower temperature, repetition penalty, and no-repeat n-gram size</p>
     </div>"""
     footer = """<div style="text-align: center; margin-top: 20px;">
-        <br> Made with 💖 by Pejman Ebrahimi | Running on CPU with optimized parameters
     </div>"""
-    with gr.Blocks(css=CSS, theme=gr.themes.Default()) as demo:
         gr.HTML(TITLE)
         gr.HTML(EXPLANATION)
         gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
         with gr.Tab("Chatbot"):
-            chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER, type="messages")
             gr.ChatInterface(
                 fn=stream_chat,
                 chatbot=chatbot,
                 fill_height=True,
-                additional_inputs_accordion=gr.Accordion(label="⚙️ Advanced Parameters", open=False),
                 additional_inputs=[
                     gr.Textbox(
-                        value="You are a helpful AI assistant. Provide accurate, concise, and non-repetitive responses.",
                         label="System Prompt",
                     ),
                     gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
                         step=0.1,
-                        value=0.7,
-                        label="Temperature (lower = more focused)",
                     ),
                     gr.Slider(
                         minimum=128,
-                        maximum=2048,
                         step=1,
-                        value=512,
                         label="Max new tokens",
                     ),
                     gr.Slider(
-                        minimum=0.5,
                         maximum=1.0,
                         step=0.1,
-                        value=0.9,
-                        label="Top-p (nucleus sampling)",
                     ),
                     gr.Slider(
                         minimum=1,
-                        maximum=100,
                         step=1,
-                        value=40,
-                        label="Top-k",
                     ),
                     gr.Slider(
-                        minimum=1.0,
                         maximum=2.0,
                         step=0.1,
-                        value=1.1,
-                        label="Repetition Penalty",
                     ),
                 ],
                 examples=[
-                    ["Explain the concept of machine learning in simple terms"],
-                    ["What are the main differences between Python and JavaScript?"],
-                    ["How does photosynthesis work in plants?"],
-                    ["Write a brief summary of the history of the internet"],
                 ],
                 cache_examples=False,
             )
-        if vision_available:
-            with gr.Tab("Vision"):
-                with gr.Row():
-                    input_img = gr.Image(label="Input Picture")
-                with gr.Row():
-                    model_selector = gr.Dropdown(
-                        choices=list(models.keys()),
-                        label="Model",
-                        value="microsoft/Phi-3.5-vision-instruct",
-                        allow_custom_value=False
-                    )
-                with gr.Row():
-                    text_input = gr.Textbox(
-                        label="Question",
-                        value="Describe what you see in this image in detail without repetition.",
-                        placeholder="Ask a specific question about the image..."
-                    )
-                with gr.Row():
-                    submit_btn = gr.Button(value="Analyze Image")
-                with gr.Row():
-                    output_text = gr.Textbox(
-                        label="Analysis Result",
-                        lines=5
-                    )
-                submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
-        else:
-            with gr.Tab("Vision"):
-                gr.HTML("""<div style="text-align: center; padding: 40px;">
-                    <h3>Vision Model Not Available</h3>
-                    <p>The vision model failed to load due to memory constraints.</p>
-                </div>""")
         gr.HTML(footer)
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 if __name__ == "__main__":
-    print("=" * 50)
-    print("Application Starting Up...")
-    print(f"Device: {device}")
-    print(f"Chat model loaded: {MODEL_ID1}")
-    print(f"Vision model loaded: {len(models) > 0}")
-    print("=" * 50)
     flask_thread = threading.Thread(target=run_flask, daemon=True)
     flask_thread.start()
     run_gradio()

 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
 # CPU-only model loading
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID1,
+    torch_dtype=torch.float32,  # Use float32 for CPU
     device_map="cpu",
+    low_cpu_mem_usage=True  # Optimize for CPU memory
 )
 # Vision model setup
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
         "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
+        torch_dtype=torch.float32,  # Use float32 for CPU
         device_map="cpu",
+        low_cpu_mem_usage=True  # Optimize for CPU memory
     ).eval()
     processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
 except Exception as e:
     print(f"Error loading vision model: {e}")
+# Chatbot function
 def stream_chat(
     message: str,
     history: list,
     system_prompt: str,
+    temperature: float = 0.8,
     max_new_tokens: int = 1024,
+    top_p: float = 1.0,
+    top_k: int = 20,
+    penalty: float = 1.2,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
     conversation = [{"role": "system", "content": system_prompt}]
     for prompt, answer in history:
         ])
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
+        do_sample=False if temperature == 0 else True,
         top_p=top_p,
         top_k=top_k,
+        temperature=temperature,
+        eos_token_id=[128001,128008,128009],
         streamer=streamer,
     )
     with torch.no_grad():
             buffer += new_text
             yield buffer
+# Vision model function
 def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
     if model_id not in models:
         return "Vision model not available"
     # Process the inputs with the processor
     inputs = processor(prompt, images, return_tensors="pt").to(device)
+    # Generation parameters
     generation_args = {
+        "max_new_tokens": 500,  # Reduced for CPU
+        "temperature": 0.0,
+        "do_sample": False,
     }
     # Generate the response
+    generate_ids = model_vision.generate(
+        **inputs,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        **generation_args
+    )
+    # Remove input tokens from the generated response
+    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+    # Decode the generated output
+    response = processor.batch_decode(
+        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    return response
+# Flask API Routes
 @flask_app.route('/health', methods=['GET'])
 def health_check():
     return jsonify({
         "status": "healthy",
         "device": device,
         "models_loaded": {
             "chatbot": MODEL_ID1 in globals() and 'model' in globals(),
+            "vision": len(models) > 0
         }
     })
         data = request.json
         message = data.get('message', '')
         system_prompt = data.get('system_prompt', 'You are a helpful assistant')
+        temperature = data.get('temperature', 0.8)
+        max_new_tokens = data.get('max_new_tokens', 512)  # Reduced for CPU
         # Prepare conversation
         conversation = [{"role": "system", "content": system_prompt}]
             conversation, add_generation_prompt=True, return_tensors="pt"
         ).to(device)
+        # Generate response
         with torch.no_grad():
             generate_ids = model.generate(
                 input_ids,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 do_sample=temperature > 0,
+                eos_token_id=[128001, 128008, 128009]
             )
         # Decode response
         response = tokenizer.decode(
             generate_ids[0][input_ids.shape[1]:],
+            skip_special_tokens=True
         )
         return jsonify({
 @flask_app.route('/api/models', methods=['GET'])
 def get_models():
     return jsonify({
         "chat_model": MODEL_ID1,
+        "vision_models": list(models.keys()),
+        "device": device
     })
 def run_flask():
     flask_app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
 def run_gradio():
+    # CSS for the interface
     CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
     PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
+    TITLE = "<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (CPU Version)</center></h1>"
+    EXPLANATION = """<div style="text-align: center; margin-top: 20px;">
+        <p><strong>CPU-Only Version</strong> - This instance is running on CPU. Responses may be slower than GPU-accelerated versions.</p>
+        <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
+        <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support.</p>
+        <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data. The model belongs to the Phi-3 model family and supports 128K token context length.</p>
     </div>"""
     footer = """<div style="text-align: center; margin-top: 20px;">
+        <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
+        <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
+        <a href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" target="_blank">microsoft/Phi-3.5-mini-instruct</a> |
+        <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>
+        <br> Made with 💖 by Pejman Ebrahimi | Running on CPU
     </div>"""
+    # Gradio app with two tabs
+    with gr.Blocks(css=CSS, theme="small_and_pretty") as demo:
         gr.HTML(TITLE)
         gr.HTML(EXPLANATION)
         gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
         with gr.Tab("Chatbot"):
+            chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
             gr.ChatInterface(
                 fn=stream_chat,
                 chatbot=chatbot,
                 fill_height=True,
+                additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
                 additional_inputs=[
                     gr.Textbox(
+                        value="You are a helpful assistant",
                         label="System Prompt",
+                        render=False,
                     ),
                     gr.Slider(
+                        minimum=0,
+                        maximum=1,
                         step=0.1,
+                        value=0.8,
+                        label="Temperature",
+                        render=False,
                     ),
                     gr.Slider(
                         minimum=128,
+                        maximum=2048,  # Reduced for CPU
                         step=1,
+                        value=512,  # Reduced for CPU
                         label="Max new tokens",
+                        render=False,
                     ),
                     gr.Slider(
+                        minimum=0.0,
                         maximum=1.0,
                         step=0.1,
+                        value=1.0,
+                        label="top_p",
+                        render=False,
                     ),
                     gr.Slider(
                         minimum=1,
+                        maximum=20,
                         step=1,
+                        value=20,
+                        label="top_k",
+                        render=False,
                     ),
                     gr.Slider(
+                        minimum=0.0,
                         maximum=2.0,
                         step=0.1,
+                        value=1.2,
+                        label="Repetition penalty",
+                        render=False,
                     ),
                 ],
                 examples=[
+                    ["Hello, how are you?"],
+                    ["Explain quantum computing in simple terms"],
+                    ["What are the benefits of renewable energy?"],
+                    ["Write a short poem about technology"],
                 ],
                 cache_examples=False,
             )
+        with gr.Tab("Vision"):
+            with gr.Row():
+                input_img = gr.Image(label="Input Picture")
+            with gr.Row():
+                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
+            with gr.Row():
+                text_input = gr.Textbox(label="Question", value="What's in this image?")
+            with gr.Row():
+                submit_btn = gr.Button(value="Submit")
+            with gr.Row():
+                output_text = gr.Textbox(label="Output Text")
+            submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
         gr.HTML(footer)
+    # Launch the Gradio app
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 if __name__ == "__main__":
+    # Start Flask server in a separate thread
     flask_thread = threading.Thread(target=run_flask, daemon=True)
     flask_thread.start()
+    # Run Gradio in main thread
     run_gradio()