Spaces:

Joe6636564
/

coderpilot

Sleeping

App Files Files Community

Joe6636564 commited on Nov 6, 2025

Commit

9e0c9bc

verified ·

1 Parent(s): 0d19a07

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -98

app.py CHANGED Viewed

@@ -32,28 +32,30 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
 # CPU-only model loading
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID1,
-    torch_dtype=torch.float32,  # Use float32 for CPU
     device_map="cpu",
-    low_cpu_mem_usage=True  # Optimize for CPU memory
 )
-# Vision model setup - FIXED for CPU
 print("Loading vision models...")
 models = {}
 processors = {}
 try:
-    # Load vision model without flash_attention_2 for CPU
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
         "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
-        torch_dtype=torch.float32,  # Use float32 for CPU
         device_map="cpu",
-        low_cpu_mem_usage=True,  # Optimize for CPU memory
-        # Remove flash_attention_2 for CPU compatibility
         _attn_implementation=None
     ).eval()
@@ -64,37 +66,21 @@ try:
     print("Vision model loaded successfully on CPU")
 except Exception as e:
     print(f"Error loading vision model: {e}")
-    # Try alternative loading method
-    try:
-        print("Trying alternative loading method for vision model...")
-        models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
-            "microsoft/Phi-3.5-vision-instruct",
-            trust_remote_code=True,
-            torch_dtype=torch.float32,
-            device_map="cpu"
-        ).eval()
-        processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
-            "microsoft/Phi-3.5-vision-instruct",
-            trust_remote_code=True
-        )
-        print("Vision model loaded successfully with alternative method")
-    except Exception as e2:
-        print(f"Failed to load vision model with alternative method: {e2}")
-# Chatbot function
 def stream_chat(
     message: str,
     history: list,
     system_prompt: str,
-    temperature: float = 0.8,
     max_new_tokens: int = 1024,
-    top_p: float = 1.0,
-    top_k: int = 20,
-    penalty: float = 1.2,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
     conversation = [{"role": "system", "content": system_prompt}]
     for prompt, answer in history:
@@ -104,18 +90,35 @@ def stream_chat(
         ])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
-        do_sample=False if temperature == 0 else True,
         top_p=top_p,
         top_k=top_k,
-        temperature=temperature,
-        eos_token_id=[128001,128008,128009],
         streamer=streamer,
     )
     with torch.no_grad():
@@ -127,7 +130,7 @@ def stream_chat(
             buffer += new_text
             yield buffer
-# Vision model function
 def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
     if model_id not in models:
         return "Vision model not available"
@@ -157,18 +160,23 @@ def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-ins
     # Process the inputs with the processor
     inputs = processor(prompt, images, return_tensors="pt").to(device)
-    # Generation parameters - reduced for CPU
     generation_args = {
-        "max_new_tokens": 300,  # Further reduced for CPU
-        "temperature": 0.0,
-        "do_sample": False,
     }
     # Generate the response
     try:
         generate_ids = model_vision.generate(
             **inputs,
-            eos_token_id=processor.tokenizer.eos_token_id,
             **generation_args
         )
@@ -184,7 +192,7 @@ def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-ins
     except Exception as e:
         return f"Error generating vision response: {str(e)}"
-# Flask API Routes
 @flask_app.route('/health', methods=['GET'])
 def health_check():
     vision_loaded = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
@@ -203,8 +211,10 @@ def api_chat():
         data = request.json
         message = data.get('message', '')
         system_prompt = data.get('system_prompt', 'You are a helpful assistant')
-        temperature = data.get('temperature', 0.8)
-        max_new_tokens = data.get('max_new_tokens', 512)  # Reduced for CPU
         # Prepare conversation
         conversation = [{"role": "system", "content": system_prompt}]
@@ -214,20 +224,26 @@ def api_chat():
             conversation, add_generation_prompt=True, return_tensors="pt"
         ).to(device)
-        # Generate response
         with torch.no_grad():
             generate_ids = model.generate(
                 input_ids,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 do_sample=temperature > 0,
-                eos_token_id=[128001, 128008, 128009]
             )
         # Decode response
         response = tokenizer.decode(
             generate_ids[0][input_ids.shape[1]:],
-            skip_special_tokens=True
         )
         return jsonify({
@@ -285,100 +301,84 @@ def run_flask():
     flask_app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
 def run_gradio():
-    # CSS for the interface
     CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
     PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
-    # Check if vision model is available
     vision_available = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
     vision_status = "Available" if vision_available else "Not Available"
-    TITLE = f"<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (CPU Version)</center></h1>"
     EXPLANATION = f"""<div style="text-align: center; margin-top: 20px;">
-        <p><strong>CPU-Only Version</strong> - This instance is running on CPU. Responses may be slower than GPU-accelerated versions.</p>
         <p><strong>Vision Model Status:</strong> {vision_status}</p>
-        <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
-        <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision.</p>
-        <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data.</p>
     </div>"""
     footer = """<div style="text-align: center; margin-top: 20px;">
-        <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
-        <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
-        <a href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" target="_blank">microsoft/Phi-3.5-mini-instruct</a> |
-        <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>
-        <br> Made with 💖 by Pejman Ebrahimi | Running on CPU
     </div>"""
-    # Gradio app with two tabs
-    with gr.Blocks(css=CSS, theme=gr.themes.Default()) as demo:  # Changed to default theme
         gr.HTML(TITLE)
         gr.HTML(EXPLANATION)
         gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
         with gr.Tab("Chatbot"):
-            chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER, type="messages")  # Fixed deprecated type
             gr.ChatInterface(
                 fn=stream_chat,
                 chatbot=chatbot,
                 fill_height=True,
-                additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
                 additional_inputs=[
                     gr.Textbox(
-                        value="You are a helpful assistant",
                         label="System Prompt",
-                        render=False,
                     ),
                     gr.Slider(
-                        minimum=0,
-                        maximum=1,
                         step=0.1,
-                        value=0.8,
-                        label="Temperature",
-                        render=False,
                     ),
                     gr.Slider(
                         minimum=128,
-                        maximum=2048,  # Reduced for CPU
                         step=1,
-                        value=512,  # Reduced for CPU
                         label="Max new tokens",
-                        render=False,
                     ),
                     gr.Slider(
-                        minimum=0.0,
                         maximum=1.0,
                         step=0.1,
-                        value=1.0,
-                        label="top_p",
-                        render=False,
                     ),
                     gr.Slider(
                         minimum=1,
-                        maximum=20,
                         step=1,
-                        value=20,
-                        label="top_k",
-                        render=False,
                     ),
                     gr.Slider(
-                        minimum=0.0,
                         maximum=2.0,
                         step=0.1,
-                        value=1.2,
-                        label="Repetition penalty",
-                        render=False,
                     ),
                 ],
                 examples=[
-                    ["Hello, how are you?"],
-                    ["Explain quantum computing in simple terms"],
-                    ["What are the benefits of renewable energy?"],
-                    ["Write a short poem about technology"],
                 ],
                 cache_examples=False,
             )
-        # Only show vision tab if model is available
         if vision_available:
             with gr.Tab("Vision"):
                 with gr.Row():
@@ -388,27 +388,32 @@ def run_gradio():
                         choices=list(models.keys()),
                         label="Model",
                         value="microsoft/Phi-3.5-vision-instruct",
-                        allow_custom_value=False  # Fixed warning
                     )
                 with gr.Row():
-                    text_input = gr.Textbox(label="Question", value="What's in this image?")
                 with gr.Row():
-                    submit_btn = gr.Button(value="Submit")
                 with gr.Row():
-                    output_text = gr.Textbox(label="Output Text")
                 submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
         else:
             with gr.Tab("Vision"):
                 gr.HTML("""<div style="text-align: center; padding: 40px;">
                     <h3>Vision Model Not Available</h3>
-                    <p>The vision model failed to load. This is likely due to memory constraints on CPU.</p>
-                    <p>Try using the chat model instead, or run this on a system with more RAM.</p>
                 </div>""")
         gr.HTML(footer)
-    # Launch the Gradio app
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 if __name__ == "__main__":
@@ -419,9 +424,7 @@ if __name__ == "__main__":
     print(f"Vision model loaded: {len(models) > 0}")
     print("=" * 50)
-    # Start Flask server in a separate thread
     flask_thread = threading.Thread(target=run_flask, daemon=True)
     flask_thread.start()
-    # Run Gradio in main thread
     run_gradio()

 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
+# Add padding token if it doesn't exist
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
 # CPU-only model loading
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID1,
+    torch_dtype=torch.float32,
     device_map="cpu",
+    low_cpu_mem_usage=True
 )
+# Vision model setup
 print("Loading vision models...")
 models = {}
 processors = {}
 try:
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
         "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
+        torch_dtype=torch.float32,
         device_map="cpu",
+        low_cpu_mem_usage=True,
         _attn_implementation=None
     ).eval()
     print("Vision model loaded successfully on CPU")
 except Exception as e:
     print(f"Error loading vision model: {e}")
+# Optimized chatbot function with better generation parameters
 def stream_chat(
     message: str,
     history: list,
     system_prompt: str,
+    temperature: float = 0.7,  # Lower temperature for more focused responses
     max_new_tokens: int = 1024,
+    top_p: float = 0.9,  # Lower top_p for less randomness
+    top_k: int = 40,     # Moderate top_k
+    repetition_penalty: float = 1.1,  # Lower repetition penalty
 ):
     print(f'message: {message}')
     print(f'history: {history}')
     conversation = [{"role": "system", "content": system_prompt}]
     for prompt, answer in history:
         ])
     conversation.append({"role": "user", "content": message})
+    # Apply chat template
+    input_ids = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(device)
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        timeout=60.0,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    # Optimized generation parameters to reduce repetition
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
+        temperature=temperature,
         top_p=top_p,
         top_k=top_k,
+        repetition_penalty=repetition_penalty,  # Use repetition_penalty instead of penalty
+        do_sample=True if temperature > 0 else False,
+        pad_token_id=tokenizer.eos_token_id,
+        eos_token_id=[tokenizer.eos_token_id, 128001, 128008, 128009],
         streamer=streamer,
+        no_repeat_ngram_size=3,  # Prevent repeating n-grams
+        early_stopping=True,
     )
     with torch.no_grad():
             buffer += new_text
             yield buffer
+# Optimized vision model function
 def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
     if model_id not in models:
         return "Vision model not available"
     # Process the inputs with the processor
     inputs = processor(prompt, images, return_tensors="pt").to(device)
+    # Optimized generation parameters for vision model
     generation_args = {
+        "max_new_tokens": 500,
+        "temperature": 0.3,  # Lower temperature for more factual responses
+        "top_p": 0.9,
+        "top_k": 30,
+        "repetition_penalty": 1.1,
+        "do_sample": True,
+        "no_repeat_ngram_size": 3,
+        "early_stopping": True,
+        "eos_token_id": processor.tokenizer.eos_token_id,
     }
     # Generate the response
     try:
         generate_ids = model_vision.generate(
             **inputs,
             **generation_args
         )
     except Exception as e:
         return f"Error generating vision response: {str(e)}"
+# Flask API Routes with optimized parameters
 @flask_app.route('/health', methods=['GET'])
 def health_check():
     vision_loaded = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
         data = request.json
         message = data.get('message', '')
         system_prompt = data.get('system_prompt', 'You are a helpful assistant')
+        temperature = data.get('temperature', 0.7)  # Default to lower temperature
+        max_new_tokens = data.get('max_new_tokens', 512)
+        top_p = data.get('top_p', 0.9)
+        repetition_penalty = data.get('repetition_penalty', 1.1)
         # Prepare conversation
         conversation = [{"role": "system", "content": system_prompt}]
             conversation, add_generation_prompt=True, return_tensors="pt"
         ).to(device)
+        # Generate response with optimized parameters
         with torch.no_grad():
             generate_ids = model.generate(
                 input_ids,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
                 do_sample=temperature > 0,
+                no_repeat_ngram_size=3,
+                early_stopping=True,
+                eos_token_id=[tokenizer.eos_token_id, 128001, 128008, 128009],
+                pad_token_id=tokenizer.eos_token_id,
             )
         # Decode response
         response = tokenizer.decode(
             generate_ids[0][input_ids.shape[1]:],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True
         )
         return jsonify({
     flask_app.run(host='0.0.0.0', port=5000, debug=False, threaded=True)
 def run_gradio():
     CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
     PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
     vision_available = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
     vision_status = "Available" if vision_available else "Not Available"
+    TITLE = f"<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (Optimized CPU Version)</center></h1>"
     EXPLANATION = f"""<div style="text-align: center; margin-top: 20px;">
+        <p><strong>Optimized CPU Version</strong> - Better response quality with reduced repetition</p>
         <p><strong>Vision Model Status:</strong> {vision_status}</p>
+        <p><strong>Optimizations applied:</strong> Lower temperature, repetition penalty, and no-repeat n-gram size</p>
     </div>"""
     footer = """<div style="text-align: center; margin-top: 20px;">
+        <br> Made with 💖 by Pejman Ebrahimi | Running on CPU with optimized parameters
     </div>"""
+    with gr.Blocks(css=CSS, theme=gr.themes.Default()) as demo:
         gr.HTML(TITLE)
         gr.HTML(EXPLANATION)
         gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
         with gr.Tab("Chatbot"):
+            chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER, type="messages")
             gr.ChatInterface(
                 fn=stream_chat,
                 chatbot=chatbot,
                 fill_height=True,
+                additional_inputs_accordion=gr.Accordion(label="⚙️ Advanced Parameters", open=False),
                 additional_inputs=[
                     gr.Textbox(
+                        value="You are a helpful AI assistant. Provide accurate, concise, and non-repetitive responses.",
                         label="System Prompt",
                     ),
                     gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
                         step=0.1,
+                        value=0.7,
+                        label="Temperature (lower = more focused)",
                     ),
                     gr.Slider(
                         minimum=128,
+                        maximum=2048,
                         step=1,
+                        value=512,
                         label="Max new tokens",
                     ),
                     gr.Slider(
+                        minimum=0.5,
                         maximum=1.0,
                         step=0.1,
+                        value=0.9,
+                        label="Top-p (nucleus sampling)",
                     ),
                     gr.Slider(
                         minimum=1,
+                        maximum=100,
                         step=1,
+                        value=40,
+                        label="Top-k",
                     ),
                     gr.Slider(
+                        minimum=1.0,
                         maximum=2.0,
                         step=0.1,
+                        value=1.1,
+                        label="Repetition Penalty",
                     ),
                 ],
                 examples=[
+                    ["Explain the concept of machine learning in simple terms"],
+                    ["What are the main differences between Python and JavaScript?"],
+                    ["How does photosynthesis work in plants?"],
+                    ["Write a brief summary of the history of the internet"],
                 ],
                 cache_examples=False,
             )
         if vision_available:
             with gr.Tab("Vision"):
                 with gr.Row():
                         choices=list(models.keys()),
                         label="Model",
                         value="microsoft/Phi-3.5-vision-instruct",
+                        allow_custom_value=False
                     )
                 with gr.Row():
+                    text_input = gr.Textbox(
+                        label="Question",
+                        value="Describe what you see in this image in detail without repetition.",
+                        placeholder="Ask a specific question about the image..."
+                    )
                 with gr.Row():
+                    submit_btn = gr.Button(value="Analyze Image")
                 with gr.Row():
+                    output_text = gr.Textbox(
+                        label="Analysis Result",
+                        lines=5
+                    )
                 submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
         else:
             with gr.Tab("Vision"):
                 gr.HTML("""<div style="text-align: center; padding: 40px;">
                     <h3>Vision Model Not Available</h3>
+                    <p>The vision model failed to load due to memory constraints.</p>
                 </div>""")
         gr.HTML(footer)
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 if __name__ == "__main__":
     print(f"Vision model loaded: {len(models) > 0}")
     print("=" * 50)
     flask_thread = threading.Thread(target=run_flask, daemon=True)
     flask_thread.start()
     run_gradio()