Spaces:

pradeep6kumar2024
/

QLORA_phi2

Sleeping

App Files Files Community

pradeep6kumar2024 commited on Mar 3, 2025

Commit

3bf5e4f

1 Parent(s): 537ecfd

updated app.py

Browse files

Files changed (1) hide show

app.py +95 -48

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import time
 # Configuration
 BASE_MODEL = "microsoft/phi-2"
@@ -17,6 +19,13 @@ class ModelWrapper:
     def load_model(self):
         if not self.loaded:
             try:
                 print("Loading tokenizer...")
                 self.tokenizer = AutoTokenizer.from_pretrained(
                     BASE_MODEL,
@@ -28,19 +37,25 @@ class ModelWrapper:
                 print("Loading base model...")
                 base_model = AutoModelForCausalLM.from_pretrained(
                     BASE_MODEL,
-                    torch_dtype=torch.float16,
-                    device_map="auto",
                     trust_remote_code=True,
-                    use_flash_attention_2=False  # Disable flash attention if causing issues
                 )
                 print("Loading LoRA adapter...")
                 self.model = PeftModel.from_pretrained(
                     base_model,
                     ADAPTER_MODEL,
-                    torch_dtype=torch.float16,
-                    device_map="auto"
                 )
                 self.model.eval()
                 print("Model loading complete!")
                 self.loaded = True
@@ -61,9 +76,10 @@ Include:
 - Function implementation with comments
 - Example usage
 - Output demonstration
-"""
             elif any(word in prompt.lower() for word in ["explain", "what is", "how does", "describe"]):
-                enhanced_prompt = f"""Below is a request for explanation. Please provide a complete, detailed response:
 {prompt}
@@ -72,13 +88,13 @@ Your response should include:
 2. Practical examples and applications
 3. Important concepts to understand
-Response:"""
             else:
-                enhanced_prompt = f"""Below is a request. Please provide a complete, detailed response:
 {prompt}
-Response:"""
             print(f"Enhanced prompt: {enhanced_prompt}")  # Debug logging
@@ -89,26 +105,26 @@ Response:"""
                 truncation=True,
                 max_length=512,
                 padding=True
-            ).to(self.model.device)
-            # Generate
             start_time = time.time()
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_length=max_length,
-                    min_length=50,  # Reduced minimum length requirement
-                    temperature=max(0.6, temperature),  # Ensure minimum temperature
-                    top_p=min(0.95, top_p),  # Cap top_p
                     do_sample=True,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
-                    repetition_penalty=1.2,  # Increased repetition penalty
-                    no_repeat_ngram_size=3,
                     num_return_sequences=1,
                     early_stopping=True,
-                    num_beams=3,  # Reduced beam search
-                    length_penalty=0.7  # Encourage shorter responses
                 )
             # Decode response
@@ -121,7 +137,7 @@ Response:"""
             print(f"After prompt removal: {response}")  # Debug logging
-            # Remove common closure patterns only if they appear at the very end
             closures = [
                 "Best regards,",
                 "Sincerely,",
@@ -134,21 +150,52 @@ Response:"""
                 "[Student]",
                 "Let me know if you need any clarification",
                 "I hope this helps",
-                "Feel free to ask"
             ]
             for closure in closures:
                 if response.lower().endswith(closure.lower()):
                     response = response[:-(len(closure))].strip()
-            print(f"After closure removal: {response}")  # Debug logging
             # Ensure code examples are properly formatted
             if "```python" not in response and "def " in response:
                 response = "```python\n" + response + "\n```"
-            # More lenient validation
-            if len(response.strip()) < 20 or response.strip() == "Response:":  # Only check for very short responses
                 print("Response validation failed - using fallback")  # Debug logging
                 if "machine learning" in prompt.lower():
@@ -194,7 +241,7 @@ result = add_numbers(num1, num2)
 print(f"The sum of {num1} and {num2} is: {result}")  # Output: The sum of 5 and 3 is: 8
 ```"""
                 else:
-                    fallback_response = "I apologize, but I couldn't generate a complete response. Please try adjusting the temperature (try 0.6-0.8) or providing more context in your prompt."
                 response = fallback_response
@@ -207,7 +254,7 @@ print(f"The sum of {num1} and {num2} is: {result}")  # Output: The sum of 5 and
 # Initialize model wrapper
 model_wrapper = ModelWrapper()
-def generate_text(prompt, max_length=512, temperature=0.7, top_p=0.9):
     """Gradio interface function"""
     try:
         if not prompt.strip():
@@ -224,7 +271,7 @@ def generate_text(prompt, max_length=512, temperature=0.7, top_p=0.9):
         print(f"Error in generate_text: {str(e)}")
         return f"Error generating response: {str(e)}\nPlease try again with a different prompt or parameters."
-# Create the Gradio interface
 demo = gr.Interface(
     fn=generate_text,
     inputs=[
@@ -235,8 +282,8 @@ demo = gr.Interface(
         ),
         gr.Slider(
             minimum=64,
-            maximum=1024,
-            value=512,
             step=64,
             label="Maximum Length",
             info="Longer values = longer responses but slower generation"
@@ -244,7 +291,7 @@ demo = gr.Interface(
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
-            value=0.7,
             step=0.1,
             label="Temperature",
             info="Higher values = more creative, lower values = more focused"
@@ -252,14 +299,14 @@ demo = gr.Interface(
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
-            value=0.9,
             step=0.1,
             label="Top P",
             info="Controls diversity of word choices"
         ),
     ],
     outputs=gr.Textbox(label="Generated Response", lines=8),
-    title="Phi-2 QLoRA Fine-tuned Assistant",
     description="""This is a fine-tuned version of Microsoft's Phi-2 model using QLoRA.
     The model has been trained to provide helpful responses for various tasks including coding, writing, and general assistance.
@@ -270,39 +317,39 @@ demo = gr.Interface(
     Tips:
     - For code generation, use lower temperature (0.3-0.5)
-    - For creative writing, use higher temperature (0.7-0.9)
-    - Adjust max length based on how long you want the response to be
     """,
     examples=[
         [
             "Write a Python function to calculate the factorial of a number and provide additional recursive function examples",
-            512,
             0.5,
-            0.9
         ],
         [
             "Explain what machine learning is in simple terms and provide some real-world applications",
-            512,
-            0.7,
-            0.9
         ],
         [
             "Write a professional email to schedule a team meeting for next week to discuss project progress",
-            512,
-            0.7,
-            0.9
         ],
         [
             "Write a Python function to implement binary search algorithm with detailed comments",
-            512,
             0.5,
-            0.9
         ],
         [
             "Explain the concept of object-oriented programming using a real-world analogy",
-            512,
-            0.7,
-            0.9
         ]
     ],
     cache_examples=False

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import time
+import gc
+import os
 # Configuration
 BASE_MODEL = "microsoft/phi-2"
     def load_model(self):
         if not self.loaded:
             try:
+                # Force CPU usage
+                os.environ["CUDA_VISIBLE_DEVICES"] = ""
+                device = torch.device("cpu")
+                # Clear memory
+                gc.collect()
                 print("Loading tokenizer...")
                 self.tokenizer = AutoTokenizer.from_pretrained(
                     BASE_MODEL,
                 print("Loading base model...")
                 base_model = AutoModelForCausalLM.from_pretrained(
                     BASE_MODEL,
+                    torch_dtype=torch.float32,  # Use float32 for CPU
+                    device_map="cpu",
                     trust_remote_code=True,
+                    use_flash_attention_2=False,
+                    low_cpu_mem_usage=True
                 )
                 print("Loading LoRA adapter...")
                 self.model = PeftModel.from_pretrained(
                     base_model,
                     ADAPTER_MODEL,
+                    torch_dtype=torch.float32,  # Use float32 for CPU
+                    device_map="cpu"
                 )
+                # Free up memory
+                del base_model
+                gc.collect()
                 self.model.eval()
                 print("Model loading complete!")
                 self.loaded = True
 - Function implementation with comments
 - Example usage
 - Output demonstration
+Provide only the implementation, no conversation."""
             elif any(word in prompt.lower() for word in ["explain", "what is", "how does", "describe"]):
+                enhanced_prompt = f"""Below is a request for explanation. Provide a complete, focused response without any conversation:
 {prompt}
 2. Practical examples and applications
 3. Important concepts to understand
+End your response when the explanation is complete. Do not ask questions or engage in conversation."""
             else:
+                enhanced_prompt = f"""Below is a request. Provide a complete, focused response without any conversation:
 {prompt}
+End your response when complete. Do not ask questions or engage in conversation."""
             print(f"Enhanced prompt: {enhanced_prompt}")  # Debug logging
                 truncation=True,
                 max_length=512,
                 padding=True
+            ).to("cpu")  # Ensure CPU usage
+            # Generate with more conservative parameters for CPU
             start_time = time.time()
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
+                    max_length=min(max_length, 384),  # Limit max length for CPU
+                    min_length=50,
+                    temperature=min(0.5, temperature),
+                    top_p=min(0.85, top_p),
                     do_sample=True,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
+                    repetition_penalty=1.3,
+                    no_repeat_ngram_size=4,
                     num_return_sequences=1,
                     early_stopping=True,
+                    num_beams=2,  # Reduced beam search for CPU
+                    length_penalty=0.6
                 )
             # Decode response
             print(f"After prompt removal: {response}")  # Debug logging
+            # Remove common closure patterns and conversation starters
             closures = [
                 "Best regards,",
                 "Sincerely,",
                 "[Student]",
                 "Let me know if you need any clarification",
                 "I hope this helps",
+                "Feel free to ask",
+                "Can you provide",
+                "Would you like",
+                "Do you want",
+                "Let me know",
+                "Please let me know",
+                "Is there anything else",
+                "Do you have any questions",
+                "Sure!",
+                "Here are some examples:"
             ]
+            # First remove conversation starters from the end
             for closure in closures:
                 if response.lower().endswith(closure.lower()):
                     response = response[:-(len(closure))].strip()
+            # Then remove any remaining conversation patterns
+            conversation_patterns = [
+                r"\?\s*$",  # Questions at the end
+                r"Sure!.*$",  # Responses starting with Sure!
+                r"Here are.*examples:?\s*$",  # Incomplete example lists
+                r"Can you.*\?\s*$",  # Questions starting with Can you
+                r"Would you.*\?\s*$",  # Questions starting with Would you
+                r"Do you.*\?\s*$",  # Questions starting with Do you
+                r"Let me know.*$",  # Let me know phrases
+                r"I hope.*$",  # I hope phrases
+                r"Feel free.*$"  # Feel free phrases
+            ]
+            import re
+            for pattern in conversation_patterns:
+                response = re.sub(pattern, "", response).strip()
+            print(f"After conversation removal: {response}")  # Debug logging
             # Ensure code examples are properly formatted
             if "```python" not in response and "def " in response:
                 response = "```python\n" + response + "\n```"
+            # More lenient validation but check for conversation markers
+            if (len(response.strip()) < 20 or
+                response.strip() == "Response:" or
+                response.strip().endswith("?") or
+                "can you" in response.lower() or
+                "let me know" in response.lower()):
                 print("Response validation failed - using fallback")  # Debug logging
                 if "machine learning" in prompt.lower():
 print(f"The sum of {num1} and {num2} is: {result}")  # Output: The sum of 5 and 3 is: 8
 ```"""
                 else:
+                    fallback_response = "I apologize, but I couldn't generate a complete response. Please try using a lower temperature (0.3-0.5) for more focused output."
                 response = fallback_response
 # Initialize model wrapper
 model_wrapper = ModelWrapper()
+def generate_text(prompt, max_length=384, temperature=0.7, top_p=0.9):  # Reduced default max_length
     """Gradio interface function"""
     try:
         if not prompt.strip():
         print(f"Error in generate_text: {str(e)}")
         return f"Error generating response: {str(e)}\nPlease try again with a different prompt or parameters."
+# Create the Gradio interface with CPU-friendly defaults
 demo = gr.Interface(
     fn=generate_text,
     inputs=[
         ),
         gr.Slider(
             minimum=64,
+            maximum=512,
+            value=384,  # Reduced default
             step=64,
             label="Maximum Length",
             info="Longer values = longer responses but slower generation"
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
+            value=0.5,  # Reduced default
             step=0.1,
             label="Temperature",
             info="Higher values = more creative, lower values = more focused"
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
+            value=0.85,  # Adjusted default
             step=0.1,
             label="Top P",
             info="Controls diversity of word choices"
         ),
     ],
     outputs=gr.Textbox(label="Generated Response", lines=8),
+    title="Phi-2 QLoRA Fine-tuned Assistant (CPU Version)",
     description="""This is a fine-tuned version of Microsoft's Phi-2 model using QLoRA.
     The model has been trained to provide helpful responses for various tasks including coding, writing, and general assistance.
     Tips:
     - For code generation, use lower temperature (0.3-0.5)
+    - For creative writing, use higher temperature (0.5-0.7)
+    - Keep max length lower (256-384) for faster responses
     """,
     examples=[
         [
             "Write a Python function to calculate the factorial of a number and provide additional recursive function examples",
+            384,
             0.5,
+            0.85
         ],
         [
             "Explain what machine learning is in simple terms and provide some real-world applications",
+            384,
+            0.5,
+            0.85
         ],
         [
             "Write a professional email to schedule a team meeting for next week to discuss project progress",
+            384,
+            0.5,
+            0.85
         ],
         [
             "Write a Python function to implement binary search algorithm with detailed comments",
+            384,
             0.5,
+            0.85
         ],
         [
             "Explain the concept of object-oriented programming using a real-world analogy",
+            384,
+            0.5,
+            0.85
         ]
     ],
     cache_examples=False