Spaces:

pradeep6kumar2024
/

QLORA_phi2

Sleeping

App Files Files Community

pradeep6kumar2024 commited on Mar 3, 2025

Commit

a27324e

1 Parent(s): 3bf5e4f

updated app.py

Browse files

Files changed (1) hide show

app.py +76 -187

app.py CHANGED Viewed

@@ -5,11 +5,17 @@ from peft import PeftModel
 import time
 import gc
 import os
 # Configuration
 BASE_MODEL = "microsoft/phi-2"
 ADAPTER_MODEL = "pradeep6kumar2024/phi2-qlora-assistant"
 class ModelWrapper:
     def __init__(self):
         self.model = None
@@ -26,6 +32,8 @@ class ModelWrapper:
                 # Clear memory
                 gc.collect()
                 print("Loading tokenizer...")
                 self.tokenizer = AutoTokenizer.from_pretrained(
                     BASE_MODEL,
@@ -34,21 +42,26 @@ class ModelWrapper:
                 )
                 self.tokenizer.pad_token = self.tokenizer.eos_token
                 print("Loading base model...")
                 base_model = AutoModelForCausalLM.from_pretrained(
                     BASE_MODEL,
-                    torch_dtype=torch.float32,  # Use float32 for CPU
                     device_map="cpu",
                     trust_remote_code=True,
                     use_flash_attention_2=False,
-                    low_cpu_mem_usage=True
                 )
                 print("Loading LoRA adapter...")
                 self.model = PeftModel.from_pretrained(
                     base_model,
                     ADAPTER_MODEL,
-                    torch_dtype=torch.float32,  # Use float32 for CPU
                     device_map="cpu"
                 )
@@ -56,6 +69,8 @@ class ModelWrapper:
                 del base_model
                 gc.collect()
                 self.model.eval()
                 print("Model loading complete!")
                 self.loaded = True
@@ -63,188 +78,79 @@ class ModelWrapper:
                 print(f"Error during model loading: {str(e)}")
                 raise
-    def generate_response(self, prompt, max_length=512, temperature=0.7, top_p=0.9):
         if not self.loaded:
             self.load_model()
         try:
-            # Enhance prompt for better completion
             if "function" in prompt.lower() and "python" in prompt.lower():
-                enhanced_prompt = f"""Write a Python function with the following requirements:
-{prompt}
-Include:
-- Function implementation with comments
-- Example usage
-- Output demonstration
-Provide only the implementation, no conversation."""
             elif any(word in prompt.lower() for word in ["explain", "what is", "how does", "describe"]):
-                enhanced_prompt = f"""Below is a request for explanation. Provide a complete, focused response without any conversation:
-{prompt}
-Your response should include:
-1. A clear explanation in simple terms
-2. Practical examples and applications
-3. Important concepts to understand
-End your response when the explanation is complete. Do not ask questions or engage in conversation."""
             else:
-                enhanced_prompt = f"""Below is a request. Provide a complete, focused response without any conversation:
-{prompt}
-End your response when complete. Do not ask questions or engage in conversation."""
-            print(f"Enhanced prompt: {enhanced_prompt}")  # Debug logging
-            # Tokenize input
             inputs = self.tokenizer(
                 enhanced_prompt,
                 return_tensors="pt",
                 truncation=True,
-                max_length=512,
                 padding=True
-            ).to("cpu")  # Ensure CPU usage
-            # Generate with more conservative parameters for CPU
             start_time = time.time()
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_length=min(max_length, 384),  # Limit max length for CPU
-                    min_length=50,
                     temperature=min(0.5, temperature),
                     top_p=min(0.85, top_p),
                     do_sample=True,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
-                    repetition_penalty=1.3,
-                    no_repeat_ngram_size=4,
                     num_return_sequences=1,
                     early_stopping=True,
-                    num_beams=2,  # Reduced beam search for CPU
                     length_penalty=0.6
                 )
             # Decode response
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            print(f"Raw response: {response}")  # Debug logging
             # Clean up the response
             if response.startswith(enhanced_prompt):
                 response = response[len(enhanced_prompt):].strip()
-            print(f"After prompt removal: {response}")  # Debug logging
-            # Remove common closure patterns and conversation starters
-            closures = [
-                "Best regards,",
-                "Sincerely,",
-                "Thanks,",
-                "Thank you,",
-                "Regards,",
-                "Assistant:",
-                "Human:",
-                "[Your Name]",
-                "[Student]",
-                "Let me know if you need any clarification",
-                "I hope this helps",
-                "Feel free to ask",
-                "Can you provide",
-                "Would you like",
-                "Do you want",
-                "Let me know",
-                "Please let me know",
-                "Is there anything else",
-                "Do you have any questions",
-                "Sure!",
-                "Here are some examples:"
-            ]
-            # First remove conversation starters from the end
-            for closure in closures:
-                if response.lower().endswith(closure.lower()):
-                    response = response[:-(len(closure))].strip()
-            # Then remove any remaining conversation patterns
-            conversation_patterns = [
-                r"\?\s*$",  # Questions at the end
-                r"Sure!.*$",  # Responses starting with Sure!
-                r"Here are.*examples:?\s*$",  # Incomplete example lists
-                r"Can you.*\?\s*$",  # Questions starting with Can you
-                r"Would you.*\?\s*$",  # Questions starting with Would you
-                r"Do you.*\?\s*$",  # Questions starting with Do you
-                r"Let me know.*$",  # Let me know phrases
-                r"I hope.*$",  # I hope phrases
-                r"Feel free.*$"  # Feel free phrases
-            ]
-            import re
-            for pattern in conversation_patterns:
-                response = re.sub(pattern, "", response).strip()
-            print(f"After conversation removal: {response}")  # Debug logging
             # Ensure code examples are properly formatted
             if "```python" not in response and "def " in response:
                 response = "```python\n" + response + "\n```"
-            # More lenient validation but check for conversation markers
-            if (len(response.strip()) < 20 or
-                response.strip() == "Response:" or
-                response.strip().endswith("?") or
-                "can you" in response.lower() or
-                "let me know" in response.lower()):
-                print("Response validation failed - using fallback")  # Debug logging
-                if "machine learning" in prompt.lower():
-                    fallback_response = """Machine learning is a branch of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed. Think of it like teaching a child:
-1. Simple Explanation:
-- Instead of giving strict rules, we show the computer many examples
-- The computer finds patterns in these examples
-- It uses these patterns to make decisions about new situations
-2. Real-World Applications:
-- Email Spam Detection: Learning to identify unwanted emails based on past examples
-- Netflix Recommendations: Suggesting movies based on what you've watched
-- Face Recognition: Unlocking your phone by learning your facial features
-- Virtual Assistants: Siri and Alexa understanding and responding to voice commands
-- Medical Diagnosis: Helping doctors identify diseases in medical images
-- Fraud Detection: Banks identifying suspicious transactions
-3. Key Benefits:
-- Automation of complex tasks
-- More accurate predictions over time
-- Ability to handle large amounts of data
-- Continuous improvement through learning
-Machine learning is transforming industries by automating tasks that once required human intelligence, making processes more efficient and enabling new possibilities in technology."""
-                elif "function" in prompt.lower():
                     fallback_response = """```python
 def add_numbers(a, b):
-    '''
-    Add two numbers and return the result
-    Args:
-        a: first number
-        b: second number
-    Returns:
-        sum of a and b
-    '''
     return a + b
-# Example usage
-num1 = 5
-num2 = 3
-result = add_numbers(num1, num2)
-print(f"The sum of {num1} and {num2} is: {result}")  # Output: The sum of 5 and 3 is: 8
 ```"""
                 else:
-                    fallback_response = "I apologize, but I couldn't generate a complete response. Please try using a lower temperature (0.3-0.5) for more focused output."
                 response = fallback_response
             generation_time = time.time() - start_time
             return response, generation_time
         except Exception as e:
@@ -254,7 +160,7 @@ print(f"The sum of {num1} and {num2} is: {result}")  # Output: The sum of 5 and
 # Initialize model wrapper
 model_wrapper = ModelWrapper()
-def generate_text(prompt, max_length=384, temperature=0.7, top_p=0.9):  # Reduced default max_length
     """Gradio interface function"""
     try:
         if not prompt.strip():
@@ -269,91 +175,74 @@ def generate_text(prompt, max_length=384, temperature=0.7, top_p=0.9):  # Reduce
         return f"Generated in {gen_time:.2f} seconds:\n\n{response}"
     except Exception as e:
         print(f"Error in generate_text: {str(e)}")
-        return f"Error generating response: {str(e)}\nPlease try again with a different prompt or parameters."
-# Create the Gradio interface with CPU-friendly defaults
 demo = gr.Interface(
     fn=generate_text,
     inputs=[
         gr.Textbox(
             label="Enter your prompt",
             placeholder="Type your prompt here...",
-            lines=4
         ),
         gr.Slider(
             minimum=64,
-            maximum=512,
-            value=384,  # Reduced default
-            step=64,
             label="Maximum Length",
-            info="Longer values = longer responses but slower generation"
         ),
         gr.Slider(
             minimum=0.1,
-            maximum=1.0,
-            value=0.5,  # Reduced default
             step=0.1,
             label="Temperature",
-            info="Higher values = more creative, lower values = more focused"
         ),
         gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.85,  # Adjusted default
             step=0.1,
             label="Top P",
-            info="Controls diversity of word choices"
         ),
     ],
-    outputs=gr.Textbox(label="Generated Response", lines=8),
-    title="Phi-2 QLoRA Fine-tuned Assistant (CPU Version)",
-    description="""This is a fine-tuned version of Microsoft's Phi-2 model using QLoRA.
-    The model has been trained to provide helpful responses for various tasks including coding, writing, and general assistance.
-    Example tasks:
-    - Writing Python functions and explaining code
-    - Explaining technical concepts in simple terms
-    - Drafting professional emails and documents
     Tips:
-    - For code generation, use lower temperature (0.3-0.5)
-    - For creative writing, use higher temperature (0.5-0.7)
-    - Keep max length lower (256-384) for faster responses
     """,
     examples=[
         [
-            "Write a Python function to calculate the factorial of a number and provide additional recursive function examples",
-            384,
-            0.5,
-            0.85
-        ],
-        [
-            "Explain what machine learning is in simple terms and provide some real-world applications",
-            384,
-            0.5,
-            0.85
-        ],
-        [
-            "Write a professional email to schedule a team meeting for next week to discuss project progress",
-            384,
-            0.5,
-            0.85
         ],
         [
-            "Write a Python function to implement binary search algorithm with detailed comments",
-            384,
-            0.5,
-            0.85
         ],
         [
-            "Explain the concept of object-oriented programming using a real-world analogy",
-            384,
-            0.5,
-            0.85
         ]
     ],
     cache_examples=False
 )
 if __name__ == "__main__":
     demo.launch()

 import time
 import gc
 import os
+import psutil
 # Configuration
 BASE_MODEL = "microsoft/phi-2"
 ADAPTER_MODEL = "pradeep6kumar2024/phi2-qlora-assistant"
+# Memory monitoring
+def get_memory_usage():
+    process = psutil.Process(os.getpid())
+    return process.memory_info().rss / (1024 * 1024)  # MB
 class ModelWrapper:
     def __init__(self):
         self.model = None
                 # Clear memory
                 gc.collect()
+                print(f"Memory before loading: {get_memory_usage():.2f} MB")
                 print("Loading tokenizer...")
                 self.tokenizer = AutoTokenizer.from_pretrained(
                     BASE_MODEL,
                 )
                 self.tokenizer.pad_token = self.tokenizer.eos_token
+                print(f"Memory after tokenizer: {get_memory_usage():.2f} MB")
                 print("Loading base model...")
                 base_model = AutoModelForCausalLM.from_pretrained(
                     BASE_MODEL,
+                    torch_dtype=torch.float32,
                     device_map="cpu",
                     trust_remote_code=True,
                     use_flash_attention_2=False,
+                    low_cpu_mem_usage=True,
+                    offload_folder="offload"
                 )
+                print(f"Memory after base model: {get_memory_usage():.2f} MB")
                 print("Loading LoRA adapter...")
                 self.model = PeftModel.from_pretrained(
                     base_model,
                     ADAPTER_MODEL,
+                    torch_dtype=torch.float32,
                     device_map="cpu"
                 )
                 del base_model
                 gc.collect()
+                print(f"Memory after adapter: {get_memory_usage():.2f} MB")
                 self.model.eval()
                 print("Model loading complete!")
                 self.loaded = True
                 print(f"Error during model loading: {str(e)}")
                 raise
+    def generate_response(self, prompt, max_length=256, temperature=0.7, top_p=0.9):
         if not self.loaded:
             self.load_model()
         try:
+            # Use shorter prompts to save memory
             if "function" in prompt.lower() and "python" in prompt.lower():
+                enhanced_prompt = f"""Write Python function: {prompt}"""
             elif any(word in prompt.lower() for word in ["explain", "what is", "how does", "describe"]):
+                enhanced_prompt = f"""Explain briefly: {prompt}"""
             else:
+                enhanced_prompt = prompt
+            print(f"Enhanced prompt: {enhanced_prompt}")
+            # Tokenize input with shorter max length
             inputs = self.tokenizer(
                 enhanced_prompt,
                 return_tensors="pt",
                 truncation=True,
+                max_length=256,  # Reduced for memory
                 padding=True
+            ).to("cpu")
+            # Generate with minimal parameters
             start_time = time.time()
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
+                    max_length=min(max_length, 256),  # Strict limit
+                    min_length=10,  # Reduced minimum
                     temperature=min(0.5, temperature),
                     top_p=min(0.85, top_p),
                     do_sample=True,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
+                    repetition_penalty=1.2,
+                    no_repeat_ngram_size=3,
                     num_return_sequences=1,
                     early_stopping=True,
+                    num_beams=1,  # Greedy decoding to save memory
                     length_penalty=0.6
                 )
             # Decode response
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Clean up the response
             if response.startswith(enhanced_prompt):
                 response = response[len(enhanced_prompt):].strip()
+            # Basic cleanup only
+            response = response.replace("Human:", "").replace("Assistant:", "")
             # Ensure code examples are properly formatted
             if "```python" not in response and "def " in response:
                 response = "```python\n" + response + "\n```"
+            # Simple validation
+            if len(response.strip()) < 10:
+                if "function" in prompt.lower():
                     fallback_response = """```python
 def add_numbers(a, b):
     return a + b
 ```"""
                 else:
+                    fallback_response = "I apologize, but I couldn't generate a response. Please try with a simpler prompt."
                 response = fallback_response
+            # Clear memory after generation
+            gc.collect()
             generation_time = time.time() - start_time
             return response, generation_time
         except Exception as e:
 # Initialize model wrapper
 model_wrapper = ModelWrapper()
+def generate_text(prompt, max_length=256, temperature=0.5, top_p=0.85):
     """Gradio interface function"""
     try:
         if not prompt.strip():
         return f"Generated in {gen_time:.2f} seconds:\n\n{response}"
     except Exception as e:
         print(f"Error in generate_text: {str(e)}")
+        return f"Error generating response: {str(e)}\nPlease try again with a shorter prompt."
+# Create a very lightweight Gradio interface
 demo = gr.Interface(
     fn=generate_text,
     inputs=[
         gr.Textbox(
             label="Enter your prompt",
             placeholder="Type your prompt here...",
+            lines=3
         ),
         gr.Slider(
             minimum=64,
+            maximum=256,
+            value=192,
+            step=32,
             label="Maximum Length",
+            info="Keep this low for CPU"
         ),
         gr.Slider(
             minimum=0.1,
+            maximum=0.7,
+            value=0.4,
             step=0.1,
             label="Temperature",
+            info="Lower is better for CPU"
         ),
         gr.Slider(
+            minimum=0.5,
+            maximum=0.9,
+            value=0.8,
             step=0.1,
             label="Top P",
+            info="Controls diversity"
         ),
     ],
+    outputs=gr.Textbox(label="Generated Response", lines=6),
+    title="Phi-2 QLoRA Assistant (CPU-Optimized)",
+    description="""This is a lightweight CPU version of the fine-tuned Phi-2 model.
     Tips:
+    - Keep prompts short and specific
+    - Use lower maximum length (128-192) for faster responses
+    - Use lower temperature (0.3-0.5) for more reliable responses
     """,
     examples=[
         [
+            "Write a Python function to calculate factorial",
+            192,
+            0.4,
+            0.8
         ],
         [
+            "Explain machine learning simply",
+            192,
+            0.4,
+            0.8
         ],
         [
+            "Write a short email to schedule a meeting",
+            192,
+            0.4,
+            0.8
         ]
     ],
     cache_examples=False
 )
 if __name__ == "__main__":
+    demo.queue(concurrency_count=1)  # Limit concurrency
     demo.launch()