Spaces:

navidfalah
/

3AI

Sleeping

App Files Files Community

navidfalah commited on Jul 4, 2025

Commit

e12356b

verified ·

1 Parent(s): 5f925e7

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -134

app.py CHANGED Viewed

@@ -98,181 +98,137 @@ except Exception as e:
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-print(f"Loading your fine-tuned Mistral model from {model_path}...")
 try:
-    # Load your fine-tuned model weights
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
         torch_dtype=torch.float16,
         device_map="auto",
-        trust_remote_code=True,
-        low_cpu_mem_usage=True,
-        local_files_only=True
     )
-    print("Fine-tuned Mistral model loaded successfully!")
 except Exception as e:
-    print(f"Error loading fine-tuned model from {model_path}: {e}")
-    print("Trying without local_files_only...")
     try:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
             torch_dtype=torch.float16,
             device_map="auto",
-            trust_remote_code=True,
             low_cpu_mem_usage=True
         )
-        print("Fine-tuned Mistral model loaded successfully!")
     except Exception as e2:
-        print(f"Cannot load fine-tuned model: {e2}")
-        print("Exiting - cannot proceed without your fine-tuned model")
         exit(1)
 def chat_function(message):
     if not message or not message.strip():
-        return "Please enter a message to get started!"
-    # Limit input length
-    if len(message) > 300:
-        return "Message too long! Please keep it under 300 characters."
     try:
         # Use flexible prompt format based on tokenizer type
         if hasattr(tokenizer, 'chat_template') or 'mistral' in tokenizer.name_or_path.lower():
             # Use Mistral format if it's actually Mistral
-            prompt = f"<s>[INST] {message.strip()} [/INST]"
         else:
             # Use simple format for other tokenizers
-            prompt = f"Human: {message.strip()}\nAssistant:"
         # Tokenize input
-        try:
-            inputs = tokenizer(
-                prompt,
-                return_tensors='pt',
-                truncation=True,
-                max_length=512,
-                padding=True
-            )
-            input_ids = inputs['input_ids']
-            attention_mask = inputs.get('attention_mask', None)
-        except Exception as e:
-            print(f"Tokenization error: {e}")
-            return f"Error processing your message: {str(e)}"
-        # Validate input
-        if input_ids.shape[-1] == 0:
-            return "Error: Empty input after encoding"
-        print(f"Input shape: {input_ids.shape}")
         # Move to model device
-        try:
-            device = next(model.parameters()).device
-            input_ids = input_ids.to(device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(device)
-        except Exception as e:
-            print(f"Device move error: {e}")
         # Generate response
-        try:
-            with torch.no_grad():
-                # Clear cache to prevent memory issues
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                # Conservative generation parameters
-                generation_kwargs = {
-                    'input_ids': input_ids,
-                    'max_new_tokens': 150,
-                    'temperature': 0.7,
-                    'do_sample': True,
-                    'pad_token_id': tokenizer.pad_token_id,
-                    'eos_token_id': tokenizer.eos_token_id,
-                    'num_return_sequences': 1,
-                    'repetition_penalty': 1.1,
-                    'top_p': 0.9,
-                    'use_cache': True,
-                    'num_beams': 1,
-                }
-                # Add attention mask if available
-                if attention_mask is not None:
-                    generation_kwargs['attention_mask'] = attention_mask
-                print(f"Generating with input_ids shape: {input_ids.shape}")
-                outputs = model.generate(**generation_kwargs)
-                print(f"Generated output shape: {outputs.shape}")
-        except Exception as e:
-            print(f"Generation error: {e}")
-            # Try with minimal settings
-            try:
-                print("Trying with minimal settings...")
-                outputs = model.generate(
-                    input_ids,
-                    max_new_tokens=80,
-                    do_sample=False,  # Greedy decoding
-                    pad_token_id=tokenizer.pad_token_id,
-                    eos_token_id=tokenizer.eos_token_id,
-                )
-                print(f"Minimal generation output shape: {outputs.shape}")
-            except Exception as e2:
-                print(f"Minimal generation also failed: {e2}")
-                return f"Error generating response: {str(e)}"
-        # Decode response
-        try:
-            # Extract only the new tokens (response part)
-            if outputs.shape[1] > input_ids.shape[1]:
-                response_ids = outputs[0][input_ids.shape[1]:]
-                response = tokenizer.decode(response_ids, skip_special_tokens=True)
-            else:
-                # Fallback: decode full output and remove prompt
-                full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-                response = full_response.replace(prompt.replace("<s>", "").replace("</s>", ""), "").strip()
-        except Exception as e:
-            print(f"Decoding error: {e}")
-            try:
-                # Last resort: decode full output
-                full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-                response = full_response
-            except:
-                return f"Error decoding response: {str(e)}"
-        # Clean up the response based on tokenizer type
         response = response.strip()
-        # Remove prompt artifacts based on what we used
-        if "[/INST]" in response:
-            response = response.split("[/INST]")[-1].strip()
-        if "[INST]" in response:
-            response = response.split("[INST]")[0].strip()
-        if "Assistant:" in response:
-            response = response.split("Assistant:")[-1].strip()
-        if "Human:" in response:
-            response = response.split("Human:")[0].strip()
-        # Remove input message if it appears in response
-        if message.strip() in response:
-            response = response.replace(message.strip(), "").strip()
-        # Limit response length
-        if len(response) > 1000:
-            response = response[:1000] + "..."
-        # Ensure we have a meaningful response
-        if len(response.strip()) < 5:
-            response = "I understand your message. How can I help you with that?"
         return response
     except Exception as e:
-        print(f"Unexpected error: {e}")
-        return f"Sorry, I encountered an unexpected error: {str(e)}"
 def clear_chat():
     return ""

 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+print(f"Loading your model from {original_mistral_model}...")
 try:
+    # Load your model from Hugging Face
+    base_model = AutoModelForCausalLM.from_pretrained(
+        original_mistral_model,
         torch_dtype=torch.float16,
         device_map="auto",
+        low_cpu_mem_usage=True
     )
+    print("Your model loaded successfully!")
+    # Check if PEFT is available and try to load local adapter
+    if PeftModel is not None and PeftConfig is not None:
+        try:
+            print(f"Trying to load local LoRA adapter from {adapter_path}...")
+            model = PeftModel.from_pretrained(
+                base_model,
+                adapter_path,
+                torch_dtype=torch.float16
+            )
+            print("Local LoRA adapter loaded successfully!")
+        except Exception as adapter_error:
+            print(f"Could not load local adapter: {adapter_error}")
+            print("Using your base model without additional adapter")
+            model = base_model
+    else:
+        print("PEFT not available - using your base model")
+        model = base_model
 except Exception as e:
+    print(f"Error loading your model: {e}")
+    print("Trying to load original Mistral as fallback...")
     try:
+        # Fallback to original Mistral
+        base_model = AutoModelForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-Instruct-v0.1",
             torch_dtype=torch.float16,
             device_map="auto",
             low_cpu_mem_usage=True
         )
+        print("Fallback Mistral model loaded!")
+        model = base_model
     except Exception as e2:
+        print(f"Cannot load any model: {e2}")
+        print("Exiting - cannot proceed without model")
         exit(1)
 def chat_function(message):
     if not message or not message.strip():
+        return "Please enter a message."
+    # Clean and limit input
+    message = message.strip()
+    if len(message) > 500:
+        return "Message too long! Please keep it under 500 characters."
     try:
         # Use flexible prompt format based on tokenizer type
         if hasattr(tokenizer, 'chat_template') or 'mistral' in tokenizer.name_or_path.lower():
             # Use Mistral format if it's actually Mistral
+            prompt = f"<s>[INST] {message} [/INST]"
         else:
             # Use simple format for other tokenizers
+            prompt = f"User: {message}\nAssistant:"
         # Tokenize input
+        inputs = tokenizer(
+            prompt,
+            return_tensors='pt',
+            truncation=True,
+            max_length=400,
+            padding=True
+        )
+        input_ids = inputs['input_ids']
+        attention_mask = inputs.get('attention_mask', None)
         # Move to model device
+        device = next(model.parameters()).device
+        input_ids = input_ids.to(device)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(device)
         # Generate response
+        with torch.no_grad():
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            outputs = model.generate(
+                input_ids,
+                max_new_tokens=200,
+                temperature=0.7,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                attention_mask=attention_mask,
+                repetition_penalty=1.1
+            )
+        # Extract and clean response
+        if outputs.shape[1] > input_ids.shape[1]:
+            response_ids = outputs[0][input_ids.shape[1]:]
+            response = tokenizer.decode(response_ids, skip_special_tokens=True)
+        else:
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            response = response.replace(prompt, "").strip()
+        # Clean up response
         response = response.strip()
+        # Remove prompt artifacts
+        for artifact in ["[/INST]", "[INST]", "Assistant:", "User:", "Human:"]:
+            if artifact in response:
+                response = response.split(artifact)[-1].strip()
+        # Remove input if it appears in response
+        if message.lower() in response.lower():
+            response = response.replace(message, "").strip()
+        # Ensure reasonable length
+        if len(response) > 800:
+            response = response[:800] + "..."
+        # Fallback if empty
+        if len(response.strip()) < 3:
+            response = "I understand. How can I help you?"
         return response
     except Exception as e:
+        return f"Error: {str(e)}"
 def clear_chat():
     return ""