Spaces:

llaa33219
/

BLOUplanet-Model-Playground

Sleeping

App Files Files Community

llaa33219 commited on Aug 8, 2025

Commit

2742dc4

verified ·

1 Parent(s): ff1e6ca

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -112

app.py CHANGED Viewed

@@ -2,20 +2,44 @@ import spaces
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 # Try to import peft, if not available use base model only
 try:
     from peft import PeftModel
     PEFT_AVAILABLE = True
 except ImportError:
-    print("Warning: peft not available, will use base model only")
     PEFT_AVAILABLE = False
-# === List your models here ===
-BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"
-ADAPTER_MODELS = {
-    "Qwen-Finetuned": "llaa33219/Entrystory-Qwen2.5-3b-Instruct",
-    # 다른 어댑터들도 추가 가능
 }
 # Global variables for model caching
@@ -24,94 +48,98 @@ current_tokenizer = None
 current_model = None
 def load_model(name):
     global current_model_name, current_tokenizer, current_model
-    if current_model_name != name:
-        print(f"Loading model: {name}")
-        # Clear previous model from memory
-        if current_model is not None:
-            del current_model
-            torch.cuda.empty_cache()
-        try:
-            if PEFT_AVAILABLE:
-                # LoRA adapter loading
-                adapter_model_id = ADAPTER_MODELS[name]
-                # Load tokenizer from adapter (has the right special tokens)
-                current_tokenizer = AutoTokenizer.from_pretrained(
-                    adapter_model_id,
-                    trust_remote_code=True
-                )
-                # Add padding token if not present
-                if current_tokenizer.pad_token is None:
-                    current_tokenizer.pad_token = current_tokenizer.eos_token
-                # Load base model
-                print(f"Loading base model: {BASE_MODEL}")
-                base_model = AutoModelForCausalLM.from_pretrained(
-                    BASE_MODEL,
-                    torch_dtype=torch.float16,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True
-                )
-                # Resize token embeddings to match the adapter's vocabulary size
-                print(f"Original vocab size: {base_model.config.vocab_size}")
-                print(f"Tokenizer vocab size: {len(current_tokenizer)}")
-                if base_model.config.vocab_size != len(current_tokenizer):
-                    print(f"Resizing token embeddings from {base_model.config.vocab_size} to {len(current_tokenizer)}")
-                    base_model.resize_token_embeddings(len(current_tokenizer))
-                # Load LoRA adapter with error handling
-                print(f"Loading LoRA adapter: {adapter_model_id}")
-                try:
-                    current_model = PeftModel.from_pretrained(
-                        base_model,
-                        adapter_model_id,
-                        torch_dtype=torch.float16
-                    )
-                    # Merge adapter with base model for better performance
-                    current_model = current_model.merge_and_unload()
-                    print(f"Successfully merged LoRA adapter")
-                except Exception as adapter_error:
-                    print(f"Failed to load LoRA adapter: {adapter_error}")
-                    print("Falling back to base model only")
-                    current_model = base_model
-            else:
-                # Fallback to base model only
-                print(f"peft not available, using base model only: {BASE_MODEL}")
-                current_tokenizer = AutoTokenizer.from_pretrained(
-                    BASE_MODEL,
-                    trust_remote_code=True
-                )
-                # Add padding token if not present
-                if current_tokenizer.pad_token is None:
-                    current_tokenizer.pad_token = current_tokenizer.eos_token
-                current_model = AutoModelForCausalLM.from_pretrained(
-                    BASE_MODEL,
-                    torch_dtype=torch.float16,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True
-                )
-            current_model_name = name
-            print(f"Successfully loaded model: {name}")
-        except Exception as e:
-            print(f"Failed to load model {name}: {e}")
-            import traceback
-            traceback.print_exc()
-            raise e
     return current_tokenizer, current_model
 @spaces.GPU()
@@ -119,18 +147,18 @@ def chat_fn(message, history, selected_model):
     try:
         tokenizer, model = load_model(selected_model)
-        # Move model to GPU
         if not next(model.parameters()).is_cuda:
             model = model.cuda()
-        # Build conversation history
         conversation = []
         for user_msg, bot_msg in history:
             conversation.append({"role": "user", "content": user_msg})
             conversation.append({"role": "assistant", "content": bot_msg})
         conversation.append({"role": "user", "content": message})
-        # Apply chat template
         try:
             input_ids = tokenizer.apply_chat_template(
                 conversation=conversation,
@@ -139,13 +167,15 @@ def chat_fn(message, history, selected_model):
                 return_tensors="pt"
             ).cuda()
         except Exception as e:
-            print(f"Chat template error: {e}")
-            # Fallback to simple tokenization
             text = f"User: {message}\nAssistant:"
             input_ids = tokenizer.encode(text, return_tensors="pt").cuda()
         # Generate response
         with torch.no_grad():
             output_ids = model.generate(
                 input_ids,
                 max_new_tokens=512,
@@ -154,10 +184,10 @@ def chat_fn(message, history, selected_model):
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
                 use_cache=True,
-                attention_mask=torch.ones_like(input_ids)
             )
-        # Decode response
         response = tokenizer.decode(
             output_ids[0][input_ids.shape[1]:],
             skip_special_tokens=True
@@ -167,56 +197,58 @@ def chat_fn(message, history, selected_model):
     except Exception as e:
         print(f"Error in chat_fn: {str(e)}")
-        import traceback
         traceback.print_exc()
         return f"죄송합니다. 오류가 발생했습니다: {str(e)}"
 def respond(message, chat_history, selected_model):
     if not message.strip():
         return chat_history, ""
-    # Get bot response
     bot_message = chat_fn(message, chat_history, selected_model)
     # Update chat history
     chat_history.append([message, bot_message])
-    return chat_history, ""
-# Create Gradio interface
-title = "Multi-Model Chatbot (LoRA Adapter Support)" if PEFT_AVAILABLE else "Multi-Model Chatbot (Base Model Only)"
 with gr.Blocks(title="Multi-Model Chat", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"# 🗨️ {title}")
     with gr.Row():
         model_select = gr.Dropdown(
-            choices=list(ADAPTER_MODELS.keys()),
-            value=list(ADAPTER_MODELS.keys())[0],
             label="Choose Model",
             interactive=True
         )
     chatbot = gr.Chatbot(
-        height=400,
         label="Chat",
-        show_copy_button=True
     )
     with gr.Row():
         msg = gr.Textbox(
             label="Message",
-            placeholder="Type your message here...",
             scale=4
         )
         send_btn = gr.Button("Send", scale=1, variant="primary")
     clear_btn = gr.Button("Clear Chat", variant="secondary")
-    # Event handlers
     def clear_chat():
         return [], ""
-    # Send message on button click or enter
     send_btn.click(
         respond,
         inputs=[msg, chatbot, model_select],
@@ -229,12 +261,21 @@ with gr.Blocks(title="Multi-Model Chat", theme=gr.themes.Soft()) as demo:
         outputs=[chatbot, msg]
     )
-    # Clear chat
     clear_btn.click(clear_chat, outputs=[chatbot, msg])
 if __name__ == "__main__":
     demo.launch(
-        share=False,
         server_name="0.0.0.0",
         server_port=7860
     )

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+import traceback
 # Try to import peft, if not available use base model only
 try:
     from peft import PeftModel
     PEFT_AVAILABLE = True
 except ImportError:
+    print("Warning: peft library not found. LoRA adapters will not be available.")
     PEFT_AVAILABLE = False
+# === Define all your available models here ===
+# This new dictionary allows you to define both base models and LoRA adapters.
+# 'type': can be 'base' for a standalone model or 'lora' for an adapter.
+# 'id': the Hugging Face model/adapter ID.
+# 'base_model_id': for LoRA adapters, specifies which base model to use.
+AVAILABLE_MODELS = {
+    "BokantLM0.1-0.5b": {
+        "type": "base",
+        "id": "llaa33219/BokantLM0.1-0.5b",
+    },
+    "Entrystory-Qwen2.5-3b-Instruct": {
+        "type": "lora",
+        "id": "llaa33219/Entrystory-Qwen2.5-3b-Instruct",
+        "base_model_id": "Qwen/Qwen2.5-3B-Instruct" # This LoRA is based on the Qwen model
+    },
+    # --- You can add more models here ---
+    # Example of another base model:
+    # "Another Base Model (e.g., Ko-LLaMA)": {
+    #     "type": "base",
+    #     "id": "beomi/KoAlpaca-Polyglot-5.8B"
+    # },
+    # Example of another LoRA adapter:
+    # "Another LoRA Finetune": {
+    #     "type": "lora",
+    #     "id": "path/to/your/other-lora-adapter",
+    #     "base_model_id": "Qwen/Qwen2.5-3B-Instruct"
+    # },
 }
 # Global variables for model caching
 current_model = None
 def load_model(name):
+    """
+    Loads a model based on the selection. It can load a base model directly
+    or load a base model and then apply a LoRA adapter to it.
+    """
     global current_model_name, current_tokenizer, current_model
+    if current_model_name == name:
+        # Model is already loaded, no need to do anything
+        return current_tokenizer, current_model
+    print(f"Switching to model: {name}")
+    # Clear previous model from memory
+    if current_model is not None:
+        del current_model
+        del current_tokenizer
+        current_model = None
+        current_tokenizer = None
+        torch.cuda.empty_cache()
+        print("Cleared previous model from memory.")
+    try:
+        model_info = AVAILABLE_MODELS[name]
+        model_type = model_info["type"]
+        model_id = model_info["id"]
+        # --- Case 1: Load a LoRA adapter model ---
+        if model_type == 'lora' and PEFT_AVAILABLE:
+            base_model_id = model_info["base_model_id"]
+            adapter_id = model_id
+            print(f"Loading LoRA model. Base: '{base_model_id}', Adapter: '{adapter_id}'")
+            # Load tokenizer from the adapter (it might have special tokens)
+            current_tokenizer = AutoTokenizer.from_pretrained(adapter_id, trust_remote_code=True)
+            # Load base model
+            base_model = AutoModelForCausalLM.from_pretrained(
+                base_model_id,
+                torch_dtype=torch.float16,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            # Resize token embeddings if the adapter's vocab differs from the base model's
+            if base_model.config.vocab_size != len(current_tokenizer):
+                print(f"Resizing token embeddings from {base_model.config.vocab_size} to {len(current_tokenizer)}")
+                base_model.resize_token_embeddings(len(current_tokenizer))
+            # Load and merge the LoRA adapter
+            print(f"Loading and merging LoRA adapter: {adapter_id}")
+            lora_model = PeftModel.from_pretrained(
+                base_model,
+                adapter_id,
+                torch_dtype=torch.float16
+            )
+            current_model = lora_model.merge_and_unload()
+            print("Successfully merged LoRA adapter.")
+        # --- Case 2: Load a base model directly ---
+        else:
+            if model_type == 'lora' and not PEFT_AVAILABLE:
+                print(f"PEFT not available. Cannot load LoRA adapter '{name}'. Falling back to its base model.")
+                # Fallback to the base model if PEFT is missing
+                model_id = model_info.get("base_model_id", list(AVAILABLE_MODELS.values())[0]['id'])
+            print(f"Loading base model: {model_id}")
+            current_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            current_model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+        # Common post-processing for any loaded model
+        if current_tokenizer.pad_token is None:
+            current_tokenizer.pad_token = current_tokenizer.eos_token
+            print("Set pad_token to eos_token.")
+        current_model_name = name
+        print(f"✅ Successfully loaded model: {name}")
+    except Exception as e:
+        print(f"❌ Failed to load model {name}: {e}")
+        traceback.print_exc()
+        # Clean up on failure
+        current_model_name = None
+        current_model = None
+        current_tokenizer = None
+        raise e # Re-raise the exception to be caught by the chat function
     return current_tokenizer, current_model
 @spaces.GPU()
     try:
         tokenizer, model = load_model(selected_model)
+        # Ensure model is on the correct device (GPU)
         if not next(model.parameters()).is_cuda:
             model = model.cuda()
+        # Build conversation history for the chat template
         conversation = []
         for user_msg, bot_msg in history:
             conversation.append({"role": "user", "content": user_msg})
             conversation.append({"role": "assistant", "content": bot_msg})
         conversation.append({"role": "user", "content": message})
+        # Apply the model's specific chat template
         try:
             input_ids = tokenizer.apply_chat_template(
                 conversation=conversation,
                 return_tensors="pt"
             ).cuda()
         except Exception as e:
+            print(f"Chat template error: {e}. Falling back to simple encoding.")
             text = f"User: {message}\nAssistant:"
             input_ids = tokenizer.encode(text, return_tensors="pt").cuda()
         # Generate response
         with torch.no_grad():
+            # Create attention mask
+            attention_mask = torch.ones_like(input_ids)
             output_ids = model.generate(
                 input_ids,
                 max_new_tokens=512,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
                 use_cache=True,
+                attention_mask=attention_mask
             )
+        # Decode the generated tokens into text, skipping the prompt
         response = tokenizer.decode(
             output_ids[0][input_ids.shape[1]:],
             skip_special_tokens=True
     except Exception as e:
         print(f"Error in chat_fn: {str(e)}")
         traceback.print_exc()
         return f"죄송합니다. 오류가 발생했습니다: {str(e)}"
 def respond(message, chat_history, selected_model):
     if not message.strip():
+        # If the message is empty, do nothing
         return chat_history, ""
+    # Get the bot's response
     bot_message = chat_fn(message, chat_history, selected_model)
     # Update chat history
     chat_history.append([message, bot_message])
+    return chat_history, "" # Return updated history and clear the input box
+# --- Gradio Interface ---
+title = "Multi-Model Chatbot (with LoRA Support)" if PEFT_AVAILABLE else "Multi-Model Chatbot (Base Models Only)"
 with gr.Blocks(title="Multi-Model Chat", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"<h1><center>🗨️ {title}</center></h1>")
+    gr.Markdown("<center>Select a model from the dropdown and start chatting. The app will load the model on the first message.</center>")
     with gr.Row():
         model_select = gr.Dropdown(
+            choices=list(AVAILABLE_MODELS.keys()),
+            value=list(AVAILABLE_MODELS.keys())[0], # Default to the first model in the list
             label="Choose Model",
             interactive=True
         )
     chatbot = gr.Chatbot(
+        height=500,
         label="Chat",
+        show_copy_button=True,
+        bubble_full_width=False
     )
     with gr.Row():
         msg = gr.Textbox(
             label="Message",
+            placeholder="여기에 메시지를 입력하세요...",
             scale=4
         )
         send_btn = gr.Button("Send", scale=1, variant="primary")
     clear_btn = gr.Button("Clear Chat", variant="secondary")
+    # --- Event Handlers ---
     def clear_chat():
         return [], ""
+    # Send message on button click or enter key press
     send_btn.click(
         respond,
         inputs=[msg, chatbot, model_select],
         outputs=[chatbot, msg]
     )
+    # Clear chat button
     clear_btn.click(clear_chat, outputs=[chatbot, msg])
 if __name__ == "__main__":
+    # Pre-load the default model to speed up the first interaction
+    try:
+        print("Pre-loading the default model...")
+        default_model_name = list(AVAILABLE_MODELS.keys())[0]
+        load_model(default_model_name)
+        print("✅ Default model pre-loaded successfully.")
+    except Exception as e:
+        print(f"⚠️ Could not pre-load the default model: {e}")
     demo.launch(
+        share=False, # Set to True to get a public link (on Hugging Face Spaces or Colab)
         server_name="0.0.0.0",
         server_port=7860
     )