jellewas
/

app.py

Model card Files Files and versions

xet

Community

jellewas commited on Nov 4, 2025

Commit

129e933

verified ·

1 Parent(s): 0173ebf

Upload hf://spaces/jellewas/mistral7b-chat with huggingface_hub

Browse files

Files changed (1) hide show

hf:/spaces/jellewas/mistral7b-chat +190 -0

hf:/spaces/jellewas/mistral7b-chat ADDED Viewed

	@@ -0,0 +1,190 @@

+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+# Model configuration
+TRAINED_MODEL = "AnythingSLM/mistral7b-qlora-output"  # Your trained model repository
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Global variables for model and tokenizer
+model = None
+tokenizer = None
+def load_model():
+    """Load the trained model directly"""
+    global model, tokenizer
+    if model is None or tokenizer is None:
+        print(f"Loading trained model from {TRAINED_MODEL}...")
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Load the trained model with quantization
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            TRAINED_MODEL,
+            quantization_config=bnb_config,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        print("Trained model loaded successfully!")
+    return model, tokenizer
+def generate_response(message, max_length=512, temperature=0.7, top_p=0.9):
+    """Generate a response using the fine-tuned model"""
+    try:
+        model, tokenizer = load_model()
+        # Format the input message
+        if not message.startswith("<s>"):
+            # Add instruction format for Mistral
+            prompt = f"<s>[INST] {message} [/INST]"
+        else:
+            prompt = message
+        # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
+        # Generate response
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_length=max_length,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        # Decode response
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Clean up the response (remove the instruction part if present)
+        if "[/INST]" in response:
+            response = response.split("[/INST]")[-1].strip()
+        return response
+    except Exception as e:
+        return f"Error generating response: {str(e)}"
+def create_interface():
+    """Create the Gradio interface"""
+    with gr.Blocks(title="Mistral 7B Fine-tuned Chat", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# 🤖 Mistral 7B Fine-tuned Chat")
+        gr.Markdown("Chat with a fine-tuned Mistral 7B model using QLoRA adapters.")
+        with gr.Row():
+            with gr.Column(scale=4):
+                chatbot = gr.Chatbot(
+                    height=400,
+                    show_label=False,
+                    container=True,
+                )
+                msg = gr.Textbox(
+                    label="Message",
+                    placeholder="Type your message here...",
+                    lines=2,
+                )
+            with gr.Column(scale=1):
+                with gr.Accordion("Parameters", open=False):
+                    max_length = gr.Slider(
+                        minimum=64,
+                        maximum=2048,
+                        value=512,
+                        step=64,
+                        label="Max Length",
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=2.0,
+                        value=0.7,
+                        step=0.1,
+                        label="Temperature",
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.9,
+                        step=0.1,
+                        label="Top P",
+                    )
+                clear_btn = gr.Button("Clear Chat")
+                submit_btn = gr.Button("Send", variant="primary")
+        def user_message(message, history):
+            if not message.strip():
+                return "", history
+            history = history + [[message, None]]
+            return "", history
+        def bot_response(history, max_length, temperature, top_p):
+            if not history or history[-1][1] is not None:
+                return history
+            user_msg = history[-1][0]
+            bot_msg = generate_response(user_msg, max_length, temperature, top_p)
+            history[-1][1] = bot_msg
+            return history
+        def clear_chat():
+            return []
+        msg.submit(
+            user_message,
+            [msg, chatbot],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            bot_response,
+            [chatbot, max_length, temperature, top_p],
+            chatbot
+        )
+        submit_btn.click(
+            user_message,
+            [msg, chatbot],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            bot_response,
+            [chatbot, max_length, temperature, top_p],
+            chatbot
+        )
+        clear_btn.click(clear_chat, outputs=chatbot)
+        gr.Markdown("""
+        ### About
+        This Space demonstrates a fine-tuned Mistral 7B model using QLoRA (4-bit quantization + LoRA adapters).
+        **Features:**
+        - 4-bit quantized base model for memory efficiency
+        - LoRA adapters for task-specific fine-tuning
+        - Adjustable generation parameters
+        - Real-time chat interface
+        **Model:** Mistral 7B Instruct v0.3 base + custom fine-tuning
+        """)
+    return interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()