Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on Nov 10

Commit

29f5263

1 Parent(s): 2f60fd7

Add Evaluation tab with ARC-Challenge, TruthfulQA, and Winogrande benchmarks

Browse files

Files changed (2) hide show

app.py +182 -35
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -2,6 +2,9 @@ import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
 # Set environment variable for flash-linear-attention
 os.environ["FLA_USE_TRITON"] = "1"
@@ -32,11 +35,11 @@ class ChatBot:
             self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 torch_dtype=torch.bfloat16,
-                device_map="balanced",  # Distribute evenly
                 max_memory=max_memory,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
-                attn_implementation="eager",  # Use eager attention instead of flash
             )
             self.model.eval()
@@ -62,7 +65,7 @@ class ChatBot:
             else:
                 device_info = ""
-            yield f"✅ **Model loaded successfully!**{device_info}\n\nYou can now start chatting below."
         except Exception as e:
             self.loaded = False
@@ -70,7 +73,7 @@ class ChatBot:
     def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
         if not self.loaded:
-            return "❌ Please load the model first by clicking the 'Load Model' button."
         try:
             # Build prompt from history
@@ -92,7 +95,7 @@ class ChatBot:
             inputs = self.tokenizer(prompt, return_tensors="pt")
             inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            # Generate with explicit attention settings
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
@@ -101,7 +104,7 @@ class ChatBot:
                     top_p=top_p,
                     do_sample=temperature > 0,
                     pad_token_id=self.tokenizer.eos_token_id,
-                    use_cache=True,  # Enable KV caching
                 )
             # Decode
@@ -115,17 +118,112 @@ class ChatBot:
         except Exception as e:
             return f"❌ Error: {str(e)}"
 # Initialize
 bot = ChatBot()
-# UI
 with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
     gr.Markdown("""
     # 🚀 Kimi Linear 48B A3B - Fine-tuned
-    Chat interface for the fine-tuned Kimi model.
     **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
     """)
@@ -136,35 +234,87 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
         total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
         gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### 🎛️ Controls")
             load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
             status = gr.Markdown("**Status:** Model not loaded")
-            gr.Markdown("---")
-            gr.Markdown("### ⚙️ Settings")
-            system_prompt = gr.Textbox(
-                label="System Prompt",
-                placeholder="You are a helpful assistant...",
-                lines=2
-            )
-            max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
-            temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
-            top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
-        with gr.Column(scale=2):
-            gr.Markdown("### 💬 Chat")
-            chatbot = gr.Chatbot(height=500, show_copy_button=True)
             with gr.Row():
-                msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
-                send = gr.Button("Send", variant="primary", scale=1)
-            clear = gr.Button("Clear")
     # Events
     load_btn.click(bot.load_model, outputs=status)
@@ -178,10 +328,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
     send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
     clear.click(lambda: None, None, chatbot)
-    gr.Markdown("""
-    ---
-    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
-    """)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
+import subprocess
+import json
+from datetime import datetime
 # Set environment variable for flash-linear-attention
 os.environ["FLA_USE_TRITON"] = "1"
             self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 torch_dtype=torch.bfloat16,
+                device_map="balanced",
                 max_memory=max_memory,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
+                attn_implementation="eager",
             )
             self.model.eval()
             else:
                 device_info = ""
+            yield f"✅ **Model loaded successfully!**{device_info}\n\nYou can now use Chat or Evaluation tabs."
         except Exception as e:
             self.loaded = False
     def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
         if not self.loaded:
+            return "❌ Please load the model first by clicking the 'Load Model' button in Controls."
         try:
             # Build prompt from history
             inputs = self.tokenizer(prompt, return_tensors="pt")
             inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Generate
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
                     top_p=top_p,
                     do_sample=temperature > 0,
                     pad_token_id=self.tokenizer.eos_token_id,
+                    use_cache=True,
                 )
             # Decode
         except Exception as e:
             return f"❌ Error: {str(e)}"
+    def run_evaluation(self, tasks_to_run):
+        """Run lm_eval on selected tasks"""
+        if not self.loaded:
+            yield "❌ Please load the model first!"
+            return
+        try:
+            # Map friendly names to lm_eval task names
+            task_map = {
+                "ARC-Challenge": "arc_challenge",
+                "TruthfulQA": "truthfulqa_mc2",
+                "Winogrande": "winogrande"
+            }
+            selected_tasks = [task_map[t] for t in tasks_to_run]
+            task_string = ",".join(selected_tasks)
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_dir = f"/tmp/eval_results_{timestamp}"
+            yield f"🔄 **Starting evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\nThis will take 30-60 minutes total.\n\n"
+            # Run lm_eval
+            cmd = [
+                "lm_eval",
+                "--model", "hf",
+                "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16",
+                "--tasks", task_string,
+                "--batch_size", "auto:4",
+                "--output_path", output_dir,
+                "--log_samples"
+            ]
+            yield f"🔄 **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\nProgress will update below...\n\n"
+            # Run evaluation
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                bufsize=1
+            )
+            output_lines = []
+            for line in process.stdout:
+                output_lines.append(line)
+                # Show last 20 lines
+                recent = ''.join(output_lines[-20:])
+                yield f"🔄 **Running evaluation...**\n\n```\n{recent}\n```"
+            process.wait()
+            if process.returncode != 0:
+                yield f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n\nLogs:\n```\n{''.join(output_lines[-50:])}\n```"
+                return
+            # Read results
+            results_file = os.path.join(output_dir, "results.json")
+            if os.path.exists(results_file):
+                with open(results_file, 'r') as f:
+                    results = json.load(f)
+                # Format results
+                result_text = "✅ **Evaluation Complete!**\n\n"
+                result_text += f"**Timestamp:** {timestamp}\n\n"
+                result_text += "## 📊 Results:\n\n"
+                for task in selected_tasks:
+                    if task in results['results']:
+                        task_results = results['results'][task]
+                        result_text += f"### {task}\n"
+                        for metric, value in task_results.items():
+                            if isinstance(value, float):
+                                result_text += f"- **{metric}:** {value:.4f}\n"
+                            else:
+                                result_text += f"- **{metric}:** {value}\n"
+                        result_text += "\n"
+                # Add summary if available
+                if 'summary' in results:
+                    result_text += "## 📈 Summary:\n\n"
+                    for metric, value in results['summary'].items():
+                        if isinstance(value, float):
+                            result_text += f"- **{metric}:** {value:.4f}\n"
+                        else:
+                            result_text += f"- **{metric}:** {value}\n"
+                result_text += f"\n\n**Full results saved to:** `{output_dir}`"
+                yield result_text
+            else:
+                yield f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
+        except Exception as e:
+            yield f"❌ **Evaluation error:**\n\n{str(e)}"
 # Initialize
 bot = ChatBot()
+# UI with Tabs
 with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
     gr.Markdown("""
     # 🚀 Kimi Linear 48B A3B - Fine-tuned
     **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
     """)
         total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
         gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
+    with gr.Tabs():
+        # Tab 1: Controls (always visible)
+        with gr.Tab("🎛️ Controls"):
+            gr.Markdown("### Load Model First")
             load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
             status = gr.Markdown("**Status:** Model not loaded")
+            gr.Markdown("""
+            ### ℹ️ Instructions
+            1. **Click "Load Model"** - Takes 5-10 minutes
+            2. **Use Chat tab** - For conversations
+            3. **Use Evaluation tab** - To run benchmarks
+            """)
+        # Tab 2: Chat
+        with gr.Tab("💬 Chat"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### ⚙️ Settings")
+                    system_prompt = gr.Textbox(
+                        label="System Prompt",
+                        placeholder="You are a helpful assistant...",
+                        lines=2
+                    )
+                    max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
+                    temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
+                    top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
+                with gr.Column(scale=2):
+                    chatbot = gr.Chatbot(height=500, show_copy_button=True)
+                    with gr.Row():
+                        msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
+                        send = gr.Button("Send", variant="primary", scale=1)
+                    clear = gr.Button("Clear Chat")
+        # Tab 3: Evaluation
+        with gr.Tab("📊 Evaluation"):
+            gr.Markdown("""
+            ### Run LM Evaluation Harness
+            Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.**
+            """)
             with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### Select Benchmarks")
+                    tasks = gr.CheckboxGroup(
+                        choices=["ARC-Challenge", "TruthfulQA", "Winogrande"],
+                        value=["ARC-Challenge", "TruthfulQA", "Winogrande"],
+                        label="Tasks to Run",
+                        info="Select one or more tasks"
+                    )
+                    eval_btn = gr.Button("🚀 Start Evaluation", variant="primary", size="lg")
+                    gr.Markdown("""
+                    ### ⏱️ Estimated Time:
+                    - **ARC-Challenge:** 15-30 min
+                    - **TruthfulQA:** 10-20 min
+                    - **Winogrande:** 15-30 min
+                    **Total:** ~40-80 minutes for all 3
+                    """)
+                with gr.Column(scale=2):
+                    eval_results = gr.Markdown("Results will appear here after evaluation completes.")
+            gr.Markdown("""
+            ---
+            **Note:** Evaluation requires the model to be loaded first. Results will be saved to `/tmp/eval_results_[timestamp]/`.
+            """)
+    gr.Markdown("""
+    ---
+    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
+    """)
     # Events
     load_btn.click(bot.load_model, outputs=status)
     send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
     clear.click(lambda: None, None, chatbot)
+    eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=eval_results)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

requirements.txt CHANGED Viewed

@@ -10,6 +10,9 @@ triton>=3.0.0
 # Flash Linear Attention (required by Kimi model)
 git+https://github.com/sustcsonglin/flash-linear-attention.git@main
 # UI
 gradio==4.19.2

 # Flash Linear Attention (required by Kimi model)
 git+https://github.com/sustcsonglin/flash-linear-attention.git@main
+# Evaluation
+lm-eval>=0.4.0
 # UI
 gradio==4.19.2