Spaces:

Joash2024
/

math-llm-demo

Sleeping

App Files Files Community

Joash2024 commited on Dec 3, 2024

Commit

0e7ff76

1 Parent(s): 081a250

Add initial demo files

Browse files

Files changed (6) hide show

DESCRIPTION.md +1 -0
README.md +59 -8
Spacefile +24 -0
app.py +144 -0
monitoring.py +97 -0
requirements.txt +11 -0

DESCRIPTION.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Interactive demo comparing base (1B) and fine-tuned (1.7B) L

README.md CHANGED Viewed

@@ -1,14 +1,65 @@
 ---
-title: Math Llm Demo
-emoji: 📈
-colorFrom: purple
-colorTo: blue
 sdk: gradio
-sdk_version: 5.7.1
 app_file: app.py
 pinned: false
-license: mit
-short_description: Interactive demo comparing base (1B) and fine-tuned (1.7B) L
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Math Problem Solver Demo
+emoji: 🧮
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
 ---
+# Mathematics Problem Solver Demo
+This demo showcases a comparison between base and fine-tuned language models in solving mathematical problems. It features real-time performance monitoring and supports multiple types of math problems.
+## Models Used
+- Base Model: [LlaMA 3.2 1B](https://huggingface.co/Alexis-Az/Math-Problem-LlaMA-3.2-1B-GGUF)
+- Fine-tuned Model: [SmolLM2 1.7B](https://huggingface.co/Alexis-Az/Math-Problem-LlaMA-3.2-1.7B-GGUF)
+## Features
+- 🔢 Multiple problem types:
+  - Addition operations
+  - Root finding
+  - Derivatives
+  - Custom problems
+- 📊 Real-time performance metrics:
+  - Response times
+  - Success rates
+  - Problem type distribution
+- 🔄 Side-by-side model comparison
+- ⚡ Example problems included
+## How to Use
+1. Select a problem type from the dropdown menu
+2. Enter your math problem in the input field
+3. Click "Solve" to see solutions from both models
+4. Compare the results and view performance metrics
+## Example Problems
+Try these sample problems:
+- Derivative: "Find the derivative of x^2 + 3x"
+- Root Finding: "What is the square root of 144?"
+- Addition: "Calculate 235 + 567"
+## Performance Monitoring
+The interface includes a live dashboard showing:
+- Average response times for each model
+- Success rates comparison
+- Distribution of problem types solved
+- Real-time performance metrics
+## Project Details
+This demo is part of a larger project comparing LLM performance on mathematical problems. The models have been fine-tuned on a custom dataset of mathematical problems to improve their problem-solving capabilities.
+## Credits
+Models provided by [Alexis-Az](https://huggingface.co/Alexis-Az)

Spacefile ADDED Viewed

	@@ -0,0 +1,24 @@

+# Spacefile for math-llm-demo
+configuration:
+  name: math-llm-demo
+  organization: Joash2024
+  hardware:
+    cpu: 2
+    memory: 16
+  system:
+    python_version: "3.10"
+sdk: gradio
+sdk_version: 4.0.0
+python_packages:
+  - "torch>=2.0.0"
+  - "transformers>=4.30.0"
+  - "accelerate>=0.20.0"
+  - "numpy>=1.21.0"
+app_file: app.py
+app_port: 7860
+models:
+  - "Alexis-Az/Math-Problem-LlaMA-3.2-1B-GGUF"
+  - "Alexis-Az/Math-Problem-LlaMA-3.2-1.7B-GGUF"

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import gradio as gr
+from transformers import AutoTokenizer, pipeline
+import torch
+import numpy as np
+from monitoring import PerformanceMonitor, measure_time
+# Model IDs
+BASE_MODEL_ID = "Alexis-Az/Math-Problem-LlaMA-3.2-1B-GGUF"
+FINETUNED_MODEL_ID = "Alexis-Az/Math-Problem-LlaMA-3.2-1.7B-GGUF"
+# Initialize performance monitor
+monitor = PerformanceMonitor()
+def format_prompt(problem):
+    """Format the input problem according to the model's expected format"""
+    return f"<|im_start|>user\nCan you help me solve this math problem? {problem}<|im_end|>\n"
+@measure_time
+def get_model_response(problem, model_id):
+    """Get response from a specific model"""
+    try:
+        # Initialize pipeline
+        pipe = pipeline(
+            "text-generation",
+            model=model_id,
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+        # Format prompt and generate response
+        prompt = format_prompt(problem)
+        response = pipe(
+            prompt,
+            max_new_tokens=100,
+            temperature=0.1,
+            top_p=0.95,
+            repetition_penalty=1.15
+        )[0]["generated_text"]
+        # Extract assistant's response
+        assistant_response = response.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
+        return assistant_response.strip()
+    except Exception as e:
+        return f"Error: {str(e)}"
+def solve_problem(problem, problem_type):
+    """Solve a math problem using both models"""
+    if not problem:
+        return "Please enter a problem", "Please enter a problem", None
+    # Record problem type
+    monitor.record_problem_type(problem_type)
+    # Add problem type context if provided
+    if problem_type != "Custom":
+        problem = f"{problem_type}: {problem}"
+    # Get responses from both models with timing
+    base_response, base_time = get_model_response(problem, BASE_MODEL_ID)
+    finetuned_response, finetuned_time = get_model_response(problem, FINETUNED_MODEL_ID)
+    # Record response times
+    monitor.record_response_time("base", base_time)
+    monitor.record_response_time("finetuned", finetuned_time)
+    # Record success (basic check - no error message)
+    monitor.record_success("base", not base_response.startswith("Error"))
+    monitor.record_success("finetuned", not finetuned_response.startswith("Error"))
+    # Get updated statistics
+    stats = monitor.get_statistics()
+    # Format statistics for display
+    stats_display = f"""
+### Performance Metrics
+#### Response Times (seconds)
+- Base Model: {stats.get('base_avg_response_time', 0):.2f} avg
+- Fine-tuned Model: {stats.get('finetuned_avg_response_time', 0):.2f} avg
+#### Success Rates
+- Base Model: {stats.get('base_success_rate', 0):.1f}%
+- Fine-tuned Model: {stats.get('finetuned_success_rate', 0):.1f}%
+#### Problem Type Distribution
+"""
+    for ptype, percentage in stats.get('problem_type_distribution', {}).items():
+        stats_display += f"- {ptype}: {percentage:.1f}%\n"
+    return base_response, finetuned_response, stats_display
+# Create Gradio interface
+with gr.Blocks(title="Mathematics Problem Solver") as demo:
+    gr.Markdown("# Mathematics Problem Solver")
+    gr.Markdown("Compare solutions between base (1B) and fine-tuned (1.7B) models")
+    with gr.Row():
+        with gr.Column():
+            problem_type = gr.Dropdown(
+                choices=["Addition", "Root Finding", "Derivative", "Custom"],
+                value="Custom",
+                label="Problem Type"
+            )
+            problem_input = gr.Textbox(
+                label="Enter your math problem",
+                placeholder="Example: Find the derivative of x^2 + 3x"
+            )
+            solve_btn = gr.Button("Solve", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Base Model (1B)")
+            base_output = gr.Textbox(label="Base Model Solution", lines=5)
+        with gr.Column():
+            gr.Markdown("### Fine-tuned Model (1.7B)")
+            finetuned_output = gr.Textbox(label="Fine-tuned Model Solution", lines=5)
+    # Performance metrics display
+    with gr.Row():
+        metrics_display = gr.Markdown("### Performance Metrics\n*Solve a problem to see metrics*")
+    # Example problems
+    gr.Examples(
+        examples=[
+            ["Find the derivative of x^2 + 3x", "Derivative"],
+            ["What is the square root of 144?", "Root Finding"],
+            ["Calculate 235 + 567", "Addition"],
+        ],
+        inputs=[problem_input, problem_type],
+        outputs=[base_output, finetuned_output, metrics_display],
+        fn=solve_problem,
+        cache_examples=True,
+    )
+    # Connect the interface
+    solve_btn.click(
+        fn=solve_problem,
+        inputs=[problem_input, problem_type],
+        outputs=[base_output, finetuned_output, metrics_display]
+    )
+if __name__ == "__main__":
+    demo.launch()

monitoring.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import time
+from datetime import datetime
+import json
+import os
+from collections import defaultdict
+import threading
+import numpy as np
+class PerformanceMonitor:
+    def __init__(self, metrics_file="metrics.json"):
+        self.metrics_file = metrics_file
+        self.metrics = defaultdict(list)
+        self.lock = threading.Lock()
+        self._load_metrics()
+    def _load_metrics(self):
+        """Load existing metrics from file"""
+        if os.path.exists(self.metrics_file):
+            try:
+                with open(self.metrics_file, 'r') as f:
+                    self.metrics.update(json.load(f))
+            except json.JSONDecodeError:
+                pass
+    def _save_metrics(self):
+        """Save metrics to file"""
+        with self.lock:
+            with open(self.metrics_file, 'w') as f:
+                json.dump(dict(self.metrics), f)
+    def record_response_time(self, model_id, duration):
+        """Record response time for a model"""
+        with self.lock:
+            self.metrics[f"{model_id}_response_times"].append({
+                'timestamp': datetime.now().isoformat(),
+                'duration': duration
+            })
+            self._save_metrics()
+    def record_success(self, model_id, success):
+        """Record success/failure for a model"""
+        with self.lock:
+            self.metrics[f"{model_id}_success_rate"].append({
+                'timestamp': datetime.now().isoformat(),
+                'success': success
+            })
+            self._save_metrics()
+    def record_problem_type(self, problem_type):
+        """Record usage of different problem types"""
+        with self.lock:
+            self.metrics['problem_types'].append({
+                'timestamp': datetime.now().isoformat(),
+                'type': problem_type
+            })
+            self._save_metrics()
+    def get_statistics(self):
+        """Calculate and return performance statistics"""
+        stats = {}
+        # Response time statistics
+        for model in ['base', 'finetuned']:
+            times = [x['duration'] for x in self.metrics.get(f"{model}_response_times", [])]
+            if times:
+                stats[f"{model}_avg_response_time"] = np.mean(times)
+                stats[f"{model}_max_response_time"] = np.max(times)
+                stats[f"{model}_min_response_time"] = np.min(times)
+        # Success rate statistics
+        for model in ['base', 'finetuned']:
+            successes = [x['success'] for x in self.metrics.get(f"{model}_success_rate", [])]
+            if successes:
+                stats[f"{model}_success_rate"] = sum(successes) / len(successes) * 100
+        # Problem type distribution
+        problem_types = [x['type'] for x in self.metrics.get('problem_types', [])]
+        if problem_types:
+            type_counts = defaultdict(int)
+            for ptype in problem_types:
+                type_counts[ptype] += 1
+            total = len(problem_types)
+            stats['problem_type_distribution'] = {
+                ptype: (count / total) * 100
+                for ptype, count in type_counts.items()
+            }
+        return stats
+def measure_time(func):
+    """Decorator to measure function execution time"""
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        duration = time.time() - start_time
+        return result, duration
+    return wrapper

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# Core dependencies
+gradio>=4.0.0
+torch>=2.0.0
+transformers>=4.30.0
+accelerate>=0.20.0
+numpy>=1.21.0
+# Testing dependencies
+pytest>=7.0.0
+pytest-cov>=4.0.0
+pytest-mock>=3.10.0