Spaces:

Prithvik-1
/

mistral-finetuning-interface

Paused

App Files Files Community

Prithvik-1 commited on Nov 24, 2025

Commit

3ba49d5

verified ·

1 Parent(s): eac8397

Upload models/msp/inference/inference_mistral7b.py with huggingface_hub

Browse files

Files changed (1) hide show

models/msp/inference/inference_mistral7b.py +402 -0

models/msp/inference/inference_mistral7b.py ADDED Viewed

	@@ -0,0 +1,402 @@

+#!/usr/bin/env python3
+"""
+Inference script for Mistral 7B
+Supports both Ollama and local fine-tuned models
+"""
+import os
+import sys
+import argparse
+import requests
+import json
+import time
+from typing import Optional, List
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
+from peft import PeftModel
+import torch
+from threading import Thread
+# Configuration
+DEFAULT_OLLAMA_URL = "http://localhost:11434"
+OLLAMA_MODEL_NAME = "mistral:7b"
+def get_device_info():
+    """Detect and return available compute device"""
+    device_info = {
+        "device": "cpu",
+        "device_type": "cpu",
+        "use_quantization": False,
+        "dtype": torch.float32
+    }
+    if torch.cuda.is_available():
+        device_info["device"] = "cuda"
+        device_info["device_type"] = "cuda"
+        device_info["use_quantization"] = True
+        device_info["dtype"] = torch.float16
+        device_info["device_count"] = torch.cuda.device_count()
+        device_info["device_name"] = torch.cuda.get_device_name(0)
+        if device_info["device_count"] > 1:
+            print(f"✓ {device_info['device_count']} CUDA GPUs detected:")
+            for i in range(device_info["device_count"]):
+                print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
+            print(f"  Model will be automatically distributed across all GPUs")
+        else:
+            print(f"✓ CUDA GPU detected: {device_info['device_name']}")
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        device_info["device"] = "mps"
+        device_info["device_type"] = "mps"
+        device_info["use_quantization"] = False  # BitsAndBytes doesn't support MPS
+        device_info["dtype"] = torch.float16
+        print("✓ Apple Silicon GPU (MPS) detected")
+    else:
+        print("⚠ No GPU detected, using CPU (inference will be slow)")
+        device_info["dtype"] = torch.float32
+    return device_info
+def load_local_model(model_path: str, use_quantization: Optional[bool] = None):
+    """Load a fine-tuned model from local path"""
+    device_info = get_device_info()
+    print(f"\nLoading model from: {model_path}")
+    # Determine quantization based on device if not explicitly set
+    if use_quantization is None:
+        use_quantization = device_info["use_quantization"]
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Check if it's a LoRA adapter
+    adapter_config_path = os.path.join(model_path, "adapter_config.json")
+    is_lora = os.path.exists(adapter_config_path)
+    # Prepare model loading kwargs
+    def get_model_kwargs(quantize=False):
+        kwargs = {"trust_remote_code": True}
+        if quantize and device_info["device_type"] == "cuda":
+            kwargs["quantization_config"] = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+            )
+            kwargs["device_map"] = "auto"
+        else:
+            kwargs["torch_dtype"] = device_info["dtype"]
+            if device_info["device_type"] == "mps":
+                kwargs["device_map"] = "auto"
+            elif device_info["device_type"] == "cuda":
+                kwargs["device_map"] = "auto"
+            else:
+                kwargs["device_map"] = "cpu"
+        return kwargs
+    if is_lora:
+        # Load base model - prefer local model to avoid cache issues
+        local_base_model = "/workspace/ftt/base_models/Mistral-7B-v0.1"
+        # Check if local model exists, otherwise use HuggingFace
+        if os.path.exists(local_base_model):
+            base_model_name = local_base_model
+            print(f"Loading base model from local: {base_model_name}")
+        else:
+            base_model_name = "mistralai/Mistral-7B-v0.1"
+            print(f"Loading base model from HuggingFace: {base_model_name}")
+        base_model = AutoModelForCausalLM.from_pretrained(
+            base_model_name,
+            local_files_only=os.path.exists(local_base_model),
+            **get_model_kwargs(use_quantization)
+        )
+        # Load LoRA adapter
+        print("Loading LoRA adapter...")
+        model = PeftModel.from_pretrained(base_model, model_path)
+        model = model.merge_and_unload()  # Merge adapter weights
+    else:
+        # Load full model
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            **get_model_kwargs(use_quantization)
+        )
+    model.eval()
+    # Report device placement for multi-GPU setups
+    if device_info["device_type"] == "cuda" and device_info.get("device_count", 1) > 1:
+        print(f"\nMulti-GPU Model Distribution:")
+        for name, module in model.named_modules():
+            if hasattr(module, 'weight') and module.weight is not None:
+                device = next(module.parameters()).device
+                if device.type == 'cuda':
+                    print(f"  {name[:50]:<50} -> GPU {device.index}")
+                    break  # Just show first layer's device
+        print(f"  (Model automatically split across {device_info['device_count']} GPUs)")
+    else:
+        print(f"Model loaded successfully on {device_info['device']}!")
+    return model, tokenizer
+def generate_with_local_model(model, tokenizer, prompt: str, max_length: int = 512, temperature: float = 0.7, stream: bool = False):
+    """Generate text using local model"""
+    # Use prompt as-is - don't reformat it
+    # The user should provide the prompt in the correct format for their model
+    formatted_prompt = prompt
+    inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
+    if stream:
+        # Streaming generation
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
+            **inputs,
+            max_new_tokens=max_length,  # Use max_new_tokens instead of max_length
+            temperature=temperature,
+            do_sample=True,
+            top_p=0.9,
+            repetition_penalty=1.1,  # Prevent repetition
+            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            streamer=streamer,
+        )
+        # Start generation in a separate thread
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Stream the output
+        generated_text = ""
+        token_count = 0
+        start_time = time.time()
+        for text in streamer:
+            generated_text += text
+            token_count += 1
+            print(text, end="", flush=True)
+        thread.join()
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        tokens_per_second = token_count / elapsed_time if elapsed_time > 0 else 0
+        # Extract only the generated part (after the prompt)
+        if prompt in generated_text:
+            response = generated_text[len(prompt):].strip()
+        else:
+            response = generated_text.strip()
+        return response, token_count, elapsed_time, tokens_per_second
+    else:
+        # Non-streaming generation (original behavior)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_length,  # Use max_new_tokens instead of max_length
+                temperature=temperature,
+                do_sample=True,
+                top_p=0.9,
+                repetition_penalty=1.1,  # Prevent repetition
+                pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the generated part (after the prompt)
+        if prompt in generated_text:
+            response = generated_text[len(prompt):].strip()
+        else:
+            response = generated_text.strip()
+        return response
+def generate_with_ollama(prompt: str, model_name: str = OLLAMA_MODEL_NAME, url: str = DEFAULT_OLLAMA_URL, max_tokens: int = 512, temperature: float = 0.7):
+    """Generate text using Ollama API"""
+    formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
+    try:
+        response = requests.post(
+            f"{url}/api/generate",
+            json={
+                "model": model_name,
+                "prompt": formatted_prompt,
+                "stream": False,
+                "options": {
+                    "temperature": temperature,
+                    "num_predict": max_tokens,
+                }
+            },
+            timeout=120
+        )
+        response.raise_for_status()
+        result = response.json()
+        generated_text = result.get("response", "")
+        # Extract only the response part
+        response_text = generated_text.split("### Response:\n")[-1].strip()
+        return response_text
+    except requests.exceptions.ConnectionError:
+        print(f"Error: Could not connect to Ollama at {url}")
+        print("Make sure Ollama is running. Start it with: ollama serve")
+        sys.exit(1)
+    except requests.exceptions.RequestException as e:
+        print(f"Error calling Ollama API: {e}")
+        sys.exit(1)
+def interactive_mode(use_ollama: bool, model_path: Optional[str] = None, ollama_model: str = OLLAMA_MODEL_NAME, ollama_url: str = DEFAULT_OLLAMA_URL, use_quantization: Optional[bool] = None):
+    """Run interactive inference session"""
+    model = None
+    tokenizer = None
+    if not use_ollama:
+        if not model_path:
+            print("Error: no model path provided for local mode")
+            sys.exit(1)
+        if not os.path.exists(model_path) and "/" not in model_path:
+            print(f"Error: Model path {model_path} does not exist")
+            sys.exit(1)
+        model, tokenizer = load_local_model(model_path, use_quantization)
+    print("\n" + "=" * 50)
+    print("Mistral 7B Interactive Inference")
+    print("Type 'quit' or 'exit' to stop")
+    print("=" * 50 + "\n")
+    while True:
+        try:
+            user_input = input("You: ").strip()
+            if user_input.lower() in ['quit', 'exit', 'q']:
+                print("Goodbye!")
+                break
+            if not user_input:
+                continue
+            print("\nAssistant: ", end="", flush=True)
+            if use_ollama:
+                start_time = time.time()
+                response = generate_with_ollama(user_input, ollama_model, ollama_url)
+                end_time = time.time()
+                inference_time = end_time - start_time
+                print(response)
+                print(f"\n⏱️  Inference time: {inference_time:.2f} seconds")
+            else:
+                # Use streaming for local model
+                response, token_count, elapsed_time, tokens_per_second = generate_with_local_model(
+                    model, tokenizer, user_input, stream=True
+                )
+                print(f"\n\n⏱️  Generation time: {elapsed_time:.2f}s | Tokens: {token_count} | Speed: {tokens_per_second:.2f} tokens/sec")
+            print()
+        except KeyboardInterrupt:
+            print("\n\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"\nError: {e}")
+def single_inference(prompt: str, use_ollama: bool, model_path: Optional[str] = None, ollama_model: str = OLLAMA_MODEL_NAME, ollama_url: str = DEFAULT_OLLAMA_URL, use_quantization: Optional[bool] = None):
+    """Run a single inference"""
+    if use_ollama:
+        start_time = time.time()
+        response = generate_with_ollama(prompt, ollama_model, ollama_url)
+        end_time = time.time()
+        inference_time = end_time - start_time
+        print(response)
+        print(f"\n⏱️  Inference time: {inference_time:.2f} seconds")
+    else:
+        if not model_path:
+            print("Error: no model path provided for local mode")
+            sys.exit(1)
+        if not os.path.exists(model_path) and "/" not in model_path:
+            print(f"Error: Model path {model_path} does not exist")
+            sys.exit(1)
+        model, tokenizer = load_local_model(model_path, use_quantization)
+        # Use streaming for local model
+        response, token_count, elapsed_time, tokens_per_second = generate_with_local_model(
+            model, tokenizer, prompt, stream=True
+        )
+        print(f"\n\n⏱️  Generation time: {elapsed_time:.2f}s | Tokens: {token_count} | Speed: {tokens_per_second:.2f} tokens/sec")
+def main():
+    parser = argparse.ArgumentParser(description="Mistral 7B Inference Script")
+    parser.add_argument(
+        "--mode",
+        choices=["local", "ollama"],
+        default="ollama",
+        help="Inference mode: local (fine-tuned model) or ollama (Ollama API)"
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="./mistral7b-finetuned-ahb2apb",
+        help="Path to fine-tuned model (for local mode)"
+    )
+    parser.add_argument(
+        "--ollama-model",
+        type=str,
+        default=OLLAMA_MODEL_NAME,
+        help="Ollama model name (default: mistral:7b)"
+    )
+    parser.add_argument(
+        "--ollama-url",
+        type=str,
+        default=DEFAULT_OLLAMA_URL,
+        help="Ollama API URL (default: http://localhost:11434)"
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="Single prompt to process (if not provided, runs in interactive mode)"
+    )
+    parser.add_argument(
+        "--no-quantization",
+        action="store_true",
+        help="Disable quantization for local models (requires more memory)"
+    )
+    args = parser.parse_args()
+    use_ollama = args.mode == "ollama"
+    use_quantization = False if args.no_quantization else None  # Auto-detect based on device unless disabled
+    if args.prompt:
+        if use_ollama:
+            start_time = time.time()
+            response = generate_with_ollama(args.prompt, args.ollama_model, args.ollama_url)
+            end_time = time.time()
+            inference_time = end_time - start_time
+            print(response)
+            print(f"\n⏱️  Inference time: {inference_time:.2f} seconds")
+        else:
+            if not args.model_path:
+                print("Error: no model path provided for local mode")
+                sys.exit(1)
+            if not os.path.exists(args.model_path) and "/" not in args.model_path:
+                print(f"Error: Model path {args.model_path} does not exist")
+                sys.exit(1)
+            model, tokenizer = load_local_model(args.model_path, use_quantization)
+            # Use streaming for local model
+            response, token_count, elapsed_time, tokens_per_second = generate_with_local_model(
+                model, tokenizer, args.prompt, stream=True
+            )
+            print(f"\n\n⏱️  Generation time: {elapsed_time:.2f}s | Tokens: {token_count} | Speed: {tokens_per_second:.2f} tokens/sec")
+    else:
+        interactive_mode(
+            use_ollama,
+            args.model_path if not use_ollama else None,
+            args.ollama_model,
+            args.ollama_url,
+            use_quantization
+        )
+if __name__ == "__main__":
+    main()