Upload 4 files

Browse files

Files changed (4) hide show

README.md +5 -3
build.sh +62 -0
llama_chat_interface.py +433 -0
merge_with_autopeft.py +28 -0

README.md CHANGED Viewed

@@ -1,3 +1,5 @@
----
-license: cc-by-nc-sa-4.0
----

+# resume.llamafile v1.0
+Finetuned and packaged by Alexander Molchevskyi
+Model: LLaMA-3.2-3B, fine-tuned on career Q&A dataset
+Purpose: Interactive resume and portfolio showcase

build.sh ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/bin/bash
+# Activate Python virtual environment with all required packages (torch, transformers, peft, etc.)
+# This keeps dependencies isolated from your system Python.
+source llm-finetune/bin/activate
+# Step 1: Run the fine-tuning script (LoRA training)
+# - llama_finetuning.py trains your LLaMA model using Q&A pairs.
+# - The output will be a LoRA adapter stored in a subdirectory.
+python3 llama_finetuning.py
+# Step 2: Make sure the locally built llamafile launcher is available
+# - We installed llamafile into ~/dev/tools/llamafile/bin
+# - Add that directory to PATH so its binaries can be found automatically.
+export PATH="$HOME/dev/tools/llamafile/bin:$PATH"
+# Step 3: Merge the LoRA adapter with the base model
+# - LoRA is efficient for training, but for deployment we want a single merged model.
+# - merge_with_autopeft.py loads the base weights and adapter, merges them, and saves FP16 weights in ./merged-fp16
+python3 merge_with_autopeft.py
+# Step 4: Convert Hugging Face FP16 model -> GGUF (llama.cpp runtime format)
+# - ./merged-fp16 is the Hugging Face directory created by the merge step.
+# - --outfile sets the name of the GGUF file.
+# - --outtype f16 ensures weights are saved in FP16 precision before quantization.
+python3 ../llama.cpp/convert_hf_to_gguf.py merged-fp16 --outfile merged-fp16.gguf --outtype f16
+# Step 5: Quantize FP16 GGUF -> Q6_K GGUF
+# - Q6_K is a 6-bit quantization that balances speed, quality, and size.
+# - merged-fp16.gguf is the input, merged-Q6_K.gguf is the output.
+# - This step makes the model small enough to run efficiently on CPU/GPU.
+../llama.cpp/build/bin/llama-quantize merged-fp16.gguf merged-Q6_K.gguf q6_k
+# Step 6: Copy the llamafile launcher
+# - "llamafile" is the universal runtime that knows how to run GGUF models.
+# - We copy it to resume.llamafile, which will become the final self-contained binary.
+cp ~/dev/tools/llamafile/bin/llamafile resume.llamafile
+# Step 7: Pack the model, args, and docs into the llamafile
+# - zipalign appends files into the llamafile binary as an uncompressed ZIP archive.
+# - merged-Q6_K.gguf is the quantized model.
+# - .args contains default runtime arguments (e.g. -m model, --threads, --ctx-size).
+# - README.md is included so end users have documentation directly inside the llamafile.
+# - The -j0 option ensures "store only" (no compression) so llamafile can memory-map the model efficiently.
+zipalign -j0 resume.llamafile merged-Q6_K.gguf .args README.md
+#Key points for education purpose
+#    Virtual environment keeps fine-tuning dependencies isolated.
+#    LoRA fine-tuning produces small adapter weights → later merged for simplicity.
+#    Merge step is critical: it creates a “normal” Hugging Face model again, which can be exported.
+#    convert_hf_to_gguf.py translates HF → GGUF (runtime format for llama.cpp + llamafile).
+#    Quantization (Q6_K) reduces model size by ~3–4× with minimal loss in quality, making it run fast on CPU.
+#    llamafile packaging produces a single executable that works on Linux/macOS directly; on Windows you just rename it to .exe.
+#    zipalign -j0 ensures files are stored uncompressed, which llamafile requires for mmap loading.

llama_chat_interface.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import os
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig
+)
+from peft import PeftModel
+import warnings
+from datetime import datetime
+import json
+# Suppress warnings for cleaner output
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+class LlamaChat:
+    def __init__(self, model_path, system_message=None, use_quantization=True, max_memory_gb=8):
+        """
+        Initialize the chat interface with the fine-tuned Llama model
+        Args:
+            model_path: Path to the fine-tuned model directory
+            system_message: System message to use for conversations (persona/context)
+            use_quantization: Whether to use 4-bit quantization (recommended for 8GB GPU)
+            max_memory_gb: Maximum GPU memory to use
+        """
+        self.model_path = model_path
+        self.use_quantization = use_quantization
+        self.max_memory_gb = max_memory_gb
+        # Default system message if none provided
+        self.system_message = system_message or (
+            "You are Alexander Molchevskyi — a senior software engineer with over 20 years "
+            "of professional experience across embedded, desktop, and server systems. "
+            "Skilled in C++, Rust, Python, AI infrastructure, compilers, WebAssembly, and "
+            "developer tooling. You answer interview questions clearly, professionally, and naturally."
+        )
+        print("🚀 Loading Llama Chat Interface...")
+        print(f"Model path: {model_path}")
+        print(f"System message: {self.system_message[:100]}{'...' if len(self.system_message) > 100 else ''}")
+        # Check CUDA availability
+        if torch.cuda.is_available():
+            print(f"✅ CUDA available: {torch.cuda.get_device_name()}")
+            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
+        else:
+            print("⚠️ CUDA not available, using CPU (will be slow)")
+        self.tokenizer = None
+        self.model = None
+        self.conversation_history = []
+        self._load_model()
+    def _setup_quantization_config(self):
+        """Setup 4-bit quantization config for memory efficiency"""
+        if not self.use_quantization:
+            return None
+        return BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+    def _load_model(self):
+        """Load the tokenizer and model"""
+        try:
+            print("📚 Loading tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_path,
+                trust_remote_code=True,
+                padding_side="left"  # For generation
+            )
+            # Add pad token if it doesn't exist
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+            print("🧠 Loading base model...")
+            # Setup quantization if requested
+            quantization_config = self._setup_quantization_config()
+            # Check if this is a PEFT model (has adapter_config.json)
+            adapter_config_path = os.path.join(self.model_path, "adapter_config.json")
+            is_peft_model = os.path.exists(adapter_config_path)
+            if is_peft_model:
+                print("🔧 Detected PEFT (LoRA) model, loading base model first...")
+                # Load adapter config to get base model name
+                with open(adapter_config_path, 'r') as f:
+                    adapter_config = json.load(f)
+                base_model_name = adapter_config.get('base_model_name_or_path', 'llama-3.2-3b')
+                print(f"Base model: {base_model_name}")
+                # Load base model
+                base_model = AutoModelForCausalLM.from_pretrained(
+                    base_model_name,
+                    quantization_config=quantization_config,
+                    device_map="auto",
+                    torch_dtype=torch.bfloat16,
+                    trust_remote_code=True,
+                    use_cache=True,  # Enable cache for inference
+                )
+                # Load PEFT model (LoRA adapter)
+                print("🎯 Loading LoRA adapter...")
+                self.model = PeftModel.from_pretrained(base_model, self.model_path)
+            else:
+                # Regular fine-tuned model (not PEFT)
+                print("📦 Loading fine-tuned model...")
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_path,
+                    quantization_config=quantization_config,
+                    device_map="auto",
+                    torch_dtype=torch.bfloat16,
+                    trust_remote_code=True,
+                    use_cache=True,  # Enable cache for inference
+                )
+            # Set model to evaluation mode
+            self.model.eval()
+            print("✅ Model loaded successfully!")
+            # Print model info
+            if hasattr(self.model, 'print_trainable_parameters'):
+                self.model.print_trainable_parameters()
+        except Exception as e:
+            print(f"❌ Error loading model: {str(e)}")
+            raise
+    def _format_message(self, user_message):
+        """Format user message with system context using Llama's chat template"""
+        return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{self.system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user_message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    def generate_response(self, user_message, max_new_tokens=200, temperature=0.7,
+                         top_p=0.9, repetition_penalty=1.1, do_sample=True):
+        """
+        Generate a response to the user message
+        Args:
+            user_message: The user's input message
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature (higher = more random)
+            top_p: Nucleus sampling parameter
+            repetition_penalty: Penalty for repeating tokens
+            do_sample: Whether to use sampling or greedy decoding
+        """
+        try:
+            # Format the input
+            formatted_input = self._format_message(user_message)
+            # Tokenize input
+            inputs = self.tokenizer(
+                formatted_input,
+                return_tensors="pt",
+                truncation=True,
+                max_length=1024  # Increased to match training max_length
+            ).to(self.model.device)
+            # Generate response
+            print("🤔 Thinking...")
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=do_sample,
+                    repetition_penalty=repetition_penalty,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    num_return_sequences=1,
+                )
+            # Decode the response
+            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract only the assistant's response (after the last assistant header)
+            assistant_response = full_response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
+            # Clean up any remaining tokens
+            assistant_response = assistant_response.replace("<|eot_id|>", "").strip()
+            return assistant_response
+        except Exception as e:
+            return f"❌ Error generating response: {str(e)}"
+    def chat_loop(self):
+        """Main chat loop"""
+        print("\n" + "="*60)
+        print("🦙 LLAMA FINE-TUNED CHAT INTERFACE")
+        print("="*60)
+        print("Commands:")
+        print("  • Type your message and press Enter")
+        print("  • '/help' - Show this help")
+        print("  • '/system' - View or change system message")
+        print("  • '/settings' - Adjust generation settings")
+        print("  • '/history' - Show conversation history")
+        print("  • '/clear' - Clear conversation history")
+        print("  • '/save' - Save conversation to file")
+        print("  • '/quit' or '/exit' - Exit the chat")
+        print("="*60)
+        # Default generation settings
+        settings = {
+            'max_new_tokens': 200,
+            'temperature': 0.7,
+            'top_p': 0.9,
+            'repetition_penalty': 1.1,
+            'do_sample': True
+        }
+        while True:
+            try:
+                # Get user input
+                user_input = input("\n👤 You: ").strip()
+                if not user_input:
+                    continue
+                # Handle commands
+                if user_input.lower() in ['/quit', '/exit']:
+                    print("👋 Goodbye!")
+                    break
+                elif user_input.lower() == '/help':
+                    self._show_help()
+                    continue
+                elif user_input.lower() == '/system':
+                    self._manage_system_message()
+                    continue
+                elif user_input.lower() == '/settings':
+                    settings = self._adjust_settings(settings)
+                    continue
+                elif user_input.lower() == '/history':
+                    self._show_history()
+                    continue
+                elif user_input.lower() == '/clear':
+                    self.conversation_history.clear()
+                    print("🧹 Conversation history cleared!")
+                    continue
+                elif user_input.lower() == '/save':
+                    self._save_conversation()
+                    continue
+                # Generate response
+                response = self.generate_response(user_input, **settings)
+                # Display response
+                print(f"\n🦙 Alexander: {response}")
+                # Save to history
+                self.conversation_history.append({
+                    'timestamp': datetime.now().isoformat(),
+                    'system': self.system_message,
+                    'user': user_input,
+                    'assistant': response
+                })
+            except KeyboardInterrupt:
+                print("\n\n👋 Chat interrupted. Goodbye!")
+                break
+            except Exception as e:
+                print(f"\n❌ Error: {str(e)}")
+    def _manage_system_message(self):
+        """Allow user to view or change the system message"""
+        print("\n🤖 SYSTEM MESSAGE MANAGEMENT:")
+        print("Current system message:")
+        print("-" * 60)
+        print(self.system_message)
+        print("-" * 60)
+        choice = input("\nOptions: [v]iew, [c]hange, or [Enter] to go back: ").strip().lower()
+        if choice == 'c' or choice == 'change':
+            print("\nEnter new system message (or press Enter to keep current):")
+            new_system = input("> ").strip()
+            if new_system:
+                self.system_message = new_system
+                print("✅ System message updated!")
+                print("Note: This will affect all future conversations.")
+            else:
+                print("System message unchanged.")
+        elif choice == 'v' or choice == 'view':
+            # Already displayed above
+            pass
+    def _show_help(self):
+        """Show help information"""
+        print("\n📋 HELP:")
+        print("This is a chat interface for your fine-tuned Llama model.")
+        print("The model has been trained with system messages to embody Alexander Molchevskyi's")
+        print("professional persona and expertise in software engineering.")
+        print("\nTips:")
+        print("• Ask technical questions about software engineering, AI, or development")
+        print("• The model maintains context of being Alexander throughout conversations")
+        print("• Use /system to view or modify the professional persona")
+        print("• Use /settings to adjust creativity (temperature) and response length")
+        print("• Higher temperature = more creative but less consistent")
+        print("• Lower temperature = more focused and consistent")
+    def _adjust_settings(self, current_settings):
+        """Allow user to adjust generation settings"""
+        print("\n⚙️ GENERATION SETTINGS:")
+        print("Current settings:")
+        for key, value in current_settings.items():
+            print(f"  {key}: {value}")
+        new_settings = current_settings.copy()
+        try:
+            # Max tokens
+            max_tokens = input(f"\nMax response length ({current_settings['max_new_tokens']}): ").strip()
+            if max_tokens:
+                new_settings['max_new_tokens'] = max(1, min(500, int(max_tokens)))
+            # Temperature
+            temp = input(f"Temperature 0.1-2.0 ({current_settings['temperature']}): ").strip()
+            if temp:
+                new_settings['temperature'] = max(0.1, min(2.0, float(temp)))
+            # Top-p
+            top_p = input(f"Top-p 0.1-1.0 ({current_settings['top_p']}): ").strip()
+            if top_p:
+                new_settings['top_p'] = max(0.1, min(1.0, float(top_p)))
+            # Repetition penalty
+            rep_penalty = input(f"Repetition penalty 1.0-2.0 ({current_settings['repetition_penalty']}): ").strip()
+            if rep_penalty:
+                new_settings['repetition_penalty'] = max(1.0, min(2.0, float(rep_penalty)))
+            print("✅ Settings updated!")
+            return new_settings
+        except ValueError:
+            print("❌ Invalid input. Settings unchanged.")
+            return current_settings
+    def _show_history(self):
+        """Show conversation history"""
+        if not self.conversation_history:
+            print("📝 No conversation history yet.")
+            return
+        print(f"\n📜 CONVERSATION HISTORY ({len(self.conversation_history)} exchanges):")
+        print("-" * 50)
+        for i, exchange in enumerate(self.conversation_history[-5:], 1):  # Show last 5
+            timestamp = exchange['timestamp'].split('T')[1].split('.')[0]  # Just time
+            print(f"\n[{timestamp}]")
+            print(f"👤 You: {exchange['user']}")
+            print(f"🦙 Alexander: {exchange['assistant'][:100]}{'...' if len(exchange['assistant']) > 100 else ''}")
+        if len(self.conversation_history) > 5:
+            print(f"\n... and {len(self.conversation_history) - 5} more exchanges")
+    def _save_conversation(self):
+        """Save conversation to a JSON file"""
+        if not self.conversation_history:
+            print("📝 No conversation to save.")
+            return
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"llama_chat_{timestamp}.json"
+        try:
+            with open(filename, 'w', encoding='utf-8') as f:
+                json.dump(self.conversation_history, f, indent=2, ensure_ascii=False)
+            print(f"💾 Conversation saved to: {filename}")
+        except Exception as e:
+            print(f"❌ Error saving conversation: {str(e)}")
+def main():
+    """Main function to start the chat interface"""
+    # Configuration
+    MODEL_PATH = "llama-3.2-3b-finetuned"  # Path to your fine-tuned model
+    # Default system message (can be customized)
+    DEFAULT_SYSTEM_MESSAGE = (
+        "You are Alexander Molchevskyi — a senior software engineer with over 20 years "
+        "of professional experience across embedded, desktop, and server systems. "
+        "Skilled in C++, Rust, Python, AI infrastructure, compilers, WebAssembly, and "
+        "developer tooling. You answer interview questions clearly, professionally, and naturally."
+    )
+    # Check if model directory exists
+    if not os.path.exists(MODEL_PATH):
+        print(f"❌ Model directory not found: {MODEL_PATH}")
+        print("Please make sure you have run the fine-tuning script first.")
+        return
+    try:
+        # Initialize chat interface
+        chat = LlamaChat(
+            model_path=MODEL_PATH,
+            system_message=DEFAULT_SYSTEM_MESSAGE,
+            use_quantization=True,  # Set to False if you have plenty of GPU memory
+            max_memory_gb=8
+        )
+        # Start chat loop
+        chat.chat_loop()
+    except Exception as e:
+        print(f"❌ Failed to initialize chat interface: {str(e)}")
+        print("\nTroubleshooting tips:")
+        print("1. Make sure the model was trained successfully")
+        print("2. Check that all required libraries are installed")
+        print("3. Ensure you have sufficient GPU memory")
+        print("4. Try setting use_quantization=True to reduce memory usage")
+if __name__ == "__main__":
+    main()

merge_with_autopeft.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# merge_with_autopeft.py
+import torch, os
+from peft import AutoPeftModelForCausalLM
+from transformers import AutoTokenizer
+# lora_dir is your *adapter* checkpoint dir produced by training
+LORA_DIR   = "llama-3.2-3b-finetuned"
+OUT_DIR    = "merged-fp16"
+DTYPE      = torch.float16
+print("Loading LoRA with AutoPeft (this reads base_model_name_or_path from the adapter config)...")
+model = AutoPeftModelForCausalLM.from_pretrained(
+    LORA_DIR,
+    torch_dtype=DTYPE,
+    device_map="cpu",
+)
+print("Merging and unloading adapters...")
+model = model.merge_and_unload()     # <- this *actually* bakes the deltas into weights
+os.makedirs(OUT_DIR, exist_ok=True)
+print("Saving merged model...")
+model.save_pretrained(OUT_DIR, safe_serialization=True)
+tok = AutoTokenizer.from_pretrained(LORA_DIR, use_fast=False)  # works because tokenizer is same as base
+tok.save_pretrained(OUT_DIR)
+print("✅ Done")