import gradio as gr import torch import gc from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import os # --------------------------------------------------------------------------- # CONFIGURATION # --------------------------------------------------------------------------- # WARNING: On CPU, 8B models are very heavy. # If this crashes, switch to "unsloth/Llama-3.2-3B-Instruct" BASE_MODEL_ID = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit" LORA_ADAPTER_ID = "JPQ24/Natural-synthesis-llama-3.2-1b" # --------------------------------------------------------------------------- # LOAD MODEL (State Initialization) # --------------------------------------------------------------------------- print("System: Initializing CPU Load Sequence...") # 1. Load Tokenizer tokenizer = AutoTokenizer.from_pretrained( BASE_MODEL_ID, token=os.environ.get("HF_TOKEN") ) # 2. Load Base Model # low_cpu_mem_usage=True is critical here to load weights sequentially print("System: Loading Base Model into RAM...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL_ID, device_map="cpu", torch_dtype=torch.float32, # Safe default. Use torch.bfloat16 if your Space supports it for speed. low_cpu_mem_usage=True, trust_remote_code=True, token=os.environ.get("HF_TOKEN") ) # 3. Attach LoRA Adapter print("System: Attaching LoRA Adapter...") model = PeftModel.from_pretrained( base_model, LORA_ADAPTER_ID, token=os.environ.get("HF_TOKEN") ) print("System: Ready.") # --------------------------------------------------------------------------- # EXECUTION ENGINE # --------------------------------------------------------------------------- def run_inference(prompt, use_lora): """ Core computation unit. Accepts state configuration (use_lora) and executes transformation. """ # 1. Input Processing messages = [{"role": "user", "content": prompt}] try: inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") except Exception: inputs = tokenizer(prompt, return_tensors="pt").input_ids # Ensure inputs are on CPU inputs = inputs.to("cpu") # 2. Generation Config (Conservative for CPU) generate_kwargs = dict( input_ids=inputs, max_new_tokens=100, # Keep short to prevent timeouts do_sample=True, temperature=0.7, ) # 3. Execution (With Context Switching) if not use_lora: # CONTEXT A: BASE MODEL # We temporarily disable the LoRA connection with model.disable_adapter(): outputs = model.generate(**generate_kwargs) else: # CONTEXT B: LORA MODEL # We use the active adapter outputs = model.generate(**generate_kwargs) # 4. Output Decoding response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True) return response def sequential_generation(prompt): """ Orchestrator for Sequential Execution. Step 1 -> Cleanup -> Step 2 """ # --- PHASE 1: BASE MODEL --- yield "Generating Base Model response... (Please wait)", "Waiting for Base to finish..." base_result = run_inference(prompt, use_lora=False) # --- INTERMEDIATE: CLEANUP --- # This is a 'heuristic' step to help the CPU breathe. # We force a garbage collection to clear the computation graph from memory. gc.collect() # --- PHASE 2: LORA MODEL --- # We yield the first result so the user can read it while the second runs yield base_result, "Generating LoRA response... (Please wait)" lora_result = run_inference(prompt, use_lora=True) # --- FINAL: COMPLETE --- yield base_result, lora_result # --------------------------------------------------------------------------- # INTERFACE # --------------------------------------------------------------------------- custom_css = """ .container { max-width: 1100px; margin: auto; } .output-box { height: 400px; overflow-y: scroll; } """ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: gr.Markdown("# Sequential Model Comparison (CPU)") gr.Markdown(f"**Architecture:** Base (`{BASE_MODEL_ID}`) + Adapter (`{LORA_ADAPTER_ID}`)") gr.Markdown("ℹ️ **Process:** This space runs the Base Model first, clears memory, and then runs the LoRA Model.") with gr.Row(): input_text = gr.Textbox(label="Prompt", placeholder="e.g. Write a poem about rust...", lines=2) submit_btn = gr.Button("Start Comparison", variant="primary") with gr.Row(): with gr.Column(): gr.Markdown("### 1. Base Model Output") output_base = gr.Textbox(label="Base Result", lines=10, interactive=False) with gr.Column(): gr.Markdown("### 2. LoRA Model Output") output_lora = gr.Textbox(label="Fine-Tuned Result", lines=10, interactive=False) submit_btn.click( fn=sequential_generation, inputs=input_text, outputs=[output_base, output_lora] ) if __name__ == "__main__": demo.launch()