import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # CPU-friendly model settings # Using a smaller model with quantization for CPU compatibility model_name = "google/gemma-2-2b" # Smaller 2B parameter model tokenizer = AutoTokenizer.from_pretrained(model_name) # Configure quantization for better CPU performance quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16 ) # Load model with CPU optimizations try: model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quantization_config, device_map="auto" # Will use CPU if no GPU is available ) using_quantization = True except Exception as e: print(f"Quantization failed with error: {e}") print("Falling back to standard CPU loading...") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, # Use float32 for CPU device_map="cpu" # Explicitly use CPU ) using_quantization = False print(f"Model loaded on CPU. Using quantization: {using_quantization}") print(f"Model size: {model_name}") # Define a function to generate text with adjusted parameters for CPU def generate_response(prompt, max_length=200): # Reduced max length inputs = tokenizer(prompt, return_tensors="pt").to(model.device) print("Generating response (this may take a while on CPU)...") start_time = time.time() # Generate output with more conservative settings for CPU outputs = model.generate( **inputs, max_new_tokens=max_length, do_sample=False, # Deterministic generation is faster num_beams=1, # No beam search for speed ) end_time = time.time() print(f"Generation completed in {end_time - start_time:.2f} seconds") # Decode and return the generated text generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Remove the prompt from the response return generated_text[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):] # Test the model with simpler, shorter prompts for CPU evaluation import time test_prompts = [ "Explain what machine learning is in one paragraph.", "Write a haiku about computers.", "List three benefits of open-source software." ] # Run evaluation print("\nEvaluating Gemini open source model on CPU\n") print("=" * 50) for i, prompt in enumerate(test_prompts): print(f"\nPrompt {i+1}: {prompt}") print("-" * 50) start_time = time.time() response = generate_response(prompt) end_time = time.time() print(f"Response time: {end_time - start_time:.2f} seconds") print(f"Response:\n{response}") print("=" * 50) # Memory usage information import psutil process = psutil.Process() memory_info = process.memory_info() print(f"\nMemory Usage: {memory_info.rss / (1024 * 1024):.2f} MB") # Save model output to a file for later analysis with open("gemini_cpu_evaluation_results.txt", "w") as f: f.write("GEMINI MODEL CPU EVALUATION RESULTS\n\n") for i, prompt in enumerate(test_prompts): f.write(f"Prompt {i+1}: {prompt}\n") f.write(f"Response:\n{generate_response(prompt)}\n\n")