File size: 2,929 Bytes
068bc7f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | #!/usr/bin/env python3
"""
Stack 2.9 - Convert & Load (No progress bar)
"""
import os
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import torch
from pathlib import Path
import json
import sys
model_path = Path("/Users/walidsobhi/stack-2.9-final-model")
cache_path = Path("/Users/walidsobhi/stack-2.9/weights_cache.pt")
print("Loading...", flush=True)
# Load tokenizer
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(model_path / "tokenizer.json"))
tokenizer.pad_token = "<|endoftext|>"
tokenizer.eos_token = "<|endoftext|>"
print("Tokenizer ready", flush=True)
# Check if cached conversion exists
if cache_path.exists():
print("Loading cached weights...", flush=True)
state_dict = torch.load(cache_path, map_location='cpu')
else:
# Convert without progress bar
print("Converting weights (one-time)...", flush=True)
# Use lower-level loading
import io
from safetensors.torch import load_file
# Read file into memory first
with open(model_path / "model.safetensors", "rb") as f:
data = f.read()
# Write to temp and load
temp_path = Path("/tmp/weights.pt")
with open(temp_path, "wb") as f:
f.write(data)
# Load with torch (silent)
state_dict = torch.load(temp_path, map_location='cpu')
temp_path.unlink()
# Cache for next time
torch.save(state_dict, cache_path)
print("Weights ready", flush=True)
# Load config
with open(model_path / "config.json") as f:
config_dict = json.load(f)
# Build model
from transformers import Qwen2ForCausalLM, Qwen2Config
config = Qwen2Config()
for k, v in config_dict.items():
setattr(config, k, v)
print("Building model...", flush=True)
model = Qwen2ForCausalLM(config)
model.load_state_dict(state_dict, strict=False)
model = model.to(torch.float16)
if torch.cuda.is_available():
model.to("cuda")
print("Ready!\n", flush=True)
# Chat
print("=" * 40)
print("Stack 2.9 Ready! (Type 'quit' to exit)")
print("=" * 40)
while True:
try:
user_input = input("\nYou: ").strip()
if not user_input:
continue
if user_input.lower() in ['quit', 'exit', 'q']:
break
prompt = f"You are Stack 2.9.\n\nUser: {user_input}\nAssistant:"
inputs = tokenizer(prompt, return_tensors='pt')
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=80, temperature=0.4, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if "Assistant:" in response:
response = response.split("Assistant:")[-1].strip()
print(f"AI: {response}")
except KeyboardInterrupt:
break
except Exception as e:
print(f"Error: {e}")
print("\nDone!") |