YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
# --- 1. RE-ESTABLISH ENVIRONMENT BASICS ---
import sys
import subprocess
print("Restoring pristine library states...")
subprocess.run([sys.executable, "-m", "pip", "install", "-U", "bitsandbytes", "accelerate"], capture_output=True)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# --- 2. CONFIGURATION ---
MODEL_ID = "Phase-Technologies/Qwen2.5-Coder-1.5B-Instruct-Qubik-Merged"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Downloading and loading standalone merged model: {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Load natively in 16-bit float (highly optimized for fast inference passes)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
device_map="auto"
)
# --- 3. DEFINE THE MULTI-HOP REASONING PROMPT ---
sample_prompt = (
"What is the revenue multiple implied by the last funding round valuation of the primary "
"technology portfolio company owned by the lead investor of the ownership group that "
"executed the highest-valued sports team acquisition in history, relative to the sector median?"
)
system_content = (
"You are a factual reasoning agent. Required steps: "
"1. Generate exactly 2 search query objects using domain parameters. "
"2. Verify cross-references exactly 1 times. "
"3. Strictly avoid simulations or factual interpolation."
)
messages = [
{"role": "system", "content": system_content},
{"role": "user", "content": sample_prompt}
]
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer([input_text], return_tensors="pt").to(device)
# --- 4. GENERATE AND PRINT HIGH-SPEED OUTPUT ---
print("\n" + "="*60)
print("RUNNING LIVE INFERENCE FROM STANDALONE MERGED REPOSITORY:")
print(f"Prompt: {sample_prompt}\n")
model.eval()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
use_cache=True,
temperature=0.1, # Low temperature forces exact structure trace compliance
do_sample=False
)
response = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
print(f"Output Trace Response:\n{response}")
print("="*60 + "\n")
- Downloads last month
- 1
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support