Ilke Ileri commited on
Commit
c46fe44
·
1 Parent(s): 190133f

ULTRA SPEED: 8-bit quantization, greedy decoding, 40 tokens, inference_mode

Browse files
Files changed (3) hide show
  1. __pycache__/app.cpython-313.pyc +0 -0
  2. app.py +20 -12
  3. requirements.txt +1 -0
__pycache__/app.cpython-313.pyc ADDED
Binary file (6.24 kB). View file
 
app.py CHANGED
@@ -27,10 +27,10 @@ BASE_MODEL = "google/gemma-1.1-2b-it"
27
  print("Loading tokenizer...")
28
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
29
 
30
- print("Loading base model...")
31
  base_model = AutoModelForCausalLM.from_pretrained(
32
  BASE_MODEL,
33
- torch_dtype=torch.float16,
34
  low_cpu_mem_usage=True,
35
  trust_remote_code=True,
36
  token=HF_TOKEN,
@@ -41,6 +41,13 @@ print("Loading LoRA adapters...")
41
  model = PeftModel.from_pretrained(base_model, MODEL_NAME, token=HF_TOKEN)
42
  model.eval()
43
 
 
 
 
 
 
 
 
44
  # Device'ı belirle
45
  device = "cuda" if torch.cuda.is_available() else "cpu"
46
  print(f"Using device: {device}")
@@ -129,18 +136,19 @@ def chat_completions():
129
  import time
130
  start_time = time.time()
131
 
132
- outputs = model.generate(
133
- **inputs,
134
- max_new_tokens=80, # Balanced: fast but complete (reduced from 150)
135
- temperature=0.6, # Slightly lower for faster convergence
136
- do_sample=True,
137
- top_p=0.85, # Reduced for faster sampling
138
- pad_token_id=tokenizer.pad_token_id,
139
- eos_token_id=tokenizer.eos_token_id
140
- )
 
141
 
142
  elapsed = time.time() - start_time
143
- print(f"Response generated in {elapsed:.2f}s")
144
 
145
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
146
 
 
27
  print("Loading tokenizer...")
28
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
29
 
30
+ print("Loading base model with 8-bit quantization for speed...")
31
  base_model = AutoModelForCausalLM.from_pretrained(
32
  BASE_MODEL,
33
+ load_in_8bit=True, # 8-bit quantization for 2-3x speedup
34
  low_cpu_mem_usage=True,
35
  trust_remote_code=True,
36
  token=HF_TOKEN,
 
41
  model = PeftModel.from_pretrained(base_model, MODEL_NAME, token=HF_TOKEN)
42
  model.eval()
43
 
44
+ # Enable torch compile for faster inference (if available)
45
+ try:
46
+ model = torch.compile(model, mode="reduce-overhead")
47
+ print("Torch compile enabled for faster inference")
48
+ except Exception as e:
49
+ print(f"Torch compile not available: {e}")
50
+
51
  # Device'ı belirle
52
  device = "cuda" if torch.cuda.is_available() else "cpu"
53
  print(f"Using device: {device}")
 
136
  import time
137
  start_time = time.time()
138
 
139
+ # Ultra-fast generation settings for <1s response
140
+ with torch.inference_mode(): # Disable gradient computation for speed
141
+ outputs = model.generate(
142
+ **inputs,
143
+ max_new_tokens=40, # Minimal tokens for ultra-fast response
144
+ do_sample=False, # Greedy decoding (fastest)
145
+ pad_token_id=tokenizer.pad_token_id,
146
+ eos_token_id=tokenizer.eos_token_id,
147
+ use_cache=True # Enable KV cache for faster generation
148
+ )
149
 
150
  elapsed = time.time() - start_time
151
+ print(f"Response generated in {elapsed:.2f}s")
152
 
153
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
154
 
requirements.txt CHANGED
@@ -7,3 +7,4 @@ torch>=2.0.0
7
  accelerate>=0.25.0
8
  sentencepiece>=0.1.99
9
  protobuf>=3.20.0
 
 
7
  accelerate>=0.25.0
8
  sentencepiece>=0.1.99
9
  protobuf>=3.20.0
10
+ bitsandbytes>=0.41.0