druvx13 commited on
Commit
9b561b2
·
verified ·
1 Parent(s): 980a21a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -15
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
  import torch
4
  import os
5
 
6
  # 🔧 CPU Optimization Suite
7
- os.environ["OMP_NUM_THREADS"] = "4" # Match your physical core count
8
  os.environ["MKL_NUM_THREADS"] = "4"
9
  torch.set_num_threads(4)
10
  torch.manual_seed(42)
@@ -13,38 +13,37 @@ torch.manual_seed(42)
13
  MODEL_NAME = "openai-community/openai-gpt"
14
  cache_dir = "./model_cache"
15
 
16
- # 🧠 Load Model with Surgical Precision
17
  tokenizer = AutoTokenizer.from_pretrained(
18
  MODEL_NAME,
19
  cache_dir=cache_dir,
20
  padding_side="left"
21
  )
22
 
 
 
 
 
 
 
23
  model = AutoModelForCausalLM.from_pretrained(
24
  MODEL_NAME,
25
- torch_dtype=torch.float32, # FP32 for CPU stability
26
  low_cpu_mem_usage=True,
27
  cache_dir=cache_dir
28
  ).eval()
29
 
30
- # 🚀 Create CPU-Optimized Pipeline
31
- text_generator = pipeline(
32
- "text-generation",
33
- model=model,
34
- tokenizer=tokenizer,
35
- device=-1 # Explicit CPU usage
36
- )
37
-
38
  def generate_response(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9, num_sequences=1):
39
  """Optimized for 18GB CPU with strict memory control"""
40
  try:
41
- # 🛡️ Input Protection
42
  inputs = tokenizer(
43
  prompt,
44
  return_tensors="pt",
45
  truncation=True,
46
  max_length=512,
47
- padding="max_length"
 
48
  )
49
 
50
  with torch.inference_mode():
@@ -55,7 +54,7 @@ def generate_response(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9, nu
55
  top_p=float(top_p),
56
  do_sample=True,
57
  num_return_sequences=int(num_sequences),
58
- pad_token_id=tokenizer.eos_token_id,
59
  eos_token_id=tokenizer.eos_token_id
60
  )
61
 
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
  import os
5
 
6
  # 🔧 CPU Optimization Suite
7
+ os.environ["OMP_NUM_THREADS"] = "4"
8
  os.environ["MKL_NUM_THREADS"] = "4"
9
  torch.set_num_threads(4)
10
  torch.manual_seed(42)
 
13
  MODEL_NAME = "openai-community/openai-gpt"
14
  cache_dir = "./model_cache"
15
 
16
+ # 🧠 Load Tokenizer with Padding Fix
17
  tokenizer = AutoTokenizer.from_pretrained(
18
  MODEL_NAME,
19
  cache_dir=cache_dir,
20
  padding_side="left"
21
  )
22
 
23
+ # ✅ Add pad_token if missing (required for batched generation)
24
+ if tokenizer.pad_token is None:
25
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
26
+ tokenizer.pad_token = tokenizer.eos_token # Fallback to EOS as pad
27
+
28
+ # 🧠 Load Model with CPU-specific settings
29
  model = AutoModelForCausalLM.from_pretrained(
30
  MODEL_NAME,
31
+ torch_dtype=torch.float32,
32
  low_cpu_mem_usage=True,
33
  cache_dir=cache_dir
34
  ).eval()
35
 
 
 
 
 
 
 
 
 
36
  def generate_response(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9, num_sequences=1):
37
  """Optimized for 18GB CPU with strict memory control"""
38
  try:
39
+ # 🛡️ Input Protection with explicit padding
40
  inputs = tokenizer(
41
  prompt,
42
  return_tensors="pt",
43
  truncation=True,
44
  max_length=512,
45
+ padding="max_length",
46
+ pad_to_multiple_of=8
47
  )
48
 
49
  with torch.inference_mode():
 
54
  top_p=float(top_p),
55
  do_sample=True,
56
  num_return_sequences=int(num_sequences),
57
+ pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
58
  eos_token_id=tokenizer.eos_token_id
59
  )
60