Nishan30 commited on
Commit
c57dc09
·
verified ·
1 Parent(s): 3ef49c3

Update app

Browse files
Files changed (1) hide show
  1. app.py +36 -7
app.py CHANGED
@@ -17,6 +17,9 @@ import re
17
  MODEL_REPO = "Nishan30/n8n-workflow-generator" # Update with your HF repo
18
  BASE_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
19
 
 
 
 
20
  # ==============================================================================
21
  # MODEL LOADING
22
  # ==============================================================================
@@ -25,17 +28,35 @@ def load_model():
25
  """Load model once and cache it"""
26
  print("Loading model...")
27
 
28
- # Load base model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  base_model = AutoModelForCausalLM.from_pretrained(
30
  BASE_MODEL,
31
- torch_dtype=torch.float16,
32
- device_map="auto",
33
- trust_remote_code=True
34
  )
35
 
36
  # Load LoRA adapter with error handling for unsupported parameters
37
  try:
38
- model = PeftModel.from_pretrained(base_model, MODEL_REPO)
 
 
 
 
39
  except TypeError as e:
40
  if "unexpected keyword argument" in str(e):
41
  print(f"⚠️ Warning: {e}")
@@ -73,7 +94,11 @@ def load_model():
73
  continue
74
 
75
  # Load from temp directory
76
- model = PeftModel.from_pretrained(base_model, temp_dir)
 
 
 
 
77
 
78
  # Cleanup
79
  shutil.rmtree(temp_dir)
@@ -82,6 +107,10 @@ def load_model():
82
 
83
  tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
84
 
 
 
 
 
85
  print("Model loaded successfully!")
86
  return model, tokenizer
87
 
@@ -488,4 +517,4 @@ if __name__ == "__main__":
488
  server_name="0.0.0.0",
489
  server_port=7860,
490
  share=False
491
- )
 
17
  MODEL_REPO = "Nishan30/n8n-workflow-generator" # Update with your HF repo
18
  BASE_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
19
 
20
+ # Memory optimization: Set to True for 8-bit quantization (uses less memory but slower)
21
+ USE_8BIT = False # Change to True if you get out-of-memory errors
22
+
23
  # ==============================================================================
24
  # MODEL LOADING
25
  # ==============================================================================
 
28
  """Load model once and cache it"""
29
  print("Loading model...")
30
 
31
+ # Prepare model loading kwargs
32
+ model_kwargs = {
33
+ "device_map": "auto",
34
+ "trust_remote_code": True,
35
+ "low_cpu_mem_usage": True,
36
+ "offload_folder": "offload",
37
+ "offload_state_dict": True,
38
+ }
39
+
40
+ # Use 8-bit quantization if enabled (saves memory)
41
+ if USE_8BIT:
42
+ print("Using 8-bit quantization for memory efficiency...")
43
+ model_kwargs["load_in_8bit"] = True
44
+ else:
45
+ model_kwargs["torch_dtype"] = torch.float16
46
+
47
+ # Load base model with memory optimization
48
  base_model = AutoModelForCausalLM.from_pretrained(
49
  BASE_MODEL,
50
+ **model_kwargs
 
 
51
  )
52
 
53
  # Load LoRA adapter with error handling for unsupported parameters
54
  try:
55
+ model = PeftModel.from_pretrained(
56
+ base_model,
57
+ MODEL_REPO,
58
+ offload_folder="offload", # Enable disk offloading for adapter too
59
+ )
60
  except TypeError as e:
61
  if "unexpected keyword argument" in str(e):
62
  print(f"⚠️ Warning: {e}")
 
94
  continue
95
 
96
  # Load from temp directory
97
+ model = PeftModel.from_pretrained(
98
+ base_model,
99
+ temp_dir,
100
+ offload_folder="offload"
101
+ )
102
 
103
  # Cleanup
104
  shutil.rmtree(temp_dir)
 
107
 
108
  tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
109
 
110
+ # Set pad token if not present
111
+ if tokenizer.pad_token is None:
112
+ tokenizer.pad_token = tokenizer.eos_token
113
+
114
  print("Model loaded successfully!")
115
  return model, tokenizer
116
 
 
517
  server_name="0.0.0.0",
518
  server_port=7860,
519
  share=False
520
+ )