anaspro commited on
Commit
6d60e00
·
1 Parent(s): 151da18
Files changed (1) hide show
  1. app.py +51 -28
app.py CHANGED
@@ -3,7 +3,7 @@
3
  import os
4
  import torch
5
  import transformers
6
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
7
  import gradio as gr
8
  import spaces
9
 
@@ -24,36 +24,59 @@ hf_token = os.getenv("HF_TOKEN")
24
 
25
  # Initialize model and tokenizer separately for better control
26
  print("Loading model and tokenizer...")
27
- tokenizer = AutoTokenizer.from_pretrained(
28
- model_path,
29
- token=hf_token,
30
- trust_remote_code=True
31
- )
 
32
 
33
- model = AutoModelForCausalLM.from_pretrained(
34
- model_path,
35
- device_map="auto",
36
- token=hf_token,
37
- trust_remote_code=True,
38
- torch_dtype=torch.bfloat16,
39
- low_cpu_mem_usage=True,
40
- quantization_config={
41
- "load_in_4bit": True,
42
- "bnb_4bit_use_double_quant": True,
43
- "bnb_4bit_quant_type": "nf4",
44
- "bnb_4bit_compute_dtype": torch.bfloat16
45
- }
46
- )
 
 
 
 
 
47
 
48
- # Create pipeline with the loaded model
49
- pipeline_model = pipeline(
50
- "text-generation",
51
- model=model,
52
- tokenizer=tokenizer,
53
- device_map="auto"
54
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- print("Model loaded successfully!")
57
 
58
  def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
59
  """Generate response using the pipeline with messages format"""
 
3
  import os
4
  import torch
5
  import transformers
6
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
  import gradio as gr
8
  import spaces
9
 
 
24
 
25
  # Initialize model and tokenizer separately for better control
26
  print("Loading model and tokenizer...")
27
+ try:
28
+ tokenizer = AutoTokenizer.from_pretrained(
29
+ model_path,
30
+ token=hf_token,
31
+ trust_remote_code=True
32
+ )
33
 
34
+ # Load model with proper quantization config
35
+ from transformers import BitsAndBytesConfig
36
+
37
+ bnb_config = BitsAndBytesConfig(
38
+ load_in_4bit=True,
39
+ bnb_4bit_use_double_quant=True,
40
+ bnb_4bit_quant_type="nf4",
41
+ bnb_4bit_compute_dtype=torch.bfloat16
42
+ )
43
+
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_path,
46
+ quantization_config=bnb_config,
47
+ device_map="auto",
48
+ token=hf_token,
49
+ trust_remote_code=True,
50
+ torch_dtype=torch.bfloat16,
51
+ low_cpu_mem_usage=True
52
+ )
53
 
54
+ # Create pipeline with the loaded model
55
+ pipeline_model = pipeline(
56
+ "text-generation",
57
+ model=model,
58
+ tokenizer=tokenizer
59
+ )
60
+
61
+ print("Model loaded successfully!")
62
+
63
+ except Exception as e:
64
+ print(f"Error loading model: {e}")
65
+ # Fallback to direct pipeline loading
66
+ print("Trying alternative loading method...")
67
+ pipeline_model = pipeline(
68
+ "text-generation",
69
+ model=model_path,
70
+ token=hf_token,
71
+ trust_remote_code=True,
72
+ model_kwargs={
73
+ "torch_dtype": torch.bfloat16,
74
+ "low_cpu_mem_usage": True,
75
+ }
76
+ )
77
+ tokenizer = pipeline_model.tokenizer
78
+ print("Model loaded with fallback method!")
79
 
 
80
 
81
  def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
82
  """Generate response using the pipeline with messages format"""