eesfeg commited on
Commit
646620f
·
1 Parent(s): 2d1bf43
Files changed (1) hide show
  1. app.py +24 -81
app.py CHANGED
@@ -10,16 +10,15 @@ import gradio as gr
10
 
11
  # =================== CONFIGURATION ===================
12
  MODEL_ID = "abdelac/Mistral_Test"
13
- USE_QUANTIZATION = True # MUST be True for 16GB RAM
14
 
15
  # =================== QUANTIZATION SETUP ===================
16
  if USE_QUANTIZATION:
17
  bnb_config = BitsAndBytesConfig(
18
- load_in_4bit=True, # Critical for memory
19
- bnb_4bit_quant_type="nf4", # 4-bit quantization
20
- bnb_4bit_compute_dtype=torch.float16, # Compute in float16
21
- bnb_4bit_use_double_quant=True, # Extra memory savings
22
- llm_int8_enable_fp32_cpu_offload=True # Offload to CPU if needed
23
  )
24
  else:
25
  bnb_config = None
@@ -33,7 +32,7 @@ def load_model():
33
  # Load tokenizer
34
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
35
 
36
- # Configure model loading based on quantization
37
  load_kwargs = {
38
  "torch_dtype": torch.float16,
39
  "device_map": "auto",
@@ -42,10 +41,10 @@ def load_model():
42
 
43
  if USE_QUANTIZATION:
44
  load_kwargs["quantization_config"] = bnb_config
45
- print("✅ Using 4-bit quantization (~4GB RAM)")
46
  else:
47
  load_kwargs["device_map"] = "cpu"
48
- print("⚠️ Using CPU only (slow but safe)")
49
 
50
  # Load model
51
  model = AutoModelForCausalLM.from_pretrained(
@@ -53,20 +52,20 @@ def load_model():
53
  **load_kwargs
54
  )
55
 
56
- # Set padding token if not present
57
  if tokenizer.pad_token is None:
58
  tokenizer.pad_token = tokenizer.eos_token
59
 
60
- print("✅ Model loaded successfully!")
61
  return tokenizer, model
62
 
63
- # =================== MEMORY-EFFICIENT GENERATION ===================
64
  def generate_text(prompt, max_tokens=100, temperature=0.7):
65
  """Generate text with memory constraints"""
66
  try:
67
  tokenizer, model = load_model()
68
 
69
- # Tokenize with truncation
70
  inputs = tokenizer(
71
  prompt,
72
  return_tensors="pt",
@@ -74,82 +73,33 @@ def generate_text(prompt, max_tokens=100, temperature=0.7):
74
  max_length=512
75
  ).to(model.device)
76
 
77
- # Generate with conservative settings
78
  with torch.no_grad():
79
  outputs = model.generate(
80
  **inputs,
81
- max_new_tokens=min(max_tokens, 150), # Cap at 150
82
  temperature=temperature,
83
  do_sample=True,
84
  pad_token_id=tokenizer.eos_token_id,
85
- repetition_penalty=1.1, # Prevent repetition
86
- no_repeat_ngram_size=2,
87
- early_stopping=True
88
  )
89
 
90
- # Decode
91
- result = tokenizer.decode(outputs[0], skip_special_tokens=True)
92
- return result
93
 
94
- except torch.cuda.OutOfMemoryError:
95
- return "❌ Out of memory! Try reducing max tokens or using CPU mode."
96
  except Exception as e:
97
  return f"❌ Error: {str(e)}"
98
 
99
- # =================== SIMPLIFIED INTERFACE ===================
100
  def create_interface():
101
- """Create memory-aware interface"""
102
- with gr.Blocks(
103
- title="🦅 Mistral Test Demo",
104
- theme=gr.themes.Soft()
105
- ) as demo:
106
 
107
- gr.Markdown(f"""
108
- # 🦅 Mistral Test Demo
 
109
 
110
- **Model:** [{MODEL_ID}](https://huggingface.co/{MODEL_ID})
111
- **Mode:** {'4-bit Quantized' if USE_QUANTIZATION else 'CPU'}
112
 
113
- ⚠️ **Note:** Mistral 7B requires quantization to run in 16GB RAM
114
- """)
115
-
116
- with gr.Row():
117
- prompt = gr.Textbox(
118
- label="Prompt",
119
- placeholder="Enter your text...",
120
- lines=3,
121
- value="What is artificial intelligence?"
122
- )
123
-
124
- with gr.Row():
125
- max_tokens = gr.Slider(
126
- 30, 150, value=80, # Reduced max for memory
127
- label="Max Tokens",
128
- info="Higher values use more memory"
129
- )
130
- temperature = gr.Slider(
131
- 0.1, 1.0, value=0.7,
132
- label="Temperature"
133
- )
134
-
135
- generate_btn = gr.Button("Generate", variant="primary", size="lg")
136
-
137
- output = gr.Textbox(
138
- label="Generated Text",
139
- lines=8,
140
- show_copy_button=True
141
- )
142
-
143
- # Memory warning
144
- gr.Markdown("""
145
- ### 💡 Memory Optimization Tips:
146
- 1. **Max Tokens ≤ 100** for best results
147
- 2. **Temperature ~0.7** for balanced output
148
- 3. If OOM occurs, refresh the page
149
- 4. Close other tabs/applications
150
- """)
151
-
152
- # Connect button
153
  generate_btn.click(
154
  fn=generate_text,
155
  inputs=[prompt, max_tokens, temperature],
@@ -160,17 +110,10 @@ def create_interface():
160
 
161
  # =================== MAIN ===================
162
  if __name__ == "__main__":
163
- # Suppress warnings
164
  warnings.filterwarnings("ignore")
165
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
166
-
167
- # Create and launch
168
  demo = create_interface()
169
  demo.launch(
170
  server_name="0.0.0.0",
171
  server_port=7860,
172
- share=False,
173
- quiet=True,
174
- debug=False,
175
- show_error=True
176
  )
 
10
 
11
  # =================== CONFIGURATION ===================
12
  MODEL_ID = "abdelac/Mistral_Test"
13
+ USE_QUANTIZATION = True
14
 
15
  # =================== QUANTIZATION SETUP ===================
16
  if USE_QUANTIZATION:
17
  bnb_config = BitsAndBytesConfig(
18
+ load_in_4bit=True,
19
+ bnb_4bit_quant_type="nf4",
20
+ bnb_4bit_compute_dtype=torch.float16,
21
+ bnb_4bit_use_double_quant=True,
 
22
  )
23
  else:
24
  bnb_config = None
 
32
  # Load tokenizer
33
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
34
 
35
+ # Configure model loading
36
  load_kwargs = {
37
  "torch_dtype": torch.float16,
38
  "device_map": "auto",
 
41
 
42
  if USE_QUANTIZATION:
43
  load_kwargs["quantization_config"] = bnb_config
44
+ print("✅ Using 4-bit quantization")
45
  else:
46
  load_kwargs["device_map"] = "cpu"
47
+ print("⚠️ Using CPU only")
48
 
49
  # Load model
50
  model = AutoModelForCausalLM.from_pretrained(
 
52
  **load_kwargs
53
  )
54
 
55
+ # Set padding token
56
  if tokenizer.pad_token is None:
57
  tokenizer.pad_token = tokenizer.eos_token
58
 
59
+ print("✅ Model loaded!")
60
  return tokenizer, model
61
 
62
+ # =================== GENERATION FUNCTION ===================
63
  def generate_text(prompt, max_tokens=100, temperature=0.7):
64
  """Generate text with memory constraints"""
65
  try:
66
  tokenizer, model = load_model()
67
 
68
+ # Tokenize
69
  inputs = tokenizer(
70
  prompt,
71
  return_tensors="pt",
 
73
  max_length=512
74
  ).to(model.device)
75
 
76
+ # Generate
77
  with torch.no_grad():
78
  outputs = model.generate(
79
  **inputs,
80
+ max_new_tokens=min(max_tokens, 150),
81
  temperature=temperature,
82
  do_sample=True,
83
  pad_token_id=tokenizer.eos_token_id,
 
 
 
84
  )
85
 
86
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
87
 
 
 
88
  except Exception as e:
89
  return f"❌ Error: {str(e)}"
90
 
91
+ # =================== SIMPLE INTERFACE ===================
92
  def create_interface():
93
+ with gr.Blocks(title="Mistral Demo") as demo:
94
+ gr.Markdown(f"# 🦅 {MODEL_ID}")
 
 
 
95
 
96
+ prompt = gr.Textbox(label="Prompt", lines=3, value="Hello")
97
+ max_tokens = gr.Slider(30, 150, value=80, label="Max Tokens")
98
+ temperature = gr.Slider(0.1, 1.0, value=0.7, label="Temperature")
99
 
100
+ generate_btn = gr.Button("Generate", variant="primary")
101
+ output = gr.Textbox(label="Output", lines=6)
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  generate_btn.click(
104
  fn=generate_text,
105
  inputs=[prompt, max_tokens, temperature],
 
110
 
111
  # =================== MAIN ===================
112
  if __name__ == "__main__":
 
113
  warnings.filterwarnings("ignore")
 
 
 
114
  demo = create_interface()
115
  demo.launch(
116
  server_name="0.0.0.0",
117
  server_port=7860,
118
+ quiet=True
 
 
 
119
  )