jeanbaptdzd commited on
Commit
cf73c6e
·
verified ·
1 Parent(s): 0aef7e8

Upload app_config_hf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app_config_hf.py +340 -0
app_config_hf.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Spaces compatible configuration for Dragon-3B model
3
+ No Pydantic dependencies - pure Python dicts
4
+ """
5
+
6
+ import os
7
+ import torch
8
+ import gc
9
+ import logging
10
+ from typing import Dict, Any, Optional
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
+ from huggingface_hub import login
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Global variables for model and tokenizer
17
+ model = None
18
+ tokenizer = None
19
+ pipe = None
20
+ model_loaded = False
21
+ current_model_name = None
22
+
23
+ # Updated Dragon configuration based on latest model
24
+ # Performance optimizations enabled:
25
+ # - flash-attn: Memory-efficient attention computation
26
+ # - flash-linear-attention: Gated DeltaNet Triton kernels
27
+ # - causal-conv1d: Short convolution for Gated DeltaNet layer
28
+ # - attn_implementation="flash_attention_2": Uses flash attention when available
29
+ DRAGON_CONFIG = {
30
+ "model_id": "DragonLLM/Dragon-3B-Base-alpha",
31
+ "display_name": "Dragon-3B-Base-alpha",
32
+ "architecture": "DragonForCausalLM",
33
+ "tokenizer": {
34
+ "eos_token": "<|endoftext|>",
35
+ "bos_token": "<|beginoftext|>",
36
+ "pad_token": "<|pad|>",
37
+ "unk_token": "<|unk|>",
38
+ "eos_token_id": 0,
39
+ "bos_token_id": 0,
40
+ "pad_token_id": 0,
41
+ "eot_token_id": 0,
42
+ "vocab_size": 196736,
43
+ "model_max_length": 8192
44
+ },
45
+ "generation": {
46
+ "eos_tokens": [0],
47
+ "bos_token_id": 0,
48
+ "temperature": 0.6,
49
+ "top_p": 0.9,
50
+ "max_new_tokens": 150,
51
+ "repetition_penalty": 1.05,
52
+ "no_repeat_ngram_size": 2,
53
+ "early_stopping": False,
54
+ "min_length": 50,
55
+ "do_sample": True,
56
+ "use_cache": True,
57
+ "pad_token_id": 0
58
+ }
59
+ }
60
+
61
+ def get_app_settings() -> Dict[str, Any]:
62
+ """Get application settings - simple dict."""
63
+ return {
64
+ "model_name": "dragon-3b-base-alpha",
65
+ "hf_token_dragon": os.getenv("HF_TOKEN_DRAGON"),
66
+ "debug": False
67
+ }
68
+
69
+ def get_model_config(model_name: str) -> Dict[str, Any]:
70
+ """Get model configuration - simple dict."""
71
+ return DRAGON_CONFIG
72
+
73
+ def cleanup_model_memory():
74
+ """Clean up model memory."""
75
+ global model, tokenizer, pipe, model_loaded, current_model_name
76
+
77
+ if model is not None:
78
+ del model
79
+ model = None
80
+
81
+ if tokenizer is not None:
82
+ del tokenizer
83
+ tokenizer = None
84
+
85
+ if pipe is not None:
86
+ del pipe
87
+ pipe = None
88
+
89
+ gc.collect()
90
+ if torch.cuda.is_available():
91
+ torch.cuda.empty_cache()
92
+
93
+ model_loaded = False
94
+ current_model_name = None
95
+ logger.info("✅ Model memory cleaned")
96
+
97
+ def load_linguacustodia_model() -> bool:
98
+ """Load the Dragon model."""
99
+ global model, tokenizer, pipe, model_loaded, current_model_name
100
+
101
+ if model_loaded and model is not None:
102
+ logger.info(f"✅ Model '{current_model_name}' already loaded")
103
+ return True
104
+
105
+ settings = get_app_settings()
106
+ hf_token_dragon = settings["hf_token_dragon"]
107
+ model_config = get_model_config(settings["model_name"])
108
+ model_id = model_config["model_id"]
109
+
110
+ if not hf_token_dragon:
111
+ logger.error("❌ HF_TOKEN_DRAGON not found in environment")
112
+ return False
113
+
114
+ try:
115
+ logger.info(f"🐉 Initializing {model_config['display_name']} model...")
116
+ login(token=hf_token_dragon, add_to_git_credential=False)
117
+ logger.info("✅ Authenticated with HuggingFace")
118
+
119
+ logger.info(f"🚀 Loading {model_id} with CUDA support...")
120
+
121
+ # Determine device and dtype for CUDA - use bfloat16 as per model config
122
+ if torch.cuda.is_available():
123
+ torch_dtype = torch.bfloat16 # Model config specifies bfloat16
124
+ device_map = "auto" # Let accelerate handle device placement
125
+ logger.info(f"⚡ Using CUDA with {torch.cuda.get_device_name(0)} and bfloat16")
126
+ else:
127
+ torch_dtype = torch.float32
128
+ device_map = None # Use CPU
129
+ logger.warning("⚠️ CUDA not available, falling back to CPU with float32")
130
+
131
+ # Check if HF_HOME is set for caching
132
+ hf_home = os.getenv("HF_HOME")
133
+ if hf_home:
134
+ logger.info(f"📁 Using HF_HOME cache: {hf_home}")
135
+ else:
136
+ logger.info("📁 Using default HF cache location")
137
+
138
+ tokenizer = AutoTokenizer.from_pretrained(
139
+ model_id,
140
+ token=hf_token_dragon,
141
+ trust_remote_code=True,
142
+ cache_dir=hf_home if hf_home else None
143
+ )
144
+
145
+ model = AutoModelForCausalLM.from_pretrained(
146
+ model_id,
147
+ token=hf_token_dragon,
148
+ dtype=torch_dtype, # Use dtype instead of torch_dtype
149
+ device_map=device_map,
150
+ trust_remote_code=True,
151
+ low_cpu_mem_usage=True,
152
+ cache_dir=hf_home if hf_home else None,
153
+ attn_implementation="flash_attention_2" if torch.cuda.is_available() else None # Use flash attention when available
154
+ )
155
+
156
+ # Create pipeline with proper device handling
157
+ if device_map == "auto":
158
+ # When using device_map="auto", don't specify device in pipeline
159
+ pipe = pipeline(
160
+ "text-generation",
161
+ model=model,
162
+ tokenizer=tokenizer,
163
+ dtype=torch_dtype # Use dtype instead of torch_dtype
164
+ )
165
+ else:
166
+ # For CPU, specify device explicitly
167
+ pipe = pipeline(
168
+ "text-generation",
169
+ model=model,
170
+ tokenizer=tokenizer,
171
+ dtype=torch_dtype, # Use dtype instead of torch_dtype
172
+ device=-1 # CPU
173
+ )
174
+
175
+ model_loaded = True
176
+ current_model_name = model_config["display_name"]
177
+ device_name = "CUDA" if torch.cuda.is_available() else "CPU"
178
+ logger.info(f"✅ Dragon model loaded successfully with {device_name}!")
179
+ return True
180
+
181
+ except Exception as e:
182
+ logger.error(f"❌ Failed to load model: {e}")
183
+ cleanup_model_memory()
184
+ return False
185
+
186
+ def run_inference(prompt: str, max_new_tokens: int = 150, temperature: float = 0.6) -> Dict[str, Any]:
187
+ """Run inference with the loaded model."""
188
+ global pipe, model, tokenizer, model_loaded, current_model_name
189
+
190
+ if not model_loaded or pipe is None or tokenizer is None:
191
+ raise RuntimeError("Model not loaded")
192
+
193
+ try:
194
+ logger.info(f"🧪 Generating inference for: '{prompt[:50]}...'")
195
+
196
+ pipe.max_new_tokens = max_new_tokens
197
+ pipe.temperature = temperature
198
+
199
+ if hasattr(model, 'generation_config'):
200
+ settings = get_app_settings()
201
+ model_config = get_model_config(settings["model_name"])
202
+
203
+ model.generation_config.eos_token_id = model_config["generation"]["eos_tokens"]
204
+ model.generation_config.early_stopping = model_config["generation"]["early_stopping"]
205
+ model.generation_config.min_length = model_config["generation"]["min_length"]
206
+
207
+ logger.info(f"🔧 Using model-specific EOS tokens: {model_config['generation']['eos_tokens']}")
208
+ logger.info("🔧 Applied anti-truncation measures")
209
+
210
+ # Tokenize input to get proper length for attention mask
211
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
212
+ input_length = inputs['input_ids'].shape[1]
213
+
214
+ # Ensure inputs are on the same device and dtype as the model
215
+ if hasattr(model, 'device'):
216
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
217
+
218
+ # Ensure model is in eval mode
219
+ model.eval()
220
+
221
+ # Generate with proper attention mask handling
222
+ result = pipe(
223
+ prompt,
224
+ max_new_tokens=max_new_tokens,
225
+ temperature=temperature,
226
+ return_full_text=False,
227
+ use_cache=False,
228
+ truncation=True,
229
+ max_length=input_length + max_new_tokens,
230
+ do_sample=True,
231
+ pad_token_id=tokenizer.eos_token_id
232
+ )
233
+
234
+ if result and len(result) > 0:
235
+ response_text = result[0]['generated_text']
236
+ tokens_generated = len(tokenizer.encode(response_text))
237
+ else:
238
+ raise RuntimeError("No response generated")
239
+
240
+ settings = get_app_settings()
241
+ model_config = get_model_config(settings["model_name"])
242
+
243
+ generation_params = {
244
+ "max_new_tokens": max_new_tokens,
245
+ "temperature": temperature,
246
+ "eos_token_id": model_config["generation"]["eos_tokens"],
247
+ "early_stopping": model_config["generation"]["early_stopping"],
248
+ "min_length": model_config["generation"]["min_length"],
249
+ "repetition_penalty": model_config["generation"]["repetition_penalty"],
250
+ "respectful_approach": True,
251
+ "storage_enabled": True,
252
+ "model_specific_config": True
253
+ }
254
+
255
+ logger.info(f"✅ Generated {tokens_generated} tokens with RESPECTFUL official config")
256
+
257
+ return {
258
+ "response": response_text,
259
+ "model_used": current_model_name,
260
+ "success": True,
261
+ "tokens_generated": tokens_generated,
262
+ "generation_params": generation_params
263
+ }
264
+
265
+ except Exception as e:
266
+ logger.error(f"❌ Inference error: {e}")
267
+
268
+ # If it's a block mask error, try with different parameters
269
+ if "block_mask" in str(e):
270
+ logger.warning("🔧 Block mask error detected, trying with adjusted parameters...")
271
+ try:
272
+ # Retry with shorter sequence and no cache
273
+ result = pipe(
274
+ prompt,
275
+ max_new_tokens=min(max_new_tokens, 100),
276
+ temperature=temperature,
277
+ return_full_text=False,
278
+ use_cache=False,
279
+ truncation=True,
280
+ max_length=1024
281
+ )
282
+
283
+ if result and len(result) > 0:
284
+ response_text = result[0]['generated_text']
285
+ tokens_generated = len(tokenizer.encode(response_text))
286
+ logger.info(f"✅ Generated {tokens_generated} tokens (retry)")
287
+ return {
288
+ "response": response_text,
289
+ "model_used": current_model_name,
290
+ "success": True,
291
+ "tokens_generated": tokens_generated,
292
+ "generation_params": {"retry": True, "reason": "block_mask_fix"}
293
+ }
294
+ except Exception as retry_error:
295
+ logger.error(f"❌ Retry inference error: {retry_error}")
296
+
297
+ return {
298
+ "response": "",
299
+ "model_used": current_model_name,
300
+ "success": False,
301
+ "tokens_generated": 0,
302
+ "generation_params": {},
303
+ "error": str(e)
304
+ }
305
+
306
+ def get_gpu_memory_info() -> Dict[str, Any]:
307
+ """Get detailed GPU memory usage."""
308
+ if not torch.cuda.is_available():
309
+ return {"gpu_available": False}
310
+
311
+ try:
312
+ # Get current GPU device
313
+ device = torch.cuda.current_device()
314
+ gpu_name = torch.cuda.get_device_name(device)
315
+
316
+ # Get total memory
317
+ total_memory = torch.cuda.get_device_properties(device).total_memory
318
+ total_memory_gb = total_memory / (1024**3)
319
+
320
+ # Get allocated and reserved memory
321
+ allocated_memory = torch.cuda.memory_allocated(device)
322
+ reserved_memory = torch.cuda.memory_reserved(device)
323
+
324
+ allocated_memory_gb = allocated_memory / (1024**3)
325
+ reserved_memory_gb = reserved_memory / (1024**3)
326
+
327
+ # Calculate free memory (approximate)
328
+ free_memory_gb = total_memory_gb - allocated_memory_gb
329
+
330
+ return {
331
+ "gpu_available": True,
332
+ "gpu_name": gpu_name,
333
+ "gpu_memory_total": f"{total_memory_gb:.2f} GB",
334
+ "gpu_memory_allocated": f"{allocated_memory_gb:.2f} GB",
335
+ "gpu_memory_reserved": f"{reserved_memory_gb:.2f} GB",
336
+ "gpu_memory_free": f"{free_memory_gb:.2f} GB"
337
+ }
338
+ except Exception as e:
339
+ logger.error(f"Error getting GPU memory info: {e}")
340
+ return {"gpu_available": False, "error": str(e)}