Aditi132 commited on
Commit
01dc3a8
·
verified ·
1 Parent(s): e29e481

Upload 2 files

Browse files
Files changed (2) hide show
  1. router_app.py +378 -0
  2. router_tracing.log +27 -0
router_app.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # router_app.py
2
+ import os
3
+ import time
4
+ import json
5
+ import torch
6
+ import logging
7
+ from fastapi import FastAPI, HTTPException
8
+ from pydantic import BaseModel
9
+ from transformers import (
10
+ AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ BitsAndBytesConfig,
13
+ pipeline,
14
+ AutoConfig
15
+ )
16
+ from datetime import datetime
17
+ import psutil
18
+ import math
19
+
20
+ # Configure logging
21
+ logging.basicConfig(
22
+ filename='router_tracing.log',
23
+ level=logging.INFO,
24
+ format='%(asctime)s - %(levelname)s - %(message)s'
25
+ )
26
+
27
+ # Initialize FastAPI app
28
+ app = FastAPI(title="System1/System2 Router", version="1.0")
29
+
30
+ # Free & Open Models (Apache 2.0 / MIT licensed)
31
+ SYSTEM1_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # 1.1B proxy for distilled Llama3-1B
32
+ SYSTEM2_MODEL = "HuggingFaceH4/zephyr-7b-beta" # 7B proxy for Llama3-8B
33
+
34
+ def estimate_model_memory(model_id, bits=4, is_system2=False):
35
+ """Estimate memory requirements for a model with given quantization"""
36
+ try:
37
+ config = AutoConfig.from_pretrained(model_id)
38
+ total_params = sum(p.numel() for p in config.to_dict().values() if isinstance(p, int))
39
+
40
+ # Rough estimation: params * bytes per param + overhead
41
+ if bits == 4:
42
+ bytes_per_param = 0.5 # 4-bit = 0.5 bytes
43
+ elif bits == 8:
44
+ bytes_per_param = 1
45
+ else:
46
+ bytes_per_param = 2 # For 16-bit
47
+
48
+ base_memory = total_params * bytes_per_param
49
+ # Add 20% overhead for activations and other components
50
+ total_memory = base_memory * 1.2
51
+
52
+ # Convert to GB
53
+ return total_memory / (1024 ** 3)
54
+ except:
55
+ # Fallback estimates
56
+ if is_system2:
57
+ return 6.0 if bits == 4 else 14.0 # 7B model
58
+ else:
59
+ return 1.2 if bits == 8 else 2.5 # 1.1B model
60
+
61
+ def get_device_map(model_id, bits=4, is_system2=False):
62
+ """Create an appropriate device map based on available resources"""
63
+ cuda_available = torch.cuda.is_available()
64
+ total_memory = psutil.virtual_memory().total / (1024 ** 3) # GB
65
+
66
+ # Estimate model size
67
+ estimated_size = estimate_model_memory(model_id, bits, is_system2)
68
+
69
+ # Log resource situation
70
+ logging.info(f"System memory: {total_memory:.2f}GB, Estimated model size: {estimated_size:.2f}GB")
71
+
72
+ if not cuda_available:
73
+ logging.warning("No GPU detected - using CPU only")
74
+ return "cpu"
75
+
76
+ # Get GPU memory
77
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) # GB
78
+
79
+ # Check if model fits on GPU
80
+ if estimated_size < gpu_memory * 0.8: # Keep 20% buffer
81
+ return "auto"
82
+
83
+ # Create custom device map for CPU offloading
84
+ logging.warning(f"Model size ({estimated_size:.2f}GB) exceeds GPU capacity ({gpu_memory:.2f}GB). Using CPU offloading.")
85
+
86
+ if bits == 4:
87
+ # For 4-bit models, we can keep most layers on GPU but offload some
88
+ return {
89
+ "": 0, # Default to GPU for most layers
90
+ "model.layers.0": "cpu", # Offload first layer to CPU
91
+ "model.layers.1": "cpu", # Offload second layer to CPU
92
+ "model.norm": "cpu", # Offload normalization to CPU
93
+ "lm_head": "cpu" # Offload head to CPU
94
+ }
95
+ else:
96
+ # For 8-bit models, more aggressive offloading
97
+ return "cpu"
98
+
99
+ # Model loading with quantization
100
+ def load_quantized_model(model_id, is_system2=False):
101
+ """Load 4-bit quantized model with proper memory management"""
102
+ try:
103
+ compute_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
104
+ device_map = get_device_map(model_id, bits=4 if is_system2 else 8, is_system2=is_system2)
105
+
106
+ # Create quantization config
107
+ if is_system2:
108
+ quant_config = BitsAndBytesConfig(
109
+ load_in_4bit=True,
110
+ bnb_4bit_quant_type="nf4",
111
+ bnb_4bit_compute_dtype=compute_dtype,
112
+ bnb_4bit_use_double_quant=True,
113
+ llm_int8_enable_fp32_cpu_offload=True # Enable CPU offloading
114
+ )
115
+ else:
116
+ quant_config = BitsAndBytesConfig(
117
+ load_in_8bit=True,
118
+ llm_int8_enable_fp32_cpu_offload=True, # Enable CPU offloading
119
+ llm_int8_threshold=6.0
120
+ )
121
+
122
+ # Load tokenizer first
123
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
124
+ if not tokenizer.pad_token:
125
+ tokenizer.pad_token = tokenizer.eos_token
126
+
127
+ # Load model with device mapping and CPU offloading
128
+ model = AutoModelForCausalLM.from_pretrained(
129
+ model_id,
130
+ quantization_config=quant_config,
131
+ device_map=device_map,
132
+ offload_folder="offload_folder",
133
+ trust_remote_code=True,
134
+ low_cpu_mem_usage=True
135
+ )
136
+
137
+ logging.info(f"Successfully loaded {model_id} with device_map: {device_map}")
138
+ return tokenizer, model
139
+
140
+ except Exception as e:
141
+ logging.error(f"Model load failed for {model_id}: {str(e)}")
142
+ # Fallback to CPU if GPU loading fails
143
+ if "out of memory" in str(e).lower() or "oom" in str(e).lower():
144
+ logging.warning("GPU memory insufficient. Falling back to CPU loading.")
145
+ try:
146
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
147
+ if not tokenizer.pad_token:
148
+ tokenizer.pad_token = tokenizer.eos_token
149
+
150
+ model = AutoModelForCausalLM.from_pretrained(
151
+ model_id,
152
+ device_map="cpu",
153
+ trust_remote_code=True
154
+ )
155
+ logging.info(f"Fallback CPU loading succeeded for {model_id}")
156
+ return tokenizer, model
157
+ except Exception as cpu_e:
158
+ logging.error(f"CPU fallback also failed: {str(cpu_e)}")
159
+ raise
160
+ raise
161
+
162
+ # Load models at startup with better memory management
163
+ print("Loading quantized models with memory optimization...")
164
+ tokenizer1, model1 = load_quantized_model(SYSTEM1_MODEL)
165
+ tokenizer2, model2 = load_quantized_model(SYSTEM2_MODEL, is_system2=True)
166
+ print("Models loaded successfully!")
167
+
168
+ # Pipeline generators with memory-efficient settings
169
+ system1_pipe = pipeline(
170
+ "text-generation",
171
+ model=model1,
172
+ tokenizer=tokenizer1,
173
+ max_new_tokens=128,
174
+ do_sample=True,
175
+ temperature=0.7,
176
+ pad_token_id=tokenizer1.eos_token_id,
177
+ device_map="auto"
178
+ )
179
+
180
+ system2_pipe = pipeline(
181
+ "text-generation",
182
+ model=model2,
183
+ tokenizer=tokenizer2,
184
+ max_new_tokens=256,
185
+ do_sample=True,
186
+ temperature=0.8,
187
+ pad_token_id=tokenizer2.eos_token_id,
188
+ device_map="auto",
189
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
190
+ )
191
+
192
+ # Router components
193
+ COMPLEX_KEYWORDS = {
194
+ 'explain', 'why', 'how', 'compare', 'analyze', 'reason',
195
+ 'steps', 'detailed', 'difference', 'advantage', 'disadvantage',
196
+ 'calculate', 'derive', 'formula', 'math', 'equation'
197
+ }
198
+
199
+ def is_semantically_complex(query: str) -> bool:
200
+ """Rule-based semantic complexity check"""
201
+ lower_query = query.lower()
202
+ tokens = lower_query.split()
203
+
204
+ # Check for complex keywords
205
+ if any(keyword in lower_query for keyword in COMPLEX_KEYWORDS):
206
+ return True
207
+
208
+ # Check length threshold (tokens)
209
+ if len(tokens) > 15: # Lowered threshold for better routing
210
+ return True
211
+
212
+ # Check question complexity patterns
213
+ if any(pattern in lower_query for pattern in ['vs', 'versus', 'pros and cons', 'advantages and disadvantages', 'compare and contrast']):
214
+ return True
215
+
216
+ return False
217
+
218
+ def calculate_entropy(response):
219
+ """Calculate average token entropy from generation scores"""
220
+ try:
221
+ # Handle different pipeline outputs
222
+ if isinstance(response, dict):
223
+ scores = response.get("scores", [])
224
+ elif isinstance(response, list) and len(response) > 0:
225
+ scores = response[0].get("scores", [])
226
+ else:
227
+ scores = []
228
+
229
+ entropies = []
230
+
231
+ for item in scores:
232
+ # Handle different score formats
233
+ if isinstance(item, tuple):
234
+ logits = item[0]
235
+ elif hasattr(item, 'logits'):
236
+ logits = item.logits
237
+ else:
238
+ logits = item
239
+
240
+ if isinstance(logits, torch.Tensor):
241
+ probs = torch.softmax(logits, dim=-1)
242
+ entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()
243
+ entropies.append(entropy)
244
+
245
+ return sum(entropies) / len(entropies) if entropies else 0
246
+ except Exception as e:
247
+ logging.warning(f"Entropy calculation failed: {str(e)}")
248
+ return 0 # Fallback if scores unavailable
249
+
250
+ # Request/Response models
251
+ class QueryRequest(BaseModel):
252
+ text: str
253
+
254
+ class RouterResponse(BaseModel):
255
+ response: str
256
+ model_used: str
257
+ routing_reason: str
258
+ latency_ms: float
259
+ entropy: float = None
260
+
261
+ # Core routing logic
262
+ @app.post("/query", response_model=RouterResponse)
263
+ async def route_query(request: QueryRequest):
264
+ start_time = time.time()
265
+ query = request.text.strip()
266
+
267
+ if not query:
268
+ raise HTTPException(status_code=400, detail="Empty query")
269
+
270
+ routing_reason = ""
271
+ entropy = 0.0
272
+ used_model = ""
273
+ result_text = ""
274
+
275
+ try:
276
+ # Step 1: Semantic complexity check
277
+ if is_semantically_complex(query):
278
+ routing_reason = "semantic_complexity"
279
+ response = system2_pipe(query, max_new_tokens=150)
280
+ used_model = "system2"
281
+
282
+ # Step 2: Simple query path with entropy fallback
283
+ else:
284
+ # Format for TinyLlama chat template
285
+ formatted_query = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\n{query}</s>\n<|assistant|>\n"
286
+
287
+ # Generate with entropy tracking
288
+ response = system1_pipe(
289
+ formatted_query,
290
+ max_new_tokens=100,
291
+ return_full_text=False,
292
+ pad_token_id=tokenizer1.eos_token_id
293
+ )
294
+
295
+ # Extract text response
296
+ if isinstance(response, list) and len(response) > 0:
297
+ generated_text = response[0]['generated_text'].strip()
298
+ # Remove the prompt part if it's included
299
+ result_text = generated_text.replace(formatted_query, "").strip()
300
+ else:
301
+ result_text = str(response).strip()
302
+
303
+ used_model = "system1"
304
+ routing_reason = "simple_query"
305
+
306
+ # If we didn't get result_text from the simple path
307
+ if not result_text and used_model == "system2":
308
+ if isinstance(response, list) and len(response) > 0:
309
+ result_text = response[0]['generated_text'].replace(query, "", 1).strip()
310
+
311
+ # Tracing
312
+ trace_data = {
313
+ "timestamp": datetime.utcnow().isoformat(),
314
+ "query": query,
315
+ "model_used": used_model,
316
+ "routing_reason": routing_reason,
317
+ "entropy": entropy,
318
+ "response": result_text
319
+ }
320
+ logging.info(json.dumps(trace_data))
321
+
322
+ # Calculate latency
323
+ latency_ms = (time.time() - start_time) * 1000
324
+
325
+ return RouterResponse(
326
+ response=result_text,
327
+ model_used=used_model,
328
+ routing_reason=routing_reason,
329
+ latency_ms=round(latency_ms, 2),
330
+ entropy=round(entropy, 2)
331
+ )
332
+
333
+ except Exception as e:
334
+ error_msg = f"Processing error for query '{query[:20]}...': {str(e)}"
335
+ logging.error(error_msg)
336
+ # Fallback to simple response
337
+ return RouterResponse(
338
+ response="I apologize, but I encountered an error processing your request. Please try again with a simpler query.",
339
+ model_used="error_fallback",
340
+ routing_reason="error_recovery",
341
+ latency_ms=round((time.time() - start_time) * 1000, 2),
342
+ entropy=0.0
343
+ )
344
+
345
+ # Health check endpoint
346
+ @app.get("/health")
347
+ async def health_check():
348
+ gpu_memory = 0
349
+ if torch.cuda.is_available():
350
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
351
+
352
+ return {
353
+ "status": "healthy",
354
+ "system1": SYSTEM1_MODEL,
355
+ "system2": SYSTEM2_MODEL,
356
+ "device": "cuda" if torch.cuda.is_available() else "cpu",
357
+ "gpu_memory_gb": round(gpu_memory, 2),
358
+ "cpu_memory_gb": round(psutil.virtual_memory().total / (1024 ** 3), 2)
359
+ }
360
+
361
+ # Warmup endpoint to prepare models
362
+ @app.post("/warmup")
363
+ async def warmup_models():
364
+ try:
365
+ # Warm up system 1
366
+ system1_pipe("Hello, how are you?", max_new_tokens=10)
367
+
368
+ # Warm up system 2
369
+ system2_pipe("What is the capital of France?", max_new_tokens=10)
370
+
371
+ return {"status": "models warmed up successfully"}
372
+ except Exception as e:
373
+ return {"status": "warmup failed", "error": str(e)}
374
+
375
+ if __name__ == "__main__":
376
+ import uvicorn
377
+ print("Starting server with memory-optimized configuration...")
378
+ uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")
router_tracing.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-22 03:20:10,773 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
2
+ 2026-01-22 03:20:16,268 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
3
+ 2026-01-22 03:21:15,887 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
4
+ 2026-01-22 03:21:19,191 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
5
+ 2026-01-22 03:21:19,213 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
6
+ 2026-01-22 03:21:19,216 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
7
+ 2026-01-22 03:21:19,277 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
8
+ 2026-01-22 03:21:19,278 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
9
+ 2026-01-22 03:21:19,278 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
10
+ 2026-01-22 03:21:19,291 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
11
+ 2026-01-22 03:21:19,316 - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
12
+ 2026-01-22 03:27:18,906 - ERROR - Model load failed for HuggingFaceH4/zephyr-7b-beta: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details.
13
+ 2026-01-22 03:34:20,341 - ERROR - Model load failed for HuggingFaceH4/zephyr-7b-beta: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details.
14
+ 2026-01-22 03:34:41,119 - INFO - System memory: 15.52GB, Estimated model size: 1.20GB
15
+ 2026-01-22 03:34:41,120 - WARNING - No GPU detected - using CPU only
16
+ 2026-01-22 03:34:49,182 - INFO - Successfully loaded TinyLlama/TinyLlama-1.1B-Chat-v1.0 with device_map: cpu
17
+ 2026-01-22 03:34:49,354 - INFO - System memory: 15.52GB, Estimated model size: 6.00GB
18
+ 2026-01-22 03:34:49,354 - WARNING - No GPU detected - using CPU only
19
+ 2026-01-22 03:37:33,010 - INFO - System memory: 15.52GB, Estimated model size: 1.20GB
20
+ 2026-01-22 03:37:33,011 - WARNING - No GPU detected - using CPU only
21
+ 2026-01-22 03:37:48,312 - INFO - Successfully loaded TinyLlama/TinyLlama-1.1B-Chat-v1.0 with device_map: cpu
22
+ 2026-01-22 03:37:48,592 - INFO - System memory: 15.52GB, Estimated model size: 6.00GB
23
+ 2026-01-22 03:37:48,592 - WARNING - No GPU detected - using CPU only
24
+ 2026-01-22 03:54:42,741 - INFO - Successfully loaded HuggingFaceH4/zephyr-7b-beta with device_map: cpu
25
+ 2026-01-22 04:21:00,210 - INFO - {"timestamp": "2026-01-22T04:21:00.209239", "query": "What is 2+2?", "model_used": "system1", "routing_reason": "simple_query", "entropy": 0.0, "response": "2 + 2 = 4"}
26
+ 2026-01-22 05:04:22,917 - INFO - {"timestamp": "2026-01-22T05:04:22.914959", "query": "Compare the architectural differences between transformer and RNN models", "model_used": "system2", "routing_reason": "semantic_complexity", "entropy": 0.0, "response": ", and discuss their potential applications in various industries. Include specific examples of indus..."}
27
+ 2026-01-22 05:48:18,885 - INFO - {"timestamp": "2026-01-22T05:48:18.879276", "query": "Compare the architectural differences between transformer and RNN models", "model_used": "system2", "routing_reason": "semantic_complexity", "entropy": 0.0, "response": "in natural language processing. Provide examples of use cases where each model is more suitable. Add..."}