Spaces:

yousefabdallah031
/

efficiency-eval

Sleeping

App Files Files Community

yousefabdallah031 commited on Jul 2, 2025

Commit

23b0f0f

verified ·

1 Parent(s): e6bc80a

Update main.py

Browse files

Files changed (1) hide show

main.py +77 -28

main.py CHANGED Viewed

@@ -24,7 +24,7 @@ def load_model_and_tokenizer():
     print(f"Using device: {device}")
     try:
-        model_path = "best_model_final"
         tokenizer = AutoTokenizer.from_pretrained(model_path)
         model = AutoModelForSequenceClassification.from_pretrained(model_path)
         model.to(device)
@@ -34,16 +34,80 @@ def load_model_and_tokenizer():
             model.half()
         print("Model and tokenizer loaded successfully!")
     except Exception as e:
         print(f"Error loading model/tokenizer: {e}")
-        model = None
-        tokenizer = None
 def cleanup_gpu_memory():
     if device and device.type == 'cuda':
         torch.cuda.empty_cache()
     gc.collect()
 @app.route("/", methods=['GET'])
 def home():
     return jsonify({
@@ -51,7 +115,8 @@ def home():
         "status": "Model loaded" if model is not None else "Model not loaded",
         "device": str(device) if device else "unknown",
         "endpoints": {
-            "/predict": "POST with JSON body containing 'codes' array"
         }
     })
@@ -88,31 +153,13 @@ def predict_batch():
         for i in range(0, len(validated_codes), batch_size):
             batch = validated_codes[i:i+batch_size]
-            inputs = tokenizer(
-                batch,
-                padding=True,
-                truncation=True,
-                max_length=512,
-                return_tensors="pt"
-            )
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            with torch.no_grad():
-                if device.type == 'cuda':
-                    with torch.cuda.amp.autocast():
-                        outputs = model(**inputs)
                 else:
-                    outputs = model(**inputs)
-            preds = torch.sigmoid(outputs.logits).cpu().numpy()
-            for pred in preds:
-                cpu_time, memory_usage = pred
-                results.append({
-                    "cpu_time": round(float(cpu_time), 4),
-                    "memory_usage": round(float(memory_usage), 4)
-                })
             cleanup_gpu_memory()
@@ -131,6 +178,8 @@ def health_check():
         "device": str(device) if device else "unknown"
     })
 load_model_and_tokenizer()
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=False, threaded=True)

     print(f"Using device: {device}")
     try:
+        model_path = "./best_model_final"
         tokenizer = AutoTokenizer.from_pretrained(model_path)
         model = AutoModelForSequenceClassification.from_pretrained(model_path)
         model.to(device)
             model.half()
         print("Model and tokenizer loaded successfully!")
     except Exception as e:
         print(f"Error loading model/tokenizer: {e}")
+        raise e
 def cleanup_gpu_memory():
     if device and device.type == 'cuda':
         torch.cuda.empty_cache()
     gc.collect()
+def predict_single(code):
+    try:
+        inputs = tokenizer(
+            code,
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors="pt"
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            if device.type == 'cuda':
+                with torch.cuda.amp.autocast():
+                    outputs = model(**inputs)
+            else:
+                outputs = model(**inputs)
+        preds = torch.sigmoid(outputs.logits).cpu().numpy()
+        cpu_time, memory_usage = preds[0]
+        return {
+            "cpu_time": round(float(cpu_time), 4),
+            "memory_usage": round(float(memory_usage), 4)
+        }
+    except Exception as e:
+        print(f"Single prediction error: {e}")
+        return {"cpu_time": 0.0, "memory_usage": 0.0}
+def predict_with_chunking(code, chunk_size=400, overlap=50):
+    try:
+        if not code or len(code.strip()) == 0:
+            return {"cpu_time": 0.0, "memory_usage": 0.0}
+        tokens = tokenizer.encode(code, add_special_tokens=False)
+        if len(tokens) <= 450:
+            return predict_single(code)
+        max_cpu_time = 0.0
+        max_memory_usage = 0.0
+        for start in range(0, len(tokens), chunk_size - overlap):
+            end = min(start + chunk_size, len(tokens))
+            chunk_tokens = tokens[start:end]
+            chunk_code = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
+            if chunk_code.strip():
+                result = predict_single(chunk_code)
+                max_cpu_time = max(max_cpu_time, result["cpu_time"])
+                max_memory_usage = max(max_memory_usage, result["memory_usage"])
+            if end >= len(tokens):
+                break
+        return {
+            "cpu_time": round(max_cpu_time, 4),
+            "memory_usage": round(max_memory_usage, 4)
+        }
+    except Exception as e:
+        print(f"Chunking prediction error: {e}")
+        return {"cpu_time": 0.0, "memory_usage": 0.0}
 @app.route("/", methods=['GET'])
 def home():
     return jsonify({
         "status": "Model loaded" if model is not None else "Model not loaded",
         "device": str(device) if device else "unknown",
         "endpoints": {
+            "/predict": "POST with JSON body containing 'codes' array",
+            "/health": "GET server health status"
         }
     })
         for i in range(0, len(validated_codes), batch_size):
             batch = validated_codes[i:i+batch_size]
+            for code in batch:
+                tokens = tokenizer.encode(code, add_special_tokens=False)
+                if len(tokens) > 450:
+                    result = predict_with_chunking(code)
                 else:
+                    result = predict_single(code)
+                results.append(result)
             cleanup_gpu_memory()
         "device": str(device) if device else "unknown"
     })
+# Load model/tokenizer immediately when app starts (important for Spaces)
 load_model_and_tokenizer()
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=False, threaded=True)