Spaces:
Sleeping
Sleeping
Update inference.py
Browse files- inference.py +44 -33
inference.py
CHANGED
|
@@ -5,10 +5,11 @@ from transformers import AutoTokenizer
|
|
| 5 |
from evo_model import EvoTransformerV22
|
| 6 |
from search_utils import web_search
|
| 7 |
import openai
|
|
|
|
| 8 |
import psutil
|
| 9 |
import platform
|
| 10 |
|
| 11 |
-
# ๐
|
| 12 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 13 |
|
| 14 |
# ๐ฆ Constants
|
|
@@ -17,7 +18,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
| 17 |
model = None
|
| 18 |
last_mod_time = 0
|
| 19 |
|
| 20 |
-
# ๐
|
| 21 |
def load_model():
|
| 22 |
global model, last_mod_time
|
| 23 |
try:
|
|
@@ -29,71 +30,81 @@ def load_model():
|
|
| 29 |
last_mod_time = current_mod_time
|
| 30 |
print("โ
Evo model loaded.")
|
| 31 |
except Exception as e:
|
| 32 |
-
print(f"โ Error loading model: {e}")
|
| 33 |
model = None
|
| 34 |
return model
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
def evo_infer(
|
| 38 |
model = load_model()
|
| 39 |
if model is None:
|
| 40 |
-
return "
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
context = "\n".join(context_blobs)
|
| 45 |
|
| 46 |
-
# โ Format input pairs
|
| 47 |
-
inputs = [f"{question} [SEP] {opt} [CTX] {context}" for opt in [option1, option2]]
|
| 48 |
scores = []
|
| 49 |
-
|
| 50 |
-
for pair in inputs:
|
| 51 |
encoded = tokenizer(pair, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
|
| 52 |
with torch.no_grad():
|
| 53 |
logits = model(encoded["input_ids"])
|
| 54 |
score = torch.sigmoid(logits).item()
|
| 55 |
scores.append(score)
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
#
|
| 63 |
-
def gpt_infer(
|
| 64 |
try:
|
| 65 |
-
|
| 66 |
response = openai.chat.completions.create(
|
| 67 |
model="gpt-3.5-turbo",
|
| 68 |
-
messages=[{"role": "user", "content":
|
| 69 |
temperature=0.7,
|
| 70 |
)
|
| 71 |
return response.choices[0].message.content.strip()
|
| 72 |
except Exception as e:
|
| 73 |
-
return f"โ ๏ธ GPT error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
# ๐
|
| 76 |
def get_model_config():
|
| 77 |
return {
|
| 78 |
"num_layers": 6,
|
| 79 |
"num_heads": 8,
|
| 80 |
"ffn_dim": 1024,
|
| 81 |
"memory_enabled": True,
|
| 82 |
-
"
|
|
|
|
| 83 |
}
|
| 84 |
|
| 85 |
-
#
|
| 86 |
def get_system_stats():
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
return {
|
| 91 |
"device": "GPU" if torch.cuda.is_available() else "CPU",
|
| 92 |
"cpu_usage_percent": psutil.cpu_percent(),
|
| 93 |
-
"memory_used_gb": round(
|
| 94 |
-
"memory_total_gb": round(
|
| 95 |
-
"gpu_name":
|
| 96 |
-
"gpu_memory_total_gb": round(
|
| 97 |
-
"gpu_memory_used_gb": round(torch.cuda.memory_allocated() / (1024 ** 3), 2) if
|
| 98 |
"platform": platform.platform()
|
| 99 |
}
|
|
|
|
| 5 |
from evo_model import EvoTransformerV22
|
| 6 |
from search_utils import web_search
|
| 7 |
import openai
|
| 8 |
+
import time
|
| 9 |
import psutil
|
| 10 |
import platform
|
| 11 |
|
| 12 |
+
# ๐ Secure API Key
|
| 13 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 14 |
|
| 15 |
# ๐ฆ Constants
|
|
|
|
| 18 |
model = None
|
| 19 |
last_mod_time = 0
|
| 20 |
|
| 21 |
+
# ๐ Reload Evo model if updated
|
| 22 |
def load_model():
|
| 23 |
global model, last_mod_time
|
| 24 |
try:
|
|
|
|
| 30 |
last_mod_time = current_mod_time
|
| 31 |
print("โ
Evo model loaded.")
|
| 32 |
except Exception as e:
|
| 33 |
+
print(f"โ Error loading Evo model: {e}")
|
| 34 |
model = None
|
| 35 |
return model
|
| 36 |
|
| 37 |
+
# ๐ฎ Evo prediction core
|
| 38 |
+
def evo_infer(query, options, user_context=""):
|
| 39 |
model = load_model()
|
| 40 |
if model is None:
|
| 41 |
+
return "Model Error", 0.0, "Model not available", ""
|
| 42 |
|
| 43 |
+
context_str = "\n".join(web_search(query) + ([user_context] if user_context else []))
|
| 44 |
+
input_pairs = [f"{query} [SEP] {opt} [CTX] {context_str}" for opt in options]
|
|
|
|
| 45 |
|
|
|
|
|
|
|
| 46 |
scores = []
|
| 47 |
+
for pair in input_pairs:
|
|
|
|
| 48 |
encoded = tokenizer(pair, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
|
| 49 |
with torch.no_grad():
|
| 50 |
logits = model(encoded["input_ids"])
|
| 51 |
score = torch.sigmoid(logits).item()
|
| 52 |
scores.append(score)
|
| 53 |
|
| 54 |
+
best_idx = int(scores[1] > scores[0])
|
| 55 |
+
return (
|
| 56 |
+
options[best_idx],
|
| 57 |
+
max(scores),
|
| 58 |
+
f"{options[0]}: {scores[0]:.3f} vs {options[1]}: {scores[1]:.3f}",
|
| 59 |
+
context_str
|
| 60 |
+
)
|
| 61 |
|
| 62 |
+
# ๐ฌ GPT-3.5 fallback
|
| 63 |
+
def gpt_infer(query, user_context=""):
|
| 64 |
try:
|
| 65 |
+
context_block = f"\n\nContext:\n{user_context}" if user_context else ""
|
| 66 |
response = openai.chat.completions.create(
|
| 67 |
model="gpt-3.5-turbo",
|
| 68 |
+
messages=[{"role": "user", "content": query + context_block}],
|
| 69 |
temperature=0.7,
|
| 70 |
)
|
| 71 |
return response.choices[0].message.content.strip()
|
| 72 |
except Exception as e:
|
| 73 |
+
return f"โ ๏ธ GPT error:\n{str(e)}"
|
| 74 |
+
|
| 75 |
+
# ๐ง Used by app.py
|
| 76 |
+
def evo_chat_predict(history, query, options):
|
| 77 |
+
context = "\n".join(history[-6:]) if history else ""
|
| 78 |
+
evo_ans, evo_score, evo_reason, evo_ctx = evo_infer(query, options, context)
|
| 79 |
+
return {
|
| 80 |
+
"answer": evo_ans,
|
| 81 |
+
"confidence": round(evo_score, 3),
|
| 82 |
+
"reasoning": evo_reason,
|
| 83 |
+
"context_used": evo_ctx
|
| 84 |
+
}
|
| 85 |
|
| 86 |
+
# ๐ Architecture stats
|
| 87 |
def get_model_config():
|
| 88 |
return {
|
| 89 |
"num_layers": 6,
|
| 90 |
"num_heads": 8,
|
| 91 |
"ffn_dim": 1024,
|
| 92 |
"memory_enabled": True,
|
| 93 |
+
"phase": "v2.2",
|
| 94 |
+
"accuracy": "~64.5%"
|
| 95 |
}
|
| 96 |
|
| 97 |
+
# ๐ฅ๏ธ System diagnostics
|
| 98 |
def get_system_stats():
|
| 99 |
+
gpu_info = torch.cuda.get_device_properties(0) if torch.cuda.is_available() else None
|
| 100 |
+
memory = psutil.virtual_memory()
|
|
|
|
| 101 |
return {
|
| 102 |
"device": "GPU" if torch.cuda.is_available() else "CPU",
|
| 103 |
"cpu_usage_percent": psutil.cpu_percent(),
|
| 104 |
+
"memory_used_gb": round(memory.used / (1024 ** 3), 2),
|
| 105 |
+
"memory_total_gb": round(memory.total / (1024 ** 3), 2),
|
| 106 |
+
"gpu_name": gpu_info.name if gpu_info else "N/A",
|
| 107 |
+
"gpu_memory_total_gb": round(gpu_info.total_memory / (1024 ** 3), 2) if gpu_info else "N/A",
|
| 108 |
+
"gpu_memory_used_gb": round(torch.cuda.memory_allocated() / (1024 ** 3), 2) if gpu_info else "N/A",
|
| 109 |
"platform": platform.platform()
|
| 110 |
}
|