Update app.py
Browse files
app.py
CHANGED
|
@@ -1,191 +1,21 @@
|
|
| 1 |
-
|
| 2 |
-
import json
|
| 3 |
-
from datasets import load_dataset
|
| 4 |
-
import numpy as np
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import time
|
| 7 |
-
import os
|
| 8 |
-
|
| 9 |
-
# Set your Hugging Face API token (set it as an environment variable)
|
| 10 |
-
HF_API_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 11 |
-
|
| 12 |
-
# Model and task configuration
|
| 13 |
-
MODELS = {
|
| 14 |
-
"describeai-gemini": "describeai/gemini",
|
| 15 |
-
"deepseek-32b": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 16 |
-
|
| 17 |
-
}
|
| 18 |
-
TASK = "rte" # SuperGLUE task: Recognizing Textual Entailment
|
| 19 |
-
|
| 20 |
-
# Load SuperGLUE dataset
|
| 21 |
-
print("Loading dataset...")
|
| 22 |
-
dataset = load_dataset("super_glue", TASK, trust_remote_code=True)
|
| 23 |
-
print(f"Dataset loaded: {len(dataset['validation'])} validation examples")
|
| 24 |
-
|
| 25 |
-
def query_hf_api(model_id, inputs, api_token):
|
| 26 |
-
"""Query the Hugging Face Inference API."""
|
| 27 |
-
API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
|
| 28 |
-
headers = {"Authorization": f"Bearer {api_token}"}
|
| 29 |
-
|
| 30 |
-
payload = {
|
| 31 |
-
"inputs": inputs,
|
| 32 |
-
"options": {"wait_for_model": True}
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
-
max_retries = 5
|
| 36 |
-
for attempt in range(max_retries):
|
| 37 |
-
response = requests.post(API_URL, headers=headers, json=payload)
|
| 38 |
-
if response.status_code == 200:
|
| 39 |
-
return response.json()
|
| 40 |
-
elif response.status_code == 429: # Too Many Requests
|
| 41 |
-
wait_time = 2 ** attempt
|
| 42 |
-
print(f"Rate limited. Waiting {wait_time} seconds...")
|
| 43 |
-
time.sleep(wait_time)
|
| 44 |
-
else:
|
| 45 |
-
print(f"Error: {response.status_code}, {response.text}")
|
| 46 |
-
break
|
| 47 |
-
|
| 48 |
-
return None
|
| 49 |
-
|
| 50 |
-
def evaluate_model_with_api(model_name, model_path, dataset, api_token):
|
| 51 |
-
"""Evaluate model using the Hugging Face Inference API."""
|
| 52 |
-
print(f"\nEvaluating {model_name} on {TASK} using Inference API...")
|
| 53 |
-
|
| 54 |
-
predictions = []
|
| 55 |
-
labels = []
|
| 56 |
-
|
| 57 |
-
eval_subset = dataset["validation"]
|
| 58 |
-
max_samples = min(10, len(eval_subset)) # Limit to 100 samples for API efficiency
|
| 59 |
-
|
| 60 |
-
for i in tqdm(range(max_samples), desc=f"Evaluating {model_name}"):
|
| 61 |
-
example = eval_subset[i]
|
| 62 |
-
input_text = f"Premise: {example['premise']}\nHypothesis: {example['hypothesis']}"
|
| 63 |
-
|
| 64 |
-
result = query_hf_api(model_path, input_text, api_token)
|
| 65 |
-
|
| 66 |
-
# Ensure pred is always assigned
|
| 67 |
-
pred = 0 # Default to 0 in case of an unexpected response
|
| 68 |
-
|
| 69 |
-
if result:
|
| 70 |
-
try:
|
| 71 |
-
if isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
|
| 72 |
-
if "label" in result[0]:
|
| 73 |
-
pred = 1 if result[0]["label"].lower() in ["entailment", "1", "true"] else 0
|
| 74 |
-
elif "score" in result[0]: # Handling a different API format
|
| 75 |
-
scores = [item["score"] for item in result]
|
| 76 |
-
pred = 0 if scores[0] > scores[1] else 1
|
| 77 |
-
else:
|
| 78 |
-
pred = 1 if "entailment" in str(result).lower() else 0
|
| 79 |
-
except Exception as e:
|
| 80 |
-
print(f"Error parsing result: {e}, {result}")
|
| 81 |
-
|
| 82 |
-
predictions.append(pred)
|
| 83 |
-
labels.append(example["label"])
|
| 84 |
-
|
| 85 |
-
time.sleep(0.5)
|
| 86 |
-
|
| 87 |
-
correct = sum(1 for p, l in zip(predictions, labels) if p == l)
|
| 88 |
-
accuracy = correct / len(predictions) if predictions else 0
|
| 89 |
-
|
| 90 |
-
results = {
|
| 91 |
-
"eval_accuracy": accuracy,
|
| 92 |
-
"num_samples": len(predictions)
|
| 93 |
-
}
|
| 94 |
-
|
| 95 |
-
print(f"Results for {model_name}: Accuracy = {accuracy:.4f}")
|
| 96 |
-
return results
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
# Ensure API token is set
|
| 100 |
-
if not HF_API_TOKEN:
|
| 101 |
-
print("Error: HF_API_TOKEN not set. Please set your Hugging Face API token.")
|
| 102 |
-
exit(1)
|
| 103 |
-
|
| 104 |
-
results = {}
|
| 105 |
-
for model_name, model_path in MODELS.items():
|
| 106 |
-
results[model_name] = evaluate_model_with_api(model_name, model_path, dataset, HF_API_TOKEN)
|
| 107 |
-
|
| 108 |
-
# Compare results
|
| 109 |
-
print("\nComparison of Results:")
|
| 110 |
-
for model_name, eval_results in results.items():
|
| 111 |
-
print(f"{model_name}: {eval_results['eval_accuracy']:.4f} accuracy on {TASK}")
|
| 112 |
-
|
| 113 |
-
# Save results
|
| 114 |
-
with open("deepseek_vs_tinyllama_rte_results.json", "w") as f:
|
| 115 |
-
json.dump(results, f, indent=4)'''
|
| 116 |
-
|
| 117 |
import torch
|
| 118 |
-
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 119 |
-
from datasets import load_dataset
|
| 120 |
-
import numpy as np
|
| 121 |
-
from tqdm import tqdm
|
| 122 |
-
|
| 123 |
-
# Define models
|
| 124 |
-
MODELS = {
|
| 125 |
-
"describeai-gemini": "describeai/gemini",
|
| 126 |
-
"deepseek-32b": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 127 |
-
}
|
| 128 |
-
|
| 129 |
-
TASK = "rte" # Recognizing Textual Entailment (RTE) task
|
| 130 |
-
device = "cuda" if torch.cuda.is_available() else "cpu" # Use GPU if available
|
| 131 |
-
|
| 132 |
-
# Load dataset
|
| 133 |
-
print("Loading dataset...")
|
| 134 |
-
dataset = load_dataset("super_glue", TASK, trust_remote_code=True)
|
| 135 |
-
print(f"Dataset loaded: {len(dataset['validation'])} validation examples")
|
| 136 |
-
|
| 137 |
-
def load_model_and_tokenizer(model_name):
|
| 138 |
-
"""Loads model and tokenizer."""
|
| 139 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 140 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
|
| 141 |
-
model.eval() # Set model to evaluation mode
|
| 142 |
-
return model, tokenizer
|
| 143 |
-
|
| 144 |
-
def predict(model, tokenizer, input_texts):
|
| 145 |
-
"""Runs inference on input texts and returns predictions."""
|
| 146 |
-
inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt").to(device)
|
| 147 |
-
with torch.no_grad():
|
| 148 |
-
outputs = model(**inputs)
|
| 149 |
-
logits = outputs.logits
|
| 150 |
-
preds = torch.argmax(logits, dim=1).cpu().numpy() # Convert logits to class predictions
|
| 151 |
-
return preds
|
| 152 |
-
|
| 153 |
-
def evaluate_model(model_name, model_path, dataset):
|
| 154 |
-
"""Evaluates a model on the RTE dataset."""
|
| 155 |
-
print(f"\nEvaluating {model_name} on {TASK}...")
|
| 156 |
-
|
| 157 |
-
model, tokenizer = load_model_and_tokenizer(model_path)
|
| 158 |
-
|
| 159 |
-
predictions = []
|
| 160 |
-
labels = []
|
| 161 |
-
|
| 162 |
-
eval_subset = dataset["validation"]
|
| 163 |
-
max_samples = min(5, len(eval_subset)) # Limit to 10 samples for efficiency
|
| 164 |
-
|
| 165 |
-
for i in tqdm(range(max_samples), desc=f"Evaluating {model_name}"):
|
| 166 |
-
example = eval_subset[i]
|
| 167 |
-
input_text = f"Premise: {example['premise']}\nHypothesis: {example['hypothesis']}"
|
| 168 |
-
|
| 169 |
-
pred = predict(model, tokenizer, [input_text])[0] # Get single prediction
|
| 170 |
-
predictions.append(pred)
|
| 171 |
-
labels.append(example["label"])
|
| 172 |
-
|
| 173 |
-
accuracy = np.mean(np.array(predictions) == np.array(labels))
|
| 174 |
-
print(f"Results for {model_name}: Accuracy = {accuracy:.4f}")
|
| 175 |
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
-
#
|
| 179 |
-
|
| 180 |
-
for model_name, model_path in MODELS.items():
|
| 181 |
-
results[model_name] = evaluate_model(model_name, model_path, dataset)
|
| 182 |
|
| 183 |
-
#
|
| 184 |
-
|
| 185 |
-
with open("direct_model_rte_results.json", "w") as f:
|
| 186 |
-
json.dump(results, f, indent=4)
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
# Load the model and tokenizer
|
| 5 |
+
model_name = "describeai/gemini" # Replace with the actual Gemini model if available on HF
|
| 6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 7 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 8 |
|
| 9 |
+
# Define input text
|
| 10 |
+
input_text = "Explain the Python function below:\n\ndef add(a, b):\n return a + b"
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
# Tokenize input
|
| 13 |
+
inputs = tokenizer(input_text, return_tensors="pt")
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
# Generate response
|
| 16 |
+
with torch.no_grad():
|
| 17 |
+
output = model.generate(**inputs, max_length=100)
|
| 18 |
|
| 19 |
+
# Decode and print result
|
| 20 |
+
response = tokenizer.decode(output[0], skip_special_tokens=True)
|
| 21 |
+
print("Model Output:", response)
|