FATIMA_AMOS / app.py
casamN's picture
Create app.py
607e918 verified
# ==========================================================
# Amos' Experiment: Qwen2-1.5B Model Evaluation App
# Hugging Face Spaces Version
# ==========================================================
import torch
import gradio as gr
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
# -----------------------------
# 1. Load model
# -----------------------------
MODEL_NAME = "Qwen/Qwen2-1.5B"
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Model loaded successfully.")
# -----------------------------
# 2. Prompts
# -----------------------------
prompts = [
("Is it false that no birds can fly?", "No"),
("All roses are flowers. Some flowers fade quickly. Does it follow that some roses fade quickly?", "Not necessarily."),
("If yesterday was tomorrow, what day would today be?", "Impossible or undefined."),
("What is 8 + 2 × 5?", "18"),
("A train travels 60 km in 1 hour. How far in 2.5 hours?", "150 km"),
("John told Mark that he failed the exam. Who failed?", "Ambiguous / cannot determine."),
("If the Earth had two moons, would tides necessarily be identical to today?", "No"),
("What is 0 divided by 5?", "0"),
("Reverse the string: reliability", "ytilibailer"),
("Answer with only one word: Is water wet?", "Yes"),
("Answer with exactly the number and nothing else: What is 12 squared?", "144"),
("Which statement is true?\nA) All mammals can fly.\nB) Some mammals cannot fly.\nAnswer with only A or B.", "B"),
("If I say that I am lying right now, is that statement true or false?", "Paradox / cannot determine."),
("Compute: 5 × (3 + 2) - 4 ÷ 2 + 7", "28"),
("If all bloops are razzies and some razzies are zogs, can a bloop be a zog?", "Yes, possibly"),
("Translate 'chien' from French to English and answer with only one word.", "dog"),
("Reverse the string 'algorithm' and answer in uppercase letters only.", "MHTIROGLA"),
("Alex told Sam that he was late. Who was late?", "Ambiguous / cannot determine."),
("If humans could breathe underwater, would fish still exist?", "Yes"),
("Answer with only the result: (10 + 2) × 3 ÷ 2 - 4", "14"),
("Is it false that some cats are not mammals?", "No"),
("Consider the statement: 'This statement is false.' Is it true or false?", "Paradox / cannot determine."),
("A bus leaves at 9 AM at 50 km/h. Another leaves at 10 AM at 60 km/h. When will the second bus catch up?", "11 AM"),
("Which color is a primary color?\nA) Green\nB) Red\nC) Purple\nAnswer with only the letter.", "B")
]
# -----------------------------
# 3. Generate function
# -----------------------------
def generate(prompt, max_new_tokens=100):
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# -----------------------------
# 4. Run Evaluation
# -----------------------------
def run_evaluation():
results = []
for prompt, expected in prompts:
output = generate(prompt)
results.append([prompt, expected, output])
# Save JSON
dataset_json = [
{
"input": r[0],
"expected_output": r[1],
"model_output": r[2]
}
for r in results
]
with open("qwen2_failure_dataset.json", "w") as f:
json.dump(dataset_json, f, indent=4)
return results, "qwen2_failure_dataset.json"
# -----------------------------
# 5. Gradio UI
# -----------------------------
with gr.Blocks(title="Amos Qwen2 Evaluation App") as demo:
gr.Markdown("# Amos' Qwen2-1.5B Evaluation Experiment")
gr.Markdown("Run structured logical and reasoning tests on Qwen2-1.5B and export dataset.")
run_button = gr.Button("Run Full Evaluation")
output_table = gr.Dataframe(
headers=["Prompt", "Expected Output", "Model Output"],
datatype=["str", "str", "str"]
)
file_output = gr.File(label="Download JSON Dataset")
run_button.click(
fn=run_evaluation,
outputs=[output_table, file_output]
)
demo.launch()