Spaces:
Sleeping
Sleeping
| # ========================================================== | |
| # Amos' Experiment: Qwen2-1.5B Model Evaluation App | |
| # Hugging Face Spaces Version | |
| # ========================================================== | |
| import torch | |
| import gradio as gr | |
| import json | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # ----------------------------- | |
| # 1. Load model | |
| # ----------------------------- | |
| MODEL_NAME = "Qwen/Qwen2-1.5B" | |
| print("Loading model...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| ) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| print("Model loaded successfully.") | |
| # ----------------------------- | |
| # 2. Prompts | |
| # ----------------------------- | |
| prompts = [ | |
| ("Is it false that no birds can fly?", "No"), | |
| ("All roses are flowers. Some flowers fade quickly. Does it follow that some roses fade quickly?", "Not necessarily."), | |
| ("If yesterday was tomorrow, what day would today be?", "Impossible or undefined."), | |
| ("What is 8 + 2 × 5?", "18"), | |
| ("A train travels 60 km in 1 hour. How far in 2.5 hours?", "150 km"), | |
| ("John told Mark that he failed the exam. Who failed?", "Ambiguous / cannot determine."), | |
| ("If the Earth had two moons, would tides necessarily be identical to today?", "No"), | |
| ("What is 0 divided by 5?", "0"), | |
| ("Reverse the string: reliability", "ytilibailer"), | |
| ("Answer with only one word: Is water wet?", "Yes"), | |
| ("Answer with exactly the number and nothing else: What is 12 squared?", "144"), | |
| ("Which statement is true?\nA) All mammals can fly.\nB) Some mammals cannot fly.\nAnswer with only A or B.", "B"), | |
| ("If I say that I am lying right now, is that statement true or false?", "Paradox / cannot determine."), | |
| ("Compute: 5 × (3 + 2) - 4 ÷ 2 + 7", "28"), | |
| ("If all bloops are razzies and some razzies are zogs, can a bloop be a zog?", "Yes, possibly"), | |
| ("Translate 'chien' from French to English and answer with only one word.", "dog"), | |
| ("Reverse the string 'algorithm' and answer in uppercase letters only.", "MHTIROGLA"), | |
| ("Alex told Sam that he was late. Who was late?", "Ambiguous / cannot determine."), | |
| ("If humans could breathe underwater, would fish still exist?", "Yes"), | |
| ("Answer with only the result: (10 + 2) × 3 ÷ 2 - 4", "14"), | |
| ("Is it false that some cats are not mammals?", "No"), | |
| ("Consider the statement: 'This statement is false.' Is it true or false?", "Paradox / cannot determine."), | |
| ("A bus leaves at 9 AM at 50 km/h. Another leaves at 10 AM at 60 km/h. When will the second bus catch up?", "11 AM"), | |
| ("Which color is a primary color?\nA) Green\nB) Red\nC) Purple\nAnswer with only the letter.", "B") | |
| ] | |
| # ----------------------------- | |
| # 3. Generate function | |
| # ----------------------------- | |
| def generate(prompt, max_new_tokens=100): | |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=False | |
| ) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # ----------------------------- | |
| # 4. Run Evaluation | |
| # ----------------------------- | |
| def run_evaluation(): | |
| results = [] | |
| for prompt, expected in prompts: | |
| output = generate(prompt) | |
| results.append([prompt, expected, output]) | |
| # Save JSON | |
| dataset_json = [ | |
| { | |
| "input": r[0], | |
| "expected_output": r[1], | |
| "model_output": r[2] | |
| } | |
| for r in results | |
| ] | |
| with open("qwen2_failure_dataset.json", "w") as f: | |
| json.dump(dataset_json, f, indent=4) | |
| return results, "qwen2_failure_dataset.json" | |
| # ----------------------------- | |
| # 5. Gradio UI | |
| # ----------------------------- | |
| with gr.Blocks(title="Amos Qwen2 Evaluation App") as demo: | |
| gr.Markdown("# Amos' Qwen2-1.5B Evaluation Experiment") | |
| gr.Markdown("Run structured logical and reasoning tests on Qwen2-1.5B and export dataset.") | |
| run_button = gr.Button("Run Full Evaluation") | |
| output_table = gr.Dataframe( | |
| headers=["Prompt", "Expected Output", "Model Output"], | |
| datatype=["str", "str", "str"] | |
| ) | |
| file_output = gr.File(label="Download JSON Dataset") | |
| run_button.click( | |
| fn=run_evaluation, | |
| outputs=[output_table, file_output] | |
| ) | |
| demo.launch() |