Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -802,7 +802,100 @@ def create_gaia_app():
|
|
| 802 |
)
|
| 803 |
|
| 804 |
# ===============================
|
| 805 |
-
# TAB 4:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
# ===============================
|
| 807 |
with gr.Tab("ℹ️ Information"):
|
| 808 |
gr.Markdown("""
|
|
@@ -815,6 +908,15 @@ def create_gaia_app():
|
|
| 815 |
- **Web browsing**: Finding and using external information
|
| 816 |
- **Tool use**: Calculator, code execution, etc.
|
| 817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
## 🎯 How to Use This Space
|
| 819 |
|
| 820 |
### 1. Model Setup
|
|
@@ -832,31 +934,89 @@ def create_gaia_app():
|
|
| 832 |
- Then try "GAIA Test Set" for real benchmark evaluation
|
| 833 |
- Download results in JSONL format for submission
|
| 834 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
## 📊 Model Recommendations
|
| 836 |
|
| 837 |
-
| Model | Best For | Memory | Speed | Quality |
|
| 838 |
-
|-------|----------|---------|-------|---------|
|
| 839 |
-
| Fast & Light | Quick testing | Low | Fast | Good |
|
| 840 |
-
| Balanced | General use | Medium | Medium | Better |
|
| 841 |
-
| High Quality | Best results | High | Slow | Best |
|
| 842 |
-
| Instruction Following | Complex reasoning | High | Medium | Excellent |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
|
| 844 |
## 🔗 Resources
|
| 845 |
-
- [GAIA Paper](https://arxiv.org/abs/2311.12983)
|
| 846 |
-
- [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
| 847 |
-
- [
|
|
|
|
| 848 |
|
| 849 |
-
##
|
| 850 |
-
Results are saved in GAIA leaderboard format:
|
| 851 |
```json
|
| 852 |
-
{"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[REASONING]"}
|
|
|
|
| 853 |
```
|
| 854 |
|
| 855 |
-
## ⚡ Tips for Best Results
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 860 |
""")
|
| 861 |
|
| 862 |
return app
|
|
|
|
| 802 |
)
|
| 803 |
|
| 804 |
# ===============================
|
| 805 |
+
# TAB 4: FULL BENCHMARK (NEW)
|
| 806 |
+
# ===============================
|
| 807 |
+
with gr.Tab("🏆 Full Benchmark"):
|
| 808 |
+
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
| 809 |
+
|
| 810 |
+
with gr.Row():
|
| 811 |
+
with gr.Column():
|
| 812 |
+
gr.Markdown(get_leaderboard_info())
|
| 813 |
+
|
| 814 |
+
with gr.Column():
|
| 815 |
+
# Test questions preview
|
| 816 |
+
test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
|
| 817 |
+
test_preview_output = gr.Markdown(
|
| 818 |
+
value="Click above to preview official test questions"
|
| 819 |
+
)
|
| 820 |
+
|
| 821 |
+
# Full benchmark
|
| 822 |
+
gr.Markdown("### 🚀 Run Complete Benchmark")
|
| 823 |
+
gr.Markdown("""
|
| 824 |
+
**Warning**: This will evaluate your model on all ~300 official GAIA test questions.
|
| 825 |
+
This process may take 1-3 hours depending on your model and hardware.
|
| 826 |
+
""")
|
| 827 |
+
|
| 828 |
+
full_benchmark_btn = gr.Button(
|
| 829 |
+
"🏆 Start Full Benchmark (300 Questions)",
|
| 830 |
+
variant="primary",
|
| 831 |
+
size="lg"
|
| 832 |
+
)
|
| 833 |
+
|
| 834 |
+
# Benchmark results
|
| 835 |
+
benchmark_status = gr.Textbox(
|
| 836 |
+
label="📊 Benchmark Status",
|
| 837 |
+
value="Ready to run benchmark",
|
| 838 |
+
interactive=False
|
| 839 |
+
)
|
| 840 |
+
|
| 841 |
+
with gr.Row():
|
| 842 |
+
with gr.Column():
|
| 843 |
+
benchmark_report = gr.Markdown(
|
| 844 |
+
label="📈 Benchmark Report",
|
| 845 |
+
value="Run benchmark to see detailed results"
|
| 846 |
+
)
|
| 847 |
+
|
| 848 |
+
with gr.Column():
|
| 849 |
+
# Download files
|
| 850 |
+
submission_file = gr.File(
|
| 851 |
+
label="💾 Download Submission File (JSONL)",
|
| 852 |
+
visible=False
|
| 853 |
+
)
|
| 854 |
+
|
| 855 |
+
metadata_file = gr.File(
|
| 856 |
+
label="📋 Download Metadata File",
|
| 857 |
+
visible=False
|
| 858 |
+
)
|
| 859 |
+
|
| 860 |
+
gr.Markdown("""
|
| 861 |
+
### 📤 Leaderboard Submission
|
| 862 |
+
1. Download the JSONL file above
|
| 863 |
+
2. Visit [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
| 864 |
+
3. Upload your submission file
|
| 865 |
+
4. View your model's ranking!
|
| 866 |
+
""")
|
| 867 |
+
|
| 868 |
+
# Event handlers
|
| 869 |
+
test_preview_btn.click(
|
| 870 |
+
fn=load_test_questions_interface,
|
| 871 |
+
outputs=[test_preview_output]
|
| 872 |
+
)
|
| 873 |
+
|
| 874 |
+
def full_benchmark_with_files(*args):
|
| 875 |
+
status, report, sub_file, meta_file = run_leaderboard_benchmark_interface(*args)
|
| 876 |
+
return (
|
| 877 |
+
status,
|
| 878 |
+
report,
|
| 879 |
+
sub_file,
|
| 880 |
+
meta_file,
|
| 881 |
+
gr.update(visible=True), # Show submission file
|
| 882 |
+
gr.update(visible=True) # Show metadata file
|
| 883 |
+
)
|
| 884 |
+
|
| 885 |
+
full_benchmark_btn.click(
|
| 886 |
+
fn=full_benchmark_with_files,
|
| 887 |
+
outputs=[
|
| 888 |
+
benchmark_status,
|
| 889 |
+
benchmark_report,
|
| 890 |
+
submission_file,
|
| 891 |
+
metadata_file,
|
| 892 |
+
submission_file, # Update visibility
|
| 893 |
+
metadata_file # Update visibility
|
| 894 |
+
]
|
| 895 |
+
)
|
| 896 |
+
|
| 897 |
+
# ===============================
|
| 898 |
+
# TAB 5: INFORMATION (UPDATED)
|
| 899 |
# ===============================
|
| 900 |
with gr.Tab("ℹ️ Information"):
|
| 901 |
gr.Markdown("""
|
|
|
|
| 908 |
- **Web browsing**: Finding and using external information
|
| 909 |
- **Tool use**: Calculator, code execution, etc.
|
| 910 |
|
| 911 |
+
## 🏆 GAIA Public Leaderboard
|
| 912 |
+
GAIA provides a **public leaderboard hosted on Hugging Face** where you can:
|
| 913 |
+
- Test your models against **300 official testing questions**
|
| 914 |
+
- Compare performance with state-of-the-art systems
|
| 915 |
+
- Track progress in AI reasoning capabilities
|
| 916 |
+
- Contribute to research community benchmarks
|
| 917 |
+
|
| 918 |
+
**Leaderboard URL**: [https://huggingface.co/spaces/gaia-benchmark/leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
| 919 |
+
|
| 920 |
## 🎯 How to Use This Space
|
| 921 |
|
| 922 |
### 1. Model Setup
|
|
|
|
| 934 |
- Then try "GAIA Test Set" for real benchmark evaluation
|
| 935 |
- Download results in JSONL format for submission
|
| 936 |
|
| 937 |
+
### 4. Full Benchmark (NEW!)
|
| 938 |
+
- Run complete evaluation on all 300 official test questions
|
| 939 |
+
- Get leaderboard-ready submission files
|
| 940 |
+
- Upload directly to GAIA leaderboard for ranking
|
| 941 |
+
|
| 942 |
## 📊 Model Recommendations
|
| 943 |
|
| 944 |
+
| Model | Best For | Memory | Speed | Quality | Leaderboard Ready |
|
| 945 |
+
|-------|----------|---------|-------|---------|------------------|
|
| 946 |
+
| Fast & Light | Quick testing | Low | Fast | Good | ✅ |
|
| 947 |
+
| Balanced | General use | Medium | Medium | Better | ✅ |
|
| 948 |
+
| High Quality | Best results | High | Slow | Best | ✅ |
|
| 949 |
+
| Instruction Following | Complex reasoning | High | Medium | Excellent | ✅ |
|
| 950 |
+
|
| 951 |
+
## 🏅 Benchmark Performance Expectations
|
| 952 |
+
|
| 953 |
+
Based on current leaderboard standings, expect these performance ranges:
|
| 954 |
+
|
| 955 |
+
| Difficulty Level | Top Models | Good Models | Baseline Models |
|
| 956 |
+
|------------------|------------|-------------|-----------------|
|
| 957 |
+
| **Level 1** (Basic) | 85-95% | 70-85% | 50-70% |
|
| 958 |
+
| **Level 2** (Intermediate) | 65-80% | 45-65% | 25-45% |
|
| 959 |
+
| **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
|
| 960 |
+
| **Overall Average** | 65-75% | 45-65% | 30-45% |
|
| 961 |
+
|
| 962 |
+
## 🚀 Continuous Benchmarking Workflow
|
| 963 |
+
|
| 964 |
+
1. **Development**: Test with sample questions
|
| 965 |
+
2. **Validation**: Run batch evaluation (10-50 questions)
|
| 966 |
+
3. **Benchmarking**: Full evaluation (300 questions)
|
| 967 |
+
4. **Submission**: Upload to leaderboard
|
| 968 |
+
5. **Analysis**: Compare with other models
|
| 969 |
+
6. **Iteration**: Improve and re-benchmark
|
| 970 |
|
| 971 |
## 🔗 Resources
|
| 972 |
+
- [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
|
| 973 |
+
- [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
|
| 974 |
+
- [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Training/validation data
|
| 975 |
+
- [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
|
| 976 |
|
| 977 |
+
## 📋 Submission Format
|
| 978 |
+
Results are saved in official GAIA leaderboard format:
|
| 979 |
```json
|
| 980 |
+
{"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
|
| 981 |
+
{"task_id": "gaia_002", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
|
| 982 |
```
|
| 983 |
|
| 984 |
+
## ⚡ Pro Tips for Best Results
|
| 985 |
+
|
| 986 |
+
### Performance Optimization
|
| 987 |
+
1. **Start Small**: Always test with sample questions first
|
| 988 |
+
2. **Choose Wisely**: Balance speed vs quality based on your goals
|
| 989 |
+
3. **Monitor Resources**: Use GPU acceleration for larger models
|
| 990 |
+
4. **Validate Format**: Ensure JSONL files are properly formatted
|
| 991 |
+
|
| 992 |
+
### Leaderboard Strategy
|
| 993 |
+
1. **Baseline First**: Get initial results with fast model
|
| 994 |
+
2. **Iterate Quickly**: Test improvements on small batches
|
| 995 |
+
3. **Full Benchmark**: Run complete evaluation when ready
|
| 996 |
+
4. **Compare Results**: Analyze performance across difficulty levels
|
| 997 |
+
5. **Document Approach**: Include model details and methodology
|
| 998 |
+
|
| 999 |
+
### Common Pitfalls to Avoid
|
| 1000 |
+
- Don't run full benchmark on untested models
|
| 1001 |
+
- Ensure stable internet connection for long evaluations
|
| 1002 |
+
- Verify submission file format before uploading
|
| 1003 |
+
- Check GPU memory usage for large models
|
| 1004 |
+
- Save intermediate results during long runs
|
| 1005 |
+
|
| 1006 |
+
## 🎯 Getting Started Checklist
|
| 1007 |
+
|
| 1008 |
+
- [ ] Load and test a model in "Model Setup"
|
| 1009 |
+
- [ ] Try example questions in "Single Question"
|
| 1010 |
+
- [ ] Run small batch in "Batch Evaluation"
|
| 1011 |
+
- [ ] Review test questions in "Full Benchmark"
|
| 1012 |
+
- [ ] Run complete benchmark when ready
|
| 1013 |
+
- [ ] Download submission files
|
| 1014 |
+
- [ ] Upload to GAIA leaderboard
|
| 1015 |
+
- [ ] Compare your results with others!
|
| 1016 |
+
|
| 1017 |
+
---
|
| 1018 |
+
|
| 1019 |
+
**Ready to start benchmarking?** Begin with the Model Setup tab and work your way through each stage. Good luck! 🚀
|
| 1020 |
""")
|
| 1021 |
|
| 1022 |
return app
|