Spaces:
Sleeping
Sleeping
| """ | |
| Integration test for AgLLM models. | |
| Tests each model by calling initialize_qa_chain() from app.py directly, | |
| exercising the same code path as the Gradio UI. | |
| Usage: | |
| python test_app_models.py # Test all models | |
| python test_app_models.py --model "GPT-4" # Test a single model | |
| python test_app_models.py --model "Gemini-2.5 Pro" | |
| Exit code 0 = all tested models passed, 1 = any failure. | |
| Models with missing API keys are skipped (not failures). | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import time | |
| import argparse | |
| # Ensure working directory is project root (app.py uses relative paths) | |
| os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
| # Load .env before anything else | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # Which API key each model needs | |
| REQUIRED_KEYS = { | |
| "GPT-4": "OPENAI_API_KEY", | |
| "GPT-4.1 Mini": "OPENAI_API_KEY", | |
| "Llama-4 Maverick": "OPENROUTER_API_KEY", | |
| "Llama-4 Scout": "OPENROUTER_API_KEY", | |
| "Gemini-2.5 Pro": "OPENROUTER_API_KEY", | |
| "Claude Opus 4.6": "ANTHROPIC_API_KEY", | |
| "Claude Sonnet 4.6": "ANTHROPIC_API_KEY", | |
| } | |
| TEST_QUESTION = "What is the common name of {species}? Answer briefly." | |
| def get_active_models(app_module): | |
| """Extract model labels from initialize_qa_chain's if/elif branches.""" | |
| import inspect | |
| source = inspect.getsource(app_module.initialize_qa_chain) | |
| models = [] | |
| for line in source.splitlines(): | |
| line = line.strip() | |
| if line.startswith('if model_name==') or line.startswith('elif model_name=='): | |
| label = line.split('==')[1].strip().rstrip(':').strip('"').strip("'") | |
| models.append(label) | |
| return models | |
| def test_model(app_module, model_label, species): | |
| """Test a model through the RAG path.""" | |
| qa_chain, availability_msg = app_module.initialize_qa_chain( | |
| specie_selector=species, | |
| application_mode="Researcher", | |
| model_name=model_label, | |
| region="Midwest USA", | |
| ) | |
| question = TEST_QUESTION.format(species=species) | |
| result = qa_chain({"question": question, "chat_history": []}) | |
| answer = result["answer"] | |
| if not answer or len(answer.strip()) == 0: | |
| raise ValueError("Empty response from RAG chain") | |
| return answer.strip() | |
| def run_single_test(app_module, model_label, species): | |
| """Run a single test. Returns a result dict.""" | |
| required_key = REQUIRED_KEYS.get(model_label) | |
| if required_key and not os.getenv(required_key): | |
| return { | |
| "model": model_label, | |
| "status": "SKIP", | |
| "reason": f"Missing {required_key}", | |
| } | |
| start = time.time() | |
| try: | |
| response = test_model(app_module, model_label, species) | |
| elapsed = time.time() - start | |
| return { | |
| "model": model_label, | |
| "status": "PASS", | |
| "response": response[:200], | |
| "latency_seconds": round(elapsed, 2), | |
| } | |
| except Exception as e: | |
| elapsed = time.time() - start | |
| return { | |
| "model": model_label, | |
| "status": "FAIL", | |
| "error": str(e)[:300], | |
| "latency_seconds": round(elapsed, 2), | |
| } | |
| def print_result(r): | |
| status = r["status"] | |
| label = r["model"] | |
| if status == "PASS": | |
| preview = r["response"][:80].replace("\n", " ") | |
| print(f" PASS {label:25s} ({r['latency_seconds']:.1f}s) \"{preview}\"") | |
| elif status == "SKIP": | |
| print(f" SKIP {label:25s} {r['reason']}") | |
| else: | |
| print(f" FAIL {label:25s} ({r['latency_seconds']:.1f}s) {r['error'][:100]}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="AgLLM integration test") | |
| parser.add_argument("--model", type=str, help="Test a single model by label") | |
| args = parser.parse_args() | |
| # Pre-flight check | |
| if not os.getenv("OPENAI_API_KEY"): | |
| print("FATAL: OPENAI_API_KEY is required (used for embeddings). Set it and retry.") | |
| sys.exit(1) | |
| print("\n Loading app.py (vector DBs, embeddings, data files)...") | |
| import app as app_module | |
| print(" App loaded successfully.\n") | |
| # Use first species in the DB for testing | |
| species = app_module.species_list[0] | |
| # Determine which models to test | |
| if args.model: | |
| models_to_test = [args.model] | |
| else: | |
| models_to_test = get_active_models(app_module) | |
| print(f"{'='*75}") | |
| print(f" AgLLM Integration Test") | |
| print(f" Species: {species}") | |
| print(f" Models: {', '.join(models_to_test)}") | |
| print(f"{'='*75}\n") | |
| results = [] | |
| tested = 0 | |
| passed = 0 | |
| skipped = 0 | |
| failed = 0 | |
| for model_label in models_to_test: | |
| r = run_single_test(app_module, model_label, species) | |
| results.append(r) | |
| print_result(r) | |
| if r["status"] == "PASS": | |
| tested += 1 | |
| passed += 1 | |
| elif r["status"] == "SKIP": | |
| skipped += 1 | |
| else: | |
| tested += 1 | |
| failed += 1 | |
| print(f"\n{'='*75}") | |
| print(f" Results: {passed}/{tested} passed, {failed} failed, {skipped} skipped") | |
| print(f"{'='*75}\n") | |
| # Write JSON results | |
| output = { | |
| "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"), | |
| "species_tested": species, | |
| "tested": tested, | |
| "passed": passed, | |
| "failed": failed, | |
| "skipped": skipped, | |
| "results": results, | |
| } | |
| output_file = "test_app_models_results.json" | |
| with open(output_file, "w") as f: | |
| json.dump(output, f, indent=2) | |
| print(f" Results written to {output_file}\n") | |
| sys.exit(0 if failed == 0 and tested > 0 else 1) | |
| if __name__ == "__main__": | |
| main() | |