""" Integration test for AgLLM models. Tests each model by calling initialize_qa_chain() from app.py directly, exercising the same code path as the Gradio UI. Usage: python test_app_models.py # Test all models python test_app_models.py --model "GPT-4" # Test a single model python test_app_models.py --model "Gemini-2.5 Pro" Exit code 0 = all tested models passed, 1 = any failure. Models with missing API keys are skipped (not failures). """ import os import sys import json import time import argparse # Ensure working directory is project root (app.py uses relative paths) os.chdir(os.path.dirname(os.path.abspath(__file__))) # Load .env before anything else from dotenv import load_dotenv load_dotenv() # Which API key each model needs REQUIRED_KEYS = { "GPT-4": "OPENAI_API_KEY", "GPT-4.1 Mini": "OPENAI_API_KEY", "Llama-4 Maverick": "OPENROUTER_API_KEY", "Llama-4 Scout": "OPENROUTER_API_KEY", "Gemini-2.5 Pro": "OPENROUTER_API_KEY", "Claude Opus 4.6": "ANTHROPIC_API_KEY", "Claude Sonnet 4.6": "ANTHROPIC_API_KEY", } TEST_QUESTION = "What is the common name of {species}? Answer briefly." def get_active_models(app_module): """Extract model labels from initialize_qa_chain's if/elif branches.""" import inspect source = inspect.getsource(app_module.initialize_qa_chain) models = [] for line in source.splitlines(): line = line.strip() if line.startswith('if model_name==') or line.startswith('elif model_name=='): label = line.split('==')[1].strip().rstrip(':').strip('"').strip("'") models.append(label) return models def test_model(app_module, model_label, species): """Test a model through the RAG path.""" qa_chain, availability_msg = app_module.initialize_qa_chain( specie_selector=species, application_mode="Researcher", model_name=model_label, region="Midwest USA", ) question = TEST_QUESTION.format(species=species) result = qa_chain({"question": question, "chat_history": []}) answer = result["answer"] if not answer or len(answer.strip()) == 0: raise ValueError("Empty response from RAG chain") return answer.strip() def run_single_test(app_module, model_label, species): """Run a single test. Returns a result dict.""" required_key = REQUIRED_KEYS.get(model_label) if required_key and not os.getenv(required_key): return { "model": model_label, "status": "SKIP", "reason": f"Missing {required_key}", } start = time.time() try: response = test_model(app_module, model_label, species) elapsed = time.time() - start return { "model": model_label, "status": "PASS", "response": response[:200], "latency_seconds": round(elapsed, 2), } except Exception as e: elapsed = time.time() - start return { "model": model_label, "status": "FAIL", "error": str(e)[:300], "latency_seconds": round(elapsed, 2), } def print_result(r): status = r["status"] label = r["model"] if status == "PASS": preview = r["response"][:80].replace("\n", " ") print(f" PASS {label:25s} ({r['latency_seconds']:.1f}s) \"{preview}\"") elif status == "SKIP": print(f" SKIP {label:25s} {r['reason']}") else: print(f" FAIL {label:25s} ({r['latency_seconds']:.1f}s) {r['error'][:100]}") def main(): parser = argparse.ArgumentParser(description="AgLLM integration test") parser.add_argument("--model", type=str, help="Test a single model by label") args = parser.parse_args() # Pre-flight check if not os.getenv("OPENAI_API_KEY"): print("FATAL: OPENAI_API_KEY is required (used for embeddings). Set it and retry.") sys.exit(1) print("\n Loading app.py (vector DBs, embeddings, data files)...") import app as app_module print(" App loaded successfully.\n") # Use first species in the DB for testing species = app_module.species_list[0] # Determine which models to test if args.model: models_to_test = [args.model] else: models_to_test = get_active_models(app_module) print(f"{'='*75}") print(f" AgLLM Integration Test") print(f" Species: {species}") print(f" Models: {', '.join(models_to_test)}") print(f"{'='*75}\n") results = [] tested = 0 passed = 0 skipped = 0 failed = 0 for model_label in models_to_test: r = run_single_test(app_module, model_label, species) results.append(r) print_result(r) if r["status"] == "PASS": tested += 1 passed += 1 elif r["status"] == "SKIP": skipped += 1 else: tested += 1 failed += 1 print(f"\n{'='*75}") print(f" Results: {passed}/{tested} passed, {failed} failed, {skipped} skipped") print(f"{'='*75}\n") # Write JSON results output = { "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"), "species_tested": species, "tested": tested, "passed": passed, "failed": failed, "skipped": skipped, "results": results, } output_file = "test_app_models_results.json" with open(output_file, "w") as f: json.dump(output, f, indent=2) print(f" Results written to {output_file}\n") sys.exit(0 if failed == 0 and tested > 0 else 1) if __name__ == "__main__": main()