Spaces:

arbabarshad
/

agllm2-dev

Sleeping

App Files Files Community

agllm2-dev / test_app_models.py

arbabarshad

Upgrade all LLM models to latest versions, add test harness

5bbe904 about 1 month ago

raw

history blame contribute delete

5.64 kB

	"""
	Integration test for AgLLM models.

	Tests each model by calling initialize_qa_chain() from app.py directly,
	exercising the same code path as the Gradio UI.

	Usage:
	python test_app_models.py # Test all models
	python test_app_models.py --model "GPT-4" # Test a single model
	python test_app_models.py --model "Gemini-2.5 Pro"

	Exit code 0 = all tested models passed, 1 = any failure.
	Models with missing API keys are skipped (not failures).
	"""

	import os
	import sys
	import json
	import time
	import argparse

	# Ensure working directory is project root (app.py uses relative paths)
	os.chdir(os.path.dirname(os.path.abspath(__file__)))

	# Load .env before anything else
	from dotenv import load_dotenv
	load_dotenv()

	# Which API key each model needs
	REQUIRED_KEYS = {
	"GPT-4": "OPENAI_API_KEY",
	"GPT-4.1 Mini": "OPENAI_API_KEY",
	"Llama-4 Maverick": "OPENROUTER_API_KEY",
	"Llama-4 Scout": "OPENROUTER_API_KEY",
	"Gemini-2.5 Pro": "OPENROUTER_API_KEY",
	"Claude Opus 4.6": "ANTHROPIC_API_KEY",
	"Claude Sonnet 4.6": "ANTHROPIC_API_KEY",
	}

	TEST_QUESTION = "What is the common name of {species}? Answer briefly."


	def get_active_models(app_module):
	"""Extract model labels from initialize_qa_chain's if/elif branches."""
	import inspect
	source = inspect.getsource(app_module.initialize_qa_chain)
	models = []
	for line in source.splitlines():
	line = line.strip()
	if line.startswith('if model_name==') or line.startswith('elif model_name=='):
	label = line.split('==')[1].strip().rstrip(':').strip('"').strip("'")
	models.append(label)
	return models


	def test_model(app_module, model_label, species):
	"""Test a model through the RAG path."""
	qa_chain, availability_msg = app_module.initialize_qa_chain(
	specie_selector=species,
	application_mode="Researcher",
	model_name=model_label,
	region="Midwest USA",
	)
	question = TEST_QUESTION.format(species=species)
	result = qa_chain({"question": question, "chat_history": []})
	answer = result["answer"]
	if not answer or len(answer.strip()) == 0:
	raise ValueError("Empty response from RAG chain")
	return answer.strip()


	def run_single_test(app_module, model_label, species):
	"""Run a single test. Returns a result dict."""
	required_key = REQUIRED_KEYS.get(model_label)
	if required_key and not os.getenv(required_key):
	return {
	"model": model_label,
	"status": "SKIP",
	"reason": f"Missing {required_key}",
	}

	start = time.time()
	try:
	response = test_model(app_module, model_label, species)
	elapsed = time.time() - start
	return {
	"model": model_label,
	"status": "PASS",
	"response": response[:200],
	"latency_seconds": round(elapsed, 2),
	}
	except Exception as e:
	elapsed = time.time() - start
	return {
	"model": model_label,
	"status": "FAIL",
	"error": str(e)[:300],
	"latency_seconds": round(elapsed, 2),
	}


	def print_result(r):
	status = r["status"]
	label = r["model"]
	if status == "PASS":
	preview = r["response"][:80].replace("\n", " ")
	print(f" PASS {label:25s} ({r['latency_seconds']:.1f}s) \"{preview}\"")
	elif status == "SKIP":
	print(f" SKIP {label:25s} {r['reason']}")
	else:
	print(f" FAIL {label:25s} ({r['latency_seconds']:.1f}s) {r['error'][:100]}")


	def main():
	parser = argparse.ArgumentParser(description="AgLLM integration test")
	parser.add_argument("--model", type=str, help="Test a single model by label")
	args = parser.parse_args()

	# Pre-flight check
	if not os.getenv("OPENAI_API_KEY"):
	print("FATAL: OPENAI_API_KEY is required (used for embeddings). Set it and retry.")
	sys.exit(1)

	print("\n Loading app.py (vector DBs, embeddings, data files)...")
	import app as app_module
	print(" App loaded successfully.\n")

	# Use first species in the DB for testing
	species = app_module.species_list[0]

	# Determine which models to test
	if args.model:
	models_to_test = [args.model]
	else:
	models_to_test = get_active_models(app_module)

	print(f"{'='*75}")
	print(f" AgLLM Integration Test")
	print(f" Species: {species}")
	print(f" Models: {', '.join(models_to_test)}")
	print(f"{'='*75}\n")

	results = []
	tested = 0
	passed = 0
	skipped = 0
	failed = 0

	for model_label in models_to_test:
	r = run_single_test(app_module, model_label, species)
	results.append(r)
	print_result(r)

	if r["status"] == "PASS":
	tested += 1
	passed += 1
	elif r["status"] == "SKIP":
	skipped += 1
	else:
	tested += 1
	failed += 1

	print(f"\n{'='*75}")
	print(f" Results: {passed}/{tested} passed, {failed} failed, {skipped} skipped")
	print(f"{'='*75}\n")

	# Write JSON results
	output = {
	"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
	"species_tested": species,
	"tested": tested,
	"passed": passed,
	"failed": failed,
	"skipped": skipped,
	"results": results,
	}
	output_file = "test_app_models_results.json"
	with open(output_file, "w") as f:
	json.dump(output, f, indent=2)
	print(f" Results written to {output_file}\n")

	sys.exit(0 if failed == 0 and tested > 0 else 1)


	if __name__ == "__main__":
	main()