Add evaluate.py

838dbf8 verified 16 days ago

11.3 kB

	"""
	GIS-Coder Evaluation Script
	============================
	Tests the fine-tuned model on GIS code generation tasks.
	Compares fine-tuned model vs base model.

	Usage:
	python evaluate.py --adapter_id RhodWeo/GIS-Coder-7B
	python evaluate.py --adapter_id ./gis-coder-7b-output/final --compare_base
	"""

	import argparse
	import torch
	import time
	import json
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	SYSTEM_PROMPT = (
	"You are GIS-Coder, an expert Python programmer specializing in Geographic Information Systems (GIS) "
	"and geospatial analysis. You write clean, efficient, well-documented Python code using libraries like "
	"GeoPandas, Shapely, Rasterio, GDAL, PyProj, OSMNX, Folium, H3, Fiona, xarray, and MovingPandas. "
	"You always explain your reasoning step-by-step before providing code, and you handle edge cases."
	)

	EVAL_PROMPTS = [
	# OSMNX (Tier 1 — models score 0%)
	{"category": "OSMnx", "difficulty": "medium",
	"prompt": "Write a Python function using OSMnx to download the street network of a city and find all dead-end streets (nodes with degree 1)."},
	{"category": "OSMnx", "difficulty": "hard",
	"prompt": "Use OSMnx to create a 10-minute walking isochrone from a given point, then find all restaurants within that isochrone using OSM POI data."},

	# Rasterio (Tier 1)
	{"category": "Rasterio", "difficulty": "medium",
	"prompt": "Write a function to calculate slope and aspect from a DEM raster file using rasterio and numpy."},
	{"category": "Rasterio", "difficulty": "hard",
	"prompt": "Write a function that takes a multispectral satellite image and computes NDVI, NDWI, and NDBI indices, saving each as a separate GeoTIFF with proper nodata handling."},

	# MovingPandas (Tier 1 — models score 0%)
	{"category": "MovingPandas", "difficulty": "medium",
	"prompt": "Use MovingPandas to create trajectories from GPS tracking data, compute speed along each trajectory, and detect stops where the object was stationary for more than 5 minutes."},

	# GeoPandas (Tier 2)
	{"category": "GeoPandas", "difficulty": "easy",
	"prompt": "Perform a spatial join between a points GeoDataFrame and a polygons GeoDataFrame to count how many points fall within each polygon."},
	{"category": "GeoPandas", "difficulty": "medium",
	"prompt": "Write a function that reads a shapefile, buffers all geometries by 500 meters, dissolves overlapping buffers, and calculates the total area covered in square kilometers."},

	# CRS / PyProj (common failure)
	{"category": "PyProj", "difficulty": "medium",
	"prompt": "Write a function that takes two GeoDataFrames with potentially different CRS, automatically detects the mismatch, reprojects them to a common CRS, and computes their spatial intersection."},

	# Multi-library workflow
	{"category": "Multi-library", "difficulty": "hard",
	"prompt": "Write a Python script that downloads building footprints from OpenStreetMap using OSMnx, projects them to UTM for accurate area calculation, filters buildings larger than 500 sq meters, and creates an interactive Folium choropleth map colored by building area."},
	{"category": "Multi-library", "difficulty": "hard",
	"prompt": "Create a complete site suitability analysis for wind turbine placement: compute slope from DEM, extract distance to roads, buffer protected areas, and combine with a weighted overlay to produce a suitability score raster."},

	# H3
	{"category": "H3", "difficulty": "medium",
	"prompt": "Use H3 hexagonal indexing to aggregate 10,000 crime incident points into resolution-9 hexagonal bins, compute density per square kilometer, and identify the top 10 hotspot hexagons."},

	# Folium
	{"category": "Folium", "difficulty": "medium",
	"prompt": "Create an interactive Folium map with three layers: clustered markers for schools, a heatmap for population density, and a choropleth of median income by census tract. Add layer controls."},
	]


	def load_model(base_model_id, adapter_id=None, quantize=True):
	"""Load model with optional LoRA adapter."""
	model_kwargs = {"trust_remote_code": True, "attn_implementation": "eager"}

	if quantize and torch.cuda.is_available():
	from transformers import BitsAndBytesConfig
	model_kwargs["quantization_config"] = BitsAndBytesConfig(
	load_in_4bit=True, bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)
	model_kwargs["dtype"] = torch.bfloat16
	else:
	model_kwargs["dtype"] = torch.float32

	model = AutoModelForCausalLM.from_pretrained(
	base_model_id, device_map="auto" if torch.cuda.is_available() else None,
	**model_kwargs,
	)
	tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

	if adapter_id:
	model = PeftModel.from_pretrained(model, adapter_id)
	print(f"Loaded adapter: {adapter_id}")

	model.eval()
	return model, tokenizer


	def generate(model, tokenizer, prompt, max_new_tokens=768):
	"""Generate response."""
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	]
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048)
	if torch.cuda.is_available():
	inputs = {k: v.cuda() for k, v in inputs.items()}

	t0 = time.time()
	with torch.no_grad():
	out = model.generate(
	**inputs, max_new_tokens=max_new_tokens,
	temperature=0.7, top_p=0.9, do_sample=True, repetition_penalty=1.1,
	)
	gen_time = time.time() - t0
	response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
	return response, gen_time


	def score_response(response, category):
	"""Score response quality on multiple dimensions."""
	r = response.lower()
	scores = {
	"has_code": 1 if ("```python" in response or "import " in response) else 0,
	"has_function": 1 if "def " in response else 0,
	"has_gis_lib": 1 if any(lib in r for lib in [
	"geopandas", "gpd.", "shapely", "rasterio", "osmnx", "ox.", "gdal",
	"pyproj", "folium", "h3.", "fiona", "xarray", "movingpandas", "mpd."
	]) else 0,
	"has_cot": 1 if any(p in r for p in [
	"step by step", "first", "then", "i need to", "let me think"
	]) else 0,
	"has_docstring": 1 if '"""' in response or "'''" in response else 0,
	"has_error_handling": 1 if any(k in r for k in ["try:", "except", "raise", "if.*none", "is_valid"]) else 0,
	"has_crs_handling": 1 if any(k in r for k in ["crs", "epsg", "to_crs", "reproject"]) else 0,
	}

	# Category-specific scoring
	cat_libs = {
	"OSMnx": ["osmnx", "ox."],
	"Rasterio": ["rasterio"],
	"MovingPandas": ["movingpandas", "mpd."],
	"GeoPandas": ["geopandas", "gpd."],
	"PyProj": ["pyproj", "crs", "epsg"],
	"H3": ["h3."],
	"Folium": ["folium"],
	}
	if category in cat_libs:
	scores["correct_library"] = 1 if any(lib in r for lib in cat_libs[category]) else 0
	else:
	scores["correct_library"] = scores["has_gis_lib"]

	scores["total"] = sum(scores.values())
	scores["max_possible"] = len(scores) - 1 # exclude total
	scores["pct"] = scores["total"] / scores["max_possible"] * 100
	return scores


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--base_model_id", default="Qwen/Qwen2.5-Coder-7B-Instruct")
	parser.add_argument("--adapter_id", default="RhodWeo/GIS-Coder-7B")
	parser.add_argument("--compare_base", action="store_true", help="Also evaluate base model for comparison")
	parser.add_argument("--max_new_tokens", type=int, default=768)
	parser.add_argument("--output", type=str, default="eval_results.json")
	parser.add_argument("--no_quantize", action="store_true")
	args = parser.parse_args()

	results = {"fine_tuned": [], "base": []}

	# Evaluate fine-tuned model
	print(f"\n{'='*60}")
	print(f"EVALUATING: {args.base_model_id} + {args.adapter_id}")
	print(f"{'='*60}")

	model, tokenizer = load_model(args.base_model_id, args.adapter_id, not args.no_quantize)

	for i, item in enumerate(EVAL_PROMPTS):
	print(f"\n[{i+1}/{len(EVAL_PROMPTS)}] {item['category']} ({item['difficulty']})")
	print(f" Prompt: {item['prompt'][:80]}...")

	response, gen_time = generate(model, tokenizer, item["prompt"], args.max_new_tokens)
	scores = score_response(response, item["category"])

	print(f" Time: {gen_time:.1f}s \| Score: {scores['pct']:.0f}% \| Code: {'✓' if scores['has_code'] else '✗'} \| "
	f"Lib: {'✓' if scores['correct_library'] else '✗'} \| CoT: {'✓' if scores['has_cot'] else '✗'}")

	results["fine_tuned"].append({
	**item, "response": response, "gen_time": gen_time, "scores": scores,
	})

	# Summary
	ft_scores = [r["scores"]["pct"] for r in results["fine_tuned"]]
	print(f"\n{'='*60}")
	print(f"FINE-TUNED MODEL SUMMARY")
	print(f" Avg score: {sum(ft_scores)/len(ft_scores):.1f}%")
	print(f" Code blocks: {sum(r['scores']['has_code'] for r in results['fine_tuned'])}/{len(EVAL_PROMPTS)}")
	print(f" Correct library: {sum(r['scores']['correct_library'] for r in results['fine_tuned'])}/{len(EVAL_PROMPTS)}")
	print(f" Chain-of-thought: {sum(r['scores']['has_cot'] for r in results['fine_tuned'])}/{len(EVAL_PROMPTS)}")

	by_cat = {}
	for r in results["fine_tuned"]:
	cat = r["category"]
	by_cat.setdefault(cat, []).append(r["scores"]["pct"])
	print(f"\n By category:")
	for cat, scores in sorted(by_cat.items()):
	print(f" {cat}: {sum(scores)/len(scores):.0f}%")

	# Optionally compare with base model
	if args.compare_base:
	print(f"\n{'='*60}")
	print(f"EVALUATING BASE: {args.base_model_id}")
	print(f"{'='*60}")

	del model
	torch.cuda.empty_cache() if torch.cuda.is_available() else None

	base_model, _ = load_model(args.base_model_id, None, not args.no_quantize)

	for i, item in enumerate(EVAL_PROMPTS):
	response, gen_time = generate(base_model, tokenizer, item["prompt"], args.max_new_tokens)
	scores = score_response(response, item["category"])
	results["base"].append({**item, "response": response, "gen_time": gen_time, "scores": scores})

	base_scores = [r["scores"]["pct"] for r in results["base"]]
	print(f"\n BASE: {sum(base_scores)/len(base_scores):.1f}% vs FINE-TUNED: {sum(ft_scores)/len(ft_scores):.1f}%")
	print(f" Improvement: +{sum(ft_scores)/len(ft_scores) - sum(base_scores)/len(base_scores):.1f}pp")

	# Save results
	with open(args.output, "w") as f:
	json.dump(results, f, indent=2, default=str)
	print(f"\nResults saved to {args.output}")


	if __name__ == "__main__":
	main()