File size: 11,274 Bytes

838dbf8

"""
GIS-Coder Evaluation Script
============================
Tests the fine-tuned model on GIS code generation tasks.
Compares fine-tuned model vs base model.

Usage:
  python evaluate.py --adapter_id RhodWeo/GIS-Coder-7B
  python evaluate.py --adapter_id ./gis-coder-7b-output/final --compare_base
"""

import argparse
import torch
import time
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

SYSTEM_PROMPT = (
    "You are GIS-Coder, an expert Python programmer specializing in Geographic Information Systems (GIS) "
    "and geospatial analysis. You write clean, efficient, well-documented Python code using libraries like "
    "GeoPandas, Shapely, Rasterio, GDAL, PyProj, OSMNX, Folium, H3, Fiona, xarray, and MovingPandas. "
    "You always explain your reasoning step-by-step before providing code, and you handle edge cases."
)

EVAL_PROMPTS = [
    # OSMNX (Tier 1 — models score 0%)
    {"category": "OSMnx", "difficulty": "medium",
     "prompt": "Write a Python function using OSMnx to download the street network of a city and find all dead-end streets (nodes with degree 1)."},
    {"category": "OSMnx", "difficulty": "hard",
     "prompt": "Use OSMnx to create a 10-minute walking isochrone from a given point, then find all restaurants within that isochrone using OSM POI data."},
    
    # Rasterio (Tier 1)
    {"category": "Rasterio", "difficulty": "medium",
     "prompt": "Write a function to calculate slope and aspect from a DEM raster file using rasterio and numpy."},
    {"category": "Rasterio", "difficulty": "hard",
     "prompt": "Write a function that takes a multispectral satellite image and computes NDVI, NDWI, and NDBI indices, saving each as a separate GeoTIFF with proper nodata handling."},
    
    # MovingPandas (Tier 1 — models score 0%)
    {"category": "MovingPandas", "difficulty": "medium",
     "prompt": "Use MovingPandas to create trajectories from GPS tracking data, compute speed along each trajectory, and detect stops where the object was stationary for more than 5 minutes."},
    
    # GeoPandas (Tier 2)
    {"category": "GeoPandas", "difficulty": "easy",
     "prompt": "Perform a spatial join between a points GeoDataFrame and a polygons GeoDataFrame to count how many points fall within each polygon."},
    {"category": "GeoPandas", "difficulty": "medium",
     "prompt": "Write a function that reads a shapefile, buffers all geometries by 500 meters, dissolves overlapping buffers, and calculates the total area covered in square kilometers."},
    
    # CRS / PyProj (common failure)
    {"category": "PyProj", "difficulty": "medium",
     "prompt": "Write a function that takes two GeoDataFrames with potentially different CRS, automatically detects the mismatch, reprojects them to a common CRS, and computes their spatial intersection."},
    
    # Multi-library workflow
    {"category": "Multi-library", "difficulty": "hard",
     "prompt": "Write a Python script that downloads building footprints from OpenStreetMap using OSMnx, projects them to UTM for accurate area calculation, filters buildings larger than 500 sq meters, and creates an interactive Folium choropleth map colored by building area."},
    {"category": "Multi-library", "difficulty": "hard",
     "prompt": "Create a complete site suitability analysis for wind turbine placement: compute slope from DEM, extract distance to roads, buffer protected areas, and combine with a weighted overlay to produce a suitability score raster."},
    
    # H3
    {"category": "H3", "difficulty": "medium",
     "prompt": "Use H3 hexagonal indexing to aggregate 10,000 crime incident points into resolution-9 hexagonal bins, compute density per square kilometer, and identify the top 10 hotspot hexagons."},
    
    # Folium
    {"category": "Folium", "difficulty": "medium",
     "prompt": "Create an interactive Folium map with three layers: clustered markers for schools, a heatmap for population density, and a choropleth of median income by census tract. Add layer controls."},
]


def load_model(base_model_id, adapter_id=None, quantize=True):
    """Load model with optional LoRA adapter."""
    model_kwargs = {"trust_remote_code": True, "attn_implementation": "eager"}
    
    if quantize and torch.cuda.is_available():
        from transformers import BitsAndBytesConfig
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        model_kwargs["dtype"] = torch.bfloat16
    else:
        model_kwargs["dtype"] = torch.float32
    
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id, device_map="auto" if torch.cuda.is_available() else None,
        **model_kwargs,
    )
    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
    
    if adapter_id:
        model = PeftModel.from_pretrained(model, adapter_id)
        print(f"Loaded adapter: {adapter_id}")
    
    model.eval()
    return model, tokenizer


def generate(model, tokenizer, prompt, max_new_tokens=768):
    """Generate response."""
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048)
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    t0 = time.time()
    with torch.no_grad():
        out = model.generate(
            **inputs, max_new_tokens=max_new_tokens,
            temperature=0.7, top_p=0.9, do_sample=True, repetition_penalty=1.1,
        )
    gen_time = time.time() - t0
    response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response, gen_time


def score_response(response, category):
    """Score response quality on multiple dimensions."""
    r = response.lower()
    scores = {
        "has_code": 1 if ("```python" in response or "import " in response) else 0,
        "has_function": 1 if "def " in response else 0,
        "has_gis_lib": 1 if any(lib in r for lib in [
            "geopandas", "gpd.", "shapely", "rasterio", "osmnx", "ox.", "gdal",
            "pyproj", "folium", "h3.", "fiona", "xarray", "movingpandas", "mpd."
        ]) else 0,
        "has_cot": 1 if any(p in r for p in [
            "step by step", "first", "then", "i need to", "let me think"
        ]) else 0,
        "has_docstring": 1 if '"""' in response or "'''" in response else 0,
        "has_error_handling": 1 if any(k in r for k in ["try:", "except", "raise", "if.*none", "is_valid"]) else 0,
        "has_crs_handling": 1 if any(k in r for k in ["crs", "epsg", "to_crs", "reproject"]) else 0,
    }
    
    # Category-specific scoring
    cat_libs = {
        "OSMnx": ["osmnx", "ox."],
        "Rasterio": ["rasterio"],
        "MovingPandas": ["movingpandas", "mpd."],
        "GeoPandas": ["geopandas", "gpd."],
        "PyProj": ["pyproj", "crs", "epsg"],
        "H3": ["h3."],
        "Folium": ["folium"],
    }
    if category in cat_libs:
        scores["correct_library"] = 1 if any(lib in r for lib in cat_libs[category]) else 0
    else:
        scores["correct_library"] = scores["has_gis_lib"]
    
    scores["total"] = sum(scores.values())
    scores["max_possible"] = len(scores) - 1  # exclude total
    scores["pct"] = scores["total"] / scores["max_possible"] * 100
    return scores


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_model_id", default="Qwen/Qwen2.5-Coder-7B-Instruct")
    parser.add_argument("--adapter_id", default="RhodWeo/GIS-Coder-7B")
    parser.add_argument("--compare_base", action="store_true", help="Also evaluate base model for comparison")
    parser.add_argument("--max_new_tokens", type=int, default=768)
    parser.add_argument("--output", type=str, default="eval_results.json")
    parser.add_argument("--no_quantize", action="store_true")
    args = parser.parse_args()
    
    results = {"fine_tuned": [], "base": []}
    
    # Evaluate fine-tuned model
    print(f"\n{'='*60}")
    print(f"EVALUATING: {args.base_model_id} + {args.adapter_id}")
    print(f"{'='*60}")
    
    model, tokenizer = load_model(args.base_model_id, args.adapter_id, not args.no_quantize)
    
    for i, item in enumerate(EVAL_PROMPTS):
        print(f"\n[{i+1}/{len(EVAL_PROMPTS)}] {item['category']} ({item['difficulty']})")
        print(f"  Prompt: {item['prompt'][:80]}...")
        
        response, gen_time = generate(model, tokenizer, item["prompt"], args.max_new_tokens)
        scores = score_response(response, item["category"])
        
        print(f"  Time: {gen_time:.1f}s | Score: {scores['pct']:.0f}% | Code: {'✓' if scores['has_code'] else '✗'} | "
              f"Lib: {'✓' if scores['correct_library'] else '✗'} | CoT: {'✓' if scores['has_cot'] else '✗'}")
        
        results["fine_tuned"].append({
            **item, "response": response, "gen_time": gen_time, "scores": scores,
        })
    
    # Summary
    ft_scores = [r["scores"]["pct"] for r in results["fine_tuned"]]
    print(f"\n{'='*60}")
    print(f"FINE-TUNED MODEL SUMMARY")
    print(f"  Avg score: {sum(ft_scores)/len(ft_scores):.1f}%")
    print(f"  Code blocks: {sum(r['scores']['has_code'] for r in results['fine_tuned'])}/{len(EVAL_PROMPTS)}")
    print(f"  Correct library: {sum(r['scores']['correct_library'] for r in results['fine_tuned'])}/{len(EVAL_PROMPTS)}")
    print(f"  Chain-of-thought: {sum(r['scores']['has_cot'] for r in results['fine_tuned'])}/{len(EVAL_PROMPTS)}")
    
    by_cat = {}
    for r in results["fine_tuned"]:
        cat = r["category"]
        by_cat.setdefault(cat, []).append(r["scores"]["pct"])
    print(f"\n  By category:")
    for cat, scores in sorted(by_cat.items()):
        print(f"    {cat}: {sum(scores)/len(scores):.0f}%")
    
    # Optionally compare with base model
    if args.compare_base:
        print(f"\n{'='*60}")
        print(f"EVALUATING BASE: {args.base_model_id}")
        print(f"{'='*60}")
        
        del model
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        base_model, _ = load_model(args.base_model_id, None, not args.no_quantize)
        
        for i, item in enumerate(EVAL_PROMPTS):
            response, gen_time = generate(base_model, tokenizer, item["prompt"], args.max_new_tokens)
            scores = score_response(response, item["category"])
            results["base"].append({**item, "response": response, "gen_time": gen_time, "scores": scores})
        
        base_scores = [r["scores"]["pct"] for r in results["base"]]
        print(f"\n  BASE: {sum(base_scores)/len(base_scores):.1f}% vs FINE-TUNED: {sum(ft_scores)/len(ft_scores):.1f}%")
        print(f"  Improvement: +{sum(ft_scores)/len(ft_scores) - sum(base_scores)/len(base_scores):.1f}pp")
    
    # Save results
    with open(args.output, "w") as f:
        json.dump(results, f, indent=2, default=str)
    print(f"\nResults saved to {args.output}")


if __name__ == "__main__":
    main()