prana_env

Sleeping

App Files Files Community

pbanavara commited on Mar 8

Commit

9cc586f

verified ·

1 Parent(s): 75a4eab

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +1 -1
prana_grpo_qwen35_9b.ipynb +542 -0
server/requirements.txt +1 -1
setup.py +11 -0

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: purple
 colorTo: indigo
 sdk: docker
 pinned: false
-app_port: 8000
 base_path: /web
 tags:
   - openenv

 colorTo: indigo
 sdk: docker
 pinned: false
+app_port: 7860
 base_path: /web
 tags:
   - openenv

prana_grpo_qwen35_9b.ipynb ADDED Viewed

	@@ -0,0 +1,542 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PRANA-Env: Reinforcement Learning with Qwen3.5-9B\n",
+    "\n",
+    "Fine-tune **Qwen3.5-9B** using **GRPO** on the PRANA kidney transplant administration environment.\n",
+    "\n",
+    "The agent must:\n",
+    "1. Query fragmented clinical datastores\n",
+    "2. Detect stale lab values (90-day KARS recency window)\n",
+    "3. Detect anomalous measurements (>25% change within 14 days)\n",
+    "4. File a complete KARS-compliant SRTR report\n",
+    "\n",
+    "Reward signal comes from the deterministic KARS validator in prana_env.\n",
+    "\n",
+    "**Hardware**: H100 80GB recommended. BF16 LoRA, no 4-bit quantization.\n",
+    "\n",
+    "**Baseline**: Qwen3:8b untuned scores **0.71 Pass@1** on temporal/anomaly tasks.  \n",
+    "**Target**: ≥ 0.90 Pass@1 after GRPO fine-tuning."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "import os, importlib.util\n",
+    "!pip install --upgrade -qqq uv\n",
+    "if importlib.util.find_spec('torch') is None or 'COLAB_' in ''.join(os.environ.keys()):\n",
+    "    try: import numpy; get_numpy = f'numpy=={numpy.__version__}'\n",
+    "    except: get_numpy = 'numpy'\n",
+    "    !uv pip install -qqq \\\n",
+    "        'torch>=2.8.0' 'triton>=3.4.0' {get_numpy} torchvision bitsandbytes 'transformers==4.56.2' \\\n",
+    "        'unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo' \\\n",
+    "        'unsloth[base] @ git+https://github.com/unslothai/unsloth'\n",
+    "elif importlib.util.find_spec('unsloth') is None:\n",
+    "    !uv pip install -qqq unsloth\n",
+    "!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers trl==0.22.2 unsloth unsloth_zoo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "# Clone prana_env and install dependencies\n",
+    "!git clone https://github.com/pbanavara/prana_env.git\n",
+    "!uv pip install -qqq fastapi uvicorn websockets pydantic openenv requests\n",
+    "%cd prana_env\n",
+    "!uv pip install -qqq -e .\n",
+    "\n",
+    "import sys, os\n",
+    "sys.path.insert(0, '.')\n",
+    "working_directory = os.path.abspath('.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load Qwen3.5-9B with LoRA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "import torch\n",
+    "\n",
+    "max_seq_length = 2048  # Multi-turn clinical dialogue needs longer context\n",
+    "lora_rank = 16         # Higher rank than 2048-game notebook — more complex reasoning task\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name = 'unsloth/Qwen3.5-9B',\n",
+    "    load_in_4bit = False,   # BF16 — QLoRA not recommended for Qwen3.5\n",
+    "    max_seq_length = max_seq_length,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r = lora_rank,\n",
+    "    target_modules = [\n",
+    "        'q_proj', 'k_proj', 'v_proj', 'o_proj',\n",
+    "        'gate_proj', 'up_proj', 'down_proj',\n",
+    "    ],\n",
+    "    lora_alpha = lora_rank * 2,\n",
+    "    use_gradient_checkpointing = 'unsloth',\n",
+    "    random_state = 3407,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Launch prana_env server\n",
+    "\n",
+    "Start the FastAPI + WebSocket server as a local subprocess — same pattern as the OpenEnv 2048 notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess, time, requests\n",
+    "\n",
+    "PRANA_PORT = 8000\n",
+    "PRANA_BASE_URL = f'http://localhost:{PRANA_PORT}'\n",
+    "_server_proc = None\n",
+    "\n",
+    "def launch_prana_server():\n",
+    "    global _server_proc\n",
+    "    if _server_proc is not None:\n",
+    "        try:\n",
+    "            requests.get(f'{PRANA_BASE_URL}/health', timeout=2)\n",
+    "            return  # already running\n",
+    "        except Exception:\n",
+    "            _server_proc.kill()\n",
+    "            _server_proc = None\n",
+    "\n",
+    "    _server_proc = subprocess.Popen(\n",
+    "        ['uvicorn', 'server.app:app', '--host', '0.0.0.0', '--port', str(PRANA_PORT)],\n",
+    "        cwd=working_directory,\n",
+    "        stdout=subprocess.DEVNULL,\n",
+    "        stderr=subprocess.DEVNULL,\n",
+    "    )\n",
+    "    # Wait for server to be ready\n",
+    "    for _ in range(20):\n",
+    "        try:\n",
+    "            requests.get(f'{PRANA_BASE_URL}/health', timeout=2)\n",
+    "            print(f'prana_env server ready at {PRANA_BASE_URL}')\n",
+    "            return\n",
+    "        except Exception:\n",
+    "            time.sleep(1)\n",
+    "    raise RuntimeError('prana_env server failed to start')\n",
+    "\n",
+    "launch_prana_server()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. PRANA-Env client helpers\n",
+    "\n",
+    "Thin wrappers around the WebSocket client for use in the reward function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "from prana_env.client import PranaEnv\n",
+    "from prana_env.models import PranaAction\n",
+    "\n",
+    "PATIENTS = ['P001', 'P002', 'P003']\n",
+    "\n",
+    "def run_episode(action_sequence: list[dict]) -> tuple[float, str]:\n",
+    "    \"\"\"\n",
+    "    Execute a list of parsed actions against prana_env and return (reward, kars_result).\n",
+    "    action_sequence: list of dicts with keys matching PranaAction fields.\n",
+    "    Returns (cumulative_reward, 'PASSED'|'FAILED'|'INCOMPLETE').\n",
+    "    \"\"\"\n",
+    "    launch_prana_server()\n",
+    "    patient_id = random.choice(PATIENTS)\n",
+    "    cumulative_reward = 0.0\n",
+    "    kars_result = 'INCOMPLETE'\n",
+    "\n",
+    "    with PranaEnv(base_url=PRANA_BASE_URL) as env:\n",
+    "        obs = env.reset(patient_id=patient_id)\n",
+    "        for action_dict in action_sequence:\n",
+    "            try:\n",
+    "                action = PranaAction(**action_dict)\n",
+    "                result = env.step(action)\n",
+    "                cumulative_reward += result.reward\n",
+    "                if result.done:\n",
+    "                    kars_result = result.observation.kars_result or 'FAILED'\n",
+    "                    break\n",
+    "            except Exception:\n",
+    "                cumulative_reward -= 1.0  # malformed action penalty\n",
+    "                continue\n",
+    "\n",
+    "    return cumulative_reward, kars_result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Action parser\n",
+    "\n",
+    "The model outputs a structured action sequence in its response. We parse it into PranaAction dicts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json, re\n",
+    "\n",
+    "def extract_actions(response: str) -> list[dict]:\n",
+    "    \"\"\"\n",
+    "    Extract a JSON array of actions from the model response.\n",
+    "    The model is instructed to output actions inside ```json ... ``` blocks.\n",
+    "    \"\"\"\n",
+    "    # Try fenced JSON block first\n",
+    "    match = re.search(r'```json\\s*(\\[.*?\\])\\s*```', response, re.DOTALL)\n",
+    "    if not match:\n",
+    "        # Fallback: any JSON array in the response\n",
+    "        match = re.search(r'(\\[\\s*\\{.*?\\}\\s*\\])', response, re.DOTALL)\n",
+    "    if not match:\n",
+    "        return []\n",
+    "    try:\n",
+    "        return json.loads(match.group(1))\n",
+    "    except json.JSONDecodeError:\n",
+    "        return []"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. GRPO prompt\n",
+    "\n",
+    "The prompt instructs the model to output a full action plan for KARS filing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SYSTEM_PROMPT = \"\"\"\n",
+    "You are a clinical administrative agent for a kidney transplant center.\n",
+    "Your task is to file a KARS-compliant SRTR report for a patient.\n",
+    "\n",
+    "Today's date is 2026-03-07 (filing date T5).\n",
+    "The patient has a record from approximately 4 months ago (T1). Some fields may be stale.\n",
+    "\n",
+    "KARS Required Fields:\n",
+    "- hba1c, gfr, creatinine (PatientDB) — time-sensitive, must be within 90 days of filing\n",
+    "- blood_type (PatientDB) — stable, no recency requirement\n",
+    "\n",
+    "OPTN Clinical Integrity Policy:\n",
+    "- If two measurements of the same field within 14 days differ by >25%, do NOT file.\n",
+    "  Communicate the anomaly and recommend a confirmatory test.\n",
+    "\n",
+    "Actions available:\n",
+    "- query_db: {action_type: query_db, target: PatientDB, field: <field>, patient_id: <id>}\n",
+    "- record_value: {action_type: record_value, field: <field>, value: <value>}\n",
+    "- file_report: {action_type: file_report}\n",
+    "\n",
+    "Output your complete action plan as a JSON array inside ```json ... ``` tags.\n",
+    "Reason step by step before outputting actions.\n",
+    "\"\"\".strip()\n",
+    "\n",
+    "USER_PROMPT_TEMPLATE = \"\"\"\n",
+    "File a KARS-compliant SRTR report for patient {patient_id}.\n",
+    "The T1 snapshot from ~4 months ago is pre-loaded in the record.\n",
+    "Check which fields are stale or anomalous, re-query only what is needed, and file.\n",
+    "\"\"\".strip()\n",
+    "\n",
+    "def make_prompt(patient_id: str) -> list[dict]:\n",
+    "    return [\n",
+    "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "        {\"role\": \"user\", \"content\": USER_PROMPT_TEMPLATE.format(patient_id=patient_id)},\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Reward functions\n",
+    "\n",
+    "Three reward signals fed to GRPOTrainer:\n",
+    "1. `actions_parseable` — model output is valid JSON with recognizable actions\n",
+    "2. `kars_reward` — KARS validator reward from prana_env (+15 first pass, +10 after correction, -5 fail)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def actions_parseable(completions, **kwargs):\n",
+    "    \"\"\"Reward 1.0 if the model outputs a parseable action list, -1.0 otherwise.\"\"\"\n",
+    "    scores = []\n",
+    "    for completion in completions:\n",
+    "        response = completion[0]['content']\n",
+    "        actions = extract_actions(response)\n",
+    "        scores.append(1.0 if len(actions) > 0 else -1.0)\n",
+    "    return scores\n",
+    "\n",
+    "\n",
+    "def kars_reward(completions, prompts, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Execute the action sequence in prana_env and return the KARS reward.\n",
+    "    Reward scale mirrors prana_env:\n",
+    "      +15  KARS PASSED first attempt\n",
+    "      +10  KARS PASSED after correction\n",
+    "       -1  re-query of already-fresh field\n",
+    "       -5  KARS FAILED\n",
+    "      -10  unrecoverable (3 attempts)\n",
+    "    Normalized to [-1, 1] for GRPO stability.\n",
+    "    \"\"\"\n",
+    "    scores = []\n",
+    "    for completion, prompt in zip(completions, prompts):\n",
+    "        response = completion[0]['content']\n",
+    "        actions = extract_actions(response)\n",
+    "\n",
+    "        if not actions:\n",
+    "            scores.append(-1.0)\n",
+    "            continue\n",
+    "\n",
+    "        # Extract patient_id from the user message\n",
+    "        patient_id = 'P001'\n",
+    "        for msg in prompt:\n",
+    "            if msg['role'] == 'user':\n",
+    "                m = re.search(r'P00\\d', msg['content'])\n",
+    "                if m:\n",
+    "                    patient_id = m.group(0)\n",
+    "\n",
+    "        # Inject patient_id into query_db actions if missing\n",
+    "        for a in actions:\n",
+    "            if a.get('action_type') == 'query_db' and 'patient_id' not in a:\n",
+    "                a['patient_id'] = patient_id\n",
+    "\n",
+    "        try:\n",
+    "            raw_reward, kars_result = run_episode(actions)\n",
+    "            # Normalize: max raw reward is +15, min is -10\n",
+    "            normalized = max(-1.0, min(1.0, raw_reward / 15.0))\n",
+    "            scores.append(normalized)\n",
+    "            print(f'[KARS] patient={patient_id} result={kars_result} raw={raw_reward:.1f} normalized={normalized:.2f}')\n",
+    "        except Exception as e:\n",
+    "            print(f'[KARS] error: {e}')\n",
+    "            scores.append(-1.0)\n",
+    "\n",
+    "    return scores"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Dataset\n",
+    "\n",
+    "Rotate across all 3 patients to ensure the model generalizes, not memorizes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "# 1000 episodes cycling through all patients\n",
+    "records = []\n",
+    "for i in range(1000):\n",
+    "    pid = PATIENTS[i % len(PATIENTS)]\n",
+    "    records.append({\n",
+    "        'prompt': make_prompt(pid),\n",
+    "        'answer': 0,\n",
+    "        'enable_thinking': False,   # Qwen3.5 thinking flag (vs reasoning_effort in gpt-oss)\n",
+    "    })\n",
+    "\n",
+    "dataset = Dataset.from_list(records)\n",
+    "\n",
+    "maximum_length = len(tokenizer.apply_chat_template(\n",
+    "    make_prompt('P001'),\n",
+    "    add_generation_prompt=True,\n",
+    "))\n",
+    "print(f'Prompt token length: {maximum_length}')\n",
+    "dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. GRPO Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_prompt_length = maximum_length + 1\n",
+    "max_completion_length = max_seq_length - max_prompt_length\n",
+    "\n",
+    "from trl import GRPOConfig, GRPOTrainer\n",
+    "\n",
+    "training_args = GRPOConfig(\n",
+    "    temperature = 1.0,\n",
+    "    learning_rate = 5e-5,\n",
+    "    weight_decay = 0.001,\n",
+    "    warmup_ratio = 0.1,\n",
+    "    lr_scheduler_type = 'linear',\n",
+    "    optim = 'adamw_8bit',\n",
+    "    logging_steps = 1,\n",
+    "    per_device_train_batch_size = 1,\n",
+    "    gradient_accumulation_steps = 4,\n",
+    "    num_generations = 8,        # Full GRPO batch — H100 80GB can handle this at 9B BF16\n",
+    "    max_prompt_length = max_prompt_length,\n",
+    "    max_completion_length = max_completion_length,\n",
+    "    max_steps = 600,\n",
+    "    save_steps = 100,\n",
+    "    report_to = 'none',\n",
+    "    output_dir = 'outputs',\n",
+    ")\n",
+    "\n",
+    "trainer = GRPOTrainer(\n",
+    "    model = model,\n",
+    "    processing_class = tokenizer,\n",
+    "    reward_funcs = [\n",
+    "        actions_parseable,\n",
+    "        kars_reward,\n",
+    "    ],\n",
+    "    args = training_args,\n",
+    "    train_dataset = dataset,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 10. Inference — test the fine-tuned model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TextStreamer\n",
+    "\n",
+    "test_patient = 'P002'  # has anomalous GFR/creatinine — hardest case\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    make_prompt(test_patient),\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True,\n",
+    "    enable_thinking=False,\n",
+    ")\n",
+    "\n",
+    "_ = model.generate(\n",
+    "    **tokenizer(text, return_tensors='pt').to('cuda'),\n",
+    "    temperature=1.0,\n",
+    "    max_new_tokens=1024,\n",
+    "    streamer=TextStreamer(tokenizer, skip_prompt=False),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 11. Save model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save LoRA adapters\n",
+    "model.save_pretrained('prana_qwen35_9b_lora')\n",
+    "tokenizer.save_pretrained('prana_qwen35_9b_lora')\n",
+    "\n",
+    "# Push to Hub (optional)\n",
+    "if False:\n",
+    "    model.push_to_hub_merged(\n",
+    "        'pbanavara/prana-qwen35-9b-grpo',\n",
+    "        tokenizer,\n",
+    "        save_method='merged_16bit',\n",
+    "        token='hf_...',\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

server/requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-openenv[core]>=0.2.0
 fastapi>=0.115.0
 uvicorn>=0.24.0

+openenv-core[core]>=0.2.0
 fastapi>=0.115.0
 uvicorn>=0.24.0

setup.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from setuptools import setup
+from setuptools.command.editable_wheel import editable_wheel
+class CompatEditableWheel(editable_wheel):
+    def run(self):
+        self.mode = "compat"
+        super().run()
+setup(cmdclass={"editable_wheel": CompatEditableWheel})