Spaces:

snorkelai
/

finqa-env

Running on CPU Upgrade

App Files Files Community

bhavishya2895 commited on Feb 17

Commit

7b0eff4

verified ·

1 Parent(s): df8da83

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

Dockerfile +83 -0
README.md +194 -5
__init__.py +24 -0
client.py +33 -0
download_data.sh +71 -0
models.py +38 -0
openenv.yaml +6 -0
openenv_finqa_env.egg-info/PKG-INFO +15 -0
openenv_finqa_env.egg-info/SOURCES.txt +16 -0
openenv_finqa_env.egg-info/dependency_links.txt +1 -0
openenv_finqa_env.egg-info/entry_points.txt +2 -0
openenv_finqa_env.egg-info/requires.txt +11 -0
openenv_finqa_env.egg-info/top_level.txt +1 -0
pyproject.toml +34 -0
server/__init__.py +13 -0
server/app.py +35 -0
server/finqa_environment.py +277 -0
server/rewards.py +282 -0
server/tools.py +218 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,83 @@

+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local src/core)
+# - Standalone environments (with openenv-core from pip)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv-core is already in the pyproject.toml dependencies
+# For standalone builds, openenv-core will be installed from pip via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install git for building from git repos (build-time only)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install dependencies using uv sync
+# First pass: install dependencies without the project (for better caching)
+# Second pass: install the project itself
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Environment variables with defaults
+ENV FINQA_DATA_PATH="/app/env/data"
+ENV FINQA_MAX_STEPS="50"
+ENV FINQA_TASK="finqa"
+# Download data from HuggingFace at build time (requires network)
+RUN pip install --no-cache-dir huggingface_hub[cli] && \
+    bash /app/env/download_data.sh snorkelai/finqa-data /app/env/data
+# Health check using Python (more portable than curl/wget)
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+# Run the FastAPI server
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,199 @@
 ---
-title: Finqa Env
-emoji: 📊
-colorFrom: pink
-colorTo: purple
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: FinQA Environment Server
+emoji: 🔊
+colorFrom: blue
+colorTo: gray
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+datasets:
+  - snorkelai/finqa-data
+tags:
+  - openenv
 ---
+# FinQA Environment
+A financial question-answering environment for RL training. Evaluates LLMs on their ability to answer complex financial questions using tool calls on SEC 10-K filing data.
+Based on [FinQABenchmark](https://github.com/snorkel-ai/FinQABenchmark) from Snorkel AI.
+## Overview
+FinQA tests an agent's ability to:
+- Explore available financial tables for a company
+- Query table metadata and execute SQL queries
+- Perform calculations on extracted data
+- Submit final answers to financial questions
+**Dataset**: 290 questions from SEC 10-K filings across multiple companies (Alphabet, Amazon, Apple, AT&T, etc.)
+**Reward**: Binary (1.0 for correct answer, 0.0 for incorrect) using fuzzy numerical matching with 1% tolerance.
+> **Note**: This dataset is for evaluation only. Do not train on it.
+## Quick Start
+### Using Docker
+```bash
+# Build the image (from OpenEnv repo root)
+docker build -t finqa-env:latest -f envs/finqa_env/server/Dockerfile .
+# Run the server
+docker run -p 8000:8000 finqa-env:latest
+# To run evaluation script (example model gpt-5)
+API_BASE_URL=https://api.openai.com/v1 API_KEY=$OPENAI_API_KEY MODEL=gpt-5 python examples/finqa_inference.py
+```
+### Local Development
+```bash
+# Install dependencies
+uv pip install pandas
+# Download data from HuggingFace
+cd envs/finqa_env
+./download_data.sh
+```
+### Using the Client
+The client uses the MCP protocol and is async by default:
+```python
+import asyncio
+from envs.finqa_env import FinQAEnv, CallToolAction
+async def main():
+    async with FinQAEnv(base_url="http://localhost:8000") as env:
+        # Reset to get a question
+        obs = await env.reset()
+        question = obs.metadata["question"]
+        company = obs.metadata["company"]
+        print(f"Question: {question}")
+        print(f"Company: {company}")
+        # Discover available tools
+        tools = await env.list_tools()
+        print([t.name for t in tools])
+        # Use tools via call_tool (convenience method)
+        result = await env.call_tool("get_descriptions", company_name=company)
+        print(f"Available tables: {result}")
+        # Or use step() with CallToolAction for full observation access
+        step_result = await env.step(CallToolAction(
+            tool_name="sql_query",
+            arguments={
+                "company_name": "alphabet",
+                "table_name": "us_gaap_ScheduleOfIncomeBeforeIncomeTaxDomesticAndForeignTableTextBlock",
+                "query": "SELECT * FROM data WHERE year = '2022'"
+            }
+        ))
+        print(f"Done: {step_result.done}, Reward: {step_result.reward}")
+        # Submit answer
+        result = await env.call_tool("submit_answer", answer="6.118")
+asyncio.run(main())
+```
+## Available Tools
+Tools are auto-discovered via MCP. Use `await env.list_tools()` to see all available tools at runtime.
+| Tool | Description | Arguments |
+|------|-------------|-----------|
+| `get_descriptions` | Get list of available table names for a company | `company_name: str` |
+| `get_table_info` | Get table metadata (columns, dtypes, unique values) | `company_name: str, table_name: str` |
+| `sql_query` | Execute SQL query on a table (requires filters) | `company_name: str, table_name: str, query: str` |
+| `submit_answer` | Submit final answer (ends episode) | `answer: str` |
+### Tool Constraints
+- **sql_query**: Must include filters (`WHERE`, `HAVING`, etc.). `SELECT *` is not allowed.
+## Environment Variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `FINQA_DATA_PATH` | `/app/env/data` | Path to data directory |
+| `FINQA_MAX_STEPS` | `50` | Maximum tool calls per episode |
+| `FINQA_TASK` | `finqa` | Task name |
+## Reward Computation
+Rewards use fuzzy numerical matching:
+- Extracts numbers from `\boxed{...}` format
+- Handles percentages, fractions, and decimals
+- 1% relative tolerance or 0.01 absolute tolerance
+- Returns `1.0` for correct, `0.0` for incorrect
+## Local Development
+```bash
+# From OpenEnv repo root
+cd envs/finqa_env
+# Run server locally
+FINQA_DATA_PATH=./data uvicorn server.app:app --reload --port 8000
+# Test with curl
+curl http://localhost:8000/health
+curl -X POST http://localhost:8000/reset
+```
+## Integration with RL Frameworks
+### TRL (GRPO)
+```python
+import asyncio
+from trl import GRPOTrainer
+from envs.finqa_env import FinQAEnv
+async def rollout_func(prompts, trainer):
+    async with FinQAEnv(base_url="http://localhost:8000") as env:
+        obs = await env.reset()
+        # Your agent logic here using await env.call_tool(...)
+        return {"reward": obs.reward, "completion": completion}
+trainer = GRPOTrainer(
+    model=model,
+    rollout_func=rollout_func,
+    ...
+)
+```
+## Project Structure
+```
+finqa_env/
+├── __init__.py           # Exports FinQAEnv, CallToolAction, ListToolsAction
+├── models.py             # FinQAState and tool name constants
+├── client.py             # MCP client (subclasses MCPToolClient)
+├── pyproject.toml        # Dependencies
+├── README.md             # This file
+├── data/                 # Benchmark data (run download_data.sh)
+│   ├── benchmark_questions/
+│   │   └── finqa.csv
+│   └── input_companies/
+│       └── [company folders]
+├── download_data.sh      # Downloads data from HuggingFace
+└── server/
+    ├── __init__.py
+    ├── finqa_environment.py  # MCPEnvironment subclass with @mcp.tool decorators
+    ├── tools.py              # Tool implementations
+    ├── rewards.py            # Reward computation
+    ├── app.py                # FastAPI server
+    └── Dockerfile
+```
+## References
+- [HuggingFace Dataset](https://huggingface.co/datasets/snorkelai/agent-finance-reasoning)
+- [Leaderboard](https://leaderboard.snorkel.ai/category/snorkelfinance)

__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# envs/finqa_env/__init__.py
+"""
+FinQA Environment for OpenEnv.
+A financial question-answering environment that evaluates LLMs on their ability
+to answer complex financial questions using tool calls on SEC 10-K filing data.
+Example:
+    >>> from envs.finqa_env import FinQAEnv
+    >>>
+    >>> async with FinQAEnv(base_url="http://localhost:8000") as env:
+    ...     await env.reset()
+    ...     tools = await env.list_tools()
+    ...     result = await env.call_tool("get_descriptions", company_name="alphabet")
+    ...     result = await env.call_tool("submit_answer", answer="6.118")
+"""
+from .client import FinQAEnv
+from .models import FinQAState
+# Re-export MCP types for convenience
+from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
+__all__ = ["FinQAEnv", "FinQAState", "CallToolAction", "ListToolsAction"]

client.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# envs/finqa_env/client.py
+"""
+Client for the FinQA environment.
+This client connects to a running FinQA environment server and provides
+a Python interface for interacting with it via MCP tools. Async by default.
+Example:
+    >>> from envs.finqa_env import FinQAEnv
+    >>>
+    >>> async with FinQAEnv(base_url="http://localhost:8000") as env:
+    ...     await env.reset()
+    ...     tools = await env.list_tools()
+    ...     result = await env.call_tool("get_descriptions", company_name="alphabet")
+    ...     print(result)
+    ...     result = await env.call_tool("submit_answer", answer="6.118")
+"""
+from openenv.core.mcp_client import MCPToolClient
+class FinQAEnv(MCPToolClient):
+    """
+    Client for the FinQA environment.
+    Inherits all functionality from MCPToolClient:
+    - list_tools(): Discover available tools
+    - call_tool(name, **kwargs): Call a tool by name
+    - reset(**kwargs): Reset the environment
+    - step(action): Execute an action
+    """
+    pass  # MCPToolClient provides all needed functionality

download_data.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/bin/bash
+# Download FinQA data from HuggingFace
+#
+# This script downloads all FinQA data from HuggingFace:
+# 1. Benchmark questions CSV
+# 2. Company financial documents (preprocessed SEC 10-K filings)
+#
+# Usage:
+#   ./download_data.sh <hf_repo_or_url> [output_dir]
+set -e
+HF_REPO_OR_URL="${1}"
+OUTPUT_DIR="${2:-./data}"
+if [ -z "$HF_REPO_OR_URL" ]; then
+    echo "Usage: $0 <hf_repo_or_url> [output_dir]"
+    echo "Example: $0 snorkelai/finqa-data ./data"
+    exit 1
+fi
+echo "========================================"
+echo "FinQA Data Download"
+echo "========================================"
+echo "Output directory: $OUTPUT_DIR"
+echo ""
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+# Check if data already exists
+if [ -f "$OUTPUT_DIR/benchmark_questions/finqa.csv" ] && [ -d "$OUTPUT_DIR/input_companies" ]; then
+    echo "Data already exists in $OUTPUT_DIR, skipping download."
+    exit 0
+fi
+# Check for huggingface-cli
+if ! command -v huggingface-cli &> /dev/null; then
+    echo "Error: huggingface-cli not found"
+    echo "Install it with: uv pip install huggingface_hub[cli]"
+    exit 1
+fi
+# Download from HuggingFace
+echo "Downloading from HuggingFace: $HF_REPO_OR_URL"
+if ! huggingface-cli download "$HF_REPO_OR_URL" --repo-type dataset --local-dir "$OUTPUT_DIR"; then
+    echo "Error: Failed to download dataset"
+    exit 1
+fi
+# Verify downloaded data
+if [ ! -f "$OUTPUT_DIR/benchmark_questions/finqa.csv" ]; then
+    echo "Error: benchmark_questions/finqa.csv not found in downloaded data"
+    exit 1
+fi
+if [ ! -d "$OUTPUT_DIR/input_companies" ]; then
+    echo "Error: input_companies/ directory not found in downloaded data"
+    exit 1
+fi
+echo ""
+echo "========================================"
+echo "Download complete!"
+echo "========================================"
+echo "Data location: $OUTPUT_DIR"
+echo ""
+# Export data path
+export FINQA_DATA_PATH="$OUTPUT_DIR"
+echo "Exported: FINQA_DATA_PATH=$FINQA_DATA_PATH"

models.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# envs/finqa_env/models.py
+"""
+State types for the FinQA environment.
+FinQA is a financial question-answering benchmark that evaluates LLMs on their
+ability to answer complex financial questions using tool calls (SQL queries,
+calculations, etc.) on SEC 10-K filing data.
+This environment uses the MCP protocol for tool interactions. Use
+``CallToolAction`` and ``ListToolsAction`` from ``openenv.core.env_server.mcp_types``
+to interact with the environment.
+"""
+from openenv.core.env_server import State
+# Tool names - defined statically to avoid circular imports
+AVAILABLE_TOOLS = ["get_descriptions", "get_table_info", "sql_query", "submit_answer"]
+class FinQAState(State):
+    """
+    Internal environment state for tracking the current episode.
+    All fields are set during reset() and are essential for episode tracking.
+    Attributes:
+        current_question: The question being asked
+        current_company: The company the question is about
+        ground_truth: The expected answer for reward computation
+        question_id: Identifier for the current question
+        # Inherited from State: episode_id, step_count
+    """
+    current_question: str = ""
+    current_company: str = ""
+    ground_truth: str = ""
+    question_id: str = ""

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: finqa_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

openenv_finqa_env.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,15 @@

+Metadata-Version: 2.4
+Name: openenv-finqa-env
+Version: 0.1.0
+Summary: FinQA Environment for OpenEnv - financial question-answering on SEC 10-K filing data
+Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.1
+Requires-Dist: fastapi>=0.115.0
+Requires-Dist: fastmcp>=2.0.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: uvicorn>=0.24.0
+Requires-Dist: requests>=2.31.0
+Requires-Dist: pandas>=2.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

openenv_finqa_env.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+README.md
+pyproject.toml
+./__init__.py
+./client.py
+./models.py
+openenv_finqa_env.egg-info/PKG-INFO
+openenv_finqa_env.egg-info/SOURCES.txt
+openenv_finqa_env.egg-info/dependency_links.txt
+openenv_finqa_env.egg-info/entry_points.txt
+openenv_finqa_env.egg-info/requires.txt
+openenv_finqa_env.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+server/finqa_environment.py
+server/rewards.py
+server/tools.py

openenv_finqa_env.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_finqa_env.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = finqa_env.server.app:main

openenv_finqa_env.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+openenv-core[core]>=0.2.1
+fastapi>=0.115.0
+fastmcp>=2.0.0
+pydantic>=2.0.0
+uvicorn>=0.24.0
+requests>=2.31.0
+pandas>=2.0.0
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0

openenv_finqa_env.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ finqa_env

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-finqa-env"
+version = "0.1.0"
+description = "FinQA Environment for OpenEnv - financial question-answering on SEC 10-K filing data"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv dependencies (required for server functionality)
+    "openenv-core[core]>=0.2.1",
+    "fastapi>=0.115.0",
+    "fastmcp>=2.0.0",
+    "pydantic>=2.0.0",
+    "uvicorn>=0.24.0",
+    "requests>=2.31.0",
+    # FinQA environment specific dependencies
+    "pandas>=2.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+server = "finqa_env.server.app:main"
+[tool.setuptools]
+packages = ["finqa_env", "finqa_env.server"]
+package-dir = { "finqa_env" = ".", "finqa_env.server" = "server" }

server/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# envs/finqa_env/server/__init__.py
+"""Server-side components for the FinQA environment."""
+def __getattr__(name):
+    if name == "FinQAEnvironment":
+        from .finqa_environment import FinQAEnvironment
+        return FinQAEnvironment
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = ["FinQAEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# envs/finqa_env/server/app.py
+"""
+FastAPI server for the FinQA environment.
+Environment Variables:
+    FINQA_DATA_PATH: Path to data directory (default: /app/env/data)
+    FINQA_MAX_STEPS: Maximum tool calls per episode (default: 50)
+    FINQA_TASK: Task name (default: finqa)
+"""
+import os
+from openenv.core.env_server.http_server import create_app
+from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
+from .finqa_environment import FinQAEnvironment
+DATA_PATH = os.environ.get("FINQA_DATA_PATH", "/app/env/data")
+MAX_STEPS = int(os.environ.get("FINQA_MAX_STEPS", "50"))
+TASK = os.environ.get("FINQA_TASK", "finqa")
+def _env_factory():
+    """Create a new FinQAEnvironment instance for each session."""
+    return FinQAEnvironment(
+        data_path=DATA_PATH,
+        max_steps=MAX_STEPS,
+        task=TASK,
+    )
+# Pass the class (factory) instead of instance for WebSocket session support
+# Use MCP types for action/observation since this is a pure MCP environment
+app = create_app(
+    _env_factory, CallToolAction, CallToolObservation, env_name="finqa_env"
+)

server/finqa_environment.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# envs/finqa_env/server/finqa_environment.py
+"""
+FinQA Environment Implementation.
+A financial question-answering environment that evaluates LLMs on their ability
+to answer complex financial questions using tool calls on SEC 10-K filing data.
+"""
+import logging
+import os
+import random
+import uuid
+from typing import Any, Dict, List, Optional
+import pandas as pd
+from fastmcp import FastMCP
+from openenv.core.env_server.mcp_environment import MCPEnvironment
+from openenv.core.env_server.mcp_types import CallToolAction
+from openenv.core.env_server.types import Action, Observation
+from ..models import FinQAState, AVAILABLE_TOOLS
+from .rewards import compute_reward
+from .tools import FinQATools
+logger = logging.getLogger(__name__)
+class FinQAEnvironment(MCPEnvironment):
+    """
+    Financial QA environment for RL training.
+    Evaluates agents on their ability to answer financial questions by:
+    - Exploring available tables for a company
+    - Querying table metadata and executing SQL queries
+    - Performing calculations
+    - Submitting final answers
+    Args:
+        data_path: Path to the data directory containing benchmark_questions/ and input_companies/
+        max_steps: Maximum number of tool calls per episode (default: 50)
+        task: Task name - currently only 'finqa' supported (default: 'finqa')
+    """
+    def __init__(
+        self,
+        data_path: str = "./data",
+        max_steps: int = 50,
+        task: str = "finqa",
+    ):
+        # Create MCP server and define tools inline
+        mcp = FastMCP("finqa_env")
+        self.data_path = data_path
+        self.max_steps = max_steps
+        self.task = task
+        assert task == "finqa", "Only finqa task is supported"
+        self.questions = self._load_questions()
+        logger.info(f"Loaded {len(self.questions)} questions for task '{task}'")
+        self._finqa_tools = FinQATools(data_path)
+        # Register tools with FastMCP
+        @mcp.tool
+        def get_descriptions(company_name: str) -> str:
+            """
+            Get a list of available table names for a company.
+            Args:
+                company_name: The name of the company
+            Returns:
+                JSON list of table names
+            """
+            return self._finqa_tools.get_descriptions(company_name)
+        @mcp.tool
+        def get_table_info(company_name: str, table_name: str) -> str:
+            """
+            Get table metadata: description, columns, types, unique values.
+            Args:
+                company_name: The name of the company
+                table_name: The name of the table
+            Returns:
+                JSON string with table metadata
+            """
+            return self._finqa_tools.get_table_info(company_name, table_name)
+        @mcp.tool
+        def sql_query(company_name: str, table_name: str, query: str) -> str:
+            """
+            Execute a SQL query on a table. Select * not allowed.
+            Filters are required: WHERE, HAVING, IN, NOT IN, EXISTS, NOT EXISTS,
+            ANY, SOME, ALL, LIKE, NOT LIKE, BETWEEN, NOT BETWEEN, IS NULL,
+            IS NOT NULL, CASE, FILTER.
+            Args:
+                company_name: The name of the company
+                table_name: The name of the table
+                query: SQL query to execute (must include filters)
+            Returns:
+                JSON string with query results
+            """
+            return self._finqa_tools.sql_query(company_name, table_name, query)
+        @mcp.tool
+        def submit_answer(answer: str) -> str:
+            """
+            Submit a final answer for the question.
+            Args:
+                answer: The final answer to submit
+            Returns:
+                Confirmation message
+            """
+            return self._finqa_tools.submit_answer(answer)
+        # Pass the MCP server to the base class
+        super().__init__(mcp)
+        # Shuffle dataset for sequential selection
+        self._shuffled_questions = self.questions.copy()
+        random.shuffle(self._shuffled_questions)
+        self._question_index = 0
+        self._state = FinQAState()
+        self._history: List[Dict[str, Any]] = []
+    def _load_questions(self) -> List[Dict[str, Any]]:
+        """Load questions from the benchmark CSV."""
+        csv_path = os.path.join(self.data_path, "benchmark_questions", f"{self.task}.csv")
+        if not os.path.isfile(csv_path):
+            raise FileNotFoundError(f"Benchmark file not found: {csv_path}")
+        df = pd.read_csv(csv_path)
+        questions = []
+        for _, row in df.iterrows():
+            questions.append({
+                "id": str(row.get("id", "")),
+                "user_query": row["user_query"],
+                "company": row["company"],
+                "question": row["question"],
+                "answer": row["answer"],
+                "question_type": row.get("question_type", ""),
+                "explanation": row.get("explanation", ""),
+            })
+        return questions
+    def _get_next_question(self) -> Dict[str, Any]:
+        """Get the next question using sequential shuffle selection."""
+        if self._question_index >= len(self._shuffled_questions):
+            random.shuffle(self._shuffled_questions)
+            self._question_index = 0
+        question = self._shuffled_questions[self._question_index]
+        self._question_index += 1
+        return question
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> Observation:
+        """
+        Reset the environment for a new episode.
+        Returns:
+            Initial observation with the question
+        """
+        question = self._get_next_question()
+        self._state = FinQAState(
+            episode_id=episode_id or str(uuid.uuid4()),
+            step_count=0,
+            current_question=question["user_query"],
+            current_company=question["company"],
+            ground_truth=question["answer"],
+            question_id=question["id"],
+        )
+        self._history = []
+        logger.info(f"Reset episode {self._state.episode_id} with question: {question['question'][:200]}...")
+        return Observation(
+            done=False,
+            reward=0.0,
+            metadata={
+                "question": question["user_query"],
+                "company": question["company"],
+                "tool_result": "",
+                "history": [],
+                "step_count": 0,
+                "available_tools": AVAILABLE_TOOLS.copy(),
+            },
+        )
+    def _step_impl(
+        self,
+        action: Action,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> Observation:
+        """
+        Handle non-MCP actions. Returns an error since this env is MCP-only.
+        """
+        return Observation(
+            done=False,
+            reward=0.0,
+            metadata={
+                "error": f"Unknown action type: {type(action).__name__}. "
+                "Use ListToolsAction or CallToolAction for MCP interactions."
+            },
+        )
+    def step(
+        self,
+        action: Action,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> Observation:
+        """
+        Execute a step in the environment.
+        Delegates to base class for MCP actions. Handles submit_answer
+        reward computation and max-step termination.
+        """
+        self._state.step_count += 1
+        # Let the base class handle MCP actions
+        obs = super().step(action, timeout_s=timeout_s, **kwargs)
+        # Check if submit_answer was called
+        if isinstance(action, CallToolAction) and action.tool_name == "submit_answer":
+            submitted_answer = action.arguments.get("answer", "")
+            reward = compute_reward(submitted_answer, self._state.ground_truth)
+            logger.info(
+                f"Episode {self._state.episode_id} ended: "
+                f"submitted='{submitted_answer}', truth='{self._state.ground_truth}', reward={reward}"
+            )
+            return Observation(
+                done=True,
+                reward=reward,
+                metadata={
+                    **obs.metadata,
+                    "ground_truth": self._state.ground_truth,
+                    "submitted_answer": submitted_answer,
+                },
+            )
+        # Check for max steps
+        if self._state.step_count >= self.max_steps:
+            logger.info(f"Episode {self._state.episode_id} terminated: max steps reached")
+            return Observation(
+                done=True,
+                reward=0.0,
+                metadata={
+                    **obs.metadata,
+                    "error": f"Max steps ({self.max_steps}) reached without submitting answer.",
+                },
+            )
+        return obs
+    @property
+    def state(self) -> FinQAState:
+        """Get the current environment state."""
+        return self._state

server/rewards.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# envs/finqa_env/server/rewards.py
+"""
+Reward computation for the FinQA environment.
+Uses fuzzy numerical matching to compare predicted answers against ground truth.
+Handles various formats: \boxed{}, percentages, fractions, decimals.
+"""
+import re
+from fractions import Fraction
+from typing import Optional, Tuple
+def extract_boxed_answer(text: str) -> Optional[str]:
+    """
+    Extract answer from \boxed{...} format.
+    Args:
+        text: Text potentially containing \boxed{answer}
+    Returns:
+        The extracted answer or None if not found
+    """
+    match = re.search(r"\\boxed\{([^}]+)\}", text)
+    if match:
+        return match.group(1).strip()
+    return None
+def extract_all_boxed_answers(text: str) -> list:
+    """
+    Extract all answers from \boxed{...} format.
+    Args:
+        text: Text potentially containing multiple \boxed{answer}
+    Returns:
+        List of extracted answers
+    """
+    matches = re.findall(r"\\boxed\{([^}]+)\}", text)
+    return [m.strip() for m in matches]
+def parse_number(text: str, convert_percent: bool = True) -> Optional[float]:
+    """
+    Parse a string into a float, handling various formats.
+    Handles:
+    - Plain numbers: "6.118", "-3.14"
+    - Percentages: "20.9%", "20.9 %"
+    - Fractions: "1/2", "3/4"
+    - Thousands separators: "1,234.56"
+    - Negative numbers in parens: "(100)"
+    Args:
+        text: String to parse
+        convert_percent: If True, divide percentages by 100. If False, just strip the % sign.
+    Returns:
+        Float value or None if parsing fails
+    """
+    if text is None:
+        return None
+    text = text.strip()
+    if not text:
+        return None
+    try:
+        # Remove LaTeX annotations like \text{million}, \text{%}, etc.
+        text = re.sub(r"\\text\{[^}]*\}", "", text)
+        # Remove currency symbols ($ and \$)
+        text = text.replace("\\$", "").replace("$", "").strip()
+        # Handle percentage (including LaTeX escaped \%)
+        if "%" in text or "\\%" in text:
+            text = text.replace("\\%", "").replace("%", "").strip()
+            if convert_percent:
+                return float(text.replace(",", "")) / 100
+            else:
+                return float(text.replace(",", ""))
+        # Handle parentheses for negative numbers
+        if text.startswith("(") and text.endswith(")"):
+            text = "-" + text[1:-1]
+        # Handle fractions (e.g., "1/2", "3/4")
+        if "/" in text and not text.startswith("-"):
+            try:
+                return float(Fraction(text))
+            except (ValueError, ZeroDivisionError):
+                pass
+        # Handle negative fractions
+        if text.startswith("-") and "/" in text:
+            try:
+                return -float(Fraction(text[1:]))
+            except (ValueError, ZeroDivisionError):
+                pass
+        # Remove thousands separators and parse
+        text = text.replace(",", "")
+        return float(text)
+    except (ValueError, TypeError):
+        return None
+def normalize_answer(answer: str, convert_percent: bool = True) -> Tuple[Optional[float], str]:
+    """
+    Normalize an answer string to a comparable format.
+    Args:
+        answer: Raw answer string
+        convert_percent: If True, divide percentages by 100. If False, just strip the % sign.
+    Returns:
+        Tuple of (parsed_number, cleaned_string)
+    """
+    if answer is None:
+        return None, ""
+    # Try to extract from \boxed{} first
+    boxed = extract_boxed_answer(answer)
+    if boxed:
+        answer = boxed
+    # Clean up whitespace
+    answer = answer.strip()
+    # Try to parse as number
+    num = parse_number(answer, convert_percent)
+    return num, answer.lower()
+def extract_numbers_from_multi_value(text: str) -> list:
+    """
+    Extract all numbers from a comma/semicolon separated string.
+    Handles formats like "2022: 0.933, 2023: 0.930" or "0.933, 0.931, 0.930".
+    """
+    parts = _split_multi_value(text)
+    return [num for _, num in parts]
+def _split_multi_value(text: str) -> list:
+    """
+    Extract (key, number) pairs from a comma/semicolon separated string.
+    Returns list of (key, float) tuples. Key is a year string like "2022"
+    if found, otherwise None.
+    """
+    # Split by comma or semicolon (with optional LaTeX spacing like \; or \ )
+    parts = re.split(r'[,;]\s*|\\[;,]\s*', text)
+    results = []
+    for part in parts:
+        # Strip LaTeX whitespace commands (\ , \;, \,)
+        part = re.sub(r'\\[;, ]', ' ', part).strip()
+        if not part:
+            continue
+        # Try to extract a year label (e.g. "2022:", "2022 to 2023:", "2022→2023:")
+        # Normalize \rightarrow and similar to "to" before matching
+        part_normalized = re.sub(r'\\rightarrow|→|->|−>', ' to ', part)
+        year_match = re.search(r'(20\d{2}(?:\s*to\s*20\d{2})?)', part_normalized)
+        key = year_match.group(1) if year_match else None
+        # Remove label prefix like "2022:" or "2022:\"
+        cleaned = re.sub(r'^[^:]*:\s*\\?\s*', '', part)
+        num = parse_number(cleaned)
+        if num is not None:
+            results.append((key, num))
+    return results
+def compare_single_values(pred_num: Optional[float], truth_num: Optional[float],
+                          pred_str: str, truth_str: str,
+                          tolerance: float = 0.01, max_absolute_diff: float = 1.0) -> bool:
+    """Compare two single values."""
+    # If both are numbers, compare numerically with tolerance
+    if pred_num is not None and truth_num is not None:
+        # Handle zero case
+        if truth_num == 0:
+            return abs(pred_num) < 0.001
+        # Calculate both errors
+        abs_diff = abs(pred_num - truth_num)
+        relative_error = abs_diff / abs(truth_num)
+        # BOTH conditions must pass
+        return relative_error <= tolerance and abs_diff <= max_absolute_diff
+    # If one is a number and other isn't, not equal
+    if (pred_num is None) != (truth_num is None):
+        return False
+    # Fall back to string comparison
+    return pred_str == truth_str
+def compute_reward(predicted: str, ground_truth: str, tolerance: float = 0.01, max_absolute_diff: float = 1.0) -> float:
+    """
+    Compute reward based on answer correctness.
+    Uses fuzzy numerical matching with BOTH relative and absolute tolerance checks.
+    A prediction is correct only if it passes BOTH conditions.
+    Handles multiple values (e.g., ground truth with multiple \boxed{} values).
+    Args:
+        predicted: The predicted answer from the agent
+        ground_truth: The expected correct answer
+        tolerance: Relative tolerance for numerical comparison (default 1%)
+        max_absolute_diff: Maximum absolute difference allowed (default 1.0)
+    Returns:
+        1.0 if correct, 0.0 if incorrect
+    """
+    # Check for multiple boxed answers in ground truth
+    truth_boxed = extract_all_boxed_answers(ground_truth)
+    if len(truth_boxed) > 1:
+        # Multiple ground truth values - split prediction by comma/semicolon
+        pred_values = re.split(r'[,;]\s*', predicted.strip())
+        if len(pred_values) != len(truth_boxed):
+            return 0.0  # Different number of values
+        # Compare each pair
+        for pred_val, truth_val in zip(pred_values, truth_boxed):
+            # Strip year/label prefix (e.g. "2024: -4" -> "-4")
+            pred_val_cleaned = re.sub(r'^[^:]*:\s*', '', pred_val) if ':' in pred_val else pred_val
+            pred_num, pred_str = normalize_answer(pred_val_cleaned)
+            truth_num, truth_str = normalize_answer(truth_val)
+            if not compare_single_values(pred_num, truth_num, pred_str, truth_str, tolerance, max_absolute_diff):
+                # Fallback: try without % conversion (for percentage points like "4.5%" vs "4.5")
+                pred_num_no_pct, _ = normalize_answer(pred_val, convert_percent=False)
+                if not compare_single_values(pred_num_no_pct, truth_num, pred_str, truth_str, tolerance, max_absolute_diff):
+                    return 0.0
+        return 1.0  # All values matched
+    # Single value comparison
+    pred_num, pred_str = normalize_answer(predicted)
+    truth_num, truth_str = normalize_answer(ground_truth)
+    if compare_single_values(pred_num, truth_num, pred_str, truth_str, tolerance, max_absolute_diff):
+        return 1.0
+    pred_num_no_pct, _ = normalize_answer(predicted, convert_percent=False)
+    if compare_single_values(pred_num_no_pct, truth_num, pred_str, truth_str, tolerance, max_absolute_diff):
+        return 1.0
+    # Fallback: multi-value inside single \boxed{} (only if truth didn't parse as single number)
+    if len(truth_boxed) == 1 and truth_num is None:
+        truth_pairs = _split_multi_value(truth_boxed[0])
+        pred_pairs = _split_multi_value(predicted)
+        if len(truth_pairs) > 1 and len(pred_pairs) == len(truth_pairs):
+            # If both sides have year keys, match by key (order-independent)
+            truth_keys = {k for k, _ in truth_pairs if k is not None}
+            pred_keys = {k for k, _ in pred_pairs if k is not None}
+            if truth_keys and pred_keys and truth_keys == pred_keys:
+                truth_map = {k: v for k, v in truth_pairs}
+                pred_map = {k: v for k, v in pred_pairs}
+                for key in truth_map:
+                    p, t = pred_map[key], truth_map[key]
+                    abs_diff = abs(p - t)
+                    rel_err = abs_diff / abs(t) if t != 0 else (0 if p == 0 else float('inf'))
+                    if not (rel_err <= tolerance and abs_diff <= max_absolute_diff):
+                        return 0.0
+                return 1.0
+            # Otherwise fall back to positional matching
+            for (_, p), (_, t) in zip(pred_pairs, truth_pairs):
+                abs_diff = abs(p - t)
+                rel_err = abs_diff / abs(t) if t != 0 else (0 if p == 0 else float('inf'))
+                if not (rel_err <= tolerance and abs_diff <= max_absolute_diff):
+                    return 0.0
+            return 1.0
+    return 0.0

server/tools.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# envs/finqa_env/server/tools.py
+"""
+Tool implementations for the FinQA environment.
+Ported from FinQABenchmark with simplifications:
+- Removed LangChain dependencies
+- Added submit_answer tool for episode termination
+"""
+import json
+import os
+import re
+import sqlite3
+from typing import Any, Dict, List, Tuple
+import pandas as pd
+class FinQATools:
+    """
+    Tool implementations for financial QA tasks.
+    Args:
+        data_path: Path to the data directory containing benchmark_questions/ and input_companies/
+    """
+    def __init__(self, data_path: str):
+        self.data_path = data_path
+        self.companies_path = os.path.join(data_path, "input_companies")
+        self._tables_cleaned = None
+    @property
+    def tables_cleaned(self) -> Dict:
+        """Lazy load the cleaned tables metadata."""
+        if self._tables_cleaned is None:
+            tables_path = os.path.join(self.companies_path, "tables_cleaned_all_companies.json")
+            with open(tables_path, "r") as f:
+                self._tables_cleaned = json.load(f)
+        return self._tables_cleaned
+    def get_available_companies(self) -> List[str]:
+        """Get list of available company names."""
+        return [
+            d for d in os.listdir(self.companies_path)
+            if os.path.isdir(os.path.join(self.companies_path, d))
+        ]
+    def execute_tool(self, tool_name: str, tool_args: Dict[str, Any]) -> Tuple[str, bool]:
+        """
+        Execute a tool and return its result.
+        Args:
+            tool_name: Name of the tool to execute
+            tool_args: Arguments for the tool
+        Returns:
+            Tuple of (result_string, is_final_answer)
+        """
+        if tool_name == "get_descriptions":
+            return self.get_descriptions(**tool_args), False
+        elif tool_name == "get_table_info":
+            return self.get_table_info(**tool_args), False
+        elif tool_name == "sql_query":
+            return self.sql_query(**tool_args), False
+        elif tool_name == "submit_answer":
+            return self.submit_answer(**tool_args), True
+        else:
+            return f"Error: Unknown tool '{tool_name}'", False
+    def get_descriptions(self, company_name: str) -> str:
+        """
+        Get a list of available table names for a company.
+        Args:
+            company_name: The name of the company
+        Returns:
+            JSON list of table names
+        """
+        company_path = os.path.join(self.companies_path, company_name)
+        if not os.path.isdir(company_path):
+            available = self.get_available_companies()
+            return f"Error: '{company_name}' not found. Available companies: {available}"
+        # Get all JSON files (tables) for this company
+        tables = []
+        for f in os.listdir(company_path):
+            if f.endswith(".json"):
+                tables.append(f.replace(".json", ""))
+        return json.dumps(tables)
+    def get_table_info(self, company_name: str, table_name: str) -> str:
+        """
+        Get table metadata: description, columns, types, unique values.
+        Args:
+            company_name: The name of the company
+            table_name: The name of the table
+        Returns:
+            JSON string with table metadata (description, columns, dtypes, unique values)
+        """
+        company_path = os.path.join(self.companies_path, company_name)
+        if not os.path.isdir(company_path):
+            available = self.get_available_companies()
+            return f"Error: '{company_name}' not found. Available companies: {available}"
+        # Clean table name (remove .json or .txt if present)
+        cleaned_table_name = table_name.replace(".json", "").replace(".txt", "")
+        table_key = f"{company_name}/{cleaned_table_name}"
+        if table_key not in self.tables_cleaned:
+            return f"Error: Table '{table_name}' not found for company '{company_name}'"
+        table_info = self.tables_cleaned[table_key].copy()
+        # Load the actual table to get column info
+        cleaned_table = pd.DataFrame(json.loads(table_info["table"]))
+        # Drop numeric columns (keep only structure columns for querying hints)
+        cols_to_drop = []
+        for col in cleaned_table.columns.tolist()[1:]:  # Skip first column
+            vals = cleaned_table[col].tolist()[1:]
+            cleaned_vals = [
+                "".join(char for char in str(x) if char.isalnum()).strip()
+                for x in vals
+            ]
+            all_numeric = all(
+                v.isnumeric() or len(v) == 0 for v in cleaned_vals
+            )
+            if all_numeric:
+                cols_to_drop.append(col)
+        table_info["column_dtypes"] = {
+            col: str(cleaned_table[col].dtype)
+            for col in cleaned_table.columns.tolist()
+        }
+        # Only show unique values for non-numeric columns
+        cleaned_table_filtered = cleaned_table.drop(cols_to_drop, axis=1)
+        table_info["unique_vals_per_col"] = {
+            col: list(cleaned_table_filtered[col].unique())
+            for col in cleaned_table_filtered.columns.tolist()
+        }
+        # Remove the raw table data from response
+        del table_info["table"]
+        return json.dumps(table_info, indent=0).replace("\n", "")
+    def sql_query(self, company_name: str, table_name: str, query: str) -> str:
+        """
+        Execute a SQL query on a table. Select * not allowed (too inefficient).
+        Filters are required to query: WHERE, HAVING, IN, NOT IN, EXISTS, NOT EXISTS, ANY, SOME, ALL, LIKE, NOT LIKE, BETWEEN, NOT BETWEEN, IS NULL, IS NOT NULL, CASE, FILTER.
+        Args:
+            company_name: The name of the company
+            table_name: The name of the table
+            query: SQL query to execute (must include filters)
+        Returns:
+            JSON string with query results
+        """
+        # Validate query has filters (prevent full table scans)
+        if "select *" in query.lower():
+            return "Error: SELECT * is not allowed (too inefficient)"
+        sql_filters = [
+            "WHERE", "HAVING", "IN", "NOT IN", "EXISTS", "NOT EXISTS",
+            "ANY", "SOME", "ALL", "LIKE", "NOT LIKE", "BETWEEN",
+            "NOT BETWEEN", "IS NULL", "IS NOT NULL", "CASE", "FILTER"
+        ]
+        query_upper = re.sub(r"(\r|\n|\t)+", " ", query).upper()
+        pattern = r"(?<!\w|\[)(" + "|".join([re.escape(f) for f in sql_filters]) + r")(?!\w|\])"
+        has_filter = (
+            any(f" {filt} " in query_upper for filt in sql_filters) or
+            len(re.findall(pattern, query_upper)) > 0
+        )
+        if not has_filter:
+            return "Error: Query must include filters (WHERE, HAVING, etc.)"
+        # Clean table name
+        cleaned_table_name = table_name.replace(".txt", "").replace(".json", "")
+        table_path = os.path.join(self.companies_path, company_name, f"{cleaned_table_name}.json")
+        if not os.path.isfile(table_path):
+            return f"Error: Table file not found at {table_path}"
+        try:
+            # Load table and execute query
+            conn = sqlite3.connect(":memory:")
+            df = pd.read_json(table_path)
+            df.to_sql(cleaned_table_name, conn, index=False, if_exists="replace")
+            result = pd.read_sql_query(query, conn)
+            conn.close()
+            return result.to_json(orient="records")
+        except Exception as e:
+            return f"Error executing query: {str(e)}"
+    def submit_answer(self, answer: str) -> str:
+        """
+        Submit a final answer for the question.
+        Args:
+            answer: The final answer to submit
+        Returns:
+            Confirmation message
+        """
+        return f"Answer submitted: {answer}"

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff