Spaces:

Mohammed-Altaf
/

DataAnalysis_Env

Sleeping

App Files Files Community

Mohammed-Altaf commited on Apr 5

Commit

7b9dfc1

1 Parent(s): aca1396

fixed issues in SQL schema not available to agent and code parsing issues

Browse files

Files changed (4) hide show

helpers/__init__.py +0 -0
helpers/response_parser.py +211 -0
inference.py +9 -106
server/data_analysis_env.py +27 -2

helpers/__init__.py ADDED Viewed

File without changes

helpers/response_parser.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import json
+import re
+from typing import Any
+FALLBACK_ACTION = json.dumps({"action": "submit_answer", "answer": "unknown"})
+# ── Layer 1: Sanitize special characters inside string values ──────────────────
+def _sanitize_string_value(match: re.Match) -> str:
+    """
+    Receives a regex match of ("key": "value") and cleans only the value part.
+    Escapes unescaped newlines, tabs, carriage returns, and inner double quotes.
+    This is the core trick LangChain uses in _replace_new_line / _custom_parser.
+    """
+    opening = match.group(1)  # e.g.  "code": "
+    value = match.group(2)  # raw value content (may span multiple lines)
+    closing = match.group(3)  # closing "
+    value = re.sub(r"\n", r"\\n", value)
+    value = re.sub(r"\r", r"\\r", value)
+    value = re.sub(r"\t", r"\\t", value)
+    value = re.sub(r'(?<!\\)"', r'\\"', value)  # escape unescaped inner quotes
+    return opening + value + closing
+def _sanitize_all_string_values(text: str) -> str:
+    """
+    Apply _sanitize_string_value to every JSON string value in the text.
+    Uses re.DOTALL so values that span multiple lines are handled correctly.
+    Generalised version of LangChain's _custom_parser (which only targeted action_input).
+    """
+    return re.sub(
+        r'("[\w]+"\s*:\s*")(.*?)(")',  # ("key": ")  VALUE  (")
+        _sanitize_string_value,
+        text,
+        flags=re.DOTALL,
+    )
+# ── Layer 2: Pre-parse text fixes ─────────────────────────────────────────────
+def _preprocess(text: str) -> str:
+    """Fix common LLM response quirks before attempting JSON parsing."""
+    # Strip markdown code fences  (```json ... ``` or ``` ... ```)
+    # LangChain uses a regex for this: _json_markdown_re
+    match = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL)
+    if match:
+        text = match.group(1).strip()
+    # Double curly braces  {{"k": "v"}}  →  {"k": "v"}
+    text = text.replace("{{", "{").replace("}}", "}")
+    # Python literals  →  JSON literals
+    text = re.sub(r"\bTrue\b", "true", text)
+    text = re.sub(r"\bFalse\b", "false", text)
+    text = re.sub(r"\bNone\b", "null", text)
+    # Trailing commas before } or ]
+    text = re.sub(r",\s*([}\]])", r"\1", text)
+    # Outer single-quote wrap  '{"k": "v"}'  →  {"k": "v"}
+    if text.startswith("'") and text.endswith("'"):
+        text = text[1:-1].replace("\\'", "'")
+    return text.strip()
+# ── Layer 3: Extract first JSON blob from surrounding prose ───────────────────
+def _extract_json_blob(text: str) -> str:
+    """
+    Pull out the first {...} or [...] blob from text that has prose around it.
+    Inspired by LangChain's _json_markdown_re fallback in parse_json_markdown.
+    """
+    match = re.search(r"(\{.*\}|\[.*\])", text, re.DOTALL)
+    return match.group(1) if match else text
+# ── Layer 4: parse_partial_json — LangChain's stack-based closer ──────────────
+def _parse_partial_json(s: str) -> Any:
+    """
+    Parse JSON that may be truncated / missing closing brackets.
+    Adapted from LangChain's parse_partial_json (originally from open-interpreter).
+    Uses a stack to track open containers and closes them before parsing.
+    """
+    s = s.strip()
+    # Try the string as-is first
+    try:
+        return json.loads(s)
+    except json.JSONDecodeError:
+        pass
+    # Walk through and auto-close any unclosed {, [, or "
+    stack = []
+    is_inside = False  # inside a string?
+    position = 0
+    for i, char in enumerate(s):
+        if is_inside:
+            if char == '"' and s[i - 1] != "\\":
+                is_inside = False
+        else:
+            if char == '"':
+                is_inside = True
+                stack.append('"')
+            elif char in "{[":
+                stack.append(char)
+            elif char in "}]":
+                if stack and stack[-1] in "{[":
+                    stack.pop()
+        position = i
+    # Close open containers in reverse order
+    completed = s[: position + 1]
+    for bracket in reversed(stack):
+        if bracket == '"':
+            completed += '"'
+        elif bracket == "{":
+            completed += "}"
+        elif bracket == "[":
+            completed += "]"
+    return json.loads(completed)
+# ── Layer 5: Direct greedy extraction — last resort for unescaped inner quotes ──
+def _extract_fields_direct(text: str) -> dict:
+    """Extract action fields using greedy regex anchored to the last closing quote.
+    Handles the case where the model emits unescaped double-quote characters inside
+    a "code" or "answer" value (e.g. df["col"]).  The non-greedy `(.*?)` in
+    _sanitize_all_string_values stops at the *first* inner quote and corrupts the
+    output.  By using a greedy `(.*)` anchored with a lookahead for the last `"}`
+    boundary we capture the full value regardless of inner quotes.
+    Args:
+        text: Pre-processed JSON-like string.
+    Returns:
+        Dict with 'action' and 'code'/'answer' keys.
+    Raises:
+        ValueError: If the action field cannot be found or the value cannot be
+            extracted for the detected action type.
+    """
+    action_match = re.search(r'"action"\s*:\s*"(\w+)"', text)
+    if not action_match:
+        raise ValueError("No 'action' field found")
+    action_type = action_match.group(1)
+    if action_type == "execute_code":
+        m = re.search(r'"code"\s*:\s*"(.*)"(?=\s*})', text, re.DOTALL)
+        if m:
+            return {"action": "execute_code", "code": m.group(1)}
+    elif action_type == "submit_answer":
+        m = re.search(r'"answer"\s*:\s*"(.*)"(?=\s*})', text, re.DOTALL)
+        if m:
+            return {"action": "submit_answer", "answer": m.group(1)}
+    raise ValueError(f"Could not extract value for action_type={action_type!r}")
+# ── Public API ─────────────────────────────────────────────────────────────────
+def parse_model_action(response_text: str) -> dict:
+    """
+    Parse a raw LLM response into an action dict.
+    Pipeline (mirrors LangChain's JsonOutputParser internals):
+      1. _preprocess      – fix markdown fences, double braces, Python literals …
+      2. _sanitize_all_string_values – escape unescaped quotes/newlines inside values
+      3. _extract_json_blob           – strip surrounding prose
+      4. _parse_partial_json          – close truncated JSON with a stack algorithm
+    Each strategy is tried independently so a failure in one doesn't block others.
+    """
+    text = response_text.strip()
+    strategies = [
+        lambda t: _parse_partial_json(t),
+        # (preprocessed, sanitized, as-is)
+        lambda t: _parse_partial_json(_sanitize_all_string_values(_preprocess(t))),
+        # (extract blob first, then preprocess + sanitize)
+        lambda t: _parse_partial_json(_sanitize_all_string_values(_preprocess(_extract_json_blob(t)))),
+        # (preprocess + extract blob, then sanitize)
+        lambda t: _parse_partial_json(_sanitize_all_string_values(_extract_json_blob(_preprocess(t)))),
+        # (sanitize raw text, skip preprocess — rare fallback)
+        lambda t: _parse_partial_json(_sanitize_all_string_values(t)),
+        # greedy extraction — handles unescaped inner quotes in code/answer values
+        lambda t: _extract_fields_direct(_preprocess(_extract_json_blob(t))),
+        lambda t: _extract_fields_direct(_extract_json_blob(t)),
+    ]
+    for strategy in strategies:
+        try:
+            return strategy(text)
+        except (json.JSONDecodeError, ValueError):
+            continue
+    print(f"JSON Decoding Error while parsing action in response text: {response_text}")
+    return json.loads(FALLBACK_ACTION)

inference.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dotenv import load_dotenv
 from openai import OpenAI
 from client import DataAnalysisClient
 from models import DataAction
 load_dotenv()
@@ -20,13 +21,17 @@ ENV_SERVER_URL = os.getenv("ENV_SERVER_URL") or "https://mohammed-altaf-dataanal
 SYSTEM_PROMPT = """
 <ROLE>
-You are a data analyst. You are given a dataset loaded as a pandas DataFrame called `df`.
-You can execute Python/pandas code to explore the dataset and answer the question.
 </ROLE>
 <RULES>
-- Use `print()` to see results of your code
-- The DataFrame `df` is pre-loaded with pandas as `pd` and numpy as `np`
 - When you have the answer, submit it in the exact format requested
 - Be precise with numbers and formatting
 </RULES>
@@ -42,8 +47,6 @@ Respond with ONLY the JSON, no other text.
 </NOTE>
 """
-FALLBACK_ACTION = json.dumps({"action": "submit_answer", "answer": "unknown"})
 def log_start(task: str, env: str, model: str) -> None:
     """Log the start of a task episode.
@@ -87,105 +90,6 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
     print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
-def parse_model_action(response_text: str) -> dict:
-    """Parse the model's raw text response into an action dict.
-    Handles multiple LLM response edge cases:
-    - Markdown code blocks (```json ... ``` or ``` ... ```)
-    - Double curly braces e.g. {{"key": "value"}}
-    - Single quotes instead of double quotes e.g. {'key': 'value'}
-    - Python literals: True/False/None → true/false/null
-    - Trailing commas in objects/arrays e.g. {"key": "value",}
-    - Extra text/prose before or after the JSON blob
-    - Escaped single quotes inside single-quoted strings
-    - Whitespace and newline noise
-    Args:
-        response_text: Raw string returned by the model.
-    Returns:
-        Parsed action dict, or a fallback submit_answer on failure.
-    """
-    def attempt_parse(text: str) -> dict:
-        return json.loads(text)
-    def apply_fixes(text: str) -> str:
-        # Strip markdown code blocks
-        if text.startswith("```"):
-            parts = text.split("```")
-            if len(parts) >= 2:
-                text = parts[1]
-                if text.startswith("json"):
-                    text = text[4:]
-                text = text.strip()
-        # Double curly braces → single
-        text = text.replace("{{", "{").replace("}}", "}")
-        # Python literals → JSON literals
-        text = re.sub(r"\bTrue\b", "true", text)
-        text = re.sub(r"\bFalse\b", "false", text)
-        text = re.sub(r"\bNone\b", "null", text)
-        # Trailing commas before } or ]
-        text = re.sub(r",\s*([}\]])", r"\1", text)
-        # Single quote handling — two distinct cases:
-        #
-        # Case 1: Entire JSON is wrapped in outer single quotes
-        #   e.g. '{"action": "x", "code": "df[\'col\']"}'
-        #   → strip the outer quotes and unescape internal \'
-        if text.startswith("'") and text.endswith("'"):
-            text = text[1:-1].replace("\\'", "'")
-        # Case 2: JSON itself uses single quotes as delimiters
-        #   e.g. {'action': 'execute_code', 'code': 'print()'}
-        #   → only apply when structure looks single-quote delimited
-        #   → avoids corrupting double-quoted values that contain bracket notation
-        elif text.startswith("{'") or ("': " in text and '": ' not in text):
-            text = re.sub(
-                r"'((?:\\'|[^'])*)'", lambda m: '"' + m.group(1).replace("\\'", "'").replace('"', '\\"') + '"', text
-            )
-        return text
-    def extract_json_blob(text: str) -> str:
-        """Extract the first {...} or [...] blob from text with surrounding prose."""
-        match = re.search(r"(\{.*\}|\[.*\])", text, re.DOTALL)
-        if match:
-            return match.group(1)
-        return text
-    text = response_text.strip()
-    try:
-        return attempt_parse(text)
-    except json.JSONDecodeError:
-        pass
-    try:
-        return attempt_parse(apply_fixes(text))
-    except json.JSONDecodeError:
-        pass
-    try:
-        blob = extract_json_blob(text)
-        return attempt_parse(apply_fixes(blob))
-    except json.JSONDecodeError:
-        pass
-    try:
-        fixed = apply_fixes(text)
-        blob = extract_json_blob(fixed)
-        return attempt_parse(blob)
-    except json.JSONDecodeError:
-        pass
-    print(f"JSON Decoding Error while parsing action in response text: {response_text}")
-    return json.loads(FALLBACK_ACTION)
 def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
     """Run a single task episode using the language model as the agent.
@@ -234,7 +138,6 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
         except Exception as exc:
             print(f"[DEBUG] Model request failed: {exc}", flush=True)
             response_text = FALLBACK_ACTION
         action = parse_model_action(response_text)
         action_type = action.get("action", "")

 from openai import OpenAI
 from client import DataAnalysisClient
+from helpers.response_parser import FALLBACK_ACTION, parse_model_action
 from models import DataAction
 load_dotenv()
 SYSTEM_PROMPT = """
 <ROLE>
+You are a data analyst. You have two data sources available:
+1. `df` — a pandas DataFrame (sales CSV, pre-loaded)
+2. A SQLite database at `db_path` — contains additional tables (e.g. customer_profiles, product_catalog)
 </ROLE>
 <RULES>
+- Use `print()` to output results
+- `pd`, `np`, `sqlite3`, and `db_path` are already in scope — NEVER use import statements (they will fail)
+- `df` is a pandas DataFrame — use pandas operations on it, NEVER SQL
+- To query the SQLite database use: `conn = sqlite3.connect(db_path)` then `pd.read_sql(query, conn)`
+- For cross-source tasks: query SQLite for the extra data, then merge with `df` using pandas
 - When you have the answer, submit it in the exact format requested
 - Be precise with numbers and formatting
 </RULES>
 </NOTE>
 """
 def log_start(task: str, env: str, model: str) -> None:
     """Log the start of a task episode.
     print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
 def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
     """Run a single task episode using the language model as the agent.
         except Exception as exc:
             print(f"[DEBUG] Model request failed: {exc}", flush=True)
             response_text = FALLBACK_ACTION
         action = parse_model_action(response_text)
         action_type = action.get("action", "")

server/data_analysis_env.py CHANGED Viewed

@@ -35,6 +35,7 @@ class DataAnalysisEnv(Environment):
     """
     MAX_STEPS = 20
     def __init__(self):
         """Initialize the environment with default state."""
@@ -79,14 +80,38 @@ class DataAnalysisEnv(Environment):
     def _dataset_info(self) -> str:
         """Generate a summary of the dataset schema for the agent.
         Returns:
-            A string describing column names, dtypes, row count, and a sample.
         """
         buf = io.StringIO()
         self._df.info(buf=buf)
         info_str = buf.getvalue()
         sample = self._df.head(3).to_string()
-        return f"Dataset shape: {self._df.shape}\n\n{info_str}\nSample rows:\n{sample}"
     def reset(
         self,

     """
     MAX_STEPS = 20
+    SUPPORTS_CONCURRENT_SESSIONS = True
     def __init__(self):
         """Initialize the environment with default state."""
     def _dataset_info(self) -> str:
         """Generate a summary of the dataset schema for the agent.
+        Includes the sales DataFrame schema plus the SQLite database table schemas
+        so the agent knows what data is available and where to find it.
         Returns:
+            A string describing column names, dtypes, row count, a sample for df,
+            and table schemas for the SQLite database.
         """
         buf = io.StringIO()
         self._df.info(buf=buf)
         info_str = buf.getvalue()
         sample = self._df.head(3).to_string()
+        df_section = f"=== df (pandas DataFrame, pre-loaded from sales CSV) ===\nShape: {self._df.shape}\n{info_str}\nSample rows:\n{sample}"
+        try:
+            conn = sqlite3.connect(DB_PATH)
+            cursor = conn.cursor()
+            cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
+            tables = [row[0] for row in cursor.fetchall()]
+            db_lines = ["\n=== SQLite database (accessible via sqlite3.connect(db_path)) ==="]
+            for table in tables:
+                cursor.execute(f"PRAGMA table_info({table})")
+                cols = [(row[1], row[2]) for row in cursor.fetchall()]
+                cursor.execute(f"SELECT COUNT(*) FROM {table}")
+                count = cursor.fetchone()[0]
+                col_str = ", ".join(f"{c} ({t})" for c, t in cols)
+                db_lines.append(f"  Table '{table}' ({count} rows): {col_str}")
+            conn.close()
+            db_section = "\n".join(db_lines)
+        except Exception:
+            db_section = "\n=== SQLite database: schema unavailable ==="
+        return f"{df_section}\n{db_section}"
     def reset(
         self,