Spaces:

prthm11
/

Scratch_Vision_Game

Sleeping

App Files Files Community

prthm11 commited on Sep 18, 2025

Commit

31c7d88

verified ·

1 Parent(s): b3ddef3

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -86

app.py CHANGED Viewed

@@ -222,21 +222,36 @@ Your task is to process OCR-extracted text from images of Scratch 3.0 code block
 """
 SYSTEM_PROMPT_JSON_CORRECTOR = """
-You are an assistant that outputs JSON responses strictly following the given schema.
-If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
-Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
-If you receive an invalid or incomplete JSON response, fix it by:
-- Adding any missing required fields with appropriate values.
-- Correcting syntax errors such as missing commas, brackets, or quotes.
-- Ensuring the JSON structure matches the schema exactly.
-- Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
-- Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
-- Never output explanations, comments, or extra text — only the corrected JSON.
-- **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
-Remember: Your output must be valid JSON only, ready to be parsed without errors.
 """
 # debugger and resolver agent for Scratch 3.0
@@ -602,69 +617,101 @@ stack_opcodes_functionalities = "\n".join([
 ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
 # Helper function to extract JSON from LLM response
-def extract_json_from_llm_response(raw_response: str) -> dict:
-    # --- 1) Pull out the JSON code‑block if present ---
-    md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
-    json_string = md.group(1).strip() if md else raw_response
-    # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
-    first, last = json_string.find('{'), json_string.rfind('}')
-    if 0 <= first < last:
-        json_string = json_string[first:last+1]
-    # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
-    json_string = re.sub(r'\b\w+\s*{', '{', json_string)
-    json_string = re.sub(r'"assistant"\s*:', '', json_string)
-    json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
-    logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
-    # --- 3.1) Fix stray inner quotes at start of name/list values ---
-    # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
-    json_string = re.sub(
-        r'("name"\s*:\s*")\s*"',
-        r'\1',
-        json_string
-    )
-    # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
-    def _esc(m):
-        prefix, body = m.group(1), m.group(2)
-        return prefix + body.replace('"', r'\"')
-    json_string = re.sub(
-        r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
-        _esc,
-        json_string
-    )
-    logger.debug("Escaped embedded quotes in logic fields.")
-    logger.debug("Quoted unquoted keys.")
-    # --- 6) Remove trailing commas before } or ] ---
-    json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
-    json_string = re.sub(r',\s*,', ',', json_string)
-    logger.debug("Removed trailing commas.")
-    # --- 7) Balance braces: drop extra } at end if needed ---
-    ob, cb = json_string.count('{'), json_string.count('}')
-    if cb > ob:
-        excess = cb - ob
-        json_string = json_string.rstrip()[:-excess]
-        logger.debug(f"Stripped {excess} extra closing brace(s).")
-    # --- 8) Escape literal newlines in *all* string values ---
-    json_string = re.sub(
-        r'"((?:[^"\\]|\\.)*?)"',
-        lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
-        json_string,
-        flags=re.DOTALL
-    )
-    logger.debug("Escaped newlines in strings.")
-    # --- 9) Final parse attempt ---
     try:
-        return json.loads(json_string)
-    except json.JSONDecodeError:
-        logger.error("Sanitized JSON still invalid:\n%s", json_string)
         raise
 def reduce_image_size_to_limit(clean_b64_str: str, max_kb: int = 4000) -> str:
@@ -1096,18 +1143,29 @@ If you find any "Code-Blocks" then,
     except json.JSONDecodeError as error_json:
         # If JSON parsing fails, use the json resolver agent
-        correction_prompt = (
-            "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
-            "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
-            f"- **Error Details**: {error_json}\n\n"
-            "**Strict Instructions for your response:**\n"
-            "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
-            "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
-            "3. No trailing commas. Correct nesting.\n\n"
-            "Here is the problematic JSON string to correct:\n"
-            f"```json\n{llm_output_raw}\n```\n"
-            "Corrected JSON:\n"
-        )
         try:
             correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
             corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)

 """
+# SYSTEM_PROMPT_JSON_CORRECTOR = """
+# You are an assistant that outputs JSON responses strictly following the given schema.
+# If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
+# Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
+# If you receive an invalid or incomplete JSON response, fix it by:
+# - Adding any missing required fields with appropriate values.
+# - Correcting syntax errors such as missing commas, brackets, or quotes.
+# - Ensuring the JSON structure matches the schema exactly.
+# - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
+# - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
+# - Never output explanations, comments, or extra text — only the corrected JSON.
+# - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
+# Remember: Your output must be valid JSON only, ready to be parsed without errors.
+# """
 SYSTEM_PROMPT_JSON_CORRECTOR = """
+You are a JSON correction assistant. Your ONLY task is to fix malformed JSON and return it in the correct format.
+REQUIRED OUTPUT FORMAT:
+{
+  "refined_logic": {
+    "name_variable": "sprite_name_here",
+    "pseudocode": "pseudocode_string_here"
+  }
+}
+RULES:
+1. Extract the sprite name and pseudocode from the input
+2. Return ONLY valid JSON in the exact format above
+3. No explanations, no extra text, no other fields
+4. If you can't find the data, use "Unknown" for name_variable and "No pseudocode found" for pseudocode
 """
 # debugger and resolver agent for Scratch 3.0
 ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
 # Helper function to extract JSON from LLM response
+# def extract_json_from_llm_response(raw_response: str) -> dict:
+#     # --- 1) Pull out the JSON code‑block if present ---
+#     md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
+#     json_string = md.group(1).strip() if md else raw_response
+#     # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
+#     first, last = json_string.find('{'), json_string.rfind('}')
+#     if 0 <= first < last:
+#         json_string = json_string[first:last+1]
+#     # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
+#     json_string = re.sub(r'\b\w+\s*{', '{', json_string)
+#     json_string = re.sub(r'"assistant"\s*:', '', json_string)
+#     json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
+#     logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
+#     # --- 3.1) Fix stray inner quotes at start of name/list values ---
+#     # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
+#     json_string = re.sub(
+#         r'("name"\s*:\s*")\s*"',
+#         r'\1',
+#         json_string
+#     )
+#     # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
+#     def _esc(m):
+#         prefix, body = m.group(1), m.group(2)
+#         return prefix + body.replace('"', r'\"')
+#     json_string = re.sub(
+#         r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
+#         _esc,
+#         json_string
+#     )
+#     logger.debug("Escaped embedded quotes in logic fields.")
+#     logger.debug("Quoted unquoted keys.")
+#     # --- 6) Remove trailing commas before } or ] ---
+#     json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
+#     json_string = re.sub(r',\s*,', ',', json_string)
+#     logger.debug("Removed trailing commas.")
+#     # --- 7) Balance braces: drop extra } at end if needed ---
+#     ob, cb = json_string.count('{'), json_string.count('}')
+#     if cb > ob:
+#         excess = cb - ob
+#         json_string = json_string.rstrip()[:-excess]
+#         logger.debug(f"Stripped {excess} extra closing brace(s).")
+#     # --- 8) Escape literal newlines in *all* string values ---
+#     json_string = re.sub(
+#         r'"((?:[^"\\]|\\.)*?)"',
+#         lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
+#         json_string,
+#         flags=re.DOTALL
+#     )
+#     logger.debug("Escaped newlines in strings.")
+#     # --- 9) Final parse attempt ---
+#     try:
+#         return json.loads(json_string)
+#     except json.JSONDecodeError:
+#         logger.error("Sanitized JSON still invalid:\n%s", json_string)
+#         raise
+def extract_json_from_llm_response(raw_response: str) -> dict:
+    """
+    Finds and parses the first valid JSON object from a raw LLM response string.
+    """
+    logger.debug("Attempting to extract JSON from raw LLM response...")
+    # 1. Look for a JSON markdown block first
+    match = re.search(r"```(?:json)?\s*({[\s\S]*?})\s*```", raw_response)
+    if match:
+        json_string = match.group(1)
+        logger.debug("Found JSON inside a markdown block.")
+        try:
+            return json.loads(json_string)
+        except json.JSONDecodeError as e:
+            logger.warning(f"Failed to parse JSON from markdown block: {e}")
+            # Fall through to the next method if parsing fails
+    # 2. If no block is found (or it failed), find the outermost braces
+    logger.debug("Markdown block not found or failed. Searching for outermost braces.")
     try:
+        first_brace = raw_response.find('{')
+        last_brace = raw_response.rfind('}')
+        if first_brace != -1 and last_brace != -1 and first_brace < last_brace:
+            json_string = raw_response[first_brace : last_brace + 1]
+            return json.loads(json_string)
+        else:
+            logger.error("Could not find a valid JSON structure (outermost braces).")
+            raise json.JSONDecodeError("No valid JSON object found in the response.", raw_response, 0)
+    except json.JSONDecodeError as e:
+        logger.error(f"Final JSON parsing attempt failed: {e}")
+        # Re-raise the exception to be caught by the calling logic (to invoke the corrector agent)
         raise
 def reduce_image_size_to_limit(clean_b64_str: str, max_kb: int = 4000) -> str:
     except json.JSONDecodeError as error_json:
         # If JSON parsing fails, use the json resolver agent
+        # correction_prompt = (
+        #     "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
+        #     "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
+        #     f"- **Error Details**: {error_json}\n\n"
+        #     "**Strict Instructions for your response:**\n"
+        #     "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
+        #     "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
+        #     "3. No trailing commas. Correct nesting.\n\n"
+        #     "Here is the problematic JSON string to correct:\n"
+        #     f"```json\n{llm_output_raw}\n```\n"
+        #     "Corrected JSON:\n"
+        # )
+        correction_prompt = f"""
+Fix this malformed response and return only the corrected JSON:
+Input: {llm_output_raw if 'llm_output_raw' in locals() else 'No response available'}
+Extract the sprite name and pseudocode, then return in this exact format:
+{{
+  "refined_logic": {{
+    "name_variable": "sprite_name",
+    "pseudocode": "pseudocode_here"
+  }}
+}}
+"""
         try:
             correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
             corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)