prthm11 commited on
Commit
31c7d88
·
verified ·
1 Parent(s): b3ddef3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -86
app.py CHANGED
@@ -222,21 +222,36 @@ Your task is to process OCR-extracted text from images of Scratch 3.0 code block
222
 
223
  """
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  SYSTEM_PROMPT_JSON_CORRECTOR = """
226
- You are an assistant that outputs JSON responses strictly following the given schema.
227
- If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
228
- Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
229
-
230
- If you receive an invalid or incomplete JSON response, fix it by:
231
- - Adding any missing required fields with appropriate values.
232
- - Correcting syntax errors such as missing commas, brackets, or quotes.
233
- - Ensuring the JSON structure matches the schema exactly.
234
- - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
235
- - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
236
- - Never output explanations, comments, or extra text — only the corrected JSON.
237
- - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
238
-
239
- Remember: Your output must be valid JSON only, ready to be parsed without errors.
240
  """
241
 
242
  # debugger and resolver agent for Scratch 3.0
@@ -602,69 +617,101 @@ stack_opcodes_functionalities = "\n".join([
602
  ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
603
 
604
  # Helper function to extract JSON from LLM response
605
- def extract_json_from_llm_response(raw_response: str) -> dict:
606
- # --- 1) Pull out the JSON code‑block if present ---
607
- md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
608
- json_string = md.group(1).strip() if md else raw_response
609
-
610
- # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
611
- first, last = json_string.find('{'), json_string.rfind('}')
612
- if 0 <= first < last:
613
- json_string = json_string[first:last+1]
614
-
615
- # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
616
- json_string = re.sub(r'\b\w+\s*{', '{', json_string)
617
- json_string = re.sub(r'"assistant"\s*:', '', json_string)
618
- json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
619
- logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
620
-
621
- # --- 3.1) Fix stray inner quotes at start of name/list values ---
622
- # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
623
- json_string = re.sub(
624
- r'("name"\s*:\s*")\s*"',
625
- r'\1',
626
- json_string
627
- )
628
 
629
- # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
630
- def _esc(m):
631
- prefix, body = m.group(1), m.group(2)
632
- return prefix + body.replace('"', r'\"')
633
- json_string = re.sub(
634
- r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
635
- _esc,
636
- json_string
637
- )
638
- logger.debug("Escaped embedded quotes in logic fields.")
639
-
640
- logger.debug("Quoted unquoted keys.")
641
-
642
- # --- 6) Remove trailing commas before } or ] ---
643
- json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
644
- json_string = re.sub(r',\s*,', ',', json_string)
645
- logger.debug("Removed trailing commas.")
646
-
647
- # --- 7) Balance braces: drop extra } at end if needed ---
648
- ob, cb = json_string.count('{'), json_string.count('}')
649
- if cb > ob:
650
- excess = cb - ob
651
- json_string = json_string.rstrip()[:-excess]
652
- logger.debug(f"Stripped {excess} extra closing brace(s).")
653
-
654
- # --- 8) Escape literal newlines in *all* string values ---
655
- json_string = re.sub(
656
- r'"((?:[^"\\]|\\.)*?)"',
657
- lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
658
- json_string,
659
- flags=re.DOTALL
660
- )
661
- logger.debug("Escaped newlines in strings.")
 
 
 
 
 
 
 
 
 
 
 
 
662
 
663
- # --- 9) Final parse attempt ---
 
 
 
 
 
 
 
 
 
 
 
 
664
  try:
665
- return json.loads(json_string)
666
- except json.JSONDecodeError:
667
- logger.error("Sanitized JSON still invalid:\n%s", json_string)
 
 
 
 
 
 
 
 
668
  raise
669
 
670
  def reduce_image_size_to_limit(clean_b64_str: str, max_kb: int = 4000) -> str:
@@ -1096,18 +1143,29 @@ If you find any "Code-Blocks" then,
1096
 
1097
  except json.JSONDecodeError as error_json:
1098
  # If JSON parsing fails, use the json resolver agent
1099
- correction_prompt = (
1100
- "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
1101
- "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
1102
- f"- **Error Details**: {error_json}\n\n"
1103
- "**Strict Instructions for your response:**\n"
1104
- "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
1105
- "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
1106
- "3. No trailing commas. Correct nesting.\n\n"
1107
- "Here is the problematic JSON string to correct:\n"
1108
- f"```json\n{llm_output_raw}\n```\n"
1109
- "Corrected JSON:\n"
1110
- )
 
 
 
 
 
 
 
 
 
 
 
1111
  try:
1112
  correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
1113
  corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)
 
222
 
223
  """
224
 
225
+ # SYSTEM_PROMPT_JSON_CORRECTOR = """
226
+ # You are an assistant that outputs JSON responses strictly following the given schema.
227
+ # If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
228
+ # Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
229
+
230
+ # If you receive an invalid or incomplete JSON response, fix it by:
231
+ # - Adding any missing required fields with appropriate values.
232
+ # - Correcting syntax errors such as missing commas, brackets, or quotes.
233
+ # - Ensuring the JSON structure matches the schema exactly.
234
+ # - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
235
+ # - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
236
+ # - Never output explanations, comments, or extra text — only the corrected JSON.
237
+ # - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
238
+
239
+ # Remember: Your output must be valid JSON only, ready to be parsed without errors.
240
+ # """
241
  SYSTEM_PROMPT_JSON_CORRECTOR = """
242
+ You are a JSON correction assistant. Your ONLY task is to fix malformed JSON and return it in the correct format.
243
+ REQUIRED OUTPUT FORMAT:
244
+ {
245
+ "refined_logic": {
246
+ "name_variable": "sprite_name_here",
247
+ "pseudocode": "pseudocode_string_here"
248
+ }
249
+ }
250
+ RULES:
251
+ 1. Extract the sprite name and pseudocode from the input
252
+ 2. Return ONLY valid JSON in the exact format above
253
+ 3. No explanations, no extra text, no other fields
254
+ 4. If you can't find the data, use "Unknown" for name_variable and "No pseudocode found" for pseudocode
 
255
  """
256
 
257
  # debugger and resolver agent for Scratch 3.0
 
617
  ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
618
 
619
  # Helper function to extract JSON from LLM response
620
+ # def extract_json_from_llm_response(raw_response: str) -> dict:
621
+ # # --- 1) Pull out the JSON code‑block if present ---
622
+ # md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
623
+ # json_string = md.group(1).strip() if md else raw_response
624
+
625
+ # # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
626
+ # first, last = json_string.find('{'), json_string.rfind('}')
627
+ # if 0 <= first < last:
628
+ # json_string = json_string[first:last+1]
629
+
630
+ # # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
631
+ # json_string = re.sub(r'\b\w+\s*{', '{', json_string)
632
+ # json_string = re.sub(r'"assistant"\s*:', '', json_string)
633
+ # json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
634
+ # logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
635
+
636
+ # # --- 3.1) Fix stray inner quotes at start of name/list values ---
637
+ # # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
638
+ # json_string = re.sub(
639
+ # r'("name"\s*:\s*")\s*"',
640
+ # r'\1',
641
+ # json_string
642
+ # )
643
 
644
+ # # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
645
+ # def _esc(m):
646
+ # prefix, body = m.group(1), m.group(2)
647
+ # return prefix + body.replace('"', r'\"')
648
+ # json_string = re.sub(
649
+ # r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
650
+ # _esc,
651
+ # json_string
652
+ # )
653
+ # logger.debug("Escaped embedded quotes in logic fields.")
654
+
655
+ # logger.debug("Quoted unquoted keys.")
656
+
657
+ # # --- 6) Remove trailing commas before } or ] ---
658
+ # json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
659
+ # json_string = re.sub(r',\s*,', ',', json_string)
660
+ # logger.debug("Removed trailing commas.")
661
+
662
+ # # --- 7) Balance braces: drop extra } at end if needed ---
663
+ # ob, cb = json_string.count('{'), json_string.count('}')
664
+ # if cb > ob:
665
+ # excess = cb - ob
666
+ # json_string = json_string.rstrip()[:-excess]
667
+ # logger.debug(f"Stripped {excess} extra closing brace(s).")
668
+
669
+ # # --- 8) Escape literal newlines in *all* string values ---
670
+ # json_string = re.sub(
671
+ # r'"((?:[^"\\]|\\.)*?)"',
672
+ # lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
673
+ # json_string,
674
+ # flags=re.DOTALL
675
+ # )
676
+ # logger.debug("Escaped newlines in strings.")
677
+
678
+ # # --- 9) Final parse attempt ---
679
+ # try:
680
+ # return json.loads(json_string)
681
+ # except json.JSONDecodeError:
682
+ # logger.error("Sanitized JSON still invalid:\n%s", json_string)
683
+ # raise
684
+ def extract_json_from_llm_response(raw_response: str) -> dict:
685
+ """
686
+ Finds and parses the first valid JSON object from a raw LLM response string.
687
+ """
688
+ logger.debug("Attempting to extract JSON from raw LLM response...")
689
 
690
+ # 1. Look for a JSON markdown block first
691
+ match = re.search(r"```(?:json)?\s*({[\s\S]*?})\s*```", raw_response)
692
+ if match:
693
+ json_string = match.group(1)
694
+ logger.debug("Found JSON inside a markdown block.")
695
+ try:
696
+ return json.loads(json_string)
697
+ except json.JSONDecodeError as e:
698
+ logger.warning(f"Failed to parse JSON from markdown block: {e}")
699
+ # Fall through to the next method if parsing fails
700
+
701
+ # 2. If no block is found (or it failed), find the outermost braces
702
+ logger.debug("Markdown block not found or failed. Searching for outermost braces.")
703
  try:
704
+ first_brace = raw_response.find('{')
705
+ last_brace = raw_response.rfind('}')
706
+ if first_brace != -1 and last_brace != -1 and first_brace < last_brace:
707
+ json_string = raw_response[first_brace : last_brace + 1]
708
+ return json.loads(json_string)
709
+ else:
710
+ logger.error("Could not find a valid JSON structure (outermost braces).")
711
+ raise json.JSONDecodeError("No valid JSON object found in the response.", raw_response, 0)
712
+ except json.JSONDecodeError as e:
713
+ logger.error(f"Final JSON parsing attempt failed: {e}")
714
+ # Re-raise the exception to be caught by the calling logic (to invoke the corrector agent)
715
  raise
716
 
717
  def reduce_image_size_to_limit(clean_b64_str: str, max_kb: int = 4000) -> str:
 
1143
 
1144
  except json.JSONDecodeError as error_json:
1145
  # If JSON parsing fails, use the json resolver agent
1146
+ # correction_prompt = (
1147
+ # "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
1148
+ # "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
1149
+ # f"- **Error Details**: {error_json}\n\n"
1150
+ # "**Strict Instructions for your response:**\n"
1151
+ # "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
1152
+ # "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
1153
+ # "3. No trailing commas. Correct nesting.\n\n"
1154
+ # "Here is the problematic JSON string to correct:\n"
1155
+ # f"```json\n{llm_output_raw}\n```\n"
1156
+ # "Corrected JSON:\n"
1157
+ # )
1158
+ correction_prompt = f"""
1159
+ Fix this malformed response and return only the corrected JSON:
1160
+ Input: {llm_output_raw if 'llm_output_raw' in locals() else 'No response available'}
1161
+ Extract the sprite name and pseudocode, then return in this exact format:
1162
+ {{
1163
+ "refined_logic": {{
1164
+ "name_variable": "sprite_name",
1165
+ "pseudocode": "pseudocode_here"
1166
+ }}
1167
+ }}
1168
+ """
1169
  try:
1170
  correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
1171
  corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)