Spaces:

ReviewGrounder
/

GradioDemo

Sleeping

GradioDemo / shared /utils /json_parser.py

eigentom

Initial Update

90c099b about 1 month ago

15.7 kB

	"""
	Robust JSON parsing utilities for LLM responses
	"""
	import json
	import re
	from typing import Any, Dict, List, Optional


	def extract_json_from_text(text: str) -> Optional[str]:
	"""
	Extract JSON from text by removing markdown code block markers

	Args:
	text: Text that may contain JSON in markdown code blocks or plain JSON

	Returns:
	Extracted JSON string or None if not found
	"""
	if not text:
	return None

	text_stripped = text.strip()

	# Try to parse as plain JSON first (no code blocks)
	try:
	json.loads(text_stripped)
	return text_stripped
	except json.JSONDecodeError:
	pass

	# Remove markdown code block markers: ```json ... ``` or ``` ... ```
	if text_stripped.startswith('```json'):
	# Remove ```json at start and ``` at end
	if text_stripped.endswith('```'):
	text_stripped = text_stripped[7:-3].strip()
	else:
	# No closing ```, just remove opening
	text_stripped = text_stripped[7:].strip()
	elif text_stripped.startswith('```'):
	# Handle ``` ... ``` (without json label)
	if text_stripped.endswith('```'):
	text_stripped = text_stripped[3:-3].strip()
	else:
	return None

	# Try to parse as JSON after removing code block markers
	try:
	json.loads(text_stripped)
	return text_stripped
	except json.JSONDecodeError:
	return None


	def parse_json_response(text: str, fallback: Any = None) -> Any:
	"""
	Parse JSON from LLM response with robust error handling

	Args:
	text: LLM response text
	fallback: Fallback value if parsing fails

	Returns:
	Parsed JSON object or fallback
	"""
	if not text:
	return fallback

	# Extract JSON from text
	json_str = extract_json_from_text(text)

	if json_str is None:
	return fallback

	try:
	return json.loads(json_str)
	except json.JSONDecodeError as e:
	# Try to fix common JSON issues
	json_str = fix_json_common_issues(json_str)
	try:
	return json.loads(json_str)
	except json.JSONDecodeError:
	return fallback


	def fix_json_common_issues(json_str: str) -> str:
	"""
	Fix common JSON formatting issues

	Args:
	json_str: JSON string that may have issues

	Returns:
	Fixed JSON string
	"""
	# Remove trailing commas
	json_str = re.sub(r',\s*}', '}', json_str)
	json_str = re.sub(r',\s*]', ']', json_str)

	# Fix single quotes to double quotes (basic)
	json_str = re.sub(r"'(\w+)':", r'"\1":', json_str)

	# Remove comments (basic)
	json_str = re.sub(r'//.*?$', '', json_str, flags=re.MULTILINE)
	json_str = re.sub(r'/\.?\*/', '', json_str, flags=re.DOTALL)

	return json_str


	def parse_keywords_json(response: str) -> List[str]:
	"""
	Parse keywords from JSON response

	Expected format:
	{"keywords": ["keyword1", "keyword2", ...]}
	or
	["keyword1", "keyword2", ...]

	Args:
	response: LLM response text

	Returns:
	List of keywords, or empty list if parsing fails
	"""
	if response is None:
	return []

	parsed = parse_json_response(response, fallback=None)

	if parsed is None:
	return []

	# Handle dict format: {"keywords": [...]}
	if isinstance(parsed, dict):
	if "keywords" in parsed and isinstance(parsed["keywords"], list):
	return parsed["keywords"][:5]
	return []

	# Handle list format: ["keyword1", "keyword2", ...]
	if isinstance(parsed, list):
	return parsed[:5]

	return []


	def parse_summary_json(response: str) -> str:
	"""
	Parse summary from JSON response

	Expected format:
	{"summary": "summary text"}
	or
	{"text": "summary text", "summary": "summary text"}

	Args:
	response: LLM response text

	Returns:
	Summary text
	"""
	parsed = parse_json_response(response, fallback=None)

	if parsed is None:
	# Fallback to text parsing
	return response.strip()

	if isinstance(parsed, dict):
	# Try different possible keys
	for key in ["summary", "text", "content", "description"]:
	if key in parsed:
	summary = str(parsed[key]).strip()
	if summary:
	return summary

	# Fallback to text parsing
	return response.strip()


	def parse_review_json(response: str, review_format: str = "detailed") -> Dict[str, Any]:
	"""
	Parse review from JSON or markdown response

	Expected formats:
	- JSON: {"summary": "...", "soundness": 5, ...}
	- Markdown: ## Summary\n\n...\n## Soundness\n\n...

	Args:
	response: LLM response text (JSON or markdown)
	review_format: Review format type (detailed, summary, structured)

	Returns:
	Review dictionary with parsed fields
	"""
	# First try to parse as JSON
	parsed = parse_json_response(response, fallback=None)

	if parsed is not None and isinstance(parsed, dict):
	# JSON format - ensure it has required fields
	if "review" not in parsed:
	parsed["review"] = response.strip()
	return parsed

	# If not JSON, try to parse as markdown
	if "## " in response or "##" in response:
	markdown_parsed = parse_review_markdown(response)
	if len(markdown_parsed) > 1: # More than just "review" field
	return markdown_parsed

	# Fallback to text parsing
	return {"review": response.strip()}


	def parse_review_markdown(markdown_text: str) -> Dict[str, Any]:
	"""
	Parse review from markdown format with sections like:
	## Summary
	...
	## Soundness
	...
	etc.

	Args:
	markdown_text: Markdown formatted review text

	Returns:
	Review dictionary with parsed fields
	"""
	review_dict = {"review": markdown_text.strip()}

	# Pattern to match markdown sections: ## SectionName\n\ncontent
	section_pattern = r'##\s([^\n]+)\s\n\n(.?)(?=\n##\s\|$)'
	matches = re.finditer(section_pattern, markdown_text, re.DOTALL)

	for match in matches:
	section_name = match.group(1).strip()
	section_content = match.group(2).strip()

	# Normalize section name (case-insensitive, remove extra spaces)
	section_name_lower = section_name.lower()

	# Map section names to dictionary keys
	if "summary" in section_name_lower:
	review_dict["summary"] = section_content
	elif "soundness" in section_name_lower:
	# Extract score - prioritize single float number (e.g., "3.0", "4.5")
	# If format is "3 / 5" or "3 / 5", extract the number before the slash
	score_val = None

	lines = section_content.split('\n')
	if lines:
	first_line = lines[0].strip()
	first_line_clean = re.sub(r'[`\*]', '', first_line)

	# Try to match number at start that's NOT followed by "/"
	num_match = re.match(r'^(\d+\.?\d)(\s)', first_line_clean)
	if num_match:
	remaining = first_line_clean[len(num_match.group(0)):].strip()
	if not remaining.startswith('/'):
	try:
	score_val = float(num_match.group(1))
	except (ValueError, IndexError):
	pass

	# If not found and there's a "/", try to extract number before "/" (e.g., "3 / 5" -> 3)
	if score_val is None and '/' in first_line_clean:
	fraction_match = re.match(r'^\s[`\]\s(\d+\.?\d)\s[`\]\s/\s\d+', first_line_clean)
	if fraction_match:
	try:
	score_val = float(fraction_match.group(1))
	except (ValueError, IndexError):
	pass

	# If not found, try to find number after "score:" or "rating:"
	if score_val is None:
	score_match = re.search(r'(?:score\|rating)\s[:=]\s(\d+\.?\d*)', section_content, re.IGNORECASE)
	if score_match:
	try:
	score_val = float(score_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is not None:
	review_dict["soundness"] = score_val # Keep as float
	elif "presentation" in section_name_lower:
	score_val = None
	lines = section_content.split('\n')
	if lines:
	first_line = lines[0].strip()
	first_line_clean = re.sub(r'[`\*]', '', first_line)

	num_match = re.match(r'^(\d+\.?\d)(\s)', first_line_clean)
	if num_match:
	remaining = first_line_clean[len(num_match.group(0)):].strip()
	if not remaining.startswith('/'):
	try:
	score_val = float(num_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None and '/' in first_line_clean:
	fraction_match = re.match(r'^\s[`\]\s(\d+\.?\d)\s[`\]\s/\s\d+', first_line_clean)
	if fraction_match:
	try:
	score_val = float(fraction_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None:
	score_match = re.search(r'(?:score\|rating)\s[:=]\s(\d+\.?\d*)', section_content, re.IGNORECASE)
	if score_match:
	try:
	score_val = float(score_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is not None:
	review_dict["presentation"] = score_val
	elif "contribution" in section_name_lower:
	score_val = None
	lines = section_content.split('\n')
	if lines:
	first_line = lines[0].strip()
	first_line_clean = re.sub(r'[`\*]', '', first_line)

	num_match = re.match(r'^(\d+\.?\d)(\s)', first_line_clean)
	if num_match:
	remaining = first_line_clean[len(num_match.group(0)):].strip()
	if not remaining.startswith('/'):
	try:
	score_val = float(num_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None and '/' in first_line_clean:
	fraction_match = re.match(r'^\s[`\]\s(\d+\.?\d)\s[`\]\s/\s\d+', first_line_clean)
	if fraction_match:
	try:
	score_val = float(fraction_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None:
	score_match = re.search(r'(?:score\|rating)\s[:=]\s(\d+\.?\d*)', section_content, re.IGNORECASE)
	if score_match:
	try:
	score_val = float(score_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is not None:
	review_dict["contribution"] = score_val
	elif "strength" in section_name_lower:
	review_dict["strengths"] = section_content
	elif "weakness" in section_name_lower:
	review_dict["weaknesses"] = section_content
	elif "question" in section_name_lower:
	review_dict["questions"] = section_content
	elif "rating" in section_name_lower and "confidence" not in section_name_lower:
	score_val = None
	lines = section_content.split('\n')
	if lines:
	first_line = lines[0].strip()
	first_line_clean = re.sub(r'[`\*]', '', first_line)

	num_match = re.match(r'^(\d+\.?\d)(\s)', first_line_clean)
	if num_match:
	remaining = first_line_clean[len(num_match.group(0)):].strip()
	if not remaining.startswith('/'):
	try:
	score_val = float(num_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None and '/' in first_line_clean:
	fraction_match = re.match(r'^\s[`\]\s(\d+\.?\d)\s[`\]\s/\s\d+', first_line_clean)
	if fraction_match:
	try:
	score_val = float(fraction_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None:
	score_match = re.search(r'(?:score\|rating)\s[:=]\s(\d+\.?\d*)', section_content, re.IGNORECASE)
	if score_match:
	try:
	score_val = float(score_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is not None:
	review_dict["rating"] = score_val
	elif "confidence" in section_name_lower:
	score_val = None
	lines = section_content.split('\n')
	if lines:
	first_line = lines[0].strip()
	first_line_clean = re.sub(r'[`\*]', '', first_line)

	num_match = re.match(r'^(\d+\.?\d)(\s)', first_line_clean)
	if num_match:
	remaining = first_line_clean[len(num_match.group(0)):].strip()
	if not remaining.startswith('/'):
	try:
	score_val = float(num_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None and '/' in first_line_clean:
	fraction_match = re.match(r'^\s[`\]\s(\d+\.?\d)\s[`\]\s/\s\d+', first_line_clean)
	if fraction_match:
	try:
	score_val = float(fraction_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is None:
	score_match = re.search(r'(?:score\|rating)\s[:=]\s(\d+\.?\d*)', section_content, re.IGNORECASE)
	if score_match:
	try:
	score_val = float(score_match.group(1))
	except (ValueError, IndexError):
	pass

	if score_val is not None:
	review_dict["confidence"] = score_val
	elif "decision" in section_name_lower:
	review_dict["decision"] = section_content

	return review_dict