Spaces:

C2MV
/

letxinet

Runtime error

App Files Files Community

letxinet / backend /utils.py

C2MV

Initial upload for Build Small Hackathon

68fb5e2 verified 17 days ago

Raw

History Blame Contribute Delete

7.03 kB

	"""
	Robust utilities for the research pipeline
	- JSON parsing with multiple fallback layers
	- Retry with exponential backoff
	- Model fallback chain
	- Content cleaning
	"""

	import json
	import re
	import asyncio
	import time
	from typing import Optional, Any


	def robust_json_parse(text: str) -> Optional[dict]:
	"""Parse JSON with 7 fallback layers (faithful to original Next.js)."""
	if not text or not text.strip():
	return None

	text = text.strip()

	# Layer 1: Direct parse
	try:
	result = json.loads(text)
	if isinstance(result, dict):
	return result
	except:
	pass

	# Layer 2: Strip markdown code blocks
	if text.startswith("```"):
	text = text.split("\n", 1)[1].rsplit("```", 1)[0].strip()
	try:
	return json.loads(text)
	except:
	pass

	# Layer 3: Find first { to last } (the main JSON object)
	start = text.find("{")
	end = text.rfind("}") + 1
	if start >= 0 and end > start:
	candidate = text[start:end]
	try:
	return json.loads(candidate)
	except:
	pass

	# Layer 4: Try to find array [
	start = text.find("[")
	end = text.rfind("]") + 1
	if start >= 0 and end > start:
	candidate = text[start:end]
	try:
	result = json.loads(candidate)
	if isinstance(result, list):
	return {"plan": result}
	except:
	pass

	# Layer 5: Try plan aliases
	for alias in ["plan", "sections", "structure", "outline", "document", "research", "chapters", "content"]:
	try:
	full = json.loads(text)
	if isinstance(full, dict) and alias in full:
	val = full[alias]
	if isinstance(val, list):
	return {"plan": val, "summary": full.get("summary", "")}
	elif isinstance(val, str):
	return {"plan": [{"section": "Content", "content": val}], "summary": val[:200]}
	except:
	pass

	# Layer 6: Extract JSON from monologue (find balanced braces)
	depth = 0
	json_start = -1
	for i, c in enumerate(text):
	if c == '{':
	if depth == 0:
	json_start = i
	depth += 1
	elif c == '}':
	depth -= 1
	if depth == 0 and json_start >= 0:
	candidate = text[json_start:i+1]
	try:
	return json.loads(candidate)
	except:
	json_start = -1

	# Layer 7: Last resort - wrap as single section
	return {"plan": [{"section": "Research Report", "content": text[:5000]}], "summary": text[:500]}


	def clean_agent_content(text: str) -> str:
	"""Remove monologue, think tags, loops, and other AI artifacts."""
	if not text:
	return ""

	# Remove think tags
	text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
	text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)

	# Remove monologue prefixes
	prefixes = [
	"Here is", "Here's", "Below is", "The following", "I'll",
	"Let me", "Sure", "Okay", "Alright", "Certainly",
	"Claro", "Aquí está", "A continuación", "Voy a"
	]
	for prefix in prefixes:
	if text.startswith(prefix):
	text = text[len(prefix):].lstrip(":").lstrip().lstrip("\n")

	# Remove repeated lines (loops)
	lines = text.split("\n")
	unique_lines = []
	seen = set()
	for line in lines:
	normalized = line.strip().lower()
	if normalized and normalized in seen:
	continue
	seen.add(normalized)
	unique_lines.append(line)

	return "\n".join(unique_lines).strip()


	def strip_latex(text: str) -> str:
	"""Remove LaTeX commands from text."""
	text = re.sub(r'\\\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{arg}
	text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove \command
	text = re.sub(r'\$[^$]+\$', '', text) # Remove inline math
	text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL) # Remove display math
	return text.strip()


	def sanitize_latex(text: str) -> str:
	"""Sanitize text for LaTeX output."""
	text = text.replace('&', '\\&')
	text = text.replace('%', '\\%')
	text = text.replace('$', '\\$')
	text = text.replace('#', '\\#')
	text = text.replace('_', '\\_')
	text = text.replace('{', '\\{')
	text = text.replace('}', '\\}')
	return text


	def normalize_boolean(val: Any) -> Any:
	"""Normalize boolean-like values."""
	if isinstance(val, str):
	val = val.strip().lower()
	if val in ("true", "yes", "1", "on"): return True
	if val in ("false", "no", "0", "off", ""): return False
	return val


	def clean_stop_words(text: str, stop_words: list = None) -> str:
	"""Remove stop words from text."""
	default_stops = [
	"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
	"of", "with", "by", "from", "as", "is", "was", "are", "were", "be",
	"el", "la", "los", "las", "un", "una", "y", "o", "pero", "en", "de",
	"del", "con", "por", "para", "como", "que", "se", "su", "al"
	]
	words = stop_words or default_stops
	tokens = text.split()
	filtered = [t for t in tokens if t.lower() not in words]
	return " ".join(filtered)


	async def with_retry(func, retries: int = 2, delay: float = 1.0, backoff: float = 2.0):
	"""Execute function with retry and exponential backoff."""
	last_error = None
	for attempt in range(retries + 1):
	try:
	return await func()
	except Exception as e:
	last_error = e
	if attempt < retries:
	await asyncio.sleep(delay * (backoff ** attempt))
	raise last_error


	def extract_research_plan(text: str) -> dict:
	"""Extract research plan from various response formats."""
	# Try JSON first
	parsed = robust_json_parse(text)
	if parsed and "plan" in parsed:
	return parsed

	# Try to find plan-like content
	if isinstance(parsed, dict):
	for key in ["sections", "structure", "outline", "document", "research", "chapters", "content"]:
	if key in parsed:
	val = parsed[key]
	if isinstance(val, list):
	return {"plan": val, "summary": parsed.get("summary", "")}

	# Fallback: wrap text as single section
	return {
	"plan": [{"section": "Research Report", "content": text[:5000]}],
	"summary": text[:500]
	}


	def is_plan_weak(plan: dict) -> bool:
	"""Check if plan needs retry (too few sections or short names)."""
	items = plan.get("plan", [])
	if not isinstance(items, list) or len(items) < 2:
	return True
	for item in items:
	if not isinstance(item, dict):
	return True
	section = item.get("section", "")
	if not section or len(section) < 5:
	return True
	return False