letxinet / backend /utils.py
C2MV's picture
Initial upload for Build Small Hackathon
68fb5e2 verified
Raw
History Blame Contribute Delete
7.03 kB
"""
Robust utilities for the research pipeline
- JSON parsing with multiple fallback layers
- Retry with exponential backoff
- Model fallback chain
- Content cleaning
"""
import json
import re
import asyncio
import time
from typing import Optional, Any
def robust_json_parse(text: str) -> Optional[dict]:
"""Parse JSON with 7 fallback layers (faithful to original Next.js)."""
if not text or not text.strip():
return None
text = text.strip()
# Layer 1: Direct parse
try:
result = json.loads(text)
if isinstance(result, dict):
return result
except:
pass
# Layer 2: Strip markdown code blocks
if text.startswith("```"):
text = text.split("\n", 1)[1].rsplit("```", 1)[0].strip()
try:
return json.loads(text)
except:
pass
# Layer 3: Find first { to last } (the main JSON object)
start = text.find("{")
end = text.rfind("}") + 1
if start >= 0 and end > start:
candidate = text[start:end]
try:
return json.loads(candidate)
except:
pass
# Layer 4: Try to find array [
start = text.find("[")
end = text.rfind("]") + 1
if start >= 0 and end > start:
candidate = text[start:end]
try:
result = json.loads(candidate)
if isinstance(result, list):
return {"plan": result}
except:
pass
# Layer 5: Try plan aliases
for alias in ["plan", "sections", "structure", "outline", "document", "research", "chapters", "content"]:
try:
full = json.loads(text)
if isinstance(full, dict) and alias in full:
val = full[alias]
if isinstance(val, list):
return {"plan": val, "summary": full.get("summary", "")}
elif isinstance(val, str):
return {"plan": [{"section": "Content", "content": val}], "summary": val[:200]}
except:
pass
# Layer 6: Extract JSON from monologue (find balanced braces)
depth = 0
json_start = -1
for i, c in enumerate(text):
if c == '{':
if depth == 0:
json_start = i
depth += 1
elif c == '}':
depth -= 1
if depth == 0 and json_start >= 0:
candidate = text[json_start:i+1]
try:
return json.loads(candidate)
except:
json_start = -1
# Layer 7: Last resort - wrap as single section
return {"plan": [{"section": "Research Report", "content": text[:5000]}], "summary": text[:500]}
def clean_agent_content(text: str) -> str:
"""Remove monologue, think tags, loops, and other AI artifacts."""
if not text:
return ""
# Remove think tags
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
# Remove monologue prefixes
prefixes = [
"Here is", "Here's", "Below is", "The following", "I'll",
"Let me", "Sure", "Okay", "Alright", "Certainly",
"Claro", "Aquí está", "A continuación", "Voy a"
]
for prefix in prefixes:
if text.startswith(prefix):
text = text[len(prefix):].lstrip(":").lstrip().lstrip("\n")
# Remove repeated lines (loops)
lines = text.split("\n")
unique_lines = []
seen = set()
for line in lines:
normalized = line.strip().lower()
if normalized and normalized in seen:
continue
seen.add(normalized)
unique_lines.append(line)
return "\n".join(unique_lines).strip()
def strip_latex(text: str) -> str:
"""Remove LaTeX commands from text."""
text = re.sub(r'\\\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{arg}
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove \command
text = re.sub(r'\$[^$]+\$', '', text) # Remove inline math
text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL) # Remove display math
return text.strip()
def sanitize_latex(text: str) -> str:
"""Sanitize text for LaTeX output."""
text = text.replace('&', '\\&')
text = text.replace('%', '\\%')
text = text.replace('$', '\\$')
text = text.replace('#', '\\#')
text = text.replace('_', '\\_')
text = text.replace('{', '\\{')
text = text.replace('}', '\\}')
return text
def normalize_boolean(val: Any) -> Any:
"""Normalize boolean-like values."""
if isinstance(val, str):
val = val.strip().lower()
if val in ("true", "yes", "1", "on"): return True
if val in ("false", "no", "0", "off", ""): return False
return val
def clean_stop_words(text: str, stop_words: list = None) -> str:
"""Remove stop words from text."""
default_stops = [
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
"of", "with", "by", "from", "as", "is", "was", "are", "were", "be",
"el", "la", "los", "las", "un", "una", "y", "o", "pero", "en", "de",
"del", "con", "por", "para", "como", "que", "se", "su", "al"
]
words = stop_words or default_stops
tokens = text.split()
filtered = [t for t in tokens if t.lower() not in words]
return " ".join(filtered)
async def with_retry(func, retries: int = 2, delay: float = 1.0, backoff: float = 2.0):
"""Execute function with retry and exponential backoff."""
last_error = None
for attempt in range(retries + 1):
try:
return await func()
except Exception as e:
last_error = e
if attempt < retries:
await asyncio.sleep(delay * (backoff ** attempt))
raise last_error
def extract_research_plan(text: str) -> dict:
"""Extract research plan from various response formats."""
# Try JSON first
parsed = robust_json_parse(text)
if parsed and "plan" in parsed:
return parsed
# Try to find plan-like content
if isinstance(parsed, dict):
for key in ["sections", "structure", "outline", "document", "research", "chapters", "content"]:
if key in parsed:
val = parsed[key]
if isinstance(val, list):
return {"plan": val, "summary": parsed.get("summary", "")}
# Fallback: wrap text as single section
return {
"plan": [{"section": "Research Report", "content": text[:5000]}],
"summary": text[:500]
}
def is_plan_weak(plan: dict) -> bool:
"""Check if plan needs retry (too few sections or short names)."""
items = plan.get("plan", [])
if not isinstance(items, list) or len(items) < 2:
return True
for item in items:
if not isinstance(item, dict):
return True
section = item.get("section", "")
if not section or len(section) < 5:
return True
return False