Spaces:

mdAmin313
/

news

Sleeping

App Files Files Community

news / _utils.py

mdAmin313

Update _utils.py

5d9234c verified 2 months ago

raw

history blame contribute delete

1.95 kB

	import re
	import json
	import logging
	from typing import Optional, Dict
	from urllib.parse import urlparse

	logger = logging.getLogger("fact_checker_utils")

	def sanitize_text(text: Optional[str]) -> str:
	"""Cleans up text by removing HTML tags and normalizing whitespace."""
	if not text:
	return ""
	t = re.sub(r"<[^>]+>", " ", text)
	t = re.sub(r"\s+", " ", t).strip()
	return t

	def extract_json_from_text(text: str) -> Optional[str]:
	"""
	Safely extracts a fenced JSON block (```json ... ```) or a standalone
	JSON object ({...}) from a text string.
	"""
	if not text:
	return None
	# 1. Search for fenced block (```json ... ``` or ``` ... ```)
	# The regex captures the content inside the braces within the backticks.
	m = re.search(r"```(?:json)?\s(\{.?\})\s*```", text, re.S \| re.I)
	if m:
	return m.group(1)

	# 2. Fallback to bracket counting for an unfenced JSON object
	start = None
	depth = 0
	for i, ch in enumerate(text):
	if ch == "{":
	if start is None:
	start = i
	depth += 1
	elif ch == "}":
	if depth > 0:
	depth -= 1
	if depth == 0 and start is not None:
	return text[start:i+1]
	return None

	def safe_parse_gemini_json(raw_text: str) -> Optional[dict]:
	"""
	Extracts JSON from Gemini's text response using extract_json_from_text
	and safely parses it.
	"""
	jstr = extract_json_from_text(raw_text)
	if not jstr:
	return None
	try:
	return json.loads(jstr)
	except Exception:
	logger.warning("Failed to parse extracted JSON: %s", jstr[:200])
	return None

	def domain_from_url(url: str) -> str:
	"""Extracts the base domain from a URL."""
	try:
	host = urlparse(url).hostname or ""
	return host.lower().lstrip("www.")
	except Exception:
	return ""