Spaces:

asoni9
/

canHeal

Sleeping

canHeal / preprocess.py

Anirudha Soni

Basic changes

597fb81 4 months ago

1.37 kB

	import json
	import re
	from pathlib import Path

	DATA_DIR = Path("data")

	def load_json(filename):
	"""Load a JSON file and return list of records."""
	with open(DATA_DIR / filename, "r", encoding="utf-8") as f:
	data = json.load(f)
	if isinstance(data, dict) and "results" in data:
	return data["results"]
	return data if isinstance(data, list) else []

	def extract_text(item):
	"""Extract textual fields from a JSON record."""
	texts = []
	for k in ("text", "description", "body", "content", "name"):
	if k in item and item[k]:
	texts.append(str(item[k]))
	if "content_json" in item and isinstance(item["content_json"], dict):
	for v in item["content_json"].values():
	if isinstance(v, str) and v.strip():
	texts.append(v)
	return texts

	def chunk_text(text, max_words=80):
	"""Split long text into smaller chunks."""
	sentences = re.split(r'(?<=[.!?]) +', text)
	chunks, cur, count = [], [], 0
	for s in sentences:
	words = s.split()
	if len(words) < 5:
	continue
	if count + len(words) > max_words and cur:
	chunks.append(" ".join(cur))
	cur, count = [s], len(words)
	else:
	cur.append(s)
	count += len(words)
	if cur:
	chunks.append(" ".join(cur))
	return chunks