Spaces:

Nihal2000
/

CarAssistanceQA

Sleeping

App Files Files Community

CarAssistanceQA / src /utils /text_cleaning.py

Nihal2000

inferancing using gemma 270 model

f05e8f9 3 months ago

raw

history blame contribute delete

4.41 kB

	# src/utils/text_cleaning.py
	from __future__ import annotations
	import re
	import unicodedata
	from typing import List, Dict, Tuple, Optional

	# Control chars except \n and \t
	_CONTROL_CHARS = ''.join(map(chr, list(range(0,9)) + [11,12] + list(range(14,32)) + [127]))
	_CTRL_RE = re.compile(f'[{re.escape(_CONTROL_CHARS)}]')

	BULLET_MAP = {
	'•': '-', '◦': '-', '·': '-', '●': '-', '': '-', '*': '-',
	'–': '-', '—': '-', # normalize dashes
	}
	QUOTE_MAP = {
	'“':'"', '”':'"', '‟':'"', '„':'"',
	'’':"'", '‘':"'", '‚':"'",
	}
	LIG_MAP = {
	'ﬁ':'fi','ﬂ':'fl','ﬃ':'ffi','ﬄ':'ffl','ﬀ':'ff','ﬅ':'st','ﬆ':'st'
	}

	def unicode_normalize(text: str) -> str:
	if not text: return ''
	return unicodedata.normalize('NFC', text)

	def remove_control_chars(text: str) -> str:
	return _CTRL_RE.sub(' ', text)

	def normalize_quotes_dashes_ligatures(text: str) -> str:
	for k,v in QUOTE_MAP.items(): text = text.replace(k,v)
	for k,v in BULLET_MAP.items(): text = text.replace(k,v)
	for k,v in LIG_MAP.items(): text = text.replace(k,v)
	return text

	def normalize_spaces(text: str) -> str:
	# Map all unicode spaces to normal space
	text = ''.join(' ' if unicodedata.category(ch).startswith('Z') else ch for ch in text)
	# Collapse multiple spaces
	text = re.sub(r'[ \t]+', ' ', text)
	# Collapse 3+ newlines to 2
	text = re.sub(r'\n{3,}', '\n\n', text)
	# Strip trailing spaces
	text = re.sub(r'[ \t]+\n', '\n', text)
	return text.strip()

	def normalize_bullets_and_lists(text: str) -> str:
	# Ensure "- " at start of bullet lines
	text = re.sub(r'(?m)^\s-\s', '- ', text)
	# Ensure "1." has a space
	text = re.sub(r'(?m)^(\d+)\.\s*', r'\1. ', text)
	return text

	def basic_clean(text: str) -> str:
	t = unicode_normalize(text)
	t = remove_control_chars(t)
	t = normalize_quotes_dashes_ligatures(t)
	t = normalize_bullets_and_lists(t)
	t = normalize_spaces(t)
	return t

	def learn_header_footer_patterns(pages: List[str], sample: int = 8) -> Tuple[Optional[re.Pattern], Optional[re.Pattern]]:
	"""
	Heuristic: find most frequent first and last lines among first N pages.
	If repeated enough, build a regex to remove them.
	"""
	first_lines, last_lines = {}, {}
	for p in pages[:sample]:
	lines = [ln.strip() for ln in p.splitlines() if ln.strip()]
	if not lines: continue
	first_lines[lines[0]] = first_lines.get(lines, 0) + 1
	last_lines[lines[-1]] = last_lines.get(lines[-1], 0) + 1

	def build(d: Dict[str,int]) -> Optional[re.Pattern]:
	if not d: return None
	top, cnt = max(d.items(), key=lambda kv: kv[1])
	if cnt >= max(3, sample//2):
	esc = re.escape(top)
	return re.compile(rf'(?m)^\s{esc}(\s+\d+[-–]?\d)?\s*$')
	return None

	return build(first_lines), build(last_lines)

	def strip_headers_footers(text: str, head_re: Optional[re.Pattern], foot_re: Optional[re.Pattern]) -> str:
	if head_re: text = head_re.sub('', text)
	if foot_re: text = foot_re.sub('', text)
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text.strip()

	def looks_like_toc_or_schedule(text: str, section_title: str = '') -> bool:
	title = (section_title or '').lower()
	if any(k in title for k in ['table of contents','contents','maintenance schedule','scheduled maintenance','index']):
	return True
	# Many lines with dotted leaders ending in page numbers
	lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
	dotted = sum(1 for ln in lines if re.search(r'\.{3,}\s*\d{1,4}([.-]\d{1,4})?$', ln))
	return dotted >= max(4, len(lines)//3)

	def bullet_density(text: str) -> float:
	lines = [ln.strip() for ln in text.splitlines()]
	if not lines: return 0.0
	bullets = sum(1 for ln in lines if re.match(r'^(-\|\d+\.)\s+', ln))
	return bullets / max(1, len(lines))

	def truncate_nicely(text: str, max_len: int = 600) -> str:
	if len(text) <= max_len: return text
	cut = text[:max_len]
	m = re.search(r'([.!?])[^.!?]*$', cut)
	if m and m.end() > int(max_len*0.6):
	end = m.end()
	else:
	nl = cut.rfind('\n'); sp = cut.rfind(' ')
	end = max(nl, sp, int(max_len*0.9))
	return cut[:end].rstrip() + '...'