Spaces:

ICA-PUC
/

beta-NORM

Sleeping

beta-NORM / scripts /normalize_md_titles.py

GitHub Actions

Snapshot from GitHub master for HF Space

6f54a86 23 days ago

3.67 kB

	import os
	import glob

	DOCS_DIR = "Docs"
	MAX_SCAN_LINES = 30

	def guess_title_from_filename(filename: str) -> str:
	"""Crea un título legible a partir del nombre de archivo."""
	base = os.path.splitext(os.path.basename(filename))[0]

	title = base.replace("_", " ").replace("-", " ")

	if title.isupper():
	title = title.title()
	return title.strip()


	def normalize_text_for_match(text: str) -> str:
	"""Normaliza texto para comparación aproximada (minúsculas, sin símbolos comunes)."""
	cleaned = text.replace("_", " ").replace("-", " ")
	cleaned = cleaned.replace("(", " ").replace(")", " ")
	cleaned = cleaned.replace(",", " ").replace(".", " ")
	cleaned = " ".join(cleaned.split())
	return cleaned.lower()


	def looks_like_title(line: str) -> bool:
	"""Heurística sencilla para detectar líneas "tipo título" (no muy largas, poco ruido numérico)."""
	txt = line.strip()
	if not txt:
	return False
	# Evitar URLs claras
	if "http://" in txt or "https://" in txt:
	return False

	if len(txt) > 200:
	return False

	digits = sum(c.isdigit() for c in txt)
	if digits > len(txt) * 0.4:
	return False
	return True

	def normalize_md_file(path: str) -> None:
	with open(path, "r", encoding="utf-8") as f:
	lines = f.readlines()

	if not lines:
	return

	filename_title = guess_title_from_filename(path)
	filename_norm = normalize_text_for_match(filename_title)

	best_idx = None
	best_title = None
	best_score = 0.0

	for i, line in enumerate(lines[:MAX_SCAN_LINES]):
	raw_line = line.rstrip("\n")
	stripped = raw_line.strip()
	if not stripped:
	continue

	is_heading = stripped.startswith("#")
	candidate = stripped.lstrip("#").strip() if is_heading else stripped

	if not looks_like_title(candidate):
	continue

	cand_norm = normalize_text_for_match(candidate)
	if not cand_norm:
	continue

	fname_words = set(filename_norm.split())
	cand_words = set(cand_norm.split())
	if not fname_words:
	overlap = 0.0
	else:
	overlap = len(fname_words & cand_words) / len(fname_words)

	score = overlap

	if is_heading:
	score += 0.1

	if score > best_score:
	best_score = score
	best_idx = i
	best_title = candidate

	# Umbral: si no encontramos nada razonable, usamos el nombre del archivo
	if best_title is None or best_score < 0.3:
	new_title_text = filename_title
	insert_idx = 0
	for i, line in enumerate(lines):
	if line.strip():
	insert_idx = i
	break
	new_title = f"# {new_title_text}\n"
	lines.insert(insert_idx, new_title + "\n")
	else:
	raw_norm = best_title.replace("_", " ").replace("-", " ")
	if raw_norm.isupper():
	raw_norm = raw_norm.title()
	new_title = f"# {raw_norm}\n"
	lines[best_idx] = new_title

	with open(path, "w", encoding="utf-8") as f:
	f.writelines(lines)

	def main():
	pattern = os.path.join(DOCS_DIR, "*.md")
	files = glob.glob(pattern)
	print(f"Encontrados {len(files)} arquivos .md em {DOCS_DIR}")

	for i, path in enumerate(files, start=1):
	print(f"[{i}/{len(files)}] Normalizando título de: {os.path.basename(path)}")
	try:
	normalize_md_file(path)
	except Exception as e:
	print(f" -> Erro ao processar {path}: {e}")

	print("Normalização de títulos concluída.")

	if __name__ == "__main__":
	main()