beta-NORM / scripts /normalize_md_titles.py
GitHub Actions
Snapshot from GitHub master for HF Space
6f54a86
import os
import glob
DOCS_DIR = "Docs"
MAX_SCAN_LINES = 30
def guess_title_from_filename(filename: str) -> str:
"""Crea un título legible a partir del nombre de archivo."""
base = os.path.splitext(os.path.basename(filename))[0]
title = base.replace("_", " ").replace("-", " ")
if title.isupper():
title = title.title()
return title.strip()
def normalize_text_for_match(text: str) -> str:
"""Normaliza texto para comparación aproximada (minúsculas, sin símbolos comunes)."""
cleaned = text.replace("_", " ").replace("-", " ")
cleaned = cleaned.replace("(", " ").replace(")", " ")
cleaned = cleaned.replace(",", " ").replace(".", " ")
cleaned = " ".join(cleaned.split())
return cleaned.lower()
def looks_like_title(line: str) -> bool:
"""Heurística sencilla para detectar líneas "tipo título" (no muy largas, poco ruido numérico)."""
txt = line.strip()
if not txt:
return False
# Evitar URLs claras
if "http://" in txt or "https://" in txt:
return False
if len(txt) > 200:
return False
digits = sum(c.isdigit() for c in txt)
if digits > len(txt) * 0.4:
return False
return True
def normalize_md_file(path: str) -> None:
with open(path, "r", encoding="utf-8") as f:
lines = f.readlines()
if not lines:
return
filename_title = guess_title_from_filename(path)
filename_norm = normalize_text_for_match(filename_title)
best_idx = None
best_title = None
best_score = 0.0
for i, line in enumerate(lines[:MAX_SCAN_LINES]):
raw_line = line.rstrip("\n")
stripped = raw_line.strip()
if not stripped:
continue
is_heading = stripped.startswith("#")
candidate = stripped.lstrip("#").strip() if is_heading else stripped
if not looks_like_title(candidate):
continue
cand_norm = normalize_text_for_match(candidate)
if not cand_norm:
continue
fname_words = set(filename_norm.split())
cand_words = set(cand_norm.split())
if not fname_words:
overlap = 0.0
else:
overlap = len(fname_words & cand_words) / len(fname_words)
score = overlap
if is_heading:
score += 0.1
if score > best_score:
best_score = score
best_idx = i
best_title = candidate
# Umbral: si no encontramos nada razonable, usamos el nombre del archivo
if best_title is None or best_score < 0.3:
new_title_text = filename_title
insert_idx = 0
for i, line in enumerate(lines):
if line.strip():
insert_idx = i
break
new_title = f"# {new_title_text}\n"
lines.insert(insert_idx, new_title + "\n")
else:
raw_norm = best_title.replace("_", " ").replace("-", " ")
if raw_norm.isupper():
raw_norm = raw_norm.title()
new_title = f"# {raw_norm}\n"
lines[best_idx] = new_title
with open(path, "w", encoding="utf-8") as f:
f.writelines(lines)
def main():
pattern = os.path.join(DOCS_DIR, "*.md")
files = glob.glob(pattern)
print(f"Encontrados {len(files)} arquivos .md em {DOCS_DIR}")
for i, path in enumerate(files, start=1):
print(f"[{i}/{len(files)}] Normalizando título de: {os.path.basename(path)}")
try:
normalize_md_file(path)
except Exception as e:
print(f" -> Erro ao processar {path}: {e}")
print("Normalização de títulos concluída.")
if __name__ == "__main__":
main()