|
|
import os |
|
|
import glob |
|
|
|
|
|
DOCS_DIR = "Docs" |
|
|
MAX_SCAN_LINES = 30 |
|
|
|
|
|
def guess_title_from_filename(filename: str) -> str: |
|
|
"""Crea un título legible a partir del nombre de archivo.""" |
|
|
base = os.path.splitext(os.path.basename(filename))[0] |
|
|
|
|
|
title = base.replace("_", " ").replace("-", " ") |
|
|
|
|
|
if title.isupper(): |
|
|
title = title.title() |
|
|
return title.strip() |
|
|
|
|
|
|
|
|
def normalize_text_for_match(text: str) -> str: |
|
|
"""Normaliza texto para comparación aproximada (minúsculas, sin símbolos comunes).""" |
|
|
cleaned = text.replace("_", " ").replace("-", " ") |
|
|
cleaned = cleaned.replace("(", " ").replace(")", " ") |
|
|
cleaned = cleaned.replace(",", " ").replace(".", " ") |
|
|
cleaned = " ".join(cleaned.split()) |
|
|
return cleaned.lower() |
|
|
|
|
|
|
|
|
def looks_like_title(line: str) -> bool: |
|
|
"""Heurística sencilla para detectar líneas "tipo título" (no muy largas, poco ruido numérico).""" |
|
|
txt = line.strip() |
|
|
if not txt: |
|
|
return False |
|
|
|
|
|
if "http://" in txt or "https://" in txt: |
|
|
return False |
|
|
|
|
|
if len(txt) > 200: |
|
|
return False |
|
|
|
|
|
digits = sum(c.isdigit() for c in txt) |
|
|
if digits > len(txt) * 0.4: |
|
|
return False |
|
|
return True |
|
|
|
|
|
def normalize_md_file(path: str) -> None: |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
lines = f.readlines() |
|
|
|
|
|
if not lines: |
|
|
return |
|
|
|
|
|
filename_title = guess_title_from_filename(path) |
|
|
filename_norm = normalize_text_for_match(filename_title) |
|
|
|
|
|
best_idx = None |
|
|
best_title = None |
|
|
best_score = 0.0 |
|
|
|
|
|
for i, line in enumerate(lines[:MAX_SCAN_LINES]): |
|
|
raw_line = line.rstrip("\n") |
|
|
stripped = raw_line.strip() |
|
|
if not stripped: |
|
|
continue |
|
|
|
|
|
is_heading = stripped.startswith("#") |
|
|
candidate = stripped.lstrip("#").strip() if is_heading else stripped |
|
|
|
|
|
if not looks_like_title(candidate): |
|
|
continue |
|
|
|
|
|
cand_norm = normalize_text_for_match(candidate) |
|
|
if not cand_norm: |
|
|
continue |
|
|
|
|
|
fname_words = set(filename_norm.split()) |
|
|
cand_words = set(cand_norm.split()) |
|
|
if not fname_words: |
|
|
overlap = 0.0 |
|
|
else: |
|
|
overlap = len(fname_words & cand_words) / len(fname_words) |
|
|
|
|
|
score = overlap |
|
|
|
|
|
if is_heading: |
|
|
score += 0.1 |
|
|
|
|
|
if score > best_score: |
|
|
best_score = score |
|
|
best_idx = i |
|
|
best_title = candidate |
|
|
|
|
|
|
|
|
if best_title is None or best_score < 0.3: |
|
|
new_title_text = filename_title |
|
|
insert_idx = 0 |
|
|
for i, line in enumerate(lines): |
|
|
if line.strip(): |
|
|
insert_idx = i |
|
|
break |
|
|
new_title = f"# {new_title_text}\n" |
|
|
lines.insert(insert_idx, new_title + "\n") |
|
|
else: |
|
|
raw_norm = best_title.replace("_", " ").replace("-", " ") |
|
|
if raw_norm.isupper(): |
|
|
raw_norm = raw_norm.title() |
|
|
new_title = f"# {raw_norm}\n" |
|
|
lines[best_idx] = new_title |
|
|
|
|
|
with open(path, "w", encoding="utf-8") as f: |
|
|
f.writelines(lines) |
|
|
|
|
|
def main(): |
|
|
pattern = os.path.join(DOCS_DIR, "*.md") |
|
|
files = glob.glob(pattern) |
|
|
print(f"Encontrados {len(files)} arquivos .md em {DOCS_DIR}") |
|
|
|
|
|
for i, path in enumerate(files, start=1): |
|
|
print(f"[{i}/{len(files)}] Normalizando título de: {os.path.basename(path)}") |
|
|
try: |
|
|
normalize_md_file(path) |
|
|
except Exception as e: |
|
|
print(f" -> Erro ao processar {path}: {e}") |
|
|
|
|
|
print("Normalização de títulos concluída.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |