Spaces:

ICA-PUC
/

beta-NORM

Sleeping

File size: 3,674 Bytes

6f54a86

import os
import glob

DOCS_DIR = "Docs"  
MAX_SCAN_LINES = 30  

def guess_title_from_filename(filename: str) -> str:
    """Crea un título legible a partir del nombre de archivo."""
    base = os.path.splitext(os.path.basename(filename))[0]

    title = base.replace("_", " ").replace("-", " ")

    if title.isupper():
        title = title.title()
    return title.strip()


def normalize_text_for_match(text: str) -> str:
    """Normaliza texto para comparación aproximada (minúsculas, sin símbolos comunes)."""
    cleaned = text.replace("_", " ").replace("-", " ")
    cleaned = cleaned.replace("(", " ").replace(")", " ")
    cleaned = cleaned.replace(",", " ").replace(".", " ")
    cleaned = " ".join(cleaned.split())
    return cleaned.lower()


def looks_like_title(line: str) -> bool:
    """Heurística sencilla para detectar líneas "tipo título" (no muy largas, poco ruido numérico)."""
    txt = line.strip()
    if not txt:
        return False
    # Evitar URLs claras
    if "http://" in txt or "https://" in txt:
        return False

    if len(txt) > 200:
        return False

    digits = sum(c.isdigit() for c in txt)
    if digits > len(txt) * 0.4:
        return False
    return True

def normalize_md_file(path: str) -> None:
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    if not lines:
        return

    filename_title = guess_title_from_filename(path)
    filename_norm = normalize_text_for_match(filename_title)

    best_idx = None
    best_title = None
    best_score = 0.0

    for i, line in enumerate(lines[:MAX_SCAN_LINES]):
        raw_line = line.rstrip("\n")
        stripped = raw_line.strip()
        if not stripped:
            continue

        is_heading = stripped.startswith("#")
        candidate = stripped.lstrip("#").strip() if is_heading else stripped

        if not looks_like_title(candidate):
            continue

        cand_norm = normalize_text_for_match(candidate)
        if not cand_norm:
            continue

        fname_words = set(filename_norm.split())
        cand_words = set(cand_norm.split())
        if not fname_words:
            overlap = 0.0
        else:
            overlap = len(fname_words & cand_words) / len(fname_words)

        score = overlap

        if is_heading:
            score += 0.1

        if score > best_score:
            best_score = score
            best_idx = i
            best_title = candidate

    # Umbral: si no encontramos nada razonable, usamos el nombre del archivo
    if best_title is None or best_score < 0.3:
        new_title_text = filename_title
        insert_idx = 0
        for i, line in enumerate(lines):
            if line.strip():
                insert_idx = i
                break
        new_title = f"# {new_title_text}\n"
        lines.insert(insert_idx, new_title + "\n")
    else:
        raw_norm = best_title.replace("_", " ").replace("-", " ")
        if raw_norm.isupper():
            raw_norm = raw_norm.title()
        new_title = f"# {raw_norm}\n"
        lines[best_idx] = new_title

    with open(path, "w", encoding="utf-8") as f:
        f.writelines(lines)

def main():
    pattern = os.path.join(DOCS_DIR, "*.md")
    files = glob.glob(pattern)
    print(f"Encontrados {len(files)} arquivos .md em {DOCS_DIR}")

    for i, path in enumerate(files, start=1):
        print(f"[{i}/{len(files)}] Normalizando título de: {os.path.basename(path)}")
        try:
            normalize_md_file(path)
        except Exception as e:
            print(f"  -> Erro ao processar {path}: {e}")

    print("Normalização de títulos concluída.")

if __name__ == "__main__":
    main()