| | import os |
| | import re |
| | from pathlib import Path |
| | import yaml |
| |
|
| | |
| | REPO_ROOT = Path(__file__).resolve().parent.parent |
| |
|
| | |
| | KEYWORD_TAGS = [ |
| | "CCore", "CShell", "REPL", "Mesh", "Agent", "HMP", |
| | "MeshConsensus", "CogSync", "GMP", "EGP", |
| | "Ethics", "Scenarios", "JSON" |
| | ] |
| |
|
| | ROOT_DIR = Path(".") |
| | STRUCTURED_DIR = ROOT_DIR / "structured_md" |
| | INDEX_FILE = STRUCTURED_DIR / "index.md" |
| |
|
| | MD_EXT = ".md" |
| |
|
| | |
| | JSON_LD_TEMPLATES = { |
| | "FAQ": """\n```json |
| | {{ |
| | "@context": "https://schema.org", |
| | "@type": "FAQPage", |
| | "mainEntity": {main_entity} |
| | }} |
| | ```\n""", |
| | "HowTo": """\n```json |
| | {{ |
| | "@context": "https://schema.org", |
| | "@type": "HowTo", |
| | "name": "{title}", |
| | "description": "{description}", |
| | "step": {steps} |
| | }} |
| | ```\n""", |
| | "Article": """\n```json |
| | {{ |
| | "@context": "https://schema.org", |
| | "@type": "Article", |
| | "name": "{title}", |
| | "description": "{description}" |
| | }} |
| | ```\n""" |
| | } |
| |
|
| | FRONT_MATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL) |
| |
|
| | def is_md_file(path): |
| | return path.suffix.lower() == MD_EXT and STRUCTURED_DIR not in path.parents |
| |
|
| | def extract_front_matter(content: str): |
| | """Возвращает (front_matter_dict, clean_content) — без YAML-шапки.""" |
| | match = FRONT_MATTER_RE.match(content) |
| | if match: |
| | try: |
| | data = yaml.safe_load(match.group(1)) or {} |
| | except Exception: |
| | data = {} |
| | clean = content[match.end():] |
| | return data, clean |
| | return {}, content |
| |
|
| | def detect_file_type(content: str, front_matter: dict | None = None) -> str: |
| | """Определяет тип: FAQ / HowTo / Article (по front-matter или заголовкам).""" |
| | front_matter = front_matter or {} |
| | if "type" in front_matter: |
| | return front_matter["type"] |
| |
|
| | |
| | if re.search(r"^#\s*FAQ\b", content, re.MULTILINE) or re.search(r"^##\s*Q&A\b", content, re.MULTILINE): |
| | return "FAQ" |
| | if re.search(r"^#\s*HowTo\b", content, re.MULTILINE) or re.search(r"^#\s*Как\s+сделать\b", content, re.IGNORECASE | re.MULTILINE): |
| | return "HowTo" |
| | return "Article" |
| |
|
| | def parse_front_matter(content): |
| | match = FRONT_MATTER_RE.match(content) |
| | if match: |
| | try: |
| | data = yaml.safe_load(match.group(1)) |
| | return data |
| | except Exception: |
| | pass |
| | return {} |
| |
|
| | def determine_type(content, front_matter): |
| | if "type" in front_matter: |
| | return front_matter["type"] |
| | |
| | if re.search(r"^#.*FAQ", content, re.MULTILINE): |
| | return "FAQ" |
| | if re.search(r"^#.*HowTo", content, re.MULTILINE): |
| | return "HowTo" |
| | return "Article" |
| |
|
| | def generate_json_ld(content, front_matter, ftype, title, rel_path): |
| | desc = front_matter.get("description", content[:100].replace("\n", " ") + "...") |
| | url = f"structured_md/{rel_path.as_posix()}" |
| |
|
| | if ftype == "FAQ": |
| | q_matches = re.findall(r"^##\s*(.+)$", content, re.MULTILINE) |
| | main_entity = [] |
| | for q in q_matches: |
| | ans_match = re.search(rf"##\s*{re.escape(q)}\s*\n(.+?)(\n##|\Z)", content, re.DOTALL) |
| | answer_text = ans_match.group(1).strip() if ans_match else "" |
| | main_entity.append({ |
| | "@type": "Question", |
| | "name": q, |
| | "acceptedAnswer": {"@type": "Answer", "text": answer_text} |
| | }) |
| | import json |
| | return JSON_LD_TEMPLATES["FAQ"].format( |
| | main_entity=json.dumps(main_entity, ensure_ascii=False, indent=2) |
| | ).replace("}}", f',\n "url": "{url}"\n}}', 1) |
| |
|
| | elif ftype == "HowTo": |
| | steps = [{"@type": "HowToStep", "name": s.strip()} for s in re.findall(r"^- (.+)$", content, re.MULTILINE)] |
| | import json |
| | return JSON_LD_TEMPLATES["HowTo"].format( |
| | title=title, description=desc, steps=json.dumps(steps, ensure_ascii=False, indent=2) |
| | ).replace("}}", f',\n "url": "{url}"\n}}', 1) |
| |
|
| | else: |
| | return JSON_LD_TEMPLATES["Article"].format( |
| | title=title, description=desc |
| | ).replace("}}", f',\n "url": "{url}"\n}}', 1) |
| |
|
| | def add_index_link(content, file_path): |
| | |
| | rel_path = os.path.relpath(STRUCTURED_DIR / "index.md", file_path.parent) |
| | link_line = f"\n\n---\n> ⚡ [AI friendly version docs (structured_md)]({rel_path})\n" |
| | if link_line.strip() not in content: |
| | content += link_line |
| | return content |
| |
|
| | def extract_tags(content, existing_tags): |
| | tags = set(existing_tags or []) |
| | for kw in KEYWORD_TAGS: |
| | if kw.lower() in content.lower(): |
| | tags.add(kw) |
| | return list(tags) |
| |
|
| | def mirror_md_files(): |
| | processed = [] |
| | for path in REPO_ROOT.rglob("*.md"): |
| | if "structured_md" in path.parts or path.name.lower() == "index.md": |
| | continue |
| |
|
| | rel_path = path.relative_to(REPO_ROOT) |
| | target_path = STRUCTURED_DIR / rel_path |
| | target_path.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | with path.open("r", encoding="utf-8") as f: |
| | content = f.read() |
| |
|
| | front_matter, clean_content = extract_front_matter(content) |
| | ftype = detect_file_type(clean_content, front_matter) |
| |
|
| | |
| | h1_match = re.search(r"^#\s*(.+)$", clean_content, re.MULTILINE) |
| | if h1_match: |
| | title = h1_match.group(1).strip() |
| | rest_content = clean_content[h1_match.end():].strip() |
| | description = front_matter.get("description", rest_content[:200].replace("\n", " ") + "...") |
| | else: |
| | title = front_matter.get("title", path.stem) |
| | description = front_matter.get("description", clean_content[:200].replace("\n", " ") + "...") |
| |
|
| | tags = extract_tags(clean_content, front_matter.get("tags", [])) |
| |
|
| | |
| | fm_dict = { |
| | "title": title, |
| | "description": description, |
| | "type": ftype, |
| | "tags": tags, |
| | } |
| | yaml_fm = "---\n" + yaml.safe_dump(fm_dict, sort_keys=False, allow_unicode=True) + "---\n\n" |
| |
|
| | |
| | clean_content = add_index_link(clean_content, target_path) |
| |
|
| | |
| | json_ld = generate_json_ld(clean_content, front_matter, ftype, title, rel_path) |
| |
|
| | |
| | with target_path.open("w", encoding="utf-8") as f: |
| | f.write(yaml_fm) |
| | f.write(clean_content.rstrip()) |
| | f.write("\n\n") |
| | f.write(json_ld) |
| |
|
| | processed.append(rel_path) |
| |
|
| | return processed |
| | |
| | def generate_index(files): |
| | index_lines = ["# ИИ-дружелюбные версии файлов\n"] |
| | tree = {} |
| |
|
| | for f in files: |
| | parts = list(f.parts) |
| | d = tree |
| | for p in parts[:-1]: |
| | d = d.setdefault(p, {}) |
| | d[parts[-1]] = None |
| |
|
| | def render_tree(d, parent_path="", level=0): |
| | lines = [] |
| | for name, sub in sorted(d.items()): |
| | indent = " " * level |
| | full_path = Path(parent_path) / name |
| | if sub is None: |
| | lines.append(f"{indent}- [{name}]({full_path.as_posix()})") |
| | else: |
| | lines.append(f"{indent}- {name}") |
| | lines.extend(render_tree(sub, full_path, level + 1)) |
| | return lines |
| |
|
| | index_lines.extend(render_tree(tree)) |
| |
|
| | INDEX_FILE.parent.mkdir(parents=True, exist_ok=True) |
| | with open(INDEX_FILE, "w", encoding="utf-8") as f: |
| | f.write("\n".join(index_lines)) |
| |
|
| | if __name__ == "__main__": |
| | STRUCTURED_DIR.mkdir(exist_ok=True) |
| | md_files = mirror_md_files() |
| | generate_index(md_files) |
| | print(f"Обработано {len(md_files)} файлов. Индекс создан: {INDEX_FILE}") |
| |
|