iris-ir-platform / backend /scripts /update_slide_metadata.py
rajvivan's picture
sync: push iris-ir-platform to HuggingFace Space
2a5d15a
Raw
History Blame Contribute Delete
2.7 kB
import os
import json
import re
DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
MD_FILE = os.path.join(DATA_DIR, "slide_directory.md")
DOCS_FILE = os.path.join(DATA_DIR, "documents.json")
OUT_JSON = os.path.join(DATA_DIR, "slide_directory_index.json")
def parse_markdown_table(file_path):
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
table_data = []
in_table = False
for line in lines:
line = line.strip()
if line.startswith("|") and not line.startswith("| ---"):
# Check if it's header or row
cells = [cell.strip() for cell in line.split("|")[1:-1]]
if len(cells) >= 7 and "Slide" not in cells[0]:
slide_num_raw = cells[0].replace("**", "")
if slide_num_raw.isdigit():
table_data.append({
"slide": int(slide_num_raw),
"file_name": cells[1],
"period": cells[2],
"topics": cells[3],
"kpis": cells[4],
"synonyms": cells[5],
"description": cells[6],
"visual_layout": cells[7] if len(cells) > 7 else ""
})
return table_data
def update_documents_json(table_data):
if not os.path.exists(DOCS_FILE):
return
with open(DOCS_FILE, "r", encoding="utf-8") as f:
docs = json.load(f)
# We assume we're updating the emiratesnbd_investor_presentation_2026_q1 document
for doc in docs:
if doc.get("filename") == "emiratesnbd_investor_presentation_2026_q1.pdf":
new_map = {}
for row in table_data:
# Combine topics and description for a richer section map
new_map[str(row["slide"])] = f"{row['topics']} | {row['description']} | Visuals: {row.get('visual_layout', '')}"
doc["page_section_map"] = new_map
with open(DOCS_FILE, "w", encoding="utf-8") as f:
json.dump(docs, f, indent=2)
def main():
if not os.path.exists(MD_FILE):
print(f"Error: {MD_FILE} not found.")
return
print(f"Parsing {MD_FILE}...")
table_data = parse_markdown_table(MD_FILE)
print(f"Parsed {len(table_data)} slides.")
# Save the detailed index
with open(OUT_JSON, "w", encoding="utf-8") as f:
json.dump(table_data, f, indent=2)
print(f"Saved detailed index to {OUT_JSON}")
# Update documents.json
update_documents_json(table_data)
print(f"Updated page_section_map in {DOCS_FILE}")
if __name__ == "__main__":
main()