topic-modelling / replace_title.py
vvinayakkk's picture
added
55deb86
import json
from pathlib import Path
from openpyxl import load_workbook
# =========================================================
# FILE PATHS
# =========================================================
# Fixed JSON
FIXED_JSON = r"C:\Users\vinay\OneDrive\Desktop\minors\topic-modelling\methodology_output\computational_techniques_master_fixed.json"
# Existing Excel
INPUT_XLSX = r"C:\Users\vinay\OneDrive\Desktop\minors\topic-modelling\methodology_output\computational_techniques_master.xlsx"
# Output Excel
OUTPUT_XLSX = r"C:\Users\vinay\OneDrive\Desktop\minors\topic-modelling\methodology_output\computational_techniques_master_fixed.xlsx"
# =========================================================
# LOAD FIXED JSON
# =========================================================
with open(FIXED_JSON, "r", encoding="utf-8") as f:
json_data = json.load(f)
# =========================================================
# LOAD EXCEL
# =========================================================
wb = load_workbook(INPUT_XLSX)
ws = wb.active
# =========================================================
# FIND COLUMN INDEXES
# =========================================================
headers = {}
for col in range(1, ws.max_column + 1):
header = ws.cell(row=1, column=col).value
if header:
headers[str(header).strip()] = col
paper_title_col = headers.get("Paper_Title")
source_file_col = headers.get("Source_File")
if not paper_title_col or not source_file_col:
raise Exception("Paper_Title or Source_File column not found!")
# =========================================================
# BUILD SOURCE_FILE -> PAPER_TITLE MAP
# =========================================================
title_map = {}
for item in json_data:
source_file = str(item.get("Source_File", "")).strip().lower()
paper_title = item.get("Paper_Title", "")
if source_file:
title_map[source_file] = paper_title
# =========================================================
# UPDATE EXCEL
# =========================================================
updated = 0
for row in range(2, ws.max_row + 1):
source_file = ws.cell(row=row, column=source_file_col).value
if not source_file:
continue
normalized_source = str(source_file).strip().lower()
if normalized_source in title_map:
ws.cell(row=row, column=paper_title_col).value = title_map[normalized_source]
updated += 1
# =========================================================
# SAVE
# =========================================================
wb.save(OUTPUT_XLSX)
print(f"[DONE] Updated rows: {updated}")
print(f"[SAVED] {OUTPUT_XLSX}")