topic-modelling / methodology_save.py
vvinayakkk's picture
Initial clean commit with LFS
a1d17f8
#!/usr/bin/env python3
"""
===============================================================================
CACHE RECOVERY + OUTPUT REGENERATOR
-------------------------------------------------------------------------------
USE THIS FILE AFTER EXTRACTION CRASHES.
This script:
- DOES NOT call APIs
- DOES NOT rerun PDFs
- ONLY loads cached JSON files
- rebuilds all final outputs
OUTPUTS:
- computational_techniques_master.xlsx
- computational_techniques_master.json
- technique_frequency_summary.xlsx
- raw_extraction_audit.json
===============================================================================
"""
# =============================================================================
# IMPORTS
# =============================================================================
import json
import logging
from pathlib import Path
import pandas as pd
# =============================================================================
# LOGGING
# =============================================================================
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s │ %(levelname)-7s │ %(message)s",
datefmt="%H:%M:%S"
)
log = logging.getLogger(__name__)
# =============================================================================
# PATHS
# =============================================================================
SCRIPT_DIR = Path(__file__).parent
OUTPUT_DIR = SCRIPT_DIR / "methodology_output"
CACHE_DIR = OUTPUT_DIR / "cache_json"
# =============================================================================
# SAVE OUTPUTS
# =============================================================================
def save_outputs(results):
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# -------------------------------------------------------------------------
# RAW AUDIT JSON
# -------------------------------------------------------------------------
raw_path = OUTPUT_DIR / "raw_extraction_audit.json"
with open(raw_path, "w", encoding="utf-8") as f:
json.dump(
results,
f,
indent=2,
ensure_ascii=False
)
log.info(f"Saved → {raw_path}")
# -------------------------------------------------------------------------
# CLEAN TABLE
# -------------------------------------------------------------------------
clean_rows = []
for result in results:
parsed = result.get("parsed_output") or {}
techniques = parsed.get(
"computational_techniques"
) or []
if not isinstance(techniques, list):
continue
for t in techniques:
clean_rows.append({
"Paper_Title": parsed.get(
"paper_title",
result.get("paper_title", "")
),
"Technique": t.get(
"technique",
""
),
"Detailed_Technique": t.get(
"detailed_technique",
""
),
"Type": t.get(
"type",
""
),
"Software": t.get(
"software",
""
),
"Evidence": t.get(
"evidence",
""
),
"Source_File": result.get(
"source_file",
""
),
})
# -------------------------------------------------------------------------
# DATAFRAME
# -------------------------------------------------------------------------
df = pd.DataFrame(clean_rows)
if not df.empty:
df = df.drop_duplicates()
df = df.sort_values(
by=[
"Paper_Title",
"Technique",
"Detailed_Technique"
]
)
# -------------------------------------------------------------------------
# MASTER EXCEL
# -------------------------------------------------------------------------
excel_path = OUTPUT_DIR / "computational_techniques_master.xlsx"
with pd.ExcelWriter(
excel_path,
engine="openpyxl"
) as writer:
df.to_excel(
writer,
index=False,
sheet_name="Techniques"
)
log.info(f"Saved → {excel_path}")
# -------------------------------------------------------------------------
# MASTER JSON
# -------------------------------------------------------------------------
json_path = (
OUTPUT_DIR /
"computational_techniques_master.json"
)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(
clean_rows,
f,
indent=2,
ensure_ascii=False
)
log.info(f"Saved → {json_path}")
# -------------------------------------------------------------------------
# FREQUENCY SUMMARY
# -------------------------------------------------------------------------
if not df.empty:
summary = (
df.groupby(
[
"Technique",
"Detailed_Technique"
]
)
.size()
.reset_index(name="Frequency")
.sort_values(
by="Frequency",
ascending=False
)
)
summary_path = (
OUTPUT_DIR /
"technique_frequency_summary.xlsx"
)
with pd.ExcelWriter(
summary_path,
engine="openpyxl"
) as writer:
summary.to_excel(
writer,
index=False,
sheet_name="Summary"
)
log.info(f"Saved → {summary_path}")
# =============================================================================
# MAIN
# =============================================================================
def main():
if not CACHE_DIR.exists():
print()
print("Cache folder not found:")
print(CACHE_DIR)
return
cache_files = sorted(
CACHE_DIR.glob("*.json")
)
print()
print("=" * 70)
print(f"Found {len(cache_files)} cached files")
print("=" * 70)
print()
results = []
failed = []
for cf in cache_files:
try:
with open(cf, encoding="utf-8") as f:
data = json.load(f)
results.append(data)
except Exception as e:
failed.append((cf.name, str(e)))
log.warning(
f"Failed → {cf.name} | {e}"
)
print()
print("=" * 70)
print(f"Loaded {len(results)} cache files")
print(f"Failed {len(failed)} files")
print("=" * 70)
save_outputs(results)
print()
print("=" * 70)
print("OUTPUT RECOVERY COMPLETE")
print(f"Outputs saved to: {OUTPUT_DIR}")
print("=" * 70)
# =============================================================================
if __name__ == "__main__":
main()