#!/usr/bin/env python3 """ =============================================================================== CACHE RECOVERY + OUTPUT REGENERATOR ------------------------------------------------------------------------------- USE THIS FILE AFTER EXTRACTION CRASHES. This script: - DOES NOT call APIs - DOES NOT rerun PDFs - ONLY loads cached JSON files - rebuilds all final outputs OUTPUTS: - computational_techniques_master.xlsx - computational_techniques_master.json - technique_frequency_summary.xlsx - raw_extraction_audit.json =============================================================================== """ # ============================================================================= # IMPORTS # ============================================================================= import json import logging from pathlib import Path import pandas as pd # ============================================================================= # LOGGING # ============================================================================= logging.basicConfig( level=logging.INFO, format="%(asctime)s │ %(levelname)-7s │ %(message)s", datefmt="%H:%M:%S" ) log = logging.getLogger(__name__) # ============================================================================= # PATHS # ============================================================================= SCRIPT_DIR = Path(__file__).parent OUTPUT_DIR = SCRIPT_DIR / "methodology_output" CACHE_DIR = OUTPUT_DIR / "cache_json" # ============================================================================= # SAVE OUTPUTS # ============================================================================= def save_outputs(results): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # ------------------------------------------------------------------------- # RAW AUDIT JSON # ------------------------------------------------------------------------- raw_path = OUTPUT_DIR / "raw_extraction_audit.json" with open(raw_path, "w", encoding="utf-8") as f: json.dump( results, f, indent=2, ensure_ascii=False ) log.info(f"Saved → {raw_path}") # ------------------------------------------------------------------------- # CLEAN TABLE # ------------------------------------------------------------------------- clean_rows = [] for result in results: parsed = result.get("parsed_output") or {} techniques = parsed.get( "computational_techniques" ) or [] if not isinstance(techniques, list): continue for t in techniques: clean_rows.append({ "Paper_Title": parsed.get( "paper_title", result.get("paper_title", "") ), "Technique": t.get( "technique", "" ), "Detailed_Technique": t.get( "detailed_technique", "" ), "Type": t.get( "type", "" ), "Software": t.get( "software", "" ), "Evidence": t.get( "evidence", "" ), "Source_File": result.get( "source_file", "" ), }) # ------------------------------------------------------------------------- # DATAFRAME # ------------------------------------------------------------------------- df = pd.DataFrame(clean_rows) if not df.empty: df = df.drop_duplicates() df = df.sort_values( by=[ "Paper_Title", "Technique", "Detailed_Technique" ] ) # ------------------------------------------------------------------------- # MASTER EXCEL # ------------------------------------------------------------------------- excel_path = OUTPUT_DIR / "computational_techniques_master.xlsx" with pd.ExcelWriter( excel_path, engine="openpyxl" ) as writer: df.to_excel( writer, index=False, sheet_name="Techniques" ) log.info(f"Saved → {excel_path}") # ------------------------------------------------------------------------- # MASTER JSON # ------------------------------------------------------------------------- json_path = ( OUTPUT_DIR / "computational_techniques_master.json" ) with open(json_path, "w", encoding="utf-8") as f: json.dump( clean_rows, f, indent=2, ensure_ascii=False ) log.info(f"Saved → {json_path}") # ------------------------------------------------------------------------- # FREQUENCY SUMMARY # ------------------------------------------------------------------------- if not df.empty: summary = ( df.groupby( [ "Technique", "Detailed_Technique" ] ) .size() .reset_index(name="Frequency") .sort_values( by="Frequency", ascending=False ) ) summary_path = ( OUTPUT_DIR / "technique_frequency_summary.xlsx" ) with pd.ExcelWriter( summary_path, engine="openpyxl" ) as writer: summary.to_excel( writer, index=False, sheet_name="Summary" ) log.info(f"Saved → {summary_path}") # ============================================================================= # MAIN # ============================================================================= def main(): if not CACHE_DIR.exists(): print() print("Cache folder not found:") print(CACHE_DIR) return cache_files = sorted( CACHE_DIR.glob("*.json") ) print() print("=" * 70) print(f"Found {len(cache_files)} cached files") print("=" * 70) print() results = [] failed = [] for cf in cache_files: try: with open(cf, encoding="utf-8") as f: data = json.load(f) results.append(data) except Exception as e: failed.append((cf.name, str(e))) log.warning( f"Failed → {cf.name} | {e}" ) print() print("=" * 70) print(f"Loaded {len(results)} cache files") print(f"Failed {len(failed)} files") print("=" * 70) save_outputs(results) print() print("=" * 70) print("OUTPUT RECOVERY COMPLETE") print(f"Outputs saved to: {OUTPUT_DIR}") print("=" * 70) # ============================================================================= if __name__ == "__main__": main()