Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| =============================================================================== | |
| CACHE RECOVERY + OUTPUT REGENERATOR | |
| ------------------------------------------------------------------------------- | |
| USE THIS FILE AFTER EXTRACTION CRASHES. | |
| This script: | |
| - DOES NOT call APIs | |
| - DOES NOT rerun PDFs | |
| - ONLY loads cached JSON files | |
| - rebuilds all final outputs | |
| OUTPUTS: | |
| - computational_techniques_master.xlsx | |
| - computational_techniques_master.json | |
| - technique_frequency_summary.xlsx | |
| - raw_extraction_audit.json | |
| =============================================================================== | |
| """ | |
| # ============================================================================= | |
| # IMPORTS | |
| # ============================================================================= | |
| import json | |
| import logging | |
| from pathlib import Path | |
| import pandas as pd | |
| # ============================================================================= | |
| # LOGGING | |
| # ============================================================================= | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s │ %(levelname)-7s │ %(message)s", | |
| datefmt="%H:%M:%S" | |
| ) | |
| log = logging.getLogger(__name__) | |
| # ============================================================================= | |
| # PATHS | |
| # ============================================================================= | |
| SCRIPT_DIR = Path(__file__).parent | |
| OUTPUT_DIR = SCRIPT_DIR / "methodology_output" | |
| CACHE_DIR = OUTPUT_DIR / "cache_json" | |
| # ============================================================================= | |
| # SAVE OUTPUTS | |
| # ============================================================================= | |
| def save_outputs(results): | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| # ------------------------------------------------------------------------- | |
| # RAW AUDIT JSON | |
| # ------------------------------------------------------------------------- | |
| raw_path = OUTPUT_DIR / "raw_extraction_audit.json" | |
| with open(raw_path, "w", encoding="utf-8") as f: | |
| json.dump( | |
| results, | |
| f, | |
| indent=2, | |
| ensure_ascii=False | |
| ) | |
| log.info(f"Saved → {raw_path}") | |
| # ------------------------------------------------------------------------- | |
| # CLEAN TABLE | |
| # ------------------------------------------------------------------------- | |
| clean_rows = [] | |
| for result in results: | |
| parsed = result.get("parsed_output") or {} | |
| techniques = parsed.get( | |
| "computational_techniques" | |
| ) or [] | |
| if not isinstance(techniques, list): | |
| continue | |
| for t in techniques: | |
| clean_rows.append({ | |
| "Paper_Title": parsed.get( | |
| "paper_title", | |
| result.get("paper_title", "") | |
| ), | |
| "Technique": t.get( | |
| "technique", | |
| "" | |
| ), | |
| "Detailed_Technique": t.get( | |
| "detailed_technique", | |
| "" | |
| ), | |
| "Type": t.get( | |
| "type", | |
| "" | |
| ), | |
| "Software": t.get( | |
| "software", | |
| "" | |
| ), | |
| "Evidence": t.get( | |
| "evidence", | |
| "" | |
| ), | |
| "Source_File": result.get( | |
| "source_file", | |
| "" | |
| ), | |
| }) | |
| # ------------------------------------------------------------------------- | |
| # DATAFRAME | |
| # ------------------------------------------------------------------------- | |
| df = pd.DataFrame(clean_rows) | |
| if not df.empty: | |
| df = df.drop_duplicates() | |
| df = df.sort_values( | |
| by=[ | |
| "Paper_Title", | |
| "Technique", | |
| "Detailed_Technique" | |
| ] | |
| ) | |
| # ------------------------------------------------------------------------- | |
| # MASTER EXCEL | |
| # ------------------------------------------------------------------------- | |
| excel_path = OUTPUT_DIR / "computational_techniques_master.xlsx" | |
| with pd.ExcelWriter( | |
| excel_path, | |
| engine="openpyxl" | |
| ) as writer: | |
| df.to_excel( | |
| writer, | |
| index=False, | |
| sheet_name="Techniques" | |
| ) | |
| log.info(f"Saved → {excel_path}") | |
| # ------------------------------------------------------------------------- | |
| # MASTER JSON | |
| # ------------------------------------------------------------------------- | |
| json_path = ( | |
| OUTPUT_DIR / | |
| "computational_techniques_master.json" | |
| ) | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump( | |
| clean_rows, | |
| f, | |
| indent=2, | |
| ensure_ascii=False | |
| ) | |
| log.info(f"Saved → {json_path}") | |
| # ------------------------------------------------------------------------- | |
| # FREQUENCY SUMMARY | |
| # ------------------------------------------------------------------------- | |
| if not df.empty: | |
| summary = ( | |
| df.groupby( | |
| [ | |
| "Technique", | |
| "Detailed_Technique" | |
| ] | |
| ) | |
| .size() | |
| .reset_index(name="Frequency") | |
| .sort_values( | |
| by="Frequency", | |
| ascending=False | |
| ) | |
| ) | |
| summary_path = ( | |
| OUTPUT_DIR / | |
| "technique_frequency_summary.xlsx" | |
| ) | |
| with pd.ExcelWriter( | |
| summary_path, | |
| engine="openpyxl" | |
| ) as writer: | |
| summary.to_excel( | |
| writer, | |
| index=False, | |
| sheet_name="Summary" | |
| ) | |
| log.info(f"Saved → {summary_path}") | |
| # ============================================================================= | |
| # MAIN | |
| # ============================================================================= | |
| def main(): | |
| if not CACHE_DIR.exists(): | |
| print() | |
| print("Cache folder not found:") | |
| print(CACHE_DIR) | |
| return | |
| cache_files = sorted( | |
| CACHE_DIR.glob("*.json") | |
| ) | |
| print() | |
| print("=" * 70) | |
| print(f"Found {len(cache_files)} cached files") | |
| print("=" * 70) | |
| print() | |
| results = [] | |
| failed = [] | |
| for cf in cache_files: | |
| try: | |
| with open(cf, encoding="utf-8") as f: | |
| data = json.load(f) | |
| results.append(data) | |
| except Exception as e: | |
| failed.append((cf.name, str(e))) | |
| log.warning( | |
| f"Failed → {cf.name} | {e}" | |
| ) | |
| print() | |
| print("=" * 70) | |
| print(f"Loaded {len(results)} cache files") | |
| print(f"Failed {len(failed)} files") | |
| print("=" * 70) | |
| save_outputs(results) | |
| print() | |
| print("=" * 70) | |
| print("OUTPUT RECOVERY COMPLETE") | |
| print(f"Outputs saved to: {OUTPUT_DIR}") | |
| print("=" * 70) | |
| # ============================================================================= | |
| if __name__ == "__main__": | |
| main() |