#!/usr/bin/env python3

"""
===============================================================================
CACHE RECOVERY + OUTPUT REGENERATOR
-------------------------------------------------------------------------------

USE THIS FILE AFTER EXTRACTION CRASHES.

This script:
- DOES NOT call APIs
- DOES NOT rerun PDFs
- ONLY loads cached JSON files
- rebuilds all final outputs

OUTPUTS:
- computational_techniques_master.xlsx
- computational_techniques_master.json
- technique_frequency_summary.xlsx
- raw_extraction_audit.json

===============================================================================
"""

# =============================================================================
# IMPORTS
# =============================================================================

import json
import logging
from pathlib import Path

import pandas as pd

# =============================================================================
# LOGGING
# =============================================================================

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s │ %(levelname)-7s │ %(message)s",
    datefmt="%H:%M:%S"
)

log = logging.getLogger(__name__)

# =============================================================================
# PATHS
# =============================================================================

SCRIPT_DIR = Path(__file__).parent

OUTPUT_DIR = SCRIPT_DIR / "methodology_output"

CACHE_DIR = OUTPUT_DIR / "cache_json"

# =============================================================================
# SAVE OUTPUTS
# =============================================================================


def save_outputs(results):

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # -------------------------------------------------------------------------
    # RAW AUDIT JSON
    # -------------------------------------------------------------------------

    raw_path = OUTPUT_DIR / "raw_extraction_audit.json"

    with open(raw_path, "w", encoding="utf-8") as f:

        json.dump(
            results,
            f,
            indent=2,
            ensure_ascii=False
        )

    log.info(f"Saved → {raw_path}")

    # -------------------------------------------------------------------------
    # CLEAN TABLE
    # -------------------------------------------------------------------------

    clean_rows = []

    for result in results:

        parsed = result.get("parsed_output") or {}

        techniques = parsed.get(
            "computational_techniques"
        ) or []

        if not isinstance(techniques, list):
            continue

        for t in techniques:

            clean_rows.append({

                "Paper_Title": parsed.get(
                    "paper_title",
                    result.get("paper_title", "")
                ),

                "Technique": t.get(
                    "technique",
                    ""
                ),

                "Detailed_Technique": t.get(
                    "detailed_technique",
                    ""
                ),

                "Type": t.get(
                    "type",
                    ""
                ),

                "Software": t.get(
                    "software",
                    ""
                ),

                "Evidence": t.get(
                    "evidence",
                    ""
                ),

                "Source_File": result.get(
                    "source_file",
                    ""
                ),
            })

    # -------------------------------------------------------------------------
    # DATAFRAME
    # -------------------------------------------------------------------------

    df = pd.DataFrame(clean_rows)

    if not df.empty:

        df = df.drop_duplicates()

        df = df.sort_values(
            by=[
                "Paper_Title",
                "Technique",
                "Detailed_Technique"
            ]
        )

    # -------------------------------------------------------------------------
    # MASTER EXCEL
    # -------------------------------------------------------------------------

    excel_path = OUTPUT_DIR / "computational_techniques_master.xlsx"

    with pd.ExcelWriter(
        excel_path,
        engine="openpyxl"
    ) as writer:

        df.to_excel(
            writer,
            index=False,
            sheet_name="Techniques"
        )

    log.info(f"Saved → {excel_path}")

    # -------------------------------------------------------------------------
    # MASTER JSON
    # -------------------------------------------------------------------------

    json_path = (
        OUTPUT_DIR /
        "computational_techniques_master.json"
    )

    with open(json_path, "w", encoding="utf-8") as f:

        json.dump(
            clean_rows,
            f,
            indent=2,
            ensure_ascii=False
        )

    log.info(f"Saved → {json_path}")

    # -------------------------------------------------------------------------
    # FREQUENCY SUMMARY
    # -------------------------------------------------------------------------

    if not df.empty:

        summary = (
            df.groupby(
                [
                    "Technique",
                    "Detailed_Technique"
                ]
            )
            .size()
            .reset_index(name="Frequency")
            .sort_values(
                by="Frequency",
                ascending=False
            )
        )

        summary_path = (
            OUTPUT_DIR /
            "technique_frequency_summary.xlsx"
        )

        with pd.ExcelWriter(
            summary_path,
            engine="openpyxl"
        ) as writer:

            summary.to_excel(
                writer,
                index=False,
                sheet_name="Summary"
            )

        log.info(f"Saved → {summary_path}")

# =============================================================================
# MAIN
# =============================================================================


def main():

    if not CACHE_DIR.exists():

        print()
        print("Cache folder not found:")
        print(CACHE_DIR)
        return

    cache_files = sorted(
        CACHE_DIR.glob("*.json")
    )

    print()
    print("=" * 70)
    print(f"Found {len(cache_files)} cached files")
    print("=" * 70)
    print()

    results = []

    failed = []

    for cf in cache_files:

        try:

            with open(cf, encoding="utf-8") as f:

                data = json.load(f)

            results.append(data)

        except Exception as e:

            failed.append((cf.name, str(e)))

            log.warning(
                f"Failed → {cf.name} | {e}"
            )

    print()
    print("=" * 70)
    print(f"Loaded {len(results)} cache files")
    print(f"Failed {len(failed)} files")
    print("=" * 70)

    save_outputs(results)

    print()
    print("=" * 70)
    print("OUTPUT RECOVERY COMPLETE")
    print(f"Outputs saved to: {OUTPUT_DIR}")
    print("=" * 70)

# =============================================================================

if __name__ == "__main__":
    main()