Spaces:

vvinayakkkkk
/

topic-modelling

Running

App Files Files Community

topic-modelling / methodology_save.py

vvinayakkk

Initial clean commit with LFS

a1d17f8 20 days ago

raw

history blame contribute delete

7.2 kB

	#!/usr/bin/env python3

	"""
	===============================================================================
	CACHE RECOVERY + OUTPUT REGENERATOR
	-------------------------------------------------------------------------------

	USE THIS FILE AFTER EXTRACTION CRASHES.

	This script:
	- DOES NOT call APIs
	- DOES NOT rerun PDFs
	- ONLY loads cached JSON files
	- rebuilds all final outputs

	OUTPUTS:
	- computational_techniques_master.xlsx
	- computational_techniques_master.json
	- technique_frequency_summary.xlsx
	- raw_extraction_audit.json

	===============================================================================
	"""

	# =============================================================================
	# IMPORTS
	# =============================================================================

	import json
	import logging
	from pathlib import Path

	import pandas as pd

	# =============================================================================
	# LOGGING
	# =============================================================================

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s │ %(levelname)-7s │ %(message)s",
	datefmt="%H:%M:%S"
	)

	log = logging.getLogger(__name__)

	# =============================================================================
	# PATHS
	# =============================================================================

	SCRIPT_DIR = Path(__file__).parent

	OUTPUT_DIR = SCRIPT_DIR / "methodology_output"

	CACHE_DIR = OUTPUT_DIR / "cache_json"

	# =============================================================================
	# SAVE OUTPUTS
	# =============================================================================


	def save_outputs(results):

	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	# -------------------------------------------------------------------------
	# RAW AUDIT JSON
	# -------------------------------------------------------------------------

	raw_path = OUTPUT_DIR / "raw_extraction_audit.json"

	with open(raw_path, "w", encoding="utf-8") as f:

	json.dump(
	results,
	f,
	indent=2,
	ensure_ascii=False
	)

	log.info(f"Saved → {raw_path}")

	# -------------------------------------------------------------------------
	# CLEAN TABLE
	# -------------------------------------------------------------------------

	clean_rows = []

	for result in results:

	parsed = result.get("parsed_output") or {}

	techniques = parsed.get(
	"computational_techniques"
	) or []

	if not isinstance(techniques, list):
	continue

	for t in techniques:

	clean_rows.append({

	"Paper_Title": parsed.get(
	"paper_title",
	result.get("paper_title", "")
	),

	"Technique": t.get(
	"technique",
	""
	),

	"Detailed_Technique": t.get(
	"detailed_technique",
	""
	),

	"Type": t.get(
	"type",
	""
	),

	"Software": t.get(
	"software",
	""
	),

	"Evidence": t.get(
	"evidence",
	""
	),

	"Source_File": result.get(
	"source_file",
	""
	),
	})

	# -------------------------------------------------------------------------
	# DATAFRAME
	# -------------------------------------------------------------------------

	df = pd.DataFrame(clean_rows)

	if not df.empty:

	df = df.drop_duplicates()

	df = df.sort_values(
	by=[
	"Paper_Title",
	"Technique",
	"Detailed_Technique"
	]
	)

	# -------------------------------------------------------------------------
	# MASTER EXCEL
	# -------------------------------------------------------------------------

	excel_path = OUTPUT_DIR / "computational_techniques_master.xlsx"

	with pd.ExcelWriter(
	excel_path,
	engine="openpyxl"
	) as writer:

	df.to_excel(
	writer,
	index=False,
	sheet_name="Techniques"
	)

	log.info(f"Saved → {excel_path}")

	# -------------------------------------------------------------------------
	# MASTER JSON
	# -------------------------------------------------------------------------

	json_path = (
	OUTPUT_DIR /
	"computational_techniques_master.json"
	)

	with open(json_path, "w", encoding="utf-8") as f:

	json.dump(
	clean_rows,
	f,
	indent=2,
	ensure_ascii=False
	)

	log.info(f"Saved → {json_path}")

	# -------------------------------------------------------------------------
	# FREQUENCY SUMMARY
	# -------------------------------------------------------------------------

	if not df.empty:

	summary = (
	df.groupby(
	[
	"Technique",
	"Detailed_Technique"
	]
	)
	.size()
	.reset_index(name="Frequency")
	.sort_values(
	by="Frequency",
	ascending=False
	)
	)

	summary_path = (
	OUTPUT_DIR /
	"technique_frequency_summary.xlsx"
	)

	with pd.ExcelWriter(
	summary_path,
	engine="openpyxl"
	) as writer:

	summary.to_excel(
	writer,
	index=False,
	sheet_name="Summary"
	)

	log.info(f"Saved → {summary_path}")

	# =============================================================================
	# MAIN
	# =============================================================================


	def main():

	if not CACHE_DIR.exists():

	print()
	print("Cache folder not found:")
	print(CACHE_DIR)
	return

	cache_files = sorted(
	CACHE_DIR.glob("*.json")
	)

	print()
	print("=" * 70)
	print(f"Found {len(cache_files)} cached files")
	print("=" * 70)
	print()

	results = []

	failed = []

	for cf in cache_files:

	try:

	with open(cf, encoding="utf-8") as f:

	data = json.load(f)

	results.append(data)

	except Exception as e:

	failed.append((cf.name, str(e)))

	log.warning(
	f"Failed → {cf.name} \| {e}"
	)

	print()
	print("=" * 70)
	print(f"Loaded {len(results)} cache files")
	print(f"Failed {len(failed)} files")
	print("=" * 70)

	save_outputs(results)

	print()
	print("=" * 70)
	print("OUTPUT RECOVERY COMPLETE")
	print(f"Outputs saved to: {OUTPUT_DIR}")
	print("=" * 70)

	# =============================================================================

	if __name__ == "__main__":
	main()