Spaces:

internationalscholarsprogram
/

docx-json-sync

Sleeping

App Files Files Community

docx-json-sync / app.py

internationalscholarsprogram

Make overview parsing robust to missing colons

e6a016f 2 months ago

raw

history blame contribute delete

10.8 kB

	import os
	import json
	import re
	from typing import Dict, Any, List

	import gradio as gr
	from docx import Document
	from deepdiff import DeepDiff
	import mysql.connector


	# -----------------------------
	# CONFIG: UNIVERSITY MAPPING
	# -----------------------------
	UNIVERSITY_ID_MAP = {
	"Indiana University of Pennsylvania (IUP)": 1,
	"Missouri State University": 2,
	"University Of Kentucky (UK)": 3,
	"University of Louisville (UofL)": 4,
	"University of Delaware (UD)": 6,
	"Grand Valley State University": 7,
	"Quinnipiac University": 9,
	"William Jessup University": 10,
	"Wilkes University": 14,
	"University of South Dakota (USD)": 16,
	}


	# -----------------------------
	# DB CONNECTION HELPERS
	# -----------------------------
	def get_db_connection():
	return mysql.connector.connect(
	host=os.getenv("DB_HOST", "localhost"),
	port=int(os.getenv("DB_PORT", "3306")),
	user=os.getenv("DB_USER", "root"),
	password=os.getenv("DB_PASSWORD", ""),
	database=os.getenv("DB_NAME", ""),
	)


	def fetch_section_json(university_id: int, section_key: str):
	conn = get_db_connection()
	try:
	cursor = conn.cursor()
	cursor.execute(
	"""
	SELECT section_json
	FROM university_handbook_sections
	WHERE university_id=%s AND section_key=%s
	LIMIT 1
	""",
	(university_id, section_key),
	)
	row = cursor.fetchone()
	if not row or not row[0]:
	return None
	try:
	return json.loads(row[0])
	except Exception:
	return None
	finally:
	cursor.close()
	conn.close()


	def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
	conn = get_db_connection()
	try:
	cursor = conn.cursor()
	new_json = json.dumps(new_data, ensure_ascii=False)
	cursor.execute(
	"""
	UPDATE university_handbook_sections
	SET section_json=%s
	WHERE university_id=%s AND section_key=%s
	""",
	(new_json, university_id, section_key),
	)
	conn.commit()
	finally:
	cursor.close()
	conn.close()


	# -----------------------------
	# DOCX PARSING HELPERS
	# -----------------------------
	def normalize_text(t: str) -> str:
	return " ".join(t.split()).strip()


	def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
	paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
	indices: List[tuple[int, str]] = []
	for i, p in enumerate(paragraphs):
	for uni in UNIVERSITY_ID_MAP.keys():
	if p == uni or p.startswith(uni):
	indices.append((i, uni))

	indices.sort(key=lambda x: x[0])

	uni_blocks: Dict[str, List[str]] = {}
	for idx, (start, uni_name) in enumerate(indices):
	end = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
	uni_blocks[uni_name] = paragraphs[start:end]
	return uni_blocks


	def parse_overview_block(block: List[str]) -> Dict[str, Any]:
	"""
	Parse the top 'overview' section (Founded, Total Students, etc.)
	in a robust way that doesn't assume colons are always present.
	"""
	data: Dict[str, Any] = {}

	def after_colon(line: str) -> str:
	"""Safely return the part after ':' if present, else empty string."""
	parts = line.split(":", 1)
	return parts[1].strip() if len(parts) > 1 else ""

	for raw_line in block:
	line = raw_line.strip()
	if not line:
	continue

	# Founded
	if line.startswith("Founded"):
	value = after_colon(line) or line # fallback to entire line
	digits = re.sub(r"[^\d]", "", value)
	if digits:
	data["founded"] = int(digits)

	# Total Students
	elif line.startswith("Total Students"):
	value = after_colon(line) or line
	digits = re.sub(r"[^\d]", "", value)
	if digits:
	data["total_students"] = int(digits)

	# Postgraduate students
	elif "Postgraduate" in line:
	value = after_colon(line) or line
	digits = re.sub(r"[^\d]", "", value)
	data["postgraduate_students"] = int(digits) if digits else None

	# Acceptance rate
	elif line.startswith("Acceptance rate"):
	value = after_colon(line) or line
	data["acceptance_rate"] = value

	# Location
	elif line.startswith("Location"):
	value = after_colon(line) or line
	data["location"] = value

	# Tuition (out-of-state yearly)
	elif "Tuition" in line:
	value = after_colon(line) or line
	digits = re.sub(r"[^\d]", "", value)
	data["tuition_out_of_state_yearly"] = int(digits) if digits else None

	return data


	def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]:
	out: List[str] = []
	started = False
	for line in block:
	if not started and start in line:
	started = True
	continue
	if started:
	if any(s in line for s in stops):
	break
	if line.strip():
	out.append(line)
	return out


	def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
	lines = extract_between(
	block,
	"Benefits for ISP students at this school",
	["To qualify for The International Scholars Program"],
	)
	return {"benefits": [normalize_text(l) for l in lines]}


	def parse_programs_block(block: List[str]) -> Dict[str, Any]:
	lines = extract_between(
	block,
	"To qualify for The International Scholars Program",
	list(UNIVERSITY_ID_MAP.keys()),
	)
	headers = {
	"Program",
	"Designation",
	"Entrance Exam Required",
	"Examples of Career Pathways",
	"Funding Category",
	}

	cleaned = [l for l in lines if l not in headers]

	programs: List[Dict[str, Any]] = []
	i = 0
	while i < len(cleaned):
	# Need at least 4 lines: name, designation, exam, at least one career/tier
	if len(cleaned) - i < 4:
	break

	name = cleaned[i]
	designation = cleaned[i + 1]
	exam = cleaned[i + 2]
	careers: List[str] = []
	j = i + 3
	while j < len(cleaned) and not cleaned[j].startswith("TIER"):
	careers.append(cleaned[j])
	j += 1
	tier = cleaned[j] if j < len(cleaned) else ""

	programs.append(
	{
	"program_name": name,
	"designation": designation,
	"entrance_exam": exam,
	"career_pathways": careers,
	"funding_category": tier,
	}
	)
	i = j + 1

	return {"programs": programs}


	# -----------------------------
	# HIGH-LEVEL UNIVERSITY PARSER
	# -----------------------------
	def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Any]:
	"""
	Given all lines for a single university block, return a dict with:
	- overview
	- benefits
	- programs
	"""
	overview = parse_overview_block(block)
	benefits = parse_benefits_block(block)
	programs = parse_programs_block(block)

	result: Dict[str, Any] = {}
	if overview:
	result["overview"] = overview
	if benefits:
	result["benefits"] = benefits
	if programs:
	result["programs"] = programs
	return result


	# -----------------------------
	# MAIN SYNC LOGIC
	# -----------------------------
	def run_full_sync(docx_file):
	if docx_file is None:
	return "No handbook file uploaded."

	try:
	# Gradio File object usually has a .name (temp file path)
	doc = Document(docx_file.name)
	except Exception as e:
	return f"Error reading DOCX: {e}"

	uni_blocks = split_doc_by_university(doc)
	logs: List[str] = []
	updated = 0

	for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
	block = uni_blocks.get(uni_name)
	if not block:
	logs.append(f"[WARN] Missing block: {uni_name}")
	continue

	parsed = parse_university_block(uni_name, block)
	if not parsed:
	logs.append(f"[WARN] Cannot parse: {uni_name}")
	continue

	for key, new_json in parsed.items():
	if key not in ("overview", "benefits", "programs"):
	continue

	old_json = fetch_section_json(uni_id, key)
	diff = DeepDiff(old_json or {}, new_json, ignore_order=True)

	if not diff:
	logs.append(f"[OK] {uni_name} [{key}] unchanged.")
	continue

	try:
	update_section_json(uni_id, key, new_json)
	logs.append(f"[UPDATED] {uni_name} [{key}] updated.")
	updated += 1
	except Exception as e:
	logs.append(f"[ERROR] {uni_name} [{key}]: {e}")

	logs.append(f"\nTotal sections updated: {updated}")
	return "\n".join(logs)


	# -----------------------------
	# ISP BRANDING - BASE64 LOGO (ALWAYS VISIBLE)
	# -----------------------------
	# TODO: Replace this with your real SVG base64
	LOGO_SRC = "data:image/svg+xml;base64,..."


	ISP_PRIMARY = "#062A4D"
	ISP_GOLD = "#D6A229"
	ISP_BG = "#F5F7FA"

	CUSTOM_CSS = f"""
	<style>
	#isp-header {{
	background: {ISP_PRIMARY};
	padding: 20px;
	border-radius: 10px;
	display: flex;
	align-items: center;
	gap: 20px;
	}}
	#isp-header h1 {{
	color: white;
	margin: 0;
	font-size: 28px;
	}}
	#isp-logo {{
	height: 60px;
	}}
	button {{
	background-color: {ISP_GOLD} !important;
	color: black !important;
	border-radius: 8px !important;
	font-weight: bold !important;
	}}
	.gradio-container {{
	background: {ISP_BG} !important;
	}}
	</style>
	"""


	# -----------------------------
	# GRADIO UI
	# -----------------------------
	with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
	gr.HTML(CUSTOM_CSS)

	# Header
	gr.HTML(
	f"""
	<div id='isp-header'>
	<img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
	<h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
	</div>
	"""
	)

	gr.Markdown(
	"""
	Upload the official ISP Handbook (.docx).
	This tool will compare, detect differences, and update changed sections.
	---
	"""
	)

	file_input = gr.File(label="Upload Handbook DOCX", file_types=[".docx"])
	log_output = gr.Textbox(label="Sync Log", lines=30)
	run_btn = gr.Button("Run Full Sync")

	run_btn.click(run_full_sync, inputs=file_input, outputs=log_output)

	if __name__ == "__main__":
	demo.launch()