import os import json import re from typing import Dict, Any, List import gradio as gr from docx import Document from deepdiff import DeepDiff import mysql.connector # ----------------------------- # CONFIG: UNIVERSITY MAPPING # ----------------------------- UNIVERSITY_ID_MAP = { "Indiana University of Pennsylvania (IUP)": 1, "Missouri State University": 2, "University Of Kentucky (UK)": 3, "University of Louisville (UofL)": 4, "University of Delaware (UD)": 6, "Grand Valley State University": 7, "Quinnipiac University": 9, "William Jessup University": 10, "Wilkes University": 14, "University of South Dakota (USD)": 16, } # ----------------------------- # DB CONNECTION HELPERS # ----------------------------- def get_db_connection(): return mysql.connector.connect( host=os.getenv("DB_HOST", "localhost"), port=int(os.getenv("DB_PORT", "3306")), user=os.getenv("DB_USER", "root"), password=os.getenv("DB_PASSWORD", ""), database=os.getenv("DB_NAME", ""), ) def fetch_section_json(university_id: int, section_key: str): conn = get_db_connection() try: cursor = conn.cursor() cursor.execute( """ SELECT section_json FROM university_handbook_sections WHERE university_id=%s AND section_key=%s LIMIT 1 """, (university_id, section_key), ) row = cursor.fetchone() if not row or not row[0]: return None try: return json.loads(row[0]) except Exception: return None finally: cursor.close() conn.close() def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]): conn = get_db_connection() try: cursor = conn.cursor() new_json = json.dumps(new_data, ensure_ascii=False) cursor.execute( """ UPDATE university_handbook_sections SET section_json=%s WHERE university_id=%s AND section_key=%s """, (new_json, university_id, section_key), ) conn.commit() finally: cursor.close() conn.close() # ----------------------------- # DOCX PARSING HELPERS # ----------------------------- def normalize_text(t: str) -> str: return " ".join(t.split()).strip() def split_doc_by_university(doc: Document) -> Dict[str, List[str]]: paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()] indices: List[tuple[int, str]] = [] for i, p in enumerate(paragraphs): for uni in UNIVERSITY_ID_MAP.keys(): if p == uni or p.startswith(uni): indices.append((i, uni)) indices.sort(key=lambda x: x[0]) uni_blocks: Dict[str, List[str]] = {} for idx, (start, uni_name) in enumerate(indices): end = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs) uni_blocks[uni_name] = paragraphs[start:end] return uni_blocks def parse_overview_block(block: List[str]) -> Dict[str, Any]: """ Parse the top 'overview' section (Founded, Total Students, etc.) in a robust way that doesn't assume colons are always present. """ data: Dict[str, Any] = {} def after_colon(line: str) -> str: """Safely return the part after ':' if present, else empty string.""" parts = line.split(":", 1) return parts[1].strip() if len(parts) > 1 else "" for raw_line in block: line = raw_line.strip() if not line: continue # Founded if line.startswith("Founded"): value = after_colon(line) or line # fallback to entire line digits = re.sub(r"[^\d]", "", value) if digits: data["founded"] = int(digits) # Total Students elif line.startswith("Total Students"): value = after_colon(line) or line digits = re.sub(r"[^\d]", "", value) if digits: data["total_students"] = int(digits) # Postgraduate students elif "Postgraduate" in line: value = after_colon(line) or line digits = re.sub(r"[^\d]", "", value) data["postgraduate_students"] = int(digits) if digits else None # Acceptance rate elif line.startswith("Acceptance rate"): value = after_colon(line) or line data["acceptance_rate"] = value # Location elif line.startswith("Location"): value = after_colon(line) or line data["location"] = value # Tuition (out-of-state yearly) elif "Tuition" in line: value = after_colon(line) or line digits = re.sub(r"[^\d]", "", value) data["tuition_out_of_state_yearly"] = int(digits) if digits else None return data def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]: out: List[str] = [] started = False for line in block: if not started and start in line: started = True continue if started: if any(s in line for s in stops): break if line.strip(): out.append(line) return out def parse_benefits_block(block: List[str]) -> Dict[str, Any]: lines = extract_between( block, "Benefits for ISP students at this school", ["To qualify for The International Scholars Program"], ) return {"benefits": [normalize_text(l) for l in lines]} def parse_programs_block(block: List[str]) -> Dict[str, Any]: lines = extract_between( block, "To qualify for The International Scholars Program", list(UNIVERSITY_ID_MAP.keys()), ) headers = { "Program", "Designation", "Entrance Exam Required", "Examples of Career Pathways", "Funding Category", } cleaned = [l for l in lines if l not in headers] programs: List[Dict[str, Any]] = [] i = 0 while i < len(cleaned): # Need at least 4 lines: name, designation, exam, at least one career/tier if len(cleaned) - i < 4: break name = cleaned[i] designation = cleaned[i + 1] exam = cleaned[i + 2] careers: List[str] = [] j = i + 3 while j < len(cleaned) and not cleaned[j].startswith("TIER"): careers.append(cleaned[j]) j += 1 tier = cleaned[j] if j < len(cleaned) else "" programs.append( { "program_name": name, "designation": designation, "entrance_exam": exam, "career_pathways": careers, "funding_category": tier, } ) i = j + 1 return {"programs": programs} # ----------------------------- # HIGH-LEVEL UNIVERSITY PARSER # ----------------------------- def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Any]: """ Given all lines for a single university block, return a dict with: - overview - benefits - programs """ overview = parse_overview_block(block) benefits = parse_benefits_block(block) programs = parse_programs_block(block) result: Dict[str, Any] = {} if overview: result["overview"] = overview if benefits: result["benefits"] = benefits if programs: result["programs"] = programs return result # ----------------------------- # MAIN SYNC LOGIC # ----------------------------- def run_full_sync(docx_file): if docx_file is None: return "No handbook file uploaded." try: # Gradio File object usually has a .name (temp file path) doc = Document(docx_file.name) except Exception as e: return f"Error reading DOCX: {e}" uni_blocks = split_doc_by_university(doc) logs: List[str] = [] updated = 0 for uni_name, uni_id in UNIVERSITY_ID_MAP.items(): block = uni_blocks.get(uni_name) if not block: logs.append(f"[WARN] Missing block: {uni_name}") continue parsed = parse_university_block(uni_name, block) if not parsed: logs.append(f"[WARN] Cannot parse: {uni_name}") continue for key, new_json in parsed.items(): if key not in ("overview", "benefits", "programs"): continue old_json = fetch_section_json(uni_id, key) diff = DeepDiff(old_json or {}, new_json, ignore_order=True) if not diff: logs.append(f"[OK] {uni_name} [{key}] unchanged.") continue try: update_section_json(uni_id, key, new_json) logs.append(f"[UPDATED] {uni_name} [{key}] updated.") updated += 1 except Exception as e: logs.append(f"[ERROR] {uni_name} [{key}]: {e}") logs.append(f"\nTotal sections updated: {updated}") return "\n".join(logs) # ----------------------------- # ISP BRANDING - BASE64 LOGO (ALWAYS VISIBLE) # ----------------------------- # TODO: Replace this with your real SVG base64 LOGO_SRC = "data:image/svg+xml;base64,..." ISP_PRIMARY = "#062A4D" ISP_GOLD = "#D6A229" ISP_BG = "#F5F7FA" CUSTOM_CSS = f""" """ # ----------------------------- # GRADIO UI # ----------------------------- with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo: gr.HTML(CUSTOM_CSS) # Header gr.HTML( f"""