Spaces:

internationalscholarsprogram
/

docx-json-sync

Sleeping

App Files Files Community

internationalscholarsprogram commited on Dec 5, 2025

Commit

92080b9

1 Parent(s): a81d369

Fix Gradio CSS and apply ISP branding

Browse files

Files changed (1) hide show

app.py +136 -273

app.py CHANGED Viewed

@@ -23,7 +23,6 @@ UNIVERSITY_ID_MAP = {
     "William Jessup University": 10,
     "Wilkes University": 14,
     "University of South Dakota (USD)": 16,
-    # Extend as you add more rows to university_handbook_sections
 }
@@ -31,11 +30,6 @@ UNIVERSITY_ID_MAP = {
 # DB CONNECTION HELPERS
 # -----------------------------
 def get_db_connection():
-    """
-    Create and return a MySQL connection using environment variables.
-    Set these in HF Space secrets:
-        DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME
-    """
     return mysql.connector.connect(
         host=os.getenv("DB_HOST", "localhost"),
         port=int(os.getenv("DB_PORT", "3306")),
@@ -46,29 +40,21 @@ def get_db_connection():
 def fetch_section_json(university_id: int, section_key: str):
-    """
-    Fetch existing JSON for given university_id + section_key from DB.
-    Returns parsed dict or None if not found.
-    """
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
-        query = """
             SELECT section_json
             FROM university_handbook_sections
-            WHERE university_id = %s AND section_key = %s
             LIMIT 1
-        """
-        cursor.execute(query, (university_id, section_key))
         row = cursor.fetchone()
-        if not row:
-            return None
-        if not row[0]:
             return None
         try:
             return json.loads(row[0])
-        except Exception:
-            # JSON malformed in DB – treat as None to force overwrite
             return None
     finally:
         cursor.close()
@@ -76,19 +62,15 @@ def fetch_section_json(university_id: int, section_key: str):
 def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
-    """
-    Update section_json in DB for given university_id + section_key.
-    """
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
-        new_json_str = json.dumps(new_data, ensure_ascii=False)
-        query = """
             UPDATE university_handbook_sections
-            SET section_json = %s
-            WHERE university_id = %s AND section_key = %s
-        """
-        cursor.execute(query, (new_json_str, university_id, section_key))
         conn.commit()
     finally:
         cursor.close()
@@ -98,362 +80,243 @@ def update_section_json(university_id: int, section_key: str, new_data: Dict[str
 # -----------------------------
 # DOCX PARSING HELPERS
 # -----------------------------
-def normalize_text(text: str) -> str:
-    return " ".join(text.split()).strip()
-def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
-    """
-    Split the docx into blocks per university name using headings that match
-    the keys in UNIVERSITY_ID_MAP.
-    Returns dict: { "University Name": [list_of_paragraph_texts_in_block] }
-    """
-    paragraphs = [normalize_text(p.text) for p in doc.paragraphs]
-    # Remove empties
-    paragraphs = [p for p in paragraphs if p]
-    # Find start indices for each known university name
     indices = []
     for i, p in enumerate(paragraphs):
-        for uni_name in UNIVERSITY_ID_MAP.keys():
-            # Exact match or paragraph starting with that name
-            if p == uni_name or p.startswith(uni_name):
-                indices.append((i, uni_name))
-    # Sort by index
     indices.sort(key=lambda x: x[0])
-    uni_blocks: Dict[str, List[str]] = {}
-    for idx, (start_idx, uni_name) in enumerate(indices):
-        end_idx = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
-        block = paragraphs[start_idx:end_idx]
-        uni_blocks[uni_name] = block
     return uni_blocks
-def parse_overview_block(block: List[str]) -> Dict[str, Any]:
-    """
-    Given the full block for a university, extract the overview section as JSON.
-    We look for lines containing 'Founded:', 'Total Students:', etc.
-    """
-    overview = {}
     for line in block:
         if line.startswith("Founded:"):
-            overview["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
-        elif line.startswith("Total Students") or line.startswith("Total Students:"):
-            overview["total_students"] = int(
-                re.sub(r"[^\d]", "", line.split(":", 1)[1])
-            )
-        elif "Postgraduate" in line and "Students" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
-            overview["postgraduate_students"] = int(digits) if digits else None
         elif line.startswith("Acceptance rate"):
-            overview["acceptance_rate"] = line.split(":", 1)[1].strip()
         elif line.startswith("Location:"):
-            overview["location"] = line.split(":", 1)[1].strip()
-        elif "Yearly Out of State Tuition Fees" in line or "Yearly Tuition Fees" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
-            overview["tuition_out_of_state_yearly"] = int(digits) if digits else None
-    return overview
-def extract_between(block: List[str], start_marker: str, stop_markers: List[str]) -> List[str]:
-    """
-    Extract lines between a line containing `start_marker` and the first line
-    containing any of `stop_markers`.
-    """
-    started = False
-    buf: List[str] = []
     for line in block:
-        if not started and start_marker in line:
             started = True
             continue
         if started:
-            if any(m in line for m in stop_markers):
                 break
             if line.strip():
-                buf.append(line.strip())
-    return buf
-def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
-    """
-    Benefits are the lines following 'Benefits for ISP students at this school'
-    until 'To qualify for The International Scholars Program' or university change.
-    """
-    benefits_lines = extract_between(
         block,
-        start_marker="Benefits for ISP students at this school",
-        stop_markers=[
-            "To qualify for The International Scholars Program at",
-            "To qualify for The International Scholars Program",
-        ],
     )
-    benefits = [normalize_text(l) for l in benefits_lines if l]
-    return {"benefits": benefits}
-def parse_programs_block(block: List[str]) -> Dict[str, Any]:
-    """
-    Parse the 'Program table' portion.
-    """
-    program_lines = extract_between(
         block,
-        start_marker="To qualify for The International Scholars Program",
-        stop_markers=[
-            "Montclair State University",
-            "Missouri State University",
-            "Indiana University of Pennsylvania",
-            "University of Louisville",
-            "University of Delaware",
-            "Grand Valley State University",
-            "Quinnipiac University",
-            "William Jessup University",
-            "Wilkes University",
-            "University of South Dakota",
-        ],
     )
-    header_keywords = {
-        "Program",
-        "Designation",
-        "Entrance Exam Required",
-        "Entrance Examination",
-        "Examples of Career Pathways",
-        "Funding Category",
-    }
-    cleaned: List[str] = []
-    for line in program_lines:
-        if line in header_keywords:
-            continue
-        cleaned.append(line)
-    programs: List[Dict[str, Any]] = []
-    i = 0
     while i < len(cleaned):
         remaining = len(cleaned) - i
-        if remaining < 4:
-            break
-        program_name = cleaned[i].strip()
-        designation = cleaned[i + 1].strip() if remaining > 1 else ""
-        entrance_exam = cleaned[i + 2].strip() if remaining > 2 else ""
-        career_paths: List[str] = []
-        j = i + 3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
-            career_paths.append(cleaned[j].strip())
             j += 1
-        funding_category = cleaned[j].strip() if j < len(cleaned) else ""
-        programs.append(
-            {
-                "program_name": program_name,
-                "designation": designation,
-                "entrance_exam": entrance_exam,
-                "career_pathways": career_paths,
-                "funding_category": funding_category,
-            }
-        )
         i = j + 1
     return {"programs": programs}
-def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Dict[str, Any]]:
-    """
-    Parse all sections for a single university block:
-      - overview
-      - benefits
-      - programs
-    """
-    sections: Dict[str, Dict[str, Any]] = {}
-    overview = parse_overview_block(block)
-    if overview:
-        overview.setdefault("university_name", uni_name)
-        sections["overview"] = overview
-    benefits = parse_benefits_block(block)
-    if benefits.get("benefits"):
-        sections["benefits"] = benefits
-    programs = parse_programs_block(block)
-    if programs.get("programs"):
-        sections["programs"] = programs
     return sections
 # -----------------------------
-# MAIN SYNC FUNCTION
 # -----------------------------
-def run_full_sync(docx_file) -> str:
-    """
-    1. Parse DOCX into university blocks
-    2. For each known university_id:
-       a. Parse overview/benefits/programs from the handbook
-       b. Fetch existing section_json from DB
-       c. Compare (DeepDiff)
-       d. If different, update DB
-    3. Return human-readable log
-    """
     if docx_file is None:
         return "No handbook file uploaded."
     try:
-        document = Document(docx_file.name)
     except Exception as e:
-        return f"Failed to read DOCX: {e}"
-    uni_blocks = split_doc_by_university(document)
-    logs: List[str] = []
-    total_updates = 0
     for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
-        block = uni_blocks.get(uni_name)
         if not block:
-            logs.append(f"[WARN] No block found in handbook for '{uni_name}'. Skipping.")
             continue
-        parsed_sections = parse_university_block(uni_name, block)
-        if not parsed_sections:
-            logs.append(f"[WARN] No parsable sections for '{uni_name}'. Skipping.")
             continue
-        for section_key, new_data in parsed_sections.items():
-            if section_key not in ("overview", "benefits", "programs"):
                 continue
-            current_data = fetch_section_json(uni_id, section_key)
-            if current_data is None:
-                logs.append(
-                    f"[INFO] No existing JSON for uni_id={uni_id}, section_key='{section_key}'. "
-                    f"Will only update if row exists."
-                )
-            diff = DeepDiff(current_data or {}, new_data, ignore_order=True)
             if not diff:
-                logs.append(f"[OK] '{uni_name}' [{section_key}] – no change.")
                 continue
             try:
-                update_section_json(uni_id, section_key, new_data)
-                total_updates += 1
-                logs.append(
-                    f"[UPDATED] '{uni_name}' [{section_key}] – DB updated (differences detected)."
-                )
             except Exception as e:
-                logs.append(
-                    f"[ERROR] Failed to update '{uni_name}' [{section_key}]: {e}"
-                )
-    summary = f"\n\nTotal sections updated: {total_updates}\n"
-    return "\n".join(logs) + summary
 # -----------------------------
-# ISP BRANDING & GRADIO UI
 # -----------------------------
 ISP_PRIMARY = "#062A4D"
 ISP_GOLD = "#D6A229"
 ISP_BG = "#F5F7FA"
-ISP_TEXT = "#333333"
-# Prefer local logo file (you must add this file in your repo: assets/logo-DRvZB3HV.svg)
-LOCAL_LOGO_PATH = "assets/logo-DRvZB3HV.svg"
-if os.path.exists(LOCAL_LOGO_PATH):
-    ISP_LOGO_SRC = LOCAL_LOGO_PATH
-else:
-    # Fallback to remote logo if local file missing
-    ISP_LOGO_SRC = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
-css = f"""
 #isp-header {{
     background: {ISP_PRIMARY};
     padding: 20px;
-    border-radius: 6px;
     display: flex;
     align-items: center;
     gap: 20px;
 }}
 #isp-header h1 {{
-    color: white !important;
-    font-size: 28px !important;
     margin: 0;
 }}
 #isp-logo {{
     height: 60px;
 }}
-.gradio-container {{
-    background: {ISP_BG} !important;
-}}
 button {{
     background-color: {ISP_GOLD} !important;
     color: black !important;
-    font-weight: bold !important;
     border-radius: 8px !important;
 }}
 """
-with gr.Blocks(css=css, title="Automated Handbook Sync Data Pipeline") as demo:
-    # Header with Logo + Title
-    with gr.Row(elem_id="isp-header"):
-        gr.HTML(
-            f"""
-            <img id='isp-logo' src='{ISP_LOGO_SRC}' alt='ISP Logo'/>
-            <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
-            """
-        )
-    gr.Markdown(
-        """
-### Automated Handbook Sync Data Pipeline
-This internal ISP tool automates:
-- Parsing university sections from the official ISP Handbook
-- Comparing extracted content with the **university_handbook_sections** table
-- Updating only fields that have changed
-- Maintaining data uniformity and reducing manual effort
----
-#### Instructions
-1. Upload the complete **ISP Handbook (.docx)**
-2. Click **Run Full Sync**
-3. Review the logs to see which university sections were updated
-Only handbook-sourced fields are updated:
-- `overview`
-- `benefits`
-- `programs`
-Other database sections (e.g., images) remain untouched.
-"""
-    )
     file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
-    sync_button = gr.Button("Run Full Sync")
-    log_output = gr.Textbox(
-        label="Sync Log",
-        lines=30,
-        interactive=False,
-    )
-    sync_button.click(
-        fn=run_full_sync,
-        inputs=file_input,
-        outputs=log_output,
-    )
 if __name__ == "__main__":

     "William Jessup University": 10,
     "Wilkes University": 14,
     "University of South Dakota (USD)": 16,
 }
 # DB CONNECTION HELPERS
 # -----------------------------
 def get_db_connection():
     return mysql.connector.connect(
         host=os.getenv("DB_HOST", "localhost"),
         port=int(os.getenv("DB_PORT", "3306")),
 def fetch_section_json(university_id: int, section_key: str):
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
+        cursor.execute("""
             SELECT section_json
             FROM university_handbook_sections
+            WHERE university_id=%s AND section_key=%s
             LIMIT 1
+        """, (university_id, section_key))
         row = cursor.fetchone()
+        if not row or not row[0]:
             return None
         try:
             return json.loads(row[0])
+        except:
             return None
     finally:
         cursor.close()
 def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
+        new_json = json.dumps(new_data, ensure_ascii=False)
+        cursor.execute("""
             UPDATE university_handbook_sections
+            SET section_json=%s
+            WHERE university_id=%s AND section_key=%s
+        """, (new_json, university_id, section_key))
         conn.commit()
     finally:
         cursor.close()
 # -----------------------------
 # DOCX PARSING HELPERS
 # -----------------------------
+def normalize_text(t): return " ".join(t.split()).strip()
+def split_doc_by_university(doc: Document):
+    paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
     indices = []
     for i, p in enumerate(paragraphs):
+        for uni in UNIVERSITY_ID_MAP.keys():
+            if p == uni or p.startswith(uni):
+                indices.append((i, uni))
     indices.sort(key=lambda x: x[0])
+    uni_blocks = {}
+    for idx, (start, uni_name) in enumerate(indices):
+        end = indices[idx+1][0] if idx + 1 < len(indices) else len(paragraphs)
+        uni_blocks[uni_name] = paragraphs[start:end]
     return uni_blocks
+def parse_overview_block(block: List[str]):
+    data = {}
     for line in block:
         if line.startswith("Founded:"):
+            data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
+        elif line.startswith("Total Students"):
+            data["total_students"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
+        elif "Postgraduate" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
+            data["postgraduate_students"] = int(digits) if digits else None
         elif line.startswith("Acceptance rate"):
+            data["acceptance_rate"] = line.split(":", 1)[1].strip()
         elif line.startswith("Location:"):
+            data["location"] = line.split(":", 1)[1].strip()
+        elif "Tuition" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
+            data["tuition_out_of_state_yearly"] = int(digits) if digits else None
+    return data
+def extract_between(block, start, stops):
+    out, started = [], False
     for line in block:
+        if not started and start in line:
             started = True
             continue
         if started:
+            if any(s in line for s in stops):
                 break
             if line.strip():
+                out.append(line)
+    return out
+def parse_benefits_block(block):
+    lines = extract_between(
         block,
+        "Benefits for ISP students at this school",
+        ["To qualify for The International Scholars Program"]
     )
+    return {"benefits": [normalize_text(l) for l in lines]}
+def parse_programs_block(block):
+    lines = extract_between(
         block,
+        "To qualify for The International Scholars Program",
+        list(UNIVERSITY_ID_MAP.keys())
     )
+    headers = {"Program", "Designation", "Entrance Exam Required", "Examples of Career Pathways", "Funding Category"}
+    cleaned = [l for l in lines if l not in headers]
+    programs, i = [], 0
     while i < len(cleaned):
         remaining = len(cleaned) - i
+        if remaining < 4: break
+        name = cleaned[i]
+        designation = cleaned[i+1]
+        exam = cleaned[i+2]
+        careers = []
+        j = i+3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
+            careers.append(cleaned[j])
             j += 1
+        tier = cleaned[j] if j < len(cleaned) else ""
+        programs.append({
+            "program_name": name,
+            "designation": designation,
+            "entrance_exam": exam,
+            "career_pathways": careers,
+            "funding_category": tier
+        })
         i = j + 1
     return {"programs": programs}
+def parse_university_block(name: str, block: List[str]):
+    sections = {}
+    ov = parse_overview_block(block)
+    if ov:
+        ov["university_name"] = name
+        sections["overview"] = ov
+    ben = parse_benefits_block(block)
+    if ben.get("benefits"):
+        sections["benefits"] = ben
+    prog = parse_programs_block(block)
+    if prog.get("programs"):
+        sections["programs"] = prog
     return sections
 # -----------------------------
+# MAIN SYNC LOGIC
 # -----------------------------
+def run_full_sync(docx_file):
     if docx_file is None:
         return "No handbook file uploaded."
     try:
+        doc = Document(docx_file.name)
     except Exception as e:
+        return f"Error reading DOCX: {e}"
+    blocks = split_doc_by_university(doc)
+    logs, updated = [], 0
     for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
+        block = blocks.get(uni_name)
         if not block:
+            logs.append(f"[WARN] Missing block for: {uni_name}")
             continue
+        data = parse_university_block(uni_name, block)
+        if not data:
+            logs.append(f"[WARN] No valid sections found for: {uni_name}")
             continue
+        for key, new_json in data.items():
+            if key not in ("overview", "benefits", "programs"):
                 continue
+            old_json = fetch_section_json(uni_id, key)
+            diff = DeepDiff(old_json or {}, new_json, ignore_order=True)
             if not diff:
+                logs.append(f"[OK] {uni_name} [{key}] unchanged.")
                 continue
             try:
+                update_section_json(uni_id, key, new_json)
+                logs.append(f"[UPDATED] {uni_name} [{key}] updated.")
+                updated += 1
             except Exception as e:
+                logs.append(f"[ERROR] Updating {uni_name} [{key}]: {e}")
+    logs.append(f"\nTotal sections updated: {updated}")
+    return "\n".join(logs)
 # -----------------------------
+# ISP BRANDING (NO css= ARGUMENT)
 # -----------------------------
 ISP_PRIMARY = "#062A4D"
 ISP_GOLD = "#D6A229"
 ISP_BG = "#F5F7FA"
+LOCAL_LOGO = "assets/logo-DRvZB3HV.svg"
+LOGO_SRC = LOCAL_LOGO if os.path.exists(LOCAL_LOGO) else "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
+CUSTOM_CSS = f"""
+<style>
 #isp-header {{
     background: {ISP_PRIMARY};
     padding: 20px;
+    border-radius: 8px;
     display: flex;
     align-items: center;
     gap: 20px;
 }}
 #isp-header h1 {{
+    color: white;
     margin: 0;
+    font-size: 26px;
 }}
 #isp-logo {{
     height: 60px;
 }}
 button {{
     background-color: {ISP_GOLD} !important;
     color: black !important;
     border-radius: 8px !important;
+    font-weight: bold !important;
 }}
+.gradio-container {{
+    background: {ISP_BG} !important;
+}}
+</style>
 """
+# -----------------------------
+# GRADIO UI
+# -----------------------------
+with gr.Blocks(title="ISP Automated Handbook Data Pipeline") as demo:
+    gr.HTML(CUSTOM_CSS)
+    # Header with logo + title
+    gr.HTML(f"""
+    <div id='isp-header'>
+        <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
+        <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
+    </div>
+    """)
+    gr.Markdown("""
+### Automated Handbook Sync Data Pipeline
+Upload the official ISP Handbook (.docx), and this tool will:
+- Extract university sections
+- Compare them with the **university_handbook_sections** table
+- Update only changed JSON fields
+- Ensure consistent, synchronized data
+---
+""")
     file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
+    log_output = gr.Textbox(label="Sync Log", lines=30)
+    sync_btn = gr.Button("Run Full Sync")
+    sync_btn.click(fn=run_full_sync, inputs=file_input, outputs=log_output)
 if __name__ == "__main__":