Spaces:

internationalscholarsprogram
/

docx-json-sync

Sleeping

App Files Files Community

internationalscholarsprogram commited on Dec 5, 2025

Commit

a45863a

1 Parent(s): c9990de

Update handbook sync engine

Browse files

Files changed (2) hide show

app.py +352 -123
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,19 +1,39 @@
 import os
 import json
 import gradio as gr
-from docx import Document  # from python-docx
 from deepdiff import DeepDiff
 import mysql.connector
 # -----------------------------
 # DB CONNECTION HELPERS
 # -----------------------------
 def get_db_connection():
     """
     Create and return a MySQL connection using environment variables.
-    Set these in your HF Space settings:
         DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME
     """
     return mysql.connector.connect(
@@ -21,40 +41,54 @@ def get_db_connection():
         port=int(os.getenv("DB_PORT", "3306")),
         user=os.getenv("DB_USER", "root"),
         password=os.getenv("DB_PASSWORD", ""),
-        database=os.getenv("DB_NAME", "test"),
     )
-def fetch_db_json(doc_id: str):
     """
-    Fetch existing JSON from the database for a given doc_id.
-    Assumes a table 'documents' with columns: id, json_data.
     """
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
-        query = "SELECT json_data FROM documents WHERE id = %s"
-        cursor.execute(query, (doc_id,))
         row = cursor.fetchone()
-        if not row or row[0] is None:
             return None
-        # If stored as TEXT, parse it as JSON.
-        return json.loads(row[0])
     finally:
         cursor.close()
         conn.close()
-def update_db_json(doc_id: str, new_data: dict):
     """
-    Update JSON content in the database for a given doc_id.
     """
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
         new_json_str = json.dumps(new_data, ensure_ascii=False)
-        query = "UPDATE documents SET json_data = %s WHERE id = %s"
-        cursor.execute(query, (new_json_str, doc_id))
         conn.commit()
     finally:
         cursor.close()
@@ -62,152 +96,347 @@ def update_db_json(doc_id: str, new_data: dict):
 # -----------------------------
-# DOCX → JSON
 # -----------------------------
-def docx_to_python_dict(file_obj):
-    if file_obj is None:
-        raise ValueError("No file uploaded")
-    document = Document(file_obj.name)
-    # Example: convert paragraphs into a simple structured dict
-    paragraphs = [p.text for p in document.paragraphs if p.text.strip() != ""]
-    data = {
-        "paragraphs": paragraphs,
-        "paragraph_count": len(paragraphs),
-    }
-    return data
-# -----------------------------
-# GRADIO CALLBACKS
-# -----------------------------
-def convert_and_compare(file_obj, doc_id):
     """
-    1. Convert DOCX to JSON (Python dict)
-    2. Fetch old JSON from DB
-    3. Compare and return:
-        - new_json_str
-        - old_json_str (or message if none)
-        - diff_str
     """
-    if file_obj is None:
-        return "{}", "No existing record (or doc_id missing)", "No file uploaded."
-    if not doc_id:
-        return "{}", "{}", "Please provide a doc_id to look up in the database."
-    # 1) DOCX → dict
-    try:
-        new_data = docx_to_python_dict(file_obj)
-    except Exception as e:
-        return "{}", "{}", f"Error parsing DOCX: {e}"
-    new_json_str = json.dumps(new_data, indent=2, ensure_ascii=False)
-    # 2) Fetch existing from DB
-    try:
-        old_data = fetch_db_json(doc_id)
-    except Exception as e:
-        return new_json_str, "{}", f"Error fetching from DB: {e}"
-    if old_data is None:
-        old_json_str = "No existing JSON found for this doc_id."
-        diff_str = "No existing data to compare. You can choose to update DB with this new JSON."
-        return new_json_str, old_json_str, diff_str
-    old_json_str = json.dumps(old_data, indent=2, ensure_ascii=False)
-    # 3) Compare with DeepDiff
-    try:
-        diff = DeepDiff(old_data, new_data, ignore_order=True)
-        if not diff:
-            diff_str = "No differences detected between DOCX JSON and DB JSON."
-        else:
-            diff_str = json.dumps(diff, indent=2, ensure_ascii=False, default=str)
-    except Exception as e:
-        diff_str = f"Error computing diff: {e}"
-    return new_json_str, old_json_str, diff_str
-def apply_update(doc_id, new_json_str):
     """
-    Apply the new JSON to the DB if user confirms.
     """
-    if not doc_id:
-        return "Please provide a doc_id."
-    if not new_json_str.strip():
-        return "No new JSON provided to update."
-    try:
-        new_data = json.loads(new_json_str)
-    except Exception as e:
-        return f"Error parsing new JSON: {e}"
     try:
-        update_db_json(doc_id, new_data)
     except Exception as e:
-        return f"Error updating DB: {e}"
-    return "Database updated successfully with new JSON."
 # -----------------------------
 # GRADIO UI
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# DOCX → JSON → DB Sync")
     gr.Markdown(
-        "Upload a Word (.docx) file, enter the document ID from your database, "
-        "and compare the generated JSON with what is stored in the DB. "
-        "If there are changes, you can update the DB."
     )
-    with gr.Row():
-        file_input = gr.File(label="Upload .docx file")
-        doc_id_input = gr.Textbox(label="Document ID (as stored in DB)", placeholder="e.g. 123")
-    with gr.Row():
-        new_json_output = gr.Code(
-            label="New JSON (from DOCX)",
-            language="json",
-            interactive=True,
-        )
-        old_json_output = gr.Code(
-            label="Existing JSON (from DB)",
-            language="json",
-            interactive=False,
-        )
-    diff_output = gr.Code(
-        label="Diff (DeepDiff result)",
-        language="json",
         interactive=False,
     )
-    compare_button = gr.Button("Convert & Compare")
-    compare_button.click(
-        fn=convert_and_compare,
-        inputs=[file_input, doc_id_input],
-        outputs=[new_json_output, old_json_output, diff_output],
-    )
-    gr.Markdown("## Apply Update")
-    gr.Markdown(
-        "If you're happy with the changes, click below to write the **New JSON** "
-        "back into the database for this `doc_id`."
-    )
-    update_status = gr.Textbox(label="Update Status", interactive=False)
-    update_button = gr.Button("Update DB with New JSON")
-    update_button.click(
-        fn=apply_update,
-        inputs=[doc_id_input, new_json_output],
-        outputs=[update_status],
     )
 if __name__ == "__main__":

 import os
 import json
+import re
+from typing import Dict, Any, List, Tuple
 import gradio as gr
+from docx import Document
 from deepdiff import DeepDiff
 import mysql.connector
+# -----------------------------
+# CONFIG: UNIVERSITY MAPPING
+# -----------------------------
+UNIVERSITY_ID_MAP = {
+    "Indiana University of Pennsylvania (IUP)": 1,
+    "Missouri State University": 2,
+    "University Of Kentucky (UK)": 3,
+    "University of Louisville (UofL)": 4,
+    "University of Delaware (UD)": 6,
+    "Grand Valley State University": 7,
+    "Quinnipiac University": 9,
+    "William Jessup University": 10,
+    "Wilkes University": 14,
+    "University of South Dakota (USD)": 16,
+    # Extend as you add more rows to university_handbook_sections
+}
 # -----------------------------
 # DB CONNECTION HELPERS
 # -----------------------------
 def get_db_connection():
     """
     Create and return a MySQL connection using environment variables.
+    Set these in HF Space secrets:
         DB_HOST, DB_PORT, DB_USER, DB_PASSWORD, DB_NAME
     """
     return mysql.connector.connect(
         port=int(os.getenv("DB_PORT", "3306")),
         user=os.getenv("DB_USER", "root"),
         password=os.getenv("DB_PASSWORD", ""),
+        database=os.getenv("DB_NAME", ""),
     )
+def fetch_section_json(university_id: int, section_key: str):
     """
+    Fetch existing JSON for given university_id + section_key from DB.
+    Returns parsed dict or None if not found.
     """
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
+        query = """
+            SELECT section_json
+            FROM university_handbook_sections
+            WHERE university_id = %s AND section_key = %s
+            LIMIT 1
+        """
+        cursor.execute(query, (university_id, section_key))
         row = cursor.fetchone()
+        if not row:
+            return None
+        if not row[0]:
+            return None
+        try:
+            return json.loads(row[0])
+        except Exception:
+            # JSON malformed in DB – treat as None to force overwrite
             return None
     finally:
         cursor.close()
         conn.close()
+def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
     """
+    Update section_json in DB for given university_id + section_key.
     """
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
         new_json_str = json.dumps(new_data, ensure_ascii=False)
+        query = """
+            UPDATE university_handbook_sections
+            SET section_json = %s
+            WHERE university_id = %s AND section_key = %s
+        """
+        cursor.execute(query, (new_json_str, university_id, section_key))
         conn.commit()
     finally:
         cursor.close()
 # -----------------------------
+# DOCX PARSING HELPERS
 # -----------------------------
+def normalize_text(text: str) -> str:
+    return " ".join(text.split()).strip()
+def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
+    """
+    Split the docx into blocks per university name using headings that match
+    the keys in UNIVERSITY_ID_MAP.
+    Returns dict: { "University Name": [list_of_paragraph_texts_in_block] }
+    """
+    paragraphs = [normalize_text(p.text) for p in doc.paragraphs]
+    # Remove empties
+    paragraphs = [p for p in paragraphs if p]
+    # Find start indices for each known university name
+    indices = []
+    for i, p in enumerate(paragraphs):
+        for uni_name in UNIVERSITY_ID_MAP.keys():
+            # Exact match or paragraph starting with that name
+            if p == uni_name or p.startswith(uni_name):
+                indices.append((i, uni_name))
+    # Sort by index
+    indices.sort(key=lambda x: x[0])
+    uni_blocks: Dict[str, List[str]] = {}
+    for idx, (start_idx, uni_name) in enumerate(indices):
+        end_idx = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
+        block = paragraphs[start_idx:end_idx]
+        uni_blocks[uni_name] = block
+    return uni_blocks
+def parse_overview_block(block: List[str]) -> Dict[str, Any]:
     """
+    Given the full block for a university, extract the overview section as JSON.
+    We look for lines containing 'Founded:', 'Total Students:', etc.
     """
+    overview = {}
+    for line in block:
+        if line.startswith("Founded:"):
+            overview["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
+        elif line.startswith("Total Students") or line.startswith("Total Students:"):
+            overview["total_students"] = int(
+                re.sub(r"[^\d]", "", line.split(":", 1)[1])
+            )
+        elif "Postgraduate" in line and "Students" in line:
+            # Some pages have 'Postgraduate students' or 'Postgraduate Students'
+            digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
+            overview["postgraduate_students"] = int(digits) if digits else None
+        elif line.startswith("Acceptance rate"):
+            overview["acceptance_rate"] = line.split(":", 1)[1].strip()
+        elif line.startswith("Location:"):
+            overview["location"] = line.split(":", 1)[1].strip()
+        elif "Yearly Out of State Tuition Fees" in line or "Yearly Tuition Fees" in line:
+            digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
+            overview["tuition_out_of_state_yearly"] = int(digits) if digits else None
+    return overview
+def extract_between(block: List[str], start_marker: str, stop_markers: List[str]) -> List[str]:
+    """
+    Extract lines between a line containing `start_marker` and the first line
+    containing any of `stop_markers`.
+    """
+    started = False
+    buf: List[str] = []
+    for line in block:
+        if not started and start_marker in line:
+            started = True
+            continue
+        if started:
+            if any(m in line for m in stop_markers):
+                break
+            if line.strip():
+                buf.append(line.strip())
+    return buf
+def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
+    """
+    Benefits are the lines following 'Benefits for ISP students at this school'
+    until 'To qualify for The International Scholars Program' or university change.
+    """
+    benefits_lines = extract_between(
+        block,
+        start_marker="Benefits for ISP students at this school",
+        stop_markers=[
+            "To qualify for The International Scholars Program at",
+            "To qualify for The International Scholars Program",
+        ],
+    )
+    # Clean bullet style / stray punctuation
+    benefits = [normalize_text(l) for l in benefits_lines if l]
+    return {"benefits": benefits}
+def parse_programs_block(block: List[str]) -> Dict[str, Any]:
+    """
+    Parse the 'Program table' portion.
+    We assume that after:
+      'To qualify for The International Scholars Program at <Uni>, you must be willing to study...'
+    we get repeated groups like:
+        Program
+        Designation
+        Entrance Exam Required
+        Examples of Career Pathways
+        Funding Category
+    But in the raw text, it often appears as:
+        MS Computer Science
+        STEM
+        Optional
+        Software Developer
+        Database Administrator
+        TIER 1
+    So we scan for the first occurrence of 'Program' header and then slice in chunks of 5-6 lines.
+    """
+    # Grab everything after 'To qualify for ... you must be willing to study'
+    program_lines = extract_between(
+        block,
+        start_marker="To qualify for The International Scholars Program",
+        stop_markers=[
+            "Montclair State University",
+            "Missouri State University",
+            "Indiana University of Pennsylvania",
+            "University of Louisville",
+            "University of Delaware",
+            "Grand Valley State University",
+            "Quinnipiac University",
+            "William Jessup University",
+            "Wilkes University",
+            "University of South Dakota",
+            # any other possible headings we might hit
+        ],
+    )
+    # Remove the header row if present
+    header_keywords = {"Program", "Designation", "Entrance Exam Required", "Entrance Examination", "Examples of Career Pathways", "Funding Category"}
+    cleaned: List[str] = []
+    for line in program_lines:
+        if line in header_keywords:
+            continue
+        cleaned.append(line)
+    # Now group by 5-6 lines per program:
+    # 0: program_name
+    # 1: designation
+    # 2: entrance_exam
+    # 3: career_path_1
+    # 4: career_path_2 (optional, may be missing)
+    # 5: funding_category
+    programs: List[Dict[str, Any]] = []
+    i = 0
+    while i < len(cleaned):
+        # Heuristic: we expect at least 4 lines ahead for a valid program
+        remaining = len(cleaned) - i
+        if remaining < 4:
+            break
+        program_name = cleaned[i].strip()
+        designation = cleaned[i + 1].strip() if remaining > 1 else ""
+        entrance_exam = cleaned[i + 2].strip() if remaining > 2 else ""
+        # Next 1–2 lines are examples of career pathways until we hit something that looks like 'TIER'
+        career_paths: List[str] = []
+        j = i + 3
+        while j < len(cleaned) and not cleaned[j].startswith("TIER"):
+            career_paths.append(cleaned[j].strip())
+            j += 1
+        funding_category = cleaned[j].strip() if j < len(cleaned) else ""
+        programs.append(
+            {
+                "program_name": program_name,
+                "designation": designation,
+                "entrance_exam": entrance_exam,
+                "career_pathways": career_paths,
+                "funding_category": funding_category,
+            }
+        )
+        # Move index to element after funding_category
+        i = j + 1
+    return {"programs": programs}
+def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Dict[str, Any]]:
     """
+    Parse all sections for a single university block:
+      - overview
+      - benefits
+      - programs
+    Return dict:
+    {
+        "overview": {...},
+        "benefits": {...},
+        "programs": {...}
+    }
     """
+    sections: Dict[str, Dict[str, Any]] = {}
+    overview = parse_overview_block(block)
+    if overview:
+        # Always include explicit name for safety
+        overview.setdefault("university_name", uni_name)
+        sections["overview"] = overview
+    benefits = parse_benefits_block(block)
+    if benefits.get("benefits"):
+        sections["benefits"] = benefits
+    programs = parse_programs_block(block)
+    if programs.get("programs"):
+        sections["programs"] = programs
+    return sections
+# -----------------------------
+# MAIN SYNC FUNCTION
+# -----------------------------
+def run_full_sync(docx_file) -> str:
+    """
+    1. Parse DOCX into university blocks
+    2. For each known university_id:
+       a. Parse overview/benefits/programs from the handbook
+       b. Fetch existing section_json from DB
+       c. Compare (DeepDiff)
+       d. If different, update DB
+    3. Return human-readable log
+    """
+    if docx_file is None:
+        return "No handbook file uploaded."
     try:
+        document = Document(docx_file.name)
     except Exception as e:
+        return f"Failed to read DOCX: {e}"
+    uni_blocks = split_doc_by_university(document)
+    logs: List[str] = []
+    total_updates = 0
+    for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
+        block = uni_blocks.get(uni_name)
+        if not block:
+            logs.append(f"[WARN] No block found in handbook for '{uni_name}'. Skipping.")
+            continue
+        parsed_sections = parse_university_block(uni_name, block)
+        if not parsed_sections:
+            logs.append(f"[WARN] No parsable sections for '{uni_name}'. Skipping.")
+            continue
+        for section_key, new_data in parsed_sections.items():
+            # We ONLY touch sections sourced from handbook: overview, benefits, programs
+            if section_key not in ("overview", "benefits", "programs"):
+                continue
+            current_data = fetch_section_json(uni_id, section_key)
+            if current_data is None:
+                # No existing record or invalid JSON – we still require that the row exists;
+                # if not, we just log and skip.
+                # If you want to INSERT missing rows, you can add that logic here.
+                logs.append(
+                    f"[INFO] No existing JSON for uni_id={uni_id}, section_key='{section_key}'. "
+                    f"Will only update if row exists."
+                )
+            # Compare with DeepDiff
+            diff = DeepDiff(current_data or {}, new_data, ignore_order=True)
+            if not diff:
+                logs.append(f"[OK] '{uni_name}' [{section_key}] – no change.")
+                continue
+            # Update DB
+            try:
+                update_section_json(uni_id, section_key, new_data)
+                total_updates += 1
+                logs.append(
+                    f"[UPDATED] '{uni_name}' [{section_key}] – DB updated (differences detected)."
+                )
+            except Exception as e:
+                logs.append(
+                    f"[ERROR] Failed to update '{uni_name}' [{section_key}]: {e}"
+                )
+    summary = f"\n\nTotal sections updated: {total_updates}\n"
+    return "\n".join(logs) + summary
 # -----------------------------
 # GRADIO UI
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# ISP Handbook → Database Sync (Full Auto)")
     gr.Markdown(
+        """
+Upload the **full ISP Handbook DOCX**.
+On **Run full sync**, the app will:
+1. Parse each university block from the handbook
+2. Extract **Overview**, **Benefits**, and **Programs** sections
+3. Compare them with `university_handbook_sections.section_json`
+4. Update only rows that have changed
+Only sections that are sourced from the handbook are touched:
+- `overview`
+- `benefits`
+- `programs`
+Sections like `campus_image` / `image` are **never updated** here.
+        """
     )
+    file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
+    sync_button = gr.Button("Run full sync")
+    log_output = gr.Textbox(
+        label="Sync Log",
+        lines=30,
         interactive=False,
     )
+    sync_button.click(
+        fn=run_full_sync,
+        inputs=file_input,
+        outputs=log_output,
     )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 gradio
 python-docx
 deepdiff
-mysql-connector-python

 gradio
 python-docx
 deepdiff
+mysql-connector-python