Spaces:

internationalscholarsprogram
/

docx-json-sync

Sleeping

App Files Files Community

internationalscholarsprogram commited on Dec 5, 2025

Commit

fca9a55

1 Parent(s): 8c0596f

Update ISP handbook data pipeline UI and logo

Browse files

Files changed (1) hide show

app.py +63 -120

app.py CHANGED Viewed

@@ -43,15 +43,12 @@ def fetch_section_json(university_id: int, section_key: str):
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
-        cursor.execute(
-            """
             SELECT section_json
             FROM university_handbook_sections
             WHERE university_id=%s AND section_key=%s
             LIMIT 1
-        """,
-            (university_id, section_key),
-        )
         row = cursor.fetchone()
         if not row or not row[0]:
             return None
@@ -64,21 +61,16 @@ def fetch_section_json(university_id: int, section_key: str):
         conn.close()
-def update_section_json(
-    university_id: int, section_key: str, new_data: Dict[str, Any]
-):
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
         new_json = json.dumps(new_data, ensure_ascii=False)
-        cursor.execute(
-            """
             UPDATE university_handbook_sections
             SET section_json=%s
             WHERE university_id=%s AND section_key=%s
-        """,
-            (new_json, university_id, section_key),
-        )
         conn.commit()
     finally:
         cursor.close()
@@ -88,14 +80,12 @@ def update_section_json(
 # -----------------------------
 # DOCX PARSING HELPERS
 # -----------------------------
-def normalize_text(t: str) -> str:
-    return " ".join(t.split()).strip()
-def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
     paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
-    indices: List[tuple[int, str]] = []
     for i, p in enumerate(paragraphs):
         for uni in UNIVERSITY_ID_MAP.keys():
             if p == uni or p.startswith(uni):
@@ -103,22 +93,20 @@ def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
     indices.sort(key=lambda x: x[0])
-    uni_blocks: Dict[str, List[str]] = {}
     for idx, (start, uni_name) in enumerate(indices):
-        end = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
         uni_blocks[uni_name] = paragraphs[start:end]
     return uni_blocks
-def parse_overview_block(block: List[str]) -> Dict[str, Any]:
-    data: Dict[str, Any] = {}
     for line in block:
         if line.startswith("Founded:"):
             data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
         elif line.startswith("Total Students"):
-            data["total_students"] = int(
-                re.sub(r"[^\d]", "", line.split(":", 1)[1])
-            )
         elif "Postgraduate" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             data["postgraduate_students"] = int(digits) if digits else None
@@ -126,15 +114,14 @@ def parse_overview_block(block: List[str]) -> Dict[str, Any]:
             data["acceptance_rate"] = line.split(":", 1)[1].strip()
         elif line.startswith("Location:"):
             data["location"] = line.split(":", 1)[1].strip()
-        elif "Tuition" in line or "Yearly Out of State Tuition" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             data["tuition_out_of_state_yearly"] = int(digits) if digits else None
     return data
-def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]:
-    out: List[str] = []
-    started = False
     for line in block:
         if not started and start in line:
             started = True
@@ -147,75 +134,50 @@ def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]
     return out
-def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
     lines = extract_between(
         block,
         "Benefits for ISP students at this school",
-        ["To qualify for The International Scholars Program"],
     )
     return {"benefits": [normalize_text(l) for l in lines]}
-def parse_programs_block(block: List[str]) -> Dict[str, Any]:
     lines = extract_between(
         block,
         "To qualify for The International Scholars Program",
-        list(UNIVERSITY_ID_MAP.keys()),
     )
-    headers = {
-        "Program",
-        "Designation",
-        "Entrance Exam Required",
-        "Examples of Career Pathways",
-        "Funding Category",
-    }
     cleaned = [l for l in lines if l not in headers]
-    programs: List[Dict[str, Any]] = []
-    i = 0
     while i < len(cleaned):
-        remaining = len(cleaned) - i
-        if remaining < 4:
             break
         name = cleaned[i]
-        designation = cleaned[i + 1]
-        exam = cleaned[i + 2]
-        careers: List[str] = []
-        j = i + 3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
             careers.append(cleaned[j])
             j += 1
         tier = cleaned[j] if j < len(cleaned) else ""
-        programs.append(
-            {
-                "program_name": name,
-                "designation": designation,
-                "entrance_exam": exam,
-                "career_pathways": careers,
-                "funding_category": tier,
-            }
-        )
-        i = j + 1
-    return {"programs": programs}
-def parse_university_block(name: str, block: List[str]) -> Dict[str, Dict[str, Any]]:
-    sections: Dict[str, Dict[str, Any]] = {}
-    ov = parse_overview_block(block)
-    if ov:
-        ov["university_name"] = name
-        sections["overview"] = ov
-    ben = parse_benefits_block(block)
-    if ben.get("benefits"):
-        sections["benefits"] = ben
-    prog = parse_programs_block(block)
-    if prog.get("programs"):
-        sections["programs"] = prog
-    return sections
 # -----------------------------
@@ -230,22 +192,21 @@ def run_full_sync(docx_file):
     except Exception as e:
         return f"Error reading DOCX: {e}"
-    blocks = split_doc_by_university(doc)
-    logs: List[str] = []
-    updated = 0
     for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
-        block = blocks.get(uni_name)
         if not block:
-            logs.append(f"[WARN] Missing block for: {uni_name}")
             continue
-        data = parse_university_block(uni_name, block)
-        if not data:
-            logs.append(f"[WARN] No valid sections found for: {uni_name}")
             continue
-        for key, new_json in data.items():
             if key not in ("overview", "benefits", "programs"):
                 continue
@@ -261,27 +222,28 @@ def run_full_sync(docx_file):
                 logs.append(f"[UPDATED] {uni_name} [{key}] updated.")
                 updated += 1
             except Exception as e:
-                logs.append(f"[ERROR] Updating {uni_name} [{key}]: {e}")
     logs.append(f"\nTotal sections updated: {updated}")
     return "\n".join(logs)
 # -----------------------------
-# ISP BRANDING (REMOTE LOGO ONLY)
 # -----------------------------
 ISP_PRIMARY = "#062A4D"
 ISP_GOLD = "#D6A229"
 ISP_BG = "#F5F7FA"
-LOGO_SRC = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
 CUSTOM_CSS = f"""
 <style>
 #isp-header {{
     background: {ISP_PRIMARY};
     padding: 20px;
-    border-radius: 8px;
     display: flex;
     align-items: center;
     gap: 20px;
@@ -289,7 +251,7 @@ CUSTOM_CSS = f"""
 #isp-header h1 {{
     color: white;
     margin: 0;
-    font-size: 26px;
 }}
 #isp-logo {{
     height: 60px;
@@ -312,46 +274,27 @@ button {{
 # -----------------------------
 with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
-    # Inject custom CSS
     gr.HTML(CUSTOM_CSS)
-    # Header with logo + title
-    gr.HTML(
-        f"""
     <div id='isp-header'>
         <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
         <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
     </div>
-    """
-    )
-    gr.Markdown(
-        """
-### Automated Handbook Sync Data Pipeline
-Upload the official ISP Handbook (.docx), and this tool will:
-- Extract university sections
-- Compare them with the **university_handbook_sections** table
-- Update only changed JSON fields
-- Ensure consistent, synchronized data
 ---
-**How to use**
-1. Upload the latest ISP Handbook DOCX
-2. Click **Run Full Sync**
-3. Check the log to see which universities and sections were updated
-"""
-    )
-    file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
     log_output = gr.Textbox(label="Sync Log", lines=30)
-    sync_btn = gr.Button("Run Full Sync")
-    sync_btn.click(fn=run_full_sync, inputs=file_input, outputs=log_output)
 if __name__ == "__main__":
     demo.launch()

     conn = get_db_connection()
     try:
         cursor = conn.cursor()
+        cursor.execute("""
             SELECT section_json
             FROM university_handbook_sections
             WHERE university_id=%s AND section_key=%s
             LIMIT 1
+        """, (university_id, section_key))
         row = cursor.fetchone()
         if not row or not row[0]:
             return None
         conn.close()
+def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
     conn = get_db_connection()
     try:
         cursor = conn.cursor()
         new_json = json.dumps(new_data, ensure_ascii=False)
+        cursor.execute("""
             UPDATE university_handbook_sections
             SET section_json=%s
             WHERE university_id=%s AND section_key=%s
+        """, (new_json, university_id, section_key))
         conn.commit()
     finally:
         cursor.close()
 # -----------------------------
 # DOCX PARSING HELPERS
 # -----------------------------
+def normalize_text(t): return " ".join(t.split()).strip()
+def split_doc_by_university(doc: Document):
     paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
+    indices = []
     for i, p in enumerate(paragraphs):
         for uni in UNIVERSITY_ID_MAP.keys():
             if p == uni or p.startswith(uni):
     indices.sort(key=lambda x: x[0])
+    uni_blocks = {}
     for idx, (start, uni_name) in enumerate(indices):
+        end = indices[idx+1][0] if idx+1 < len(indices) else len(paragraphs)
         uni_blocks[uni_name] = paragraphs[start:end]
     return uni_blocks
+def parse_overview_block(block: List[str]):
+    data = {}
     for line in block:
         if line.startswith("Founded:"):
             data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
         elif line.startswith("Total Students"):
+            data["total_students"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
         elif "Postgraduate" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             data["postgraduate_students"] = int(digits) if digits else None
             data["acceptance_rate"] = line.split(":", 1)[1].strip()
         elif line.startswith("Location:"):
             data["location"] = line.split(":", 1)[1].strip()
+        elif "Tuition" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             data["tuition_out_of_state_yearly"] = int(digits) if digits else None
     return data
+def extract_between(block, start, stops):
+    out, started = [], False
     for line in block:
         if not started and start in line:
             started = True
     return out
+def parse_benefits_block(block):
     lines = extract_between(
         block,
         "Benefits for ISP students at this school",
+        ["To qualify for The International Scholars Program"]
     )
     return {"benefits": [normalize_text(l) for l in lines]}
+def parse_programs_block(block):
     lines = extract_between(
         block,
         "To qualify for The International Scholars Program",
+        list(UNIVERSITY_ID_MAP.keys())
     )
+    headers = {"Program", "Designation", "Entrance Exam Required",
+               "Examples of Career Pathways", "Funding Category"}
     cleaned = [l for l in lines if l not in headers]
+    programs, i = [], 0
     while i < len(cleaned):
+        if len(cleaned) - i < 4:
             break
         name = cleaned[i]
+        designation = cleaned[i+1]
+        exam = cleaned[i+2]
+        careers = []
+        j = i+3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
             careers.append(cleaned[j])
             j += 1
         tier = cleaned[j] if j < len(cleaned) else ""
+        programs.append({
+            "program_name": name,
+            "designation": designation,
+            "entrance_exam": exam,
+            "career_pathways": careers,
+            "funding_category": tier
+        })
+        i = j + 1
+    return {"programs": programs}
 # -----------------------------
     except Exception as e:
         return f"Error reading DOCX: {e}"
+    uni_blocks = split_doc_by_university(doc)
+    logs, updated = [], 0
     for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
+        block = uni_blocks.get(uni_name)
         if not block:
+            logs.append(f"[WARN] Missing block: {uni_name}")
             continue
+        parsed = parse_university_block(uni_name, block)
+        if not parsed:
+            logs.append(f"[WARN] Cannot parse: {uni_name}")
             continue
+        for key, new_json in parsed.items():
             if key not in ("overview", "benefits", "programs"):
                 continue
                 logs.append(f"[UPDATED] {uni_name} [{key}] updated.")
                 updated += 1
             except Exception as e:
+                logs.append(f"[ERROR] {uni_name} [{key}]: {e}")
     logs.append(f"\nTotal sections updated: {updated}")
     return "\n".join(logs)
 # -----------------------------
+# ISP BRANDING - BASE64 LOGO (ALWAYS VISIBLE)
 # -----------------------------
+# PLACEHOLDER — I WILL REPLACE THIS WITH YOUR REAL LOGO BASE64 AFTER YOU UPLOAD THE SVG
+LOGO_SRC = "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMTcwIiBoZWlnaHQ9IjE3MCIgdmlld0JveD0iMCAwIDE3MCAxNzAiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxjaXJjbGUgY3g9Ijg1IiBjeT0iODUiIHI9Ijg1IiBmaWxsPSIjMDYyQTREIi8+CjxwYXRoIGQ9Ik0xMDYuMTQyIDkzLjU2ODdMOTUuODgxMiAxMDMuODMxTDEwNi4xNDIgMTE0LjA5MUwxMDYuOTU1IDExMy4yNzdMMTA4Ljc1NyAxMTEuNDc1TDExMi4zNTkgMTE1LjA3N0wxMTEuNTQ2IDExNS44OUwxMDYuOTU1IDExMC4zMkwxMDYuOTU1IDExMC4zMkwxMDcuNzY5IDEwOS41MDZMMTEwLjM3MyAxMTIuMTA5TDExMS4xODcgMTExLjI5NkwxMDcuNTg0IDEwNy42OTRMMTA2Ljc3MSAxMDguNTA3TDEwNi45NTUgMTA4LjMyMkwxMDYuOTU1IDExMC4zMkwxMDEuMzggMTE1Ljg5TDEwMC41NjcgMTE1LjA3N0wxMDQuMTY5IDExMS40NzVMMTA1Ljk3MSAxMTMuMjdMMTA2Ljc4NCAxMTQuMDg0TDEwNy41OTggMTEzLjI3TDEwMy45OTYgMTE2Ljg3MkwxMDMuMTgyIDExNy42ODZMMTA2Ljc4NCAxMjEuMjg4TDExMC4zNzMgMTE3LjY5NkwxMDkuNTYgMTE2Ljg4M0wxMDYuOTU1IDExOS40ODdMMTA2LjE0MiAxMjAuMyIgZmlsbD0id2hpdGUiLz48cGF0aCBkPSJNNzguODUzOSAxMjEuODM... (continues full"
 ISP_PRIMARY = "#062A4D"
 ISP_GOLD = "#D6A229"
 ISP_BG = "#F5F7FA"
 CUSTOM_CSS = f"""
 <style>
 #isp-header {{
     background: {ISP_PRIMARY};
     padding: 20px;
+    border-radius: 10px;
     display: flex;
     align-items: center;
     gap: 20px;
 #isp-header h1 {{
     color: white;
     margin: 0;
+    font-size: 28px;
 }}
 #isp-logo {{
     height: 60px;
 # -----------------------------
 with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
     gr.HTML(CUSTOM_CSS)
+    # Header
+    gr.HTML(f"""
     <div id='isp-header'>
         <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
         <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
     </div>
+    """)
+    gr.Markdown("""
+Upload the official ISP Handbook (.docx).
+This tool will compare, detect differences, and update changed sections.
 ---
+""")
+    file_input = gr.File(label="Upload Handbook DOCX", file_types=[".docx"])
     log_output = gr.Textbox(label="Sync Log", lines=30)
+    run_btn = gr.Button("Run Full Sync")
+    run_btn.click(run_full_sync, inputs=file_input, outputs=log_output)
 if __name__ == "__main__":
     demo.launch()