Spaces:

internationalscholarsprogram
/

docx-json-sync

Sleeping

App Files Files Community

internationalscholarsprogram commited on Dec 5, 2025

Commit

a81d369

1 Parent(s): 40200e1

Brand data pipeline UI with ISP logo

Browse files

Files changed (2) hide show

app.py +21 -61
assets/logo-DRvZB3HV.svg +0 -0

app.py CHANGED Viewed

@@ -146,7 +146,6 @@ def parse_overview_block(block: List[str]) -> Dict[str, Any]:
                 re.sub(r"[^\d]", "", line.split(":", 1)[1])
             )
         elif "Postgraduate" in line and "Students" in line:
-            # Some pages have 'Postgraduate students' or 'Postgraduate Students'
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             overview["postgraduate_students"] = int(digits) if digits else None
         elif line.startswith("Acceptance rate"):
@@ -192,7 +191,6 @@ def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
             "To qualify for The International Scholars Program",
         ],
     )
-    # Clean bullet style / stray punctuation
     benefits = [normalize_text(l) for l in benefits_lines if l]
     return {"benefits": benefits}
@@ -200,30 +198,7 @@ def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
 def parse_programs_block(block: List[str]) -> Dict[str, Any]:
     """
     Parse the 'Program table' portion.
-    We assume that after:
-      'To qualify for The International Scholars Program at <Uni>, you must be willing to study...'
-    we get repeated groups like:
-        Program
-        Designation
-        Entrance Exam Required
-        Examples of Career Pathways
-        Funding Category
-    But in the raw text, it often appears as:
-        MS Computer Science
-        STEM
-        Optional
-        Software Developer
-        Database Administrator
-        TIER 1
-    So we scan for the first occurrence of 'Program' header and then slice in chunks of 5-6 lines.
     """
-    # Grab everything after 'To qualify for ... you must be willing to study'
     program_lines = extract_between(
         block,
         start_marker="To qualify for The International Scholars Program",
@@ -238,11 +213,9 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
             "William Jessup University",
             "Wilkes University",
             "University of South Dakota",
-            # any other possible headings we might hit
         ],
     )
-    # Remove the header row if present
     header_keywords = {
         "Program",
         "Designation",
@@ -257,17 +230,9 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
             continue
         cleaned.append(line)
-    # Now group by 5-6 lines per program:
-    # 0: program_name
-    # 1: designation
-    # 2: entrance_exam
-    # 3: career_path_1
-    # 4: career_path_2 (optional, may be missing)
-    # 5: funding_category
     programs: List[Dict[str, Any]] = []
     i = 0
     while i < len(cleaned):
-        # Heuristic: we expect at least 4 lines ahead for a valid program
         remaining = len(cleaned) - i
         if remaining < 4:
             break
@@ -275,7 +240,6 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
         program_name = cleaned[i].strip()
         designation = cleaned[i + 1].strip() if remaining > 1 else ""
         entrance_exam = cleaned[i + 2].strip() if remaining > 2 else ""
-        # Next 1–2 lines are examples of career pathways until we hit something that looks like 'TIER'
         career_paths: List[str] = []
         j = i + 3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
@@ -294,7 +258,6 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
             }
         )
-        # Move index to element after funding_category
         i = j + 1
     return {"programs": programs}
@@ -306,19 +269,11 @@ def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Dict[st
       - overview
       - benefits
       - programs
-    Return dict:
-    {
-        "overview": {...},
-        "benefits": {...},
-        "programs": {...}
-    }
     """
     sections: Dict[str, Dict[str, Any]] = {}
     overview = parse_overview_block(block)
     if overview:
-        # Always include explicit name for safety
         overview.setdefault("university_name", uni_name)
         sections["overview"] = overview
@@ -371,7 +326,6 @@ def run_full_sync(docx_file) -> str:
             continue
         for section_key, new_data in parsed_sections.items():
-            # We ONLY touch sections sourced from handbook: overview, benefits, programs
             if section_key not in ("overview", "benefits", "programs"):
                 continue
@@ -382,13 +336,11 @@ def run_full_sync(docx_file) -> str:
                     f"Will only update if row exists."
                 )
-            # Compare with DeepDiff
             diff = DeepDiff(current_data or {}, new_data, ignore_order=True)
             if not diff:
                 logs.append(f"[OK] '{uni_name}' [{section_key}] – no change.")
                 continue
-            # Update DB
             try:
                 update_section_json(uni_id, section_key, new_data)
                 total_updates += 1
@@ -411,7 +363,14 @@ ISP_PRIMARY = "#062A4D"
 ISP_GOLD = "#D6A229"
 ISP_BG = "#F5F7FA"
 ISP_TEXT = "#333333"
-ISP_LOGO = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
 css = f"""
 #isp-header {{
@@ -445,16 +404,18 @@ with gr.Blocks(css=css, title="Automated Handbook Sync Data Pipeline") as demo:
     # Header with Logo + Title
     with gr.Row(elem_id="isp-header"):
-        gr.HTML(f"""
-            <img id='isp-logo' src='{ISP_LOGO}'/>
-            <h1>Automated Handbook Sync Data Pipeline</h1>
-        """)
     gr.Markdown(
-        f"""
-### Welcome to the ISP Handbook Sync System
-This internal tool fully automates:
 - Parsing university sections from the official ISP Handbook
 - Comparing extracted content with the **university_handbook_sections** table
@@ -463,21 +424,20 @@ This internal tool fully automates:
 ---
-#### **Instructions**
 1. Upload the complete **ISP Handbook (.docx)**
 2. Click **Run Full Sync**
 3. Review the logs to see which university sections were updated
-Only official handbook-sourced fields are updated:
 - `overview`
 - `benefits`
 - `programs`
 Other database sections (e.g., images) remain untouched.
----
-        """
     )
     file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])

                 re.sub(r"[^\d]", "", line.split(":", 1)[1])
             )
         elif "Postgraduate" in line and "Students" in line:
             digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             overview["postgraduate_students"] = int(digits) if digits else None
         elif line.startswith("Acceptance rate"):
             "To qualify for The International Scholars Program",
         ],
     )
     benefits = [normalize_text(l) for l in benefits_lines if l]
     return {"benefits": benefits}
 def parse_programs_block(block: List[str]) -> Dict[str, Any]:
     """
     Parse the 'Program table' portion.
     """
     program_lines = extract_between(
         block,
         start_marker="To qualify for The International Scholars Program",
             "William Jessup University",
             "Wilkes University",
             "University of South Dakota",
         ],
     )
     header_keywords = {
         "Program",
         "Designation",
             continue
         cleaned.append(line)
     programs: List[Dict[str, Any]] = []
     i = 0
     while i < len(cleaned):
         remaining = len(cleaned) - i
         if remaining < 4:
             break
         program_name = cleaned[i].strip()
         designation = cleaned[i + 1].strip() if remaining > 1 else ""
         entrance_exam = cleaned[i + 2].strip() if remaining > 2 else ""
         career_paths: List[str] = []
         j = i + 3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
             }
         )
         i = j + 1
     return {"programs": programs}
       - overview
       - benefits
       - programs
     """
     sections: Dict[str, Dict[str, Any]] = {}
     overview = parse_overview_block(block)
     if overview:
         overview.setdefault("university_name", uni_name)
         sections["overview"] = overview
             continue
         for section_key, new_data in parsed_sections.items():
             if section_key not in ("overview", "benefits", "programs"):
                 continue
                     f"Will only update if row exists."
                 )
             diff = DeepDiff(current_data or {}, new_data, ignore_order=True)
             if not diff:
                 logs.append(f"[OK] '{uni_name}' [{section_key}] – no change.")
                 continue
             try:
                 update_section_json(uni_id, section_key, new_data)
                 total_updates += 1
 ISP_GOLD = "#D6A229"
 ISP_BG = "#F5F7FA"
 ISP_TEXT = "#333333"
+# Prefer local logo file (you must add this file in your repo: assets/logo-DRvZB3HV.svg)
+LOCAL_LOGO_PATH = "assets/logo-DRvZB3HV.svg"
+if os.path.exists(LOCAL_LOGO_PATH):
+    ISP_LOGO_SRC = LOCAL_LOGO_PATH
+else:
+    # Fallback to remote logo if local file missing
+    ISP_LOGO_SRC = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
 css = f"""
 #isp-header {{
     # Header with Logo + Title
     with gr.Row(elem_id="isp-header"):
+        gr.HTML(
+            f"""
+            <img id='isp-logo' src='{ISP_LOGO_SRC}' alt='ISP Logo'/>
+            <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
+            """
+        )
     gr.Markdown(
+        """
+### Automated Handbook Sync Data Pipeline
+This internal ISP tool automates:
 - Parsing university sections from the official ISP Handbook
 - Comparing extracted content with the **university_handbook_sections** table
 ---
+#### Instructions
 1. Upload the complete **ISP Handbook (.docx)**
 2. Click **Run Full Sync**
 3. Review the logs to see which university sections were updated
+Only handbook-sourced fields are updated:
 - `overview`
 - `benefits`
 - `programs`
 Other database sections (e.g., images) remain untouched.
+"""
     )
     file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])

assets/logo-DRvZB3HV.svg ADDED Viewed