Spaces:

internationalscholarsprogram
/

docx-json-sync

Sleeping

App Files Files Community

internationalscholarsprogram commited on Dec 8, 2025

Commit

e6a016f

1 Parent(s): fbbe199

Make overview parsing robust to missing colons

Browse files

Files changed (1) hide show

app.py +51 -15

app.py CHANGED Viewed

@@ -86,13 +86,13 @@ def update_section_json(university_id: int, section_key: str, new_data: Dict[str
 # -----------------------------
 # DOCX PARSING HELPERS
 # -----------------------------
-def normalize_text(t):
     return " ".join(t.split()).strip()
-def split_doc_by_university(doc: Document):
     paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
-    indices = []
     for i, p in enumerate(paragraphs):
         for uni in UNIVERSITY_ID_MAP.keys():
             if p == uni or p.startswith(uni):
@@ -108,22 +108,58 @@ def split_doc_by_university(doc: Document):
 def parse_overview_block(block: List[str]) -> Dict[str, Any]:
     data: Dict[str, Any] = {}
-    for line in block:
-        if line.startswith("Founded:"):
-            data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
         elif line.startswith("Total Students"):
-            data["total_students"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
         elif "Postgraduate" in line:
-            digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             data["postgraduate_students"] = int(digits) if digits else None
         elif line.startswith("Acceptance rate"):
-            data["acceptance_rate"] = line.split(":", 1)[1].strip()
-        elif line.startswith("Location:"):
-            data["location"] = line.split(":", 1)[1].strip()
         elif "Tuition" in line:
-            digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
             data["tuition_out_of_state_yearly"] = int(digits) if digits else None
     return data
@@ -167,16 +203,17 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
     cleaned = [l for l in lines if l not in headers]
-    programs = []
     i = 0
     while i < len(cleaned):
         if len(cleaned) - i < 4:
             break
         name = cleaned[i]
         designation = cleaned[i + 1]
         exam = cleaned[i + 2]
-        careers = []
         j = i + 3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
             careers.append(cleaned[j])
@@ -211,7 +248,6 @@ def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Any]:
     benefits = parse_benefits_block(block)
     programs = parse_programs_block(block)
-    # If everything failed, return empty dict so caller can handle
     result: Dict[str, Any] = {}
     if overview:
         result["overview"] = overview

 # -----------------------------
 # DOCX PARSING HELPERS
 # -----------------------------
+def normalize_text(t: str) -> str:
     return " ".join(t.split()).strip()
+def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
     paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
+    indices: List[tuple[int, str]] = []
     for i, p in enumerate(paragraphs):
         for uni in UNIVERSITY_ID_MAP.keys():
             if p == uni or p.startswith(uni):
 def parse_overview_block(block: List[str]) -> Dict[str, Any]:
+    """
+    Parse the top 'overview' section (Founded, Total Students, etc.)
+    in a robust way that doesn't assume colons are always present.
+    """
     data: Dict[str, Any] = {}
+    def after_colon(line: str) -> str:
+        """Safely return the part after ':' if present, else empty string."""
+        parts = line.split(":", 1)
+        return parts[1].strip() if len(parts) > 1 else ""
+    for raw_line in block:
+        line = raw_line.strip()
+        if not line:
+            continue
+        # Founded
+        if line.startswith("Founded"):
+            value = after_colon(line) or line  # fallback to entire line
+            digits = re.sub(r"[^\d]", "", value)
+            if digits:
+                data["founded"] = int(digits)
+        # Total Students
         elif line.startswith("Total Students"):
+            value = after_colon(line) or line
+            digits = re.sub(r"[^\d]", "", value)
+            if digits:
+                data["total_students"] = int(digits)
+        # Postgraduate students
         elif "Postgraduate" in line:
+            value = after_colon(line) or line
+            digits = re.sub(r"[^\d]", "", value)
             data["postgraduate_students"] = int(digits) if digits else None
+        # Acceptance rate
         elif line.startswith("Acceptance rate"):
+            value = after_colon(line) or line
+            data["acceptance_rate"] = value
+        # Location
+        elif line.startswith("Location"):
+            value = after_colon(line) or line
+            data["location"] = value
+        # Tuition (out-of-state yearly)
         elif "Tuition" in line:
+            value = after_colon(line) or line
+            digits = re.sub(r"[^\d]", "", value)
             data["tuition_out_of_state_yearly"] = int(digits) if digits else None
     return data
     cleaned = [l for l in lines if l not in headers]
+    programs: List[Dict[str, Any]] = []
     i = 0
     while i < len(cleaned):
+        # Need at least 4 lines: name, designation, exam, at least one career/tier
         if len(cleaned) - i < 4:
             break
         name = cleaned[i]
         designation = cleaned[i + 1]
         exam = cleaned[i + 2]
+        careers: List[str] = []
         j = i + 3
         while j < len(cleaned) and not cleaned[j].startswith("TIER"):
             careers.append(cleaned[j])
     benefits = parse_benefits_block(block)
     programs = parse_programs_block(block)
     result: Dict[str, Any] = {}
     if overview:
         result["overview"] = overview