Commit
·
e6a016f
1
Parent(s):
fbbe199
Make overview parsing robust to missing colons
Browse files
app.py
CHANGED
|
@@ -86,13 +86,13 @@ def update_section_json(university_id: int, section_key: str, new_data: Dict[str
|
|
| 86 |
# -----------------------------
|
| 87 |
# DOCX PARSING HELPERS
|
| 88 |
# -----------------------------
|
| 89 |
-
def normalize_text(t):
|
| 90 |
return " ".join(t.split()).strip()
|
| 91 |
|
| 92 |
|
| 93 |
-
def split_doc_by_university(doc: Document):
|
| 94 |
paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
|
| 95 |
-
indices = []
|
| 96 |
for i, p in enumerate(paragraphs):
|
| 97 |
for uni in UNIVERSITY_ID_MAP.keys():
|
| 98 |
if p == uni or p.startswith(uni):
|
|
@@ -108,22 +108,58 @@ def split_doc_by_university(doc: Document):
|
|
| 108 |
|
| 109 |
|
| 110 |
def parse_overview_block(block: List[str]) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
data: Dict[str, Any] = {}
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
elif line.startswith("Total Students"):
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
elif "Postgraduate" in line:
|
| 118 |
-
|
|
|
|
| 119 |
data["postgraduate_students"] = int(digits) if digits else None
|
|
|
|
|
|
|
| 120 |
elif line.startswith("Acceptance rate"):
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
elif "Tuition" in line:
|
| 125 |
-
|
|
|
|
| 126 |
data["tuition_out_of_state_yearly"] = int(digits) if digits else None
|
|
|
|
| 127 |
return data
|
| 128 |
|
| 129 |
|
|
@@ -167,16 +203,17 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
|
|
| 167 |
|
| 168 |
cleaned = [l for l in lines if l not in headers]
|
| 169 |
|
| 170 |
-
programs = []
|
| 171 |
i = 0
|
| 172 |
while i < len(cleaned):
|
|
|
|
| 173 |
if len(cleaned) - i < 4:
|
| 174 |
break
|
| 175 |
|
| 176 |
name = cleaned[i]
|
| 177 |
designation = cleaned[i + 1]
|
| 178 |
exam = cleaned[i + 2]
|
| 179 |
-
careers = []
|
| 180 |
j = i + 3
|
| 181 |
while j < len(cleaned) and not cleaned[j].startswith("TIER"):
|
| 182 |
careers.append(cleaned[j])
|
|
@@ -211,7 +248,6 @@ def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Any]:
|
|
| 211 |
benefits = parse_benefits_block(block)
|
| 212 |
programs = parse_programs_block(block)
|
| 213 |
|
| 214 |
-
# If everything failed, return empty dict so caller can handle
|
| 215 |
result: Dict[str, Any] = {}
|
| 216 |
if overview:
|
| 217 |
result["overview"] = overview
|
|
|
|
| 86 |
# -----------------------------
|
| 87 |
# DOCX PARSING HELPERS
|
| 88 |
# -----------------------------
|
| 89 |
+
def normalize_text(t: str) -> str:
|
| 90 |
return " ".join(t.split()).strip()
|
| 91 |
|
| 92 |
|
| 93 |
+
def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
|
| 94 |
paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
|
| 95 |
+
indices: List[tuple[int, str]] = []
|
| 96 |
for i, p in enumerate(paragraphs):
|
| 97 |
for uni in UNIVERSITY_ID_MAP.keys():
|
| 98 |
if p == uni or p.startswith(uni):
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
def parse_overview_block(block: List[str]) -> Dict[str, Any]:
|
| 111 |
+
"""
|
| 112 |
+
Parse the top 'overview' section (Founded, Total Students, etc.)
|
| 113 |
+
in a robust way that doesn't assume colons are always present.
|
| 114 |
+
"""
|
| 115 |
data: Dict[str, Any] = {}
|
| 116 |
+
|
| 117 |
+
def after_colon(line: str) -> str:
|
| 118 |
+
"""Safely return the part after ':' if present, else empty string."""
|
| 119 |
+
parts = line.split(":", 1)
|
| 120 |
+
return parts[1].strip() if len(parts) > 1 else ""
|
| 121 |
+
|
| 122 |
+
for raw_line in block:
|
| 123 |
+
line = raw_line.strip()
|
| 124 |
+
if not line:
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
# Founded
|
| 128 |
+
if line.startswith("Founded"):
|
| 129 |
+
value = after_colon(line) or line # fallback to entire line
|
| 130 |
+
digits = re.sub(r"[^\d]", "", value)
|
| 131 |
+
if digits:
|
| 132 |
+
data["founded"] = int(digits)
|
| 133 |
+
|
| 134 |
+
# Total Students
|
| 135 |
elif line.startswith("Total Students"):
|
| 136 |
+
value = after_colon(line) or line
|
| 137 |
+
digits = re.sub(r"[^\d]", "", value)
|
| 138 |
+
if digits:
|
| 139 |
+
data["total_students"] = int(digits)
|
| 140 |
+
|
| 141 |
+
# Postgraduate students
|
| 142 |
elif "Postgraduate" in line:
|
| 143 |
+
value = after_colon(line) or line
|
| 144 |
+
digits = re.sub(r"[^\d]", "", value)
|
| 145 |
data["postgraduate_students"] = int(digits) if digits else None
|
| 146 |
+
|
| 147 |
+
# Acceptance rate
|
| 148 |
elif line.startswith("Acceptance rate"):
|
| 149 |
+
value = after_colon(line) or line
|
| 150 |
+
data["acceptance_rate"] = value
|
| 151 |
+
|
| 152 |
+
# Location
|
| 153 |
+
elif line.startswith("Location"):
|
| 154 |
+
value = after_colon(line) or line
|
| 155 |
+
data["location"] = value
|
| 156 |
+
|
| 157 |
+
# Tuition (out-of-state yearly)
|
| 158 |
elif "Tuition" in line:
|
| 159 |
+
value = after_colon(line) or line
|
| 160 |
+
digits = re.sub(r"[^\d]", "", value)
|
| 161 |
data["tuition_out_of_state_yearly"] = int(digits) if digits else None
|
| 162 |
+
|
| 163 |
return data
|
| 164 |
|
| 165 |
|
|
|
|
| 203 |
|
| 204 |
cleaned = [l for l in lines if l not in headers]
|
| 205 |
|
| 206 |
+
programs: List[Dict[str, Any]] = []
|
| 207 |
i = 0
|
| 208 |
while i < len(cleaned):
|
| 209 |
+
# Need at least 4 lines: name, designation, exam, at least one career/tier
|
| 210 |
if len(cleaned) - i < 4:
|
| 211 |
break
|
| 212 |
|
| 213 |
name = cleaned[i]
|
| 214 |
designation = cleaned[i + 1]
|
| 215 |
exam = cleaned[i + 2]
|
| 216 |
+
careers: List[str] = []
|
| 217 |
j = i + 3
|
| 218 |
while j < len(cleaned) and not cleaned[j].startswith("TIER"):
|
| 219 |
careers.append(cleaned[j])
|
|
|
|
| 248 |
benefits = parse_benefits_block(block)
|
| 249 |
programs = parse_programs_block(block)
|
| 250 |
|
|
|
|
| 251 |
result: Dict[str, Any] = {}
|
| 252 |
if overview:
|
| 253 |
result["overview"] = overview
|