internationalscholarsprogram commited on
Commit
e6a016f
·
1 Parent(s): fbbe199

Make overview parsing robust to missing colons

Browse files
Files changed (1) hide show
  1. app.py +51 -15
app.py CHANGED
@@ -86,13 +86,13 @@ def update_section_json(university_id: int, section_key: str, new_data: Dict[str
86
  # -----------------------------
87
  # DOCX PARSING HELPERS
88
  # -----------------------------
89
- def normalize_text(t):
90
  return " ".join(t.split()).strip()
91
 
92
 
93
- def split_doc_by_university(doc: Document):
94
  paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
95
- indices = []
96
  for i, p in enumerate(paragraphs):
97
  for uni in UNIVERSITY_ID_MAP.keys():
98
  if p == uni or p.startswith(uni):
@@ -108,22 +108,58 @@ def split_doc_by_university(doc: Document):
108
 
109
 
110
  def parse_overview_block(block: List[str]) -> Dict[str, Any]:
 
 
 
 
111
  data: Dict[str, Any] = {}
112
- for line in block:
113
- if line.startswith("Founded:"):
114
- data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  elif line.startswith("Total Students"):
116
- data["total_students"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
 
 
 
 
 
117
  elif "Postgraduate" in line:
118
- digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
 
119
  data["postgraduate_students"] = int(digits) if digits else None
 
 
120
  elif line.startswith("Acceptance rate"):
121
- data["acceptance_rate"] = line.split(":", 1)[1].strip()
122
- elif line.startswith("Location:"):
123
- data["location"] = line.split(":", 1)[1].strip()
 
 
 
 
 
 
124
  elif "Tuition" in line:
125
- digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
 
126
  data["tuition_out_of_state_yearly"] = int(digits) if digits else None
 
127
  return data
128
 
129
 
@@ -167,16 +203,17 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
167
 
168
  cleaned = [l for l in lines if l not in headers]
169
 
170
- programs = []
171
  i = 0
172
  while i < len(cleaned):
 
173
  if len(cleaned) - i < 4:
174
  break
175
 
176
  name = cleaned[i]
177
  designation = cleaned[i + 1]
178
  exam = cleaned[i + 2]
179
- careers = []
180
  j = i + 3
181
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
182
  careers.append(cleaned[j])
@@ -211,7 +248,6 @@ def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Any]:
211
  benefits = parse_benefits_block(block)
212
  programs = parse_programs_block(block)
213
 
214
- # If everything failed, return empty dict so caller can handle
215
  result: Dict[str, Any] = {}
216
  if overview:
217
  result["overview"] = overview
 
86
  # -----------------------------
87
  # DOCX PARSING HELPERS
88
  # -----------------------------
89
+ def normalize_text(t: str) -> str:
90
  return " ".join(t.split()).strip()
91
 
92
 
93
+ def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
94
  paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
95
+ indices: List[tuple[int, str]] = []
96
  for i, p in enumerate(paragraphs):
97
  for uni in UNIVERSITY_ID_MAP.keys():
98
  if p == uni or p.startswith(uni):
 
108
 
109
 
110
  def parse_overview_block(block: List[str]) -> Dict[str, Any]:
111
+ """
112
+ Parse the top 'overview' section (Founded, Total Students, etc.)
113
+ in a robust way that doesn't assume colons are always present.
114
+ """
115
  data: Dict[str, Any] = {}
116
+
117
+ def after_colon(line: str) -> str:
118
+ """Safely return the part after ':' if present, else empty string."""
119
+ parts = line.split(":", 1)
120
+ return parts[1].strip() if len(parts) > 1 else ""
121
+
122
+ for raw_line in block:
123
+ line = raw_line.strip()
124
+ if not line:
125
+ continue
126
+
127
+ # Founded
128
+ if line.startswith("Founded"):
129
+ value = after_colon(line) or line # fallback to entire line
130
+ digits = re.sub(r"[^\d]", "", value)
131
+ if digits:
132
+ data["founded"] = int(digits)
133
+
134
+ # Total Students
135
  elif line.startswith("Total Students"):
136
+ value = after_colon(line) or line
137
+ digits = re.sub(r"[^\d]", "", value)
138
+ if digits:
139
+ data["total_students"] = int(digits)
140
+
141
+ # Postgraduate students
142
  elif "Postgraduate" in line:
143
+ value = after_colon(line) or line
144
+ digits = re.sub(r"[^\d]", "", value)
145
  data["postgraduate_students"] = int(digits) if digits else None
146
+
147
+ # Acceptance rate
148
  elif line.startswith("Acceptance rate"):
149
+ value = after_colon(line) or line
150
+ data["acceptance_rate"] = value
151
+
152
+ # Location
153
+ elif line.startswith("Location"):
154
+ value = after_colon(line) or line
155
+ data["location"] = value
156
+
157
+ # Tuition (out-of-state yearly)
158
  elif "Tuition" in line:
159
+ value = after_colon(line) or line
160
+ digits = re.sub(r"[^\d]", "", value)
161
  data["tuition_out_of_state_yearly"] = int(digits) if digits else None
162
+
163
  return data
164
 
165
 
 
203
 
204
  cleaned = [l for l in lines if l not in headers]
205
 
206
+ programs: List[Dict[str, Any]] = []
207
  i = 0
208
  while i < len(cleaned):
209
+ # Need at least 4 lines: name, designation, exam, at least one career/tier
210
  if len(cleaned) - i < 4:
211
  break
212
 
213
  name = cleaned[i]
214
  designation = cleaned[i + 1]
215
  exam = cleaned[i + 2]
216
+ careers: List[str] = []
217
  j = i + 3
218
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
219
  careers.append(cleaned[j])
 
248
  benefits = parse_benefits_block(block)
249
  programs = parse_programs_block(block)
250
 
 
251
  result: Dict[str, Any] = {}
252
  if overview:
253
  result["overview"] = overview