internationalscholarsprogram commited on
Commit
a81d369
·
1 Parent(s): 40200e1

Brand data pipeline UI with ISP logo

Browse files
Files changed (2) hide show
  1. app.py +21 -61
  2. assets/logo-DRvZB3HV.svg +0 -0
app.py CHANGED
@@ -146,7 +146,6 @@ def parse_overview_block(block: List[str]) -> Dict[str, Any]:
146
  re.sub(r"[^\d]", "", line.split(":", 1)[1])
147
  )
148
  elif "Postgraduate" in line and "Students" in line:
149
- # Some pages have 'Postgraduate students' or 'Postgraduate Students'
150
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
151
  overview["postgraduate_students"] = int(digits) if digits else None
152
  elif line.startswith("Acceptance rate"):
@@ -192,7 +191,6 @@ def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
192
  "To qualify for The International Scholars Program",
193
  ],
194
  )
195
- # Clean bullet style / stray punctuation
196
  benefits = [normalize_text(l) for l in benefits_lines if l]
197
  return {"benefits": benefits}
198
 
@@ -200,30 +198,7 @@ def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
200
  def parse_programs_block(block: List[str]) -> Dict[str, Any]:
201
  """
202
  Parse the 'Program table' portion.
203
-
204
- We assume that after:
205
- 'To qualify for The International Scholars Program at <Uni>, you must be willing to study...'
206
-
207
- we get repeated groups like:
208
-
209
- Program
210
- Designation
211
- Entrance Exam Required
212
- Examples of Career Pathways
213
- Funding Category
214
-
215
- But in the raw text, it often appears as:
216
-
217
- MS Computer Science
218
- STEM
219
- Optional
220
- Software Developer
221
- Database Administrator
222
- TIER 1
223
-
224
- So we scan for the first occurrence of 'Program' header and then slice in chunks of 5-6 lines.
225
  """
226
- # Grab everything after 'To qualify for ... you must be willing to study'
227
  program_lines = extract_between(
228
  block,
229
  start_marker="To qualify for The International Scholars Program",
@@ -238,11 +213,9 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
238
  "William Jessup University",
239
  "Wilkes University",
240
  "University of South Dakota",
241
- # any other possible headings we might hit
242
  ],
243
  )
244
 
245
- # Remove the header row if present
246
  header_keywords = {
247
  "Program",
248
  "Designation",
@@ -257,17 +230,9 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
257
  continue
258
  cleaned.append(line)
259
 
260
- # Now group by 5-6 lines per program:
261
- # 0: program_name
262
- # 1: designation
263
- # 2: entrance_exam
264
- # 3: career_path_1
265
- # 4: career_path_2 (optional, may be missing)
266
- # 5: funding_category
267
  programs: List[Dict[str, Any]] = []
268
  i = 0
269
  while i < len(cleaned):
270
- # Heuristic: we expect at least 4 lines ahead for a valid program
271
  remaining = len(cleaned) - i
272
  if remaining < 4:
273
  break
@@ -275,7 +240,6 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
275
  program_name = cleaned[i].strip()
276
  designation = cleaned[i + 1].strip() if remaining > 1 else ""
277
  entrance_exam = cleaned[i + 2].strip() if remaining > 2 else ""
278
- # Next 1–2 lines are examples of career pathways until we hit something that looks like 'TIER'
279
  career_paths: List[str] = []
280
  j = i + 3
281
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
@@ -294,7 +258,6 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
294
  }
295
  )
296
 
297
- # Move index to element after funding_category
298
  i = j + 1
299
 
300
  return {"programs": programs}
@@ -306,19 +269,11 @@ def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Dict[st
306
  - overview
307
  - benefits
308
  - programs
309
-
310
- Return dict:
311
- {
312
- "overview": {...},
313
- "benefits": {...},
314
- "programs": {...}
315
- }
316
  """
317
  sections: Dict[str, Dict[str, Any]] = {}
318
 
319
  overview = parse_overview_block(block)
320
  if overview:
321
- # Always include explicit name for safety
322
  overview.setdefault("university_name", uni_name)
323
  sections["overview"] = overview
324
 
@@ -371,7 +326,6 @@ def run_full_sync(docx_file) -> str:
371
  continue
372
 
373
  for section_key, new_data in parsed_sections.items():
374
- # We ONLY touch sections sourced from handbook: overview, benefits, programs
375
  if section_key not in ("overview", "benefits", "programs"):
376
  continue
377
 
@@ -382,13 +336,11 @@ def run_full_sync(docx_file) -> str:
382
  f"Will only update if row exists."
383
  )
384
 
385
- # Compare with DeepDiff
386
  diff = DeepDiff(current_data or {}, new_data, ignore_order=True)
387
  if not diff:
388
  logs.append(f"[OK] '{uni_name}' [{section_key}] – no change.")
389
  continue
390
 
391
- # Update DB
392
  try:
393
  update_section_json(uni_id, section_key, new_data)
394
  total_updates += 1
@@ -411,7 +363,14 @@ ISP_PRIMARY = "#062A4D"
411
  ISP_GOLD = "#D6A229"
412
  ISP_BG = "#F5F7FA"
413
  ISP_TEXT = "#333333"
414
- ISP_LOGO = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
 
 
 
 
 
 
 
415
 
416
  css = f"""
417
  #isp-header {{
@@ -445,16 +404,18 @@ with gr.Blocks(css=css, title="Automated Handbook Sync Data Pipeline") as demo:
445
 
446
  # Header with Logo + Title
447
  with gr.Row(elem_id="isp-header"):
448
- gr.HTML(f"""
449
- <img id='isp-logo' src='{ISP_LOGO}'/>
450
- <h1>Automated Handbook Sync Data Pipeline</h1>
451
- """)
 
 
452
 
453
  gr.Markdown(
454
- f"""
455
- ### Welcome to the ISP Handbook Sync System
456
 
457
- This internal tool fully automates:
458
 
459
  - Parsing university sections from the official ISP Handbook
460
  - Comparing extracted content with the **university_handbook_sections** table
@@ -463,21 +424,20 @@ This internal tool fully automates:
463
 
464
  ---
465
 
466
- #### **Instructions**
467
 
468
  1. Upload the complete **ISP Handbook (.docx)**
469
  2. Click **Run Full Sync**
470
  3. Review the logs to see which university sections were updated
471
 
472
- Only official handbook-sourced fields are updated:
 
473
  - `overview`
474
  - `benefits`
475
  - `programs`
476
 
477
  Other database sections (e.g., images) remain untouched.
478
-
479
- ---
480
- """
481
  )
482
 
483
  file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
 
146
  re.sub(r"[^\d]", "", line.split(":", 1)[1])
147
  )
148
  elif "Postgraduate" in line and "Students" in line:
 
149
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
150
  overview["postgraduate_students"] = int(digits) if digits else None
151
  elif line.startswith("Acceptance rate"):
 
191
  "To qualify for The International Scholars Program",
192
  ],
193
  )
 
194
  benefits = [normalize_text(l) for l in benefits_lines if l]
195
  return {"benefits": benefits}
196
 
 
198
  def parse_programs_block(block: List[str]) -> Dict[str, Any]:
199
  """
200
  Parse the 'Program table' portion.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  """
 
202
  program_lines = extract_between(
203
  block,
204
  start_marker="To qualify for The International Scholars Program",
 
213
  "William Jessup University",
214
  "Wilkes University",
215
  "University of South Dakota",
 
216
  ],
217
  )
218
 
 
219
  header_keywords = {
220
  "Program",
221
  "Designation",
 
230
  continue
231
  cleaned.append(line)
232
 
 
 
 
 
 
 
 
233
  programs: List[Dict[str, Any]] = []
234
  i = 0
235
  while i < len(cleaned):
 
236
  remaining = len(cleaned) - i
237
  if remaining < 4:
238
  break
 
240
  program_name = cleaned[i].strip()
241
  designation = cleaned[i + 1].strip() if remaining > 1 else ""
242
  entrance_exam = cleaned[i + 2].strip() if remaining > 2 else ""
 
243
  career_paths: List[str] = []
244
  j = i + 3
245
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
 
258
  }
259
  )
260
 
 
261
  i = j + 1
262
 
263
  return {"programs": programs}
 
269
  - overview
270
  - benefits
271
  - programs
 
 
 
 
 
 
 
272
  """
273
  sections: Dict[str, Dict[str, Any]] = {}
274
 
275
  overview = parse_overview_block(block)
276
  if overview:
 
277
  overview.setdefault("university_name", uni_name)
278
  sections["overview"] = overview
279
 
 
326
  continue
327
 
328
  for section_key, new_data in parsed_sections.items():
 
329
  if section_key not in ("overview", "benefits", "programs"):
330
  continue
331
 
 
336
  f"Will only update if row exists."
337
  )
338
 
 
339
  diff = DeepDiff(current_data or {}, new_data, ignore_order=True)
340
  if not diff:
341
  logs.append(f"[OK] '{uni_name}' [{section_key}] – no change.")
342
  continue
343
 
 
344
  try:
345
  update_section_json(uni_id, section_key, new_data)
346
  total_updates += 1
 
363
  ISP_GOLD = "#D6A229"
364
  ISP_BG = "#F5F7FA"
365
  ISP_TEXT = "#333333"
366
+
367
+ # Prefer local logo file (you must add this file in your repo: assets/logo-DRvZB3HV.svg)
368
+ LOCAL_LOGO_PATH = "assets/logo-DRvZB3HV.svg"
369
+ if os.path.exists(LOCAL_LOGO_PATH):
370
+ ISP_LOGO_SRC = LOCAL_LOGO_PATH
371
+ else:
372
+ # Fallback to remote logo if local file missing
373
+ ISP_LOGO_SRC = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
374
 
375
  css = f"""
376
  #isp-header {{
 
404
 
405
  # Header with Logo + Title
406
  with gr.Row(elem_id="isp-header"):
407
+ gr.HTML(
408
+ f"""
409
+ <img id='isp-logo' src='{ISP_LOGO_SRC}' alt='ISP Logo'/>
410
+ <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
411
+ """
412
+ )
413
 
414
  gr.Markdown(
415
+ """
416
+ ### Automated Handbook Sync Data Pipeline
417
 
418
+ This internal ISP tool automates:
419
 
420
  - Parsing university sections from the official ISP Handbook
421
  - Comparing extracted content with the **university_handbook_sections** table
 
424
 
425
  ---
426
 
427
+ #### Instructions
428
 
429
  1. Upload the complete **ISP Handbook (.docx)**
430
  2. Click **Run Full Sync**
431
  3. Review the logs to see which university sections were updated
432
 
433
+ Only handbook-sourced fields are updated:
434
+
435
  - `overview`
436
  - `benefits`
437
  - `programs`
438
 
439
  Other database sections (e.g., images) remain untouched.
440
+ """
 
 
441
  )
442
 
443
  file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
assets/logo-DRvZB3HV.svg ADDED