internationalscholarsprogram commited on
Commit
40200e1
·
1 Parent(s): a45863a

Brand ISP Automated Handbook Sync Data Pipeline

Browse files
Files changed (1) hide show
  1. app.py +79 -22
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import json
3
  import re
4
- from typing import Dict, Any, List, Tuple
5
 
6
  import gradio as gr
7
  from docx import Document
@@ -243,14 +243,21 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
243
  )
244
 
245
  # Remove the header row if present
246
- header_keywords = {"Program", "Designation", "Entrance Exam Required", "Entrance Examination", "Examples of Career Pathways", "Funding Category"}
 
 
 
 
 
 
 
247
  cleaned: List[str] = []
248
  for line in program_lines:
249
  if line in header_keywords:
250
  continue
251
  cleaned.append(line)
252
 
253
- # Now group by 5-6 lines per program:
254
  # 0: program_name
255
  # 1: designation
256
  # 2: entrance_exam
@@ -370,9 +377,6 @@ def run_full_sync(docx_file) -> str:
370
 
371
  current_data = fetch_section_json(uni_id, section_key)
372
  if current_data is None:
373
- # No existing record or invalid JSON – we still require that the row exists;
374
- # if not, we just log and skip.
375
- # If you want to INSERT missing rows, you can add that logic here.
376
  logs.append(
377
  f"[INFO] No existing JSON for uni_id={uni_id}, section_key='{section_key}'. "
378
  f"Will only update if row exists."
@@ -401,32 +405,84 @@ def run_full_sync(docx_file) -> str:
401
 
402
 
403
  # -----------------------------
404
- # GRADIO UI
405
  # -----------------------------
406
- with gr.Blocks() as demo:
407
- gr.Markdown("# ISP Handbook → Database Sync (Full Auto)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  gr.Markdown(
409
- """
410
- Upload the **full ISP Handbook DOCX**.
411
- On **Run full sync**, the app will:
 
 
 
 
 
 
412
 
413
- 1. Parse each university block from the handbook
414
- 2. Extract **Overview**, **Benefits**, and **Programs** sections
415
- 3. Compare them with `university_handbook_sections.section_json`
416
- 4. Update only rows that have changed
417
 
418
- Only sections that are sourced from the handbook are touched:
419
- - `overview`
420
- - `benefits`
421
- - `programs`
422
 
423
- Sections like `campus_image` / `image` are **never updated** here.
 
 
 
 
 
 
 
 
 
 
 
424
  """
425
  )
426
 
427
  file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
428
 
429
- sync_button = gr.Button("Run full sync")
430
  log_output = gr.Textbox(
431
  label="Sync Log",
432
  lines=30,
@@ -439,5 +495,6 @@ Sections like `campus_image` / `image` are **never updated** here.
439
  outputs=log_output,
440
  )
441
 
 
442
  if __name__ == "__main__":
443
  demo.launch()
 
1
  import os
2
  import json
3
  import re
4
+ from typing import Dict, Any, List
5
 
6
  import gradio as gr
7
  from docx import Document
 
243
  )
244
 
245
  # Remove the header row if present
246
+ header_keywords = {
247
+ "Program",
248
+ "Designation",
249
+ "Entrance Exam Required",
250
+ "Entrance Examination",
251
+ "Examples of Career Pathways",
252
+ "Funding Category",
253
+ }
254
  cleaned: List[str] = []
255
  for line in program_lines:
256
  if line in header_keywords:
257
  continue
258
  cleaned.append(line)
259
 
260
+ # Now group by 5-6 lines per program:
261
  # 0: program_name
262
  # 1: designation
263
  # 2: entrance_exam
 
377
 
378
  current_data = fetch_section_json(uni_id, section_key)
379
  if current_data is None:
 
 
 
380
  logs.append(
381
  f"[INFO] No existing JSON for uni_id={uni_id}, section_key='{section_key}'. "
382
  f"Will only update if row exists."
 
405
 
406
 
407
  # -----------------------------
408
+ # ISP BRANDING & GRADIO UI
409
  # -----------------------------
410
+ ISP_PRIMARY = "#062A4D"
411
+ ISP_GOLD = "#D6A229"
412
+ ISP_BG = "#F5F7FA"
413
+ ISP_TEXT = "#333333"
414
+ ISP_LOGO = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
415
+
416
+ css = f"""
417
+ #isp-header {{
418
+ background: {ISP_PRIMARY};
419
+ padding: 20px;
420
+ border-radius: 6px;
421
+ display: flex;
422
+ align-items: center;
423
+ gap: 20px;
424
+ }}
425
+ #isp-header h1 {{
426
+ color: white !important;
427
+ font-size: 28px !important;
428
+ margin: 0;
429
+ }}
430
+ #isp-logo {{
431
+ height: 60px;
432
+ }}
433
+ .gradio-container {{
434
+ background: {ISP_BG} !important;
435
+ }}
436
+ button {{
437
+ background-color: {ISP_GOLD} !important;
438
+ color: black !important;
439
+ font-weight: bold !important;
440
+ border-radius: 8px !important;
441
+ }}
442
+ """
443
+
444
+ with gr.Blocks(css=css, title="Automated Handbook Sync Data Pipeline") as demo:
445
+
446
+ # Header with Logo + Title
447
+ with gr.Row(elem_id="isp-header"):
448
+ gr.HTML(f"""
449
+ <img id='isp-logo' src='{ISP_LOGO}'/>
450
+ <h1>Automated Handbook Sync Data Pipeline</h1>
451
+ """)
452
+
453
  gr.Markdown(
454
+ f"""
455
+ ### Welcome to the ISP Handbook Sync System
456
+
457
+ This internal tool fully automates:
458
+
459
+ - Parsing university sections from the official ISP Handbook
460
+ - Comparing extracted content with the **university_handbook_sections** table
461
+ - Updating only fields that have changed
462
+ - Maintaining data uniformity and reducing manual effort
463
 
464
+ ---
 
 
 
465
 
466
+ #### **Instructions**
 
 
 
467
 
468
+ 1. Upload the complete **ISP Handbook (.docx)**
469
+ 2. Click **Run Full Sync**
470
+ 3. Review the logs to see which university sections were updated
471
+
472
+ Only official handbook-sourced fields are updated:
473
+ - `overview`
474
+ - `benefits`
475
+ - `programs`
476
+
477
+ Other database sections (e.g., images) remain untouched.
478
+
479
+ ---
480
  """
481
  )
482
 
483
  file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
484
 
485
+ sync_button = gr.Button("Run Full Sync")
486
  log_output = gr.Textbox(
487
  label="Sync Log",
488
  lines=30,
 
495
  outputs=log_output,
496
  )
497
 
498
+
499
  if __name__ == "__main__":
500
  demo.launch()