Commit
·
40200e1
1
Parent(s):
a45863a
Brand ISP Automated Handbook Sync Data Pipeline
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import re
|
| 4 |
-
from typing import Dict, Any, List
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
from docx import Document
|
|
@@ -243,14 +243,21 @@ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
|
|
| 243 |
)
|
| 244 |
|
| 245 |
# Remove the header row if present
|
| 246 |
-
header_keywords = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
cleaned: List[str] = []
|
| 248 |
for line in program_lines:
|
| 249 |
if line in header_keywords:
|
| 250 |
continue
|
| 251 |
cleaned.append(line)
|
| 252 |
|
| 253 |
-
# Now group by 5-6 lines per program:
|
| 254 |
# 0: program_name
|
| 255 |
# 1: designation
|
| 256 |
# 2: entrance_exam
|
|
@@ -370,9 +377,6 @@ def run_full_sync(docx_file) -> str:
|
|
| 370 |
|
| 371 |
current_data = fetch_section_json(uni_id, section_key)
|
| 372 |
if current_data is None:
|
| 373 |
-
# No existing record or invalid JSON – we still require that the row exists;
|
| 374 |
-
# if not, we just log and skip.
|
| 375 |
-
# If you want to INSERT missing rows, you can add that logic here.
|
| 376 |
logs.append(
|
| 377 |
f"[INFO] No existing JSON for uni_id={uni_id}, section_key='{section_key}'. "
|
| 378 |
f"Will only update if row exists."
|
|
@@ -401,32 +405,84 @@ def run_full_sync(docx_file) -> str:
|
|
| 401 |
|
| 402 |
|
| 403 |
# -----------------------------
|
| 404 |
-
# GRADIO UI
|
| 405 |
# -----------------------------
|
| 406 |
-
|
| 407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
gr.Markdown(
|
| 409 |
-
"""
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
-
|
| 414 |
-
2. Extract **Overview**, **Benefits**, and **Programs** sections
|
| 415 |
-
3. Compare them with `university_handbook_sections.section_json`
|
| 416 |
-
4. Update only rows that have changed
|
| 417 |
|
| 418 |
-
|
| 419 |
-
- `overview`
|
| 420 |
-
- `benefits`
|
| 421 |
-
- `programs`
|
| 422 |
|
| 423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
"""
|
| 425 |
)
|
| 426 |
|
| 427 |
file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
|
| 428 |
|
| 429 |
-
sync_button = gr.Button("Run
|
| 430 |
log_output = gr.Textbox(
|
| 431 |
label="Sync Log",
|
| 432 |
lines=30,
|
|
@@ -439,5 +495,6 @@ Sections like `campus_image` / `image` are **never updated** here.
|
|
| 439 |
outputs=log_output,
|
| 440 |
)
|
| 441 |
|
|
|
|
| 442 |
if __name__ == "__main__":
|
| 443 |
demo.launch()
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import re
|
| 4 |
+
from typing import Dict, Any, List
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
from docx import Document
|
|
|
|
| 243 |
)
|
| 244 |
|
| 245 |
# Remove the header row if present
|
| 246 |
+
header_keywords = {
|
| 247 |
+
"Program",
|
| 248 |
+
"Designation",
|
| 249 |
+
"Entrance Exam Required",
|
| 250 |
+
"Entrance Examination",
|
| 251 |
+
"Examples of Career Pathways",
|
| 252 |
+
"Funding Category",
|
| 253 |
+
}
|
| 254 |
cleaned: List[str] = []
|
| 255 |
for line in program_lines:
|
| 256 |
if line in header_keywords:
|
| 257 |
continue
|
| 258 |
cleaned.append(line)
|
| 259 |
|
| 260 |
+
# Now group by 5-6 lines per program:
|
| 261 |
# 0: program_name
|
| 262 |
# 1: designation
|
| 263 |
# 2: entrance_exam
|
|
|
|
| 377 |
|
| 378 |
current_data = fetch_section_json(uni_id, section_key)
|
| 379 |
if current_data is None:
|
|
|
|
|
|
|
|
|
|
| 380 |
logs.append(
|
| 381 |
f"[INFO] No existing JSON for uni_id={uni_id}, section_key='{section_key}'. "
|
| 382 |
f"Will only update if row exists."
|
|
|
|
| 405 |
|
| 406 |
|
| 407 |
# -----------------------------
|
| 408 |
+
# ISP BRANDING & GRADIO UI
|
| 409 |
# -----------------------------
|
| 410 |
+
ISP_PRIMARY = "#062A4D"
|
| 411 |
+
ISP_GOLD = "#D6A229"
|
| 412 |
+
ISP_BG = "#F5F7FA"
|
| 413 |
+
ISP_TEXT = "#333333"
|
| 414 |
+
ISP_LOGO = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
|
| 415 |
+
|
| 416 |
+
css = f"""
|
| 417 |
+
#isp-header {{
|
| 418 |
+
background: {ISP_PRIMARY};
|
| 419 |
+
padding: 20px;
|
| 420 |
+
border-radius: 6px;
|
| 421 |
+
display: flex;
|
| 422 |
+
align-items: center;
|
| 423 |
+
gap: 20px;
|
| 424 |
+
}}
|
| 425 |
+
#isp-header h1 {{
|
| 426 |
+
color: white !important;
|
| 427 |
+
font-size: 28px !important;
|
| 428 |
+
margin: 0;
|
| 429 |
+
}}
|
| 430 |
+
#isp-logo {{
|
| 431 |
+
height: 60px;
|
| 432 |
+
}}
|
| 433 |
+
.gradio-container {{
|
| 434 |
+
background: {ISP_BG} !important;
|
| 435 |
+
}}
|
| 436 |
+
button {{
|
| 437 |
+
background-color: {ISP_GOLD} !important;
|
| 438 |
+
color: black !important;
|
| 439 |
+
font-weight: bold !important;
|
| 440 |
+
border-radius: 8px !important;
|
| 441 |
+
}}
|
| 442 |
+
"""
|
| 443 |
+
|
| 444 |
+
with gr.Blocks(css=css, title="Automated Handbook Sync Data Pipeline") as demo:
|
| 445 |
+
|
| 446 |
+
# Header with Logo + Title
|
| 447 |
+
with gr.Row(elem_id="isp-header"):
|
| 448 |
+
gr.HTML(f"""
|
| 449 |
+
<img id='isp-logo' src='{ISP_LOGO}'/>
|
| 450 |
+
<h1>Automated Handbook Sync Data Pipeline</h1>
|
| 451 |
+
""")
|
| 452 |
+
|
| 453 |
gr.Markdown(
|
| 454 |
+
f"""
|
| 455 |
+
### Welcome to the ISP Handbook Sync System
|
| 456 |
+
|
| 457 |
+
This internal tool fully automates:
|
| 458 |
+
|
| 459 |
+
- Parsing university sections from the official ISP Handbook
|
| 460 |
+
- Comparing extracted content with the **university_handbook_sections** table
|
| 461 |
+
- Updating only fields that have changed
|
| 462 |
+
- Maintaining data uniformity and reducing manual effort
|
| 463 |
|
| 464 |
+
---
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
+
#### **Instructions**
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
+
1. Upload the complete **ISP Handbook (.docx)**
|
| 469 |
+
2. Click **Run Full Sync**
|
| 470 |
+
3. Review the logs to see which university sections were updated
|
| 471 |
+
|
| 472 |
+
Only official handbook-sourced fields are updated:
|
| 473 |
+
- `overview`
|
| 474 |
+
- `benefits`
|
| 475 |
+
- `programs`
|
| 476 |
+
|
| 477 |
+
Other database sections (e.g., images) remain untouched.
|
| 478 |
+
|
| 479 |
+
---
|
| 480 |
"""
|
| 481 |
)
|
| 482 |
|
| 483 |
file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
|
| 484 |
|
| 485 |
+
sync_button = gr.Button("Run Full Sync")
|
| 486 |
log_output = gr.Textbox(
|
| 487 |
label="Sync Log",
|
| 488 |
lines=30,
|
|
|
|
| 495 |
outputs=log_output,
|
| 496 |
)
|
| 497 |
|
| 498 |
+
|
| 499 |
if __name__ == "__main__":
|
| 500 |
demo.launch()
|