File size: 10,777 Bytes
c9990de 928c304 a45863a 40200e1 c9990de a45863a c9990de a45863a c9990de a45863a c9990de a45863a c9990de fbbe199 a45863a 92080b9 a45863a fbbe199 c9990de 92080b9 a45863a 8c0596f c9990de fca9a55 c9990de 92080b9 fbbe199 a45863a 92080b9 fbbe199 c9990de 928c304 c9990de a45863a c9990de e6a016f fbbe199 c9990de e6a016f 92080b9 e6a016f a45863a 92080b9 c9990de a45863a c9990de fbbe199 92080b9 fbbe199 92080b9 a45863a fbbe199 e6a016f fbbe199 e6a016f 92080b9 e6a016f 92080b9 e6a016f 92080b9 e6a016f a45863a e6a016f fca9a55 e6a016f 92080b9 e6a016f 92080b9 a45863a fbbe199 a45863a 92080b9 a45863a 92080b9 a45863a 92080b9 a45863a fbbe199 92080b9 a45863a 92080b9 fbbe199 a45863a 92080b9 c9990de fbbe199 92080b9 a45863a 92080b9 fbbe199 a45863a fbbe199 fca9a55 92080b9 a45863a e6a016f fbbe199 a45863a e6a016f fca9a55 8c0596f fbbe199 92080b9 fbbe199 e6a016f fbbe199 a45863a 92080b9 a45863a 92080b9 8c0596f fbbe199 fca9a55 a45863a fca9a55 a45863a fbbe199 a45863a 92080b9 a45863a 92080b9 a45863a c9990de fbbe199 92080b9 c9990de 92080b9 a45863a fca9a55 fbbe199 a45863a fca9a55 a45863a fca9a55 a45863a fca9a55 a45863a fca9a55 92080b9 a45863a 92080b9 a45863a 92080b9 a45863a 92080b9 a45863a fca9a55 a45863a 92080b9 c9990de fca9a55 c9990de fbbe199 fca9a55 40200e1 a81d369 92080b9 40200e1 fca9a55 40200e1 92080b9 40200e1 fca9a55 40200e1 92080b9 40200e1 92080b9 40200e1 92080b9 8c0596f 92080b9 a45863a fca9a55 fbbe199 92080b9 fbbe199 40200e1 fbbe199 fca9a55 92080b9 fbbe199 8c0596f fca9a55 92080b9 fca9a55 c9990de fca9a55 40200e1 928c304 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 |
import os
import json
import re
from typing import Dict, Any, List
import gradio as gr
from docx import Document
from deepdiff import DeepDiff
import mysql.connector
# -----------------------------
# CONFIG: UNIVERSITY MAPPING
# -----------------------------
UNIVERSITY_ID_MAP = {
"Indiana University of Pennsylvania (IUP)": 1,
"Missouri State University": 2,
"University Of Kentucky (UK)": 3,
"University of Louisville (UofL)": 4,
"University of Delaware (UD)": 6,
"Grand Valley State University": 7,
"Quinnipiac University": 9,
"William Jessup University": 10,
"Wilkes University": 14,
"University of South Dakota (USD)": 16,
}
# -----------------------------
# DB CONNECTION HELPERS
# -----------------------------
def get_db_connection():
return mysql.connector.connect(
host=os.getenv("DB_HOST", "localhost"),
port=int(os.getenv("DB_PORT", "3306")),
user=os.getenv("DB_USER", "root"),
password=os.getenv("DB_PASSWORD", ""),
database=os.getenv("DB_NAME", ""),
)
def fetch_section_json(university_id: int, section_key: str):
conn = get_db_connection()
try:
cursor = conn.cursor()
cursor.execute(
"""
SELECT section_json
FROM university_handbook_sections
WHERE university_id=%s AND section_key=%s
LIMIT 1
""",
(university_id, section_key),
)
row = cursor.fetchone()
if not row or not row[0]:
return None
try:
return json.loads(row[0])
except Exception:
return None
finally:
cursor.close()
conn.close()
def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
conn = get_db_connection()
try:
cursor = conn.cursor()
new_json = json.dumps(new_data, ensure_ascii=False)
cursor.execute(
"""
UPDATE university_handbook_sections
SET section_json=%s
WHERE university_id=%s AND section_key=%s
""",
(new_json, university_id, section_key),
)
conn.commit()
finally:
cursor.close()
conn.close()
# -----------------------------
# DOCX PARSING HELPERS
# -----------------------------
def normalize_text(t: str) -> str:
return " ".join(t.split()).strip()
def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
indices: List[tuple[int, str]] = []
for i, p in enumerate(paragraphs):
for uni in UNIVERSITY_ID_MAP.keys():
if p == uni or p.startswith(uni):
indices.append((i, uni))
indices.sort(key=lambda x: x[0])
uni_blocks: Dict[str, List[str]] = {}
for idx, (start, uni_name) in enumerate(indices):
end = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
uni_blocks[uni_name] = paragraphs[start:end]
return uni_blocks
def parse_overview_block(block: List[str]) -> Dict[str, Any]:
"""
Parse the top 'overview' section (Founded, Total Students, etc.)
in a robust way that doesn't assume colons are always present.
"""
data: Dict[str, Any] = {}
def after_colon(line: str) -> str:
"""Safely return the part after ':' if present, else empty string."""
parts = line.split(":", 1)
return parts[1].strip() if len(parts) > 1 else ""
for raw_line in block:
line = raw_line.strip()
if not line:
continue
# Founded
if line.startswith("Founded"):
value = after_colon(line) or line # fallback to entire line
digits = re.sub(r"[^\d]", "", value)
if digits:
data["founded"] = int(digits)
# Total Students
elif line.startswith("Total Students"):
value = after_colon(line) or line
digits = re.sub(r"[^\d]", "", value)
if digits:
data["total_students"] = int(digits)
# Postgraduate students
elif "Postgraduate" in line:
value = after_colon(line) or line
digits = re.sub(r"[^\d]", "", value)
data["postgraduate_students"] = int(digits) if digits else None
# Acceptance rate
elif line.startswith("Acceptance rate"):
value = after_colon(line) or line
data["acceptance_rate"] = value
# Location
elif line.startswith("Location"):
value = after_colon(line) or line
data["location"] = value
# Tuition (out-of-state yearly)
elif "Tuition" in line:
value = after_colon(line) or line
digits = re.sub(r"[^\d]", "", value)
data["tuition_out_of_state_yearly"] = int(digits) if digits else None
return data
def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]:
out: List[str] = []
started = False
for line in block:
if not started and start in line:
started = True
continue
if started:
if any(s in line for s in stops):
break
if line.strip():
out.append(line)
return out
def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
lines = extract_between(
block,
"Benefits for ISP students at this school",
["To qualify for The International Scholars Program"],
)
return {"benefits": [normalize_text(l) for l in lines]}
def parse_programs_block(block: List[str]) -> Dict[str, Any]:
lines = extract_between(
block,
"To qualify for The International Scholars Program",
list(UNIVERSITY_ID_MAP.keys()),
)
headers = {
"Program",
"Designation",
"Entrance Exam Required",
"Examples of Career Pathways",
"Funding Category",
}
cleaned = [l for l in lines if l not in headers]
programs: List[Dict[str, Any]] = []
i = 0
while i < len(cleaned):
# Need at least 4 lines: name, designation, exam, at least one career/tier
if len(cleaned) - i < 4:
break
name = cleaned[i]
designation = cleaned[i + 1]
exam = cleaned[i + 2]
careers: List[str] = []
j = i + 3
while j < len(cleaned) and not cleaned[j].startswith("TIER"):
careers.append(cleaned[j])
j += 1
tier = cleaned[j] if j < len(cleaned) else ""
programs.append(
{
"program_name": name,
"designation": designation,
"entrance_exam": exam,
"career_pathways": careers,
"funding_category": tier,
}
)
i = j + 1
return {"programs": programs}
# -----------------------------
# HIGH-LEVEL UNIVERSITY PARSER
# -----------------------------
def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Any]:
"""
Given all lines for a single university block, return a dict with:
- overview
- benefits
- programs
"""
overview = parse_overview_block(block)
benefits = parse_benefits_block(block)
programs = parse_programs_block(block)
result: Dict[str, Any] = {}
if overview:
result["overview"] = overview
if benefits:
result["benefits"] = benefits
if programs:
result["programs"] = programs
return result
# -----------------------------
# MAIN SYNC LOGIC
# -----------------------------
def run_full_sync(docx_file):
if docx_file is None:
return "No handbook file uploaded."
try:
# Gradio File object usually has a .name (temp file path)
doc = Document(docx_file.name)
except Exception as e:
return f"Error reading DOCX: {e}"
uni_blocks = split_doc_by_university(doc)
logs: List[str] = []
updated = 0
for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
block = uni_blocks.get(uni_name)
if not block:
logs.append(f"[WARN] Missing block: {uni_name}")
continue
parsed = parse_university_block(uni_name, block)
if not parsed:
logs.append(f"[WARN] Cannot parse: {uni_name}")
continue
for key, new_json in parsed.items():
if key not in ("overview", "benefits", "programs"):
continue
old_json = fetch_section_json(uni_id, key)
diff = DeepDiff(old_json or {}, new_json, ignore_order=True)
if not diff:
logs.append(f"[OK] {uni_name} [{key}] unchanged.")
continue
try:
update_section_json(uni_id, key, new_json)
logs.append(f"[UPDATED] {uni_name} [{key}] updated.")
updated += 1
except Exception as e:
logs.append(f"[ERROR] {uni_name} [{key}]: {e}")
logs.append(f"\nTotal sections updated: {updated}")
return "\n".join(logs)
# -----------------------------
# ISP BRANDING - BASE64 LOGO (ALWAYS VISIBLE)
# -----------------------------
# TODO: Replace this with your real SVG base64
LOGO_SRC = "data:image/svg+xml;base64,..."
ISP_PRIMARY = "#062A4D"
ISP_GOLD = "#D6A229"
ISP_BG = "#F5F7FA"
CUSTOM_CSS = f"""
<style>
#isp-header {{
background: {ISP_PRIMARY};
padding: 20px;
border-radius: 10px;
display: flex;
align-items: center;
gap: 20px;
}}
#isp-header h1 {{
color: white;
margin: 0;
font-size: 28px;
}}
#isp-logo {{
height: 60px;
}}
button {{
background-color: {ISP_GOLD} !important;
color: black !important;
border-radius: 8px !important;
font-weight: bold !important;
}}
.gradio-container {{
background: {ISP_BG} !important;
}}
</style>
"""
# -----------------------------
# GRADIO UI
# -----------------------------
with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
gr.HTML(CUSTOM_CSS)
# Header
gr.HTML(
f"""
<div id='isp-header'>
<img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
<h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
</div>
"""
)
gr.Markdown(
"""
Upload the official ISP Handbook (.docx).
This tool will compare, detect differences, and update changed sections.
---
"""
)
file_input = gr.File(label="Upload Handbook DOCX", file_types=[".docx"])
log_output = gr.Textbox(label="Sync Log", lines=30)
run_btn = gr.Button("Run Full Sync")
run_btn.click(run_full_sync, inputs=file_input, outputs=log_output)
if __name__ == "__main__":
demo.launch()
|