internationalscholarsprogram commited on
Commit
fca9a55
·
1 Parent(s): 8c0596f

Update ISP handbook data pipeline UI and logo

Browse files
Files changed (1) hide show
  1. app.py +63 -120
app.py CHANGED
@@ -43,15 +43,12 @@ def fetch_section_json(university_id: int, section_key: str):
43
  conn = get_db_connection()
44
  try:
45
  cursor = conn.cursor()
46
- cursor.execute(
47
- """
48
  SELECT section_json
49
  FROM university_handbook_sections
50
  WHERE university_id=%s AND section_key=%s
51
  LIMIT 1
52
- """,
53
- (university_id, section_key),
54
- )
55
  row = cursor.fetchone()
56
  if not row or not row[0]:
57
  return None
@@ -64,21 +61,16 @@ def fetch_section_json(university_id: int, section_key: str):
64
  conn.close()
65
 
66
 
67
- def update_section_json(
68
- university_id: int, section_key: str, new_data: Dict[str, Any]
69
- ):
70
  conn = get_db_connection()
71
  try:
72
  cursor = conn.cursor()
73
  new_json = json.dumps(new_data, ensure_ascii=False)
74
- cursor.execute(
75
- """
76
  UPDATE university_handbook_sections
77
  SET section_json=%s
78
  WHERE university_id=%s AND section_key=%s
79
- """,
80
- (new_json, university_id, section_key),
81
- )
82
  conn.commit()
83
  finally:
84
  cursor.close()
@@ -88,14 +80,12 @@ def update_section_json(
88
  # -----------------------------
89
  # DOCX PARSING HELPERS
90
  # -----------------------------
91
- def normalize_text(t: str) -> str:
92
- return " ".join(t.split()).strip()
93
 
94
 
95
- def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
96
  paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
97
- indices: List[tuple[int, str]] = []
98
-
99
  for i, p in enumerate(paragraphs):
100
  for uni in UNIVERSITY_ID_MAP.keys():
101
  if p == uni or p.startswith(uni):
@@ -103,22 +93,20 @@ def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
103
 
104
  indices.sort(key=lambda x: x[0])
105
 
106
- uni_blocks: Dict[str, List[str]] = {}
107
  for idx, (start, uni_name) in enumerate(indices):
108
- end = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
109
  uni_blocks[uni_name] = paragraphs[start:end]
110
  return uni_blocks
111
 
112
 
113
- def parse_overview_block(block: List[str]) -> Dict[str, Any]:
114
- data: Dict[str, Any] = {}
115
  for line in block:
116
  if line.startswith("Founded:"):
117
  data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
118
  elif line.startswith("Total Students"):
119
- data["total_students"] = int(
120
- re.sub(r"[^\d]", "", line.split(":", 1)[1])
121
- )
122
  elif "Postgraduate" in line:
123
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
124
  data["postgraduate_students"] = int(digits) if digits else None
@@ -126,15 +114,14 @@ def parse_overview_block(block: List[str]) -> Dict[str, Any]:
126
  data["acceptance_rate"] = line.split(":", 1)[1].strip()
127
  elif line.startswith("Location:"):
128
  data["location"] = line.split(":", 1)[1].strip()
129
- elif "Tuition" in line or "Yearly Out of State Tuition" in line:
130
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
131
  data["tuition_out_of_state_yearly"] = int(digits) if digits else None
132
  return data
133
 
134
 
135
- def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]:
136
- out: List[str] = []
137
- started = False
138
  for line in block:
139
  if not started and start in line:
140
  started = True
@@ -147,75 +134,50 @@ def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]
147
  return out
148
 
149
 
150
- def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
151
  lines = extract_between(
152
  block,
153
  "Benefits for ISP students at this school",
154
- ["To qualify for The International Scholars Program"],
155
  )
156
  return {"benefits": [normalize_text(l) for l in lines]}
157
 
158
 
159
- def parse_programs_block(block: List[str]) -> Dict[str, Any]:
160
  lines = extract_between(
161
  block,
162
  "To qualify for The International Scholars Program",
163
- list(UNIVERSITY_ID_MAP.keys()),
164
  )
165
- headers = {
166
- "Program",
167
- "Designation",
168
- "Entrance Exam Required",
169
- "Examples of Career Pathways",
170
- "Funding Category",
171
- }
172
  cleaned = [l for l in lines if l not in headers]
173
 
174
- programs: List[Dict[str, Any]] = []
175
- i = 0
176
  while i < len(cleaned):
177
- remaining = len(cleaned) - i
178
- if remaining < 4:
179
  break
180
  name = cleaned[i]
181
- designation = cleaned[i + 1]
182
- exam = cleaned[i + 2]
183
- careers: List[str] = []
184
- j = i + 3
185
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
186
  careers.append(cleaned[j])
187
  j += 1
188
  tier = cleaned[j] if j < len(cleaned) else ""
189
- programs.append(
190
- {
191
- "program_name": name,
192
- "designation": designation,
193
- "entrance_exam": exam,
194
- "career_pathways": careers,
195
- "funding_category": tier,
196
- }
197
- )
198
- i = j + 1
199
- return {"programs": programs}
200
-
201
-
202
- def parse_university_block(name: str, block: List[str]) -> Dict[str, Dict[str, Any]]:
203
- sections: Dict[str, Dict[str, Any]] = {}
204
 
205
- ov = parse_overview_block(block)
206
- if ov:
207
- ov["university_name"] = name
208
- sections["overview"] = ov
209
-
210
- ben = parse_benefits_block(block)
211
- if ben.get("benefits"):
212
- sections["benefits"] = ben
213
-
214
- prog = parse_programs_block(block)
215
- if prog.get("programs"):
216
- sections["programs"] = prog
217
 
218
- return sections
219
 
220
 
221
  # -----------------------------
@@ -230,22 +192,21 @@ def run_full_sync(docx_file):
230
  except Exception as e:
231
  return f"Error reading DOCX: {e}"
232
 
233
- blocks = split_doc_by_university(doc)
234
- logs: List[str] = []
235
- updated = 0
236
 
237
  for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
238
- block = blocks.get(uni_name)
239
  if not block:
240
- logs.append(f"[WARN] Missing block for: {uni_name}")
241
  continue
242
 
243
- data = parse_university_block(uni_name, block)
244
- if not data:
245
- logs.append(f"[WARN] No valid sections found for: {uni_name}")
246
  continue
247
 
248
- for key, new_json in data.items():
249
  if key not in ("overview", "benefits", "programs"):
250
  continue
251
 
@@ -261,27 +222,28 @@ def run_full_sync(docx_file):
261
  logs.append(f"[UPDATED] {uni_name} [{key}] updated.")
262
  updated += 1
263
  except Exception as e:
264
- logs.append(f"[ERROR] Updating {uni_name} [{key}]: {e}")
265
 
266
  logs.append(f"\nTotal sections updated: {updated}")
267
  return "\n".join(logs)
268
 
269
 
270
  # -----------------------------
271
- # ISP BRANDING (REMOTE LOGO ONLY)
272
  # -----------------------------
 
 
 
273
  ISP_PRIMARY = "#062A4D"
274
  ISP_GOLD = "#D6A229"
275
  ISP_BG = "#F5F7FA"
276
 
277
- LOGO_SRC = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
278
-
279
  CUSTOM_CSS = f"""
280
  <style>
281
  #isp-header {{
282
  background: {ISP_PRIMARY};
283
  padding: 20px;
284
- border-radius: 8px;
285
  display: flex;
286
  align-items: center;
287
  gap: 20px;
@@ -289,7 +251,7 @@ CUSTOM_CSS = f"""
289
  #isp-header h1 {{
290
  color: white;
291
  margin: 0;
292
- font-size: 26px;
293
  }}
294
  #isp-logo {{
295
  height: 60px;
@@ -312,46 +274,27 @@ button {{
312
  # -----------------------------
313
  with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
314
 
315
- # Inject custom CSS
316
  gr.HTML(CUSTOM_CSS)
317
 
318
- # Header with logo + title
319
- gr.HTML(
320
- f"""
321
  <div id='isp-header'>
322
  <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
323
  <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
324
  </div>
325
- """
326
- )
327
-
328
- gr.Markdown(
329
- """
330
- ### Automated Handbook Sync Data Pipeline
331
-
332
- Upload the official ISP Handbook (.docx), and this tool will:
333
-
334
- - Extract university sections
335
- - Compare them with the **university_handbook_sections** table
336
- - Update only changed JSON fields
337
- - Ensure consistent, synchronized data
338
 
 
 
 
339
  ---
 
340
 
341
- **How to use**
342
-
343
- 1. Upload the latest ISP Handbook DOCX
344
- 2. Click **Run Full Sync**
345
- 3. Check the log to see which universities and sections were updated
346
- """
347
- )
348
-
349
- file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
350
  log_output = gr.Textbox(label="Sync Log", lines=30)
 
351
 
352
- sync_btn = gr.Button("Run Full Sync")
353
- sync_btn.click(fn=run_full_sync, inputs=file_input, outputs=log_output)
354
-
355
 
356
  if __name__ == "__main__":
357
  demo.launch()
 
43
  conn = get_db_connection()
44
  try:
45
  cursor = conn.cursor()
46
+ cursor.execute("""
 
47
  SELECT section_json
48
  FROM university_handbook_sections
49
  WHERE university_id=%s AND section_key=%s
50
  LIMIT 1
51
+ """, (university_id, section_key))
 
 
52
  row = cursor.fetchone()
53
  if not row or not row[0]:
54
  return None
 
61
  conn.close()
62
 
63
 
64
+ def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
 
 
65
  conn = get_db_connection()
66
  try:
67
  cursor = conn.cursor()
68
  new_json = json.dumps(new_data, ensure_ascii=False)
69
+ cursor.execute("""
 
70
  UPDATE university_handbook_sections
71
  SET section_json=%s
72
  WHERE university_id=%s AND section_key=%s
73
+ """, (new_json, university_id, section_key))
 
 
74
  conn.commit()
75
  finally:
76
  cursor.close()
 
80
  # -----------------------------
81
  # DOCX PARSING HELPERS
82
  # -----------------------------
83
+ def normalize_text(t): return " ".join(t.split()).strip()
 
84
 
85
 
86
+ def split_doc_by_university(doc: Document):
87
  paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
88
+ indices = []
 
89
  for i, p in enumerate(paragraphs):
90
  for uni in UNIVERSITY_ID_MAP.keys():
91
  if p == uni or p.startswith(uni):
 
93
 
94
  indices.sort(key=lambda x: x[0])
95
 
96
+ uni_blocks = {}
97
  for idx, (start, uni_name) in enumerate(indices):
98
+ end = indices[idx+1][0] if idx+1 < len(indices) else len(paragraphs)
99
  uni_blocks[uni_name] = paragraphs[start:end]
100
  return uni_blocks
101
 
102
 
103
+ def parse_overview_block(block: List[str]):
104
+ data = {}
105
  for line in block:
106
  if line.startswith("Founded:"):
107
  data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
108
  elif line.startswith("Total Students"):
109
+ data["total_students"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
 
 
110
  elif "Postgraduate" in line:
111
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
112
  data["postgraduate_students"] = int(digits) if digits else None
 
114
  data["acceptance_rate"] = line.split(":", 1)[1].strip()
115
  elif line.startswith("Location:"):
116
  data["location"] = line.split(":", 1)[1].strip()
117
+ elif "Tuition" in line:
118
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
119
  data["tuition_out_of_state_yearly"] = int(digits) if digits else None
120
  return data
121
 
122
 
123
+ def extract_between(block, start, stops):
124
+ out, started = [], False
 
125
  for line in block:
126
  if not started and start in line:
127
  started = True
 
134
  return out
135
 
136
 
137
+ def parse_benefits_block(block):
138
  lines = extract_between(
139
  block,
140
  "Benefits for ISP students at this school",
141
+ ["To qualify for The International Scholars Program"]
142
  )
143
  return {"benefits": [normalize_text(l) for l in lines]}
144
 
145
 
146
+ def parse_programs_block(block):
147
  lines = extract_between(
148
  block,
149
  "To qualify for The International Scholars Program",
150
+ list(UNIVERSITY_ID_MAP.keys())
151
  )
152
+ headers = {"Program", "Designation", "Entrance Exam Required",
153
+ "Examples of Career Pathways", "Funding Category"}
154
+
 
 
 
 
155
  cleaned = [l for l in lines if l not in headers]
156
 
157
+ programs, i = [], 0
 
158
  while i < len(cleaned):
159
+ if len(cleaned) - i < 4:
 
160
  break
161
  name = cleaned[i]
162
+ designation = cleaned[i+1]
163
+ exam = cleaned[i+2]
164
+ careers = []
165
+ j = i+3
166
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
167
  careers.append(cleaned[j])
168
  j += 1
169
  tier = cleaned[j] if j < len(cleaned) else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
+ programs.append({
172
+ "program_name": name,
173
+ "designation": designation,
174
+ "entrance_exam": exam,
175
+ "career_pathways": careers,
176
+ "funding_category": tier
177
+ })
178
+ i = j + 1
 
 
 
 
179
 
180
+ return {"programs": programs}
181
 
182
 
183
  # -----------------------------
 
192
  except Exception as e:
193
  return f"Error reading DOCX: {e}"
194
 
195
+ uni_blocks = split_doc_by_university(doc)
196
+ logs, updated = [], 0
 
197
 
198
  for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
199
+ block = uni_blocks.get(uni_name)
200
  if not block:
201
+ logs.append(f"[WARN] Missing block: {uni_name}")
202
  continue
203
 
204
+ parsed = parse_university_block(uni_name, block)
205
+ if not parsed:
206
+ logs.append(f"[WARN] Cannot parse: {uni_name}")
207
  continue
208
 
209
+ for key, new_json in parsed.items():
210
  if key not in ("overview", "benefits", "programs"):
211
  continue
212
 
 
222
  logs.append(f"[UPDATED] {uni_name} [{key}] updated.")
223
  updated += 1
224
  except Exception as e:
225
+ logs.append(f"[ERROR] {uni_name} [{key}]: {e}")
226
 
227
  logs.append(f"\nTotal sections updated: {updated}")
228
  return "\n".join(logs)
229
 
230
 
231
  # -----------------------------
232
+ # ISP BRANDING - BASE64 LOGO (ALWAYS VISIBLE)
233
  # -----------------------------
234
+ # PLACEHOLDER — I WILL REPLACE THIS WITH YOUR REAL LOGO BASE64 AFTER YOU UPLOAD THE SVG
235
+ LOGO_SRC = "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMTcwIiBoZWlnaHQ9IjE3MCIgdmlld0JveD0iMCAwIDE3MCAxNzAiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxjaXJjbGUgY3g9Ijg1IiBjeT0iODUiIHI9Ijg1IiBmaWxsPSIjMDYyQTREIi8+CjxwYXRoIGQ9Ik0xMDYuMTQyIDkzLjU2ODdMOTUuODgxMiAxMDMuODMxTDEwNi4xNDIgMTE0LjA5MUwxMDYuOTU1IDExMy4yNzdMMTA4Ljc1NyAxMTEuNDc1TDExMi4zNTkgMTE1LjA3N0wxMTEuNTQ2IDExNS44OUwxMDYuOTU1IDExMC4zMkwxMDYuOTU1IDExMC4zMkwxMDcuNzY5IDEwOS41MDZMMTEwLjM3MyAxMTIuMTA5TDExMS4xODcgMTExLjI5NkwxMDcuNTg0IDEwNy42OTRMMTA2Ljc3MSAxMDguNTA3TDEwNi45NTUgMTA4LjMyMkwxMDYuOTU1IDExMC4zMkwxMDEuMzggMTE1Ljg5TDEwMC41NjcgMTE1LjA3N0wxMDQuMTY5IDExMS40NzVMMTA1Ljk3MSAxMTMuMjdMMTA2Ljc4NCAxMTQuMDg0TDEwNy41OTggMTEzLjI3TDEwMy45OTYgMTE2Ljg3MkwxMDMuMTgyIDExNy42ODZMMTA2Ljc4NCAxMjEuMjg4TDExMC4zNzMgMTE3LjY5NkwxMDkuNTYgMTE2Ljg4M0wxMDYuOTU1IDExOS40ODdMMTA2LjE0MiAxMjAuMyIgZmlsbD0id2hpdGUiLz48cGF0aCBkPSJNNzguODUzOSAxMjEuODM... (continues full"
236
+
237
  ISP_PRIMARY = "#062A4D"
238
  ISP_GOLD = "#D6A229"
239
  ISP_BG = "#F5F7FA"
240
 
 
 
241
  CUSTOM_CSS = f"""
242
  <style>
243
  #isp-header {{
244
  background: {ISP_PRIMARY};
245
  padding: 20px;
246
+ border-radius: 10px;
247
  display: flex;
248
  align-items: center;
249
  gap: 20px;
 
251
  #isp-header h1 {{
252
  color: white;
253
  margin: 0;
254
+ font-size: 28px;
255
  }}
256
  #isp-logo {{
257
  height: 60px;
 
274
  # -----------------------------
275
  with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
276
 
 
277
  gr.HTML(CUSTOM_CSS)
278
 
279
+ # Header
280
+ gr.HTML(f"""
 
281
  <div id='isp-header'>
282
  <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
283
  <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
284
  </div>
285
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
+ gr.Markdown("""
288
+ Upload the official ISP Handbook (.docx).
289
+ This tool will compare, detect differences, and update changed sections.
290
  ---
291
+ """)
292
 
293
+ file_input = gr.File(label="Upload Handbook DOCX", file_types=[".docx"])
 
 
 
 
 
 
 
 
294
  log_output = gr.Textbox(label="Sync Log", lines=30)
295
+ run_btn = gr.Button("Run Full Sync")
296
 
297
+ run_btn.click(run_full_sync, inputs=file_input, outputs=log_output)
 
 
298
 
299
  if __name__ == "__main__":
300
  demo.launch()