internationalscholarsprogram commited on
Commit
8c0596f
·
1 Parent(s): 92080b9

Use remote ISP logo and data pipeline branding

Browse files
Files changed (1) hide show
  1. app.py +81 -47
app.py CHANGED
@@ -43,34 +43,42 @@ def fetch_section_json(university_id: int, section_key: str):
43
  conn = get_db_connection()
44
  try:
45
  cursor = conn.cursor()
46
- cursor.execute("""
 
47
  SELECT section_json
48
  FROM university_handbook_sections
49
  WHERE university_id=%s AND section_key=%s
50
  LIMIT 1
51
- """, (university_id, section_key))
 
 
52
  row = cursor.fetchone()
53
  if not row or not row[0]:
54
  return None
55
  try:
56
  return json.loads(row[0])
57
- except:
58
  return None
59
  finally:
60
  cursor.close()
61
  conn.close()
62
 
63
 
64
- def update_section_json(university_id: int, section_key: str, new_data: Dict[str, Any]):
 
 
65
  conn = get_db_connection()
66
  try:
67
  cursor = conn.cursor()
68
  new_json = json.dumps(new_data, ensure_ascii=False)
69
- cursor.execute("""
 
70
  UPDATE university_handbook_sections
71
  SET section_json=%s
72
  WHERE university_id=%s AND section_key=%s
73
- """, (new_json, university_id, section_key))
 
 
74
  conn.commit()
75
  finally:
76
  cursor.close()
@@ -80,12 +88,14 @@ def update_section_json(university_id: int, section_key: str, new_data: Dict[str
80
  # -----------------------------
81
  # DOCX PARSING HELPERS
82
  # -----------------------------
83
- def normalize_text(t): return " ".join(t.split()).strip()
 
84
 
85
 
86
- def split_doc_by_university(doc: Document):
87
  paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
88
- indices = []
 
89
  for i, p in enumerate(paragraphs):
90
  for uni in UNIVERSITY_ID_MAP.keys():
91
  if p == uni or p.startswith(uni):
@@ -93,20 +103,22 @@ def split_doc_by_university(doc: Document):
93
 
94
  indices.sort(key=lambda x: x[0])
95
 
96
- uni_blocks = {}
97
  for idx, (start, uni_name) in enumerate(indices):
98
- end = indices[idx+1][0] if idx + 1 < len(indices) else len(paragraphs)
99
  uni_blocks[uni_name] = paragraphs[start:end]
100
  return uni_blocks
101
 
102
 
103
- def parse_overview_block(block: List[str]):
104
- data = {}
105
  for line in block:
106
  if line.startswith("Founded:"):
107
  data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
108
  elif line.startswith("Total Students"):
109
- data["total_students"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
 
 
110
  elif "Postgraduate" in line:
111
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
112
  data["postgraduate_students"] = int(digits) if digits else None
@@ -114,14 +126,15 @@ def parse_overview_block(block: List[str]):
114
  data["acceptance_rate"] = line.split(":", 1)[1].strip()
115
  elif line.startswith("Location:"):
116
  data["location"] = line.split(":", 1)[1].strip()
117
- elif "Tuition" in line:
118
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
119
  data["tuition_out_of_state_yearly"] = int(digits) if digits else None
120
  return data
121
 
122
 
123
- def extract_between(block, start, stops):
124
- out, started = [], False
 
125
  for line in block:
126
  if not started and start in line:
127
  started = True
@@ -134,50 +147,61 @@ def extract_between(block, start, stops):
134
  return out
135
 
136
 
137
- def parse_benefits_block(block):
138
  lines = extract_between(
139
  block,
140
  "Benefits for ISP students at this school",
141
- ["To qualify for The International Scholars Program"]
142
  )
143
  return {"benefits": [normalize_text(l) for l in lines]}
144
 
145
 
146
- def parse_programs_block(block):
147
  lines = extract_between(
148
  block,
149
  "To qualify for The International Scholars Program",
150
- list(UNIVERSITY_ID_MAP.keys())
151
  )
152
- headers = {"Program", "Designation", "Entrance Exam Required", "Examples of Career Pathways", "Funding Category"}
 
 
 
 
 
 
153
  cleaned = [l for l in lines if l not in headers]
154
 
155
- programs, i = [], 0
 
156
  while i < len(cleaned):
157
  remaining = len(cleaned) - i
158
- if remaining < 4: break
 
159
  name = cleaned[i]
160
- designation = cleaned[i+1]
161
- exam = cleaned[i+2]
162
- careers = []
163
- j = i+3
164
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
165
  careers.append(cleaned[j])
166
  j += 1
167
  tier = cleaned[j] if j < len(cleaned) else ""
168
- programs.append({
169
- "program_name": name,
170
- "designation": designation,
171
- "entrance_exam": exam,
172
- "career_pathways": careers,
173
- "funding_category": tier
174
- })
 
 
175
  i = j + 1
176
  return {"programs": programs}
177
 
178
 
179
- def parse_university_block(name: str, block: List[str]):
180
- sections = {}
 
181
  ov = parse_overview_block(block)
182
  if ov:
183
  ov["university_name"] = name
@@ -207,7 +231,8 @@ def run_full_sync(docx_file):
207
  return f"Error reading DOCX: {e}"
208
 
209
  blocks = split_doc_by_university(doc)
210
- logs, updated = [], 0
 
211
 
212
  for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
213
  block = blocks.get(uni_name)
@@ -243,14 +268,13 @@ def run_full_sync(docx_file):
243
 
244
 
245
  # -----------------------------
246
- # ISP BRANDING (NO css= ARGUMENT)
247
  # -----------------------------
248
  ISP_PRIMARY = "#062A4D"
249
  ISP_GOLD = "#D6A229"
250
  ISP_BG = "#F5F7FA"
251
 
252
- LOCAL_LOGO = "assets/logo-DRvZB3HV.svg"
253
- LOGO_SRC = LOCAL_LOGO if os.path.exists(LOCAL_LOGO) else "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
254
 
255
  CUSTOM_CSS = f"""
256
  <style>
@@ -286,19 +310,23 @@ button {{
286
  # -----------------------------
287
  # GRADIO UI
288
  # -----------------------------
289
- with gr.Blocks(title="ISP Automated Handbook Data Pipeline") as demo:
290
 
 
291
  gr.HTML(CUSTOM_CSS)
292
 
293
  # Header with logo + title
294
- gr.HTML(f"""
 
295
  <div id='isp-header'>
296
  <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
297
  <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
298
  </div>
299
- """)
 
300
 
301
- gr.Markdown("""
 
302
  ### Automated Handbook Sync Data Pipeline
303
 
304
  Upload the official ISP Handbook (.docx), and this tool will:
@@ -309,13 +337,19 @@ Upload the official ISP Handbook (.docx), and this tool will:
309
  - Ensure consistent, synchronized data
310
 
311
  ---
312
- """)
 
 
 
 
 
 
 
313
 
314
  file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
315
  log_output = gr.Textbox(label="Sync Log", lines=30)
316
 
317
  sync_btn = gr.Button("Run Full Sync")
318
-
319
  sync_btn.click(fn=run_full_sync, inputs=file_input, outputs=log_output)
320
 
321
 
 
43
  conn = get_db_connection()
44
  try:
45
  cursor = conn.cursor()
46
+ cursor.execute(
47
+ """
48
  SELECT section_json
49
  FROM university_handbook_sections
50
  WHERE university_id=%s AND section_key=%s
51
  LIMIT 1
52
+ """,
53
+ (university_id, section_key),
54
+ )
55
  row = cursor.fetchone()
56
  if not row or not row[0]:
57
  return None
58
  try:
59
  return json.loads(row[0])
60
+ except Exception:
61
  return None
62
  finally:
63
  cursor.close()
64
  conn.close()
65
 
66
 
67
+ def update_section_json(
68
+ university_id: int, section_key: str, new_data: Dict[str, Any]
69
+ ):
70
  conn = get_db_connection()
71
  try:
72
  cursor = conn.cursor()
73
  new_json = json.dumps(new_data, ensure_ascii=False)
74
+ cursor.execute(
75
+ """
76
  UPDATE university_handbook_sections
77
  SET section_json=%s
78
  WHERE university_id=%s AND section_key=%s
79
+ """,
80
+ (new_json, university_id, section_key),
81
+ )
82
  conn.commit()
83
  finally:
84
  cursor.close()
 
88
  # -----------------------------
89
  # DOCX PARSING HELPERS
90
  # -----------------------------
91
+ def normalize_text(t: str) -> str:
92
+ return " ".join(t.split()).strip()
93
 
94
 
95
+ def split_doc_by_university(doc: Document) -> Dict[str, List[str]]:
96
  paragraphs = [normalize_text(p.text) for p in doc.paragraphs if p.text.strip()]
97
+ indices: List[tuple[int, str]] = []
98
+
99
  for i, p in enumerate(paragraphs):
100
  for uni in UNIVERSITY_ID_MAP.keys():
101
  if p == uni or p.startswith(uni):
 
103
 
104
  indices.sort(key=lambda x: x[0])
105
 
106
+ uni_blocks: Dict[str, List[str]] = {}
107
  for idx, (start, uni_name) in enumerate(indices):
108
+ end = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
109
  uni_blocks[uni_name] = paragraphs[start:end]
110
  return uni_blocks
111
 
112
 
113
+ def parse_overview_block(block: List[str]) -> Dict[str, Any]:
114
+ data: Dict[str, Any] = {}
115
  for line in block:
116
  if line.startswith("Founded:"):
117
  data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
118
  elif line.startswith("Total Students"):
119
+ data["total_students"] = int(
120
+ re.sub(r"[^\d]", "", line.split(":", 1)[1])
121
+ )
122
  elif "Postgraduate" in line:
123
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
124
  data["postgraduate_students"] = int(digits) if digits else None
 
126
  data["acceptance_rate"] = line.split(":", 1)[1].strip()
127
  elif line.startswith("Location:"):
128
  data["location"] = line.split(":", 1)[1].strip()
129
+ elif "Tuition" in line or "Yearly Out of State Tuition" in line:
130
  digits = re.sub(r"[^\d]", "", line.split(":", 1)[1])
131
  data["tuition_out_of_state_yearly"] = int(digits) if digits else None
132
  return data
133
 
134
 
135
+ def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]:
136
+ out: List[str] = []
137
+ started = False
138
  for line in block:
139
  if not started and start in line:
140
  started = True
 
147
  return out
148
 
149
 
150
+ def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
151
  lines = extract_between(
152
  block,
153
  "Benefits for ISP students at this school",
154
+ ["To qualify for The International Scholars Program"],
155
  )
156
  return {"benefits": [normalize_text(l) for l in lines]}
157
 
158
 
159
+ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
160
  lines = extract_between(
161
  block,
162
  "To qualify for The International Scholars Program",
163
+ list(UNIVERSITY_ID_MAP.keys()),
164
  )
165
+ headers = {
166
+ "Program",
167
+ "Designation",
168
+ "Entrance Exam Required",
169
+ "Examples of Career Pathways",
170
+ "Funding Category",
171
+ }
172
  cleaned = [l for l in lines if l not in headers]
173
 
174
+ programs: List[Dict[str, Any]] = []
175
+ i = 0
176
  while i < len(cleaned):
177
  remaining = len(cleaned) - i
178
+ if remaining < 4:
179
+ break
180
  name = cleaned[i]
181
+ designation = cleaned[i + 1]
182
+ exam = cleaned[i + 2]
183
+ careers: List[str] = []
184
+ j = i + 3
185
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
186
  careers.append(cleaned[j])
187
  j += 1
188
  tier = cleaned[j] if j < len(cleaned) else ""
189
+ programs.append(
190
+ {
191
+ "program_name": name,
192
+ "designation": designation,
193
+ "entrance_exam": exam,
194
+ "career_pathways": careers,
195
+ "funding_category": tier,
196
+ }
197
+ )
198
  i = j + 1
199
  return {"programs": programs}
200
 
201
 
202
+ def parse_university_block(name: str, block: List[str]) -> Dict[str, Dict[str, Any]]:
203
+ sections: Dict[str, Dict[str, Any]] = {}
204
+
205
  ov = parse_overview_block(block)
206
  if ov:
207
  ov["university_name"] = name
 
231
  return f"Error reading DOCX: {e}"
232
 
233
  blocks = split_doc_by_university(doc)
234
+ logs: List[str] = []
235
+ updated = 0
236
 
237
  for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
238
  block = blocks.get(uni_name)
 
268
 
269
 
270
  # -----------------------------
271
+ # ISP BRANDING (REMOTE LOGO ONLY)
272
  # -----------------------------
273
  ISP_PRIMARY = "#062A4D"
274
  ISP_GOLD = "#D6A229"
275
  ISP_BG = "#F5F7FA"
276
 
277
+ LOGO_SRC = "https://qhtestingserver.com/assets/logo-DRvZB3HV.svg"
 
278
 
279
  CUSTOM_CSS = f"""
280
  <style>
 
310
  # -----------------------------
311
  # GRADIO UI
312
  # -----------------------------
313
+ with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
314
 
315
+ # Inject custom CSS
316
  gr.HTML(CUSTOM_CSS)
317
 
318
  # Header with logo + title
319
+ gr.HTML(
320
+ f"""
321
  <div id='isp-header'>
322
  <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
323
  <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
324
  </div>
325
+ """
326
+ )
327
 
328
+ gr.Markdown(
329
+ """
330
  ### Automated Handbook Sync Data Pipeline
331
 
332
  Upload the official ISP Handbook (.docx), and this tool will:
 
337
  - Ensure consistent, synchronized data
338
 
339
  ---
340
+
341
+ **How to use**
342
+
343
+ 1. Upload the latest ISP Handbook DOCX
344
+ 2. Click **Run Full Sync**
345
+ 3. Check the log to see which universities and sections were updated
346
+ """
347
+ )
348
 
349
  file_input = gr.File(label="Upload ISP Handbook DOCX", file_types=[".docx"])
350
  log_output = gr.Textbox(label="Sync Log", lines=30)
351
 
352
  sync_btn = gr.Button("Run Full Sync")
 
353
  sync_btn.click(fn=run_full_sync, inputs=file_input, outputs=log_output)
354
 
355