internationalscholarsprogram commited on
Commit
fbbe199
·
1 Parent(s): fca9a55

Fix: Added parse_university_block and updated handbook sync logic

Browse files
Files changed (1) hide show
  1. app.py +84 -36
app.py CHANGED
@@ -43,12 +43,15 @@ def fetch_section_json(university_id: int, section_key: str):
43
  conn = get_db_connection()
44
  try:
45
  cursor = conn.cursor()
46
- cursor.execute("""
 
47
  SELECT section_json
48
  FROM university_handbook_sections
49
  WHERE university_id=%s AND section_key=%s
50
  LIMIT 1
51
- """, (university_id, section_key))
 
 
52
  row = cursor.fetchone()
53
  if not row or not row[0]:
54
  return None
@@ -66,11 +69,14 @@ def update_section_json(university_id: int, section_key: str, new_data: Dict[str
66
  try:
67
  cursor = conn.cursor()
68
  new_json = json.dumps(new_data, ensure_ascii=False)
69
- cursor.execute("""
 
70
  UPDATE university_handbook_sections
71
  SET section_json=%s
72
  WHERE university_id=%s AND section_key=%s
73
- """, (new_json, university_id, section_key))
 
 
74
  conn.commit()
75
  finally:
76
  cursor.close()
@@ -80,7 +86,8 @@ def update_section_json(university_id: int, section_key: str, new_data: Dict[str
80
  # -----------------------------
81
  # DOCX PARSING HELPERS
82
  # -----------------------------
83
- def normalize_text(t): return " ".join(t.split()).strip()
 
84
 
85
 
86
  def split_doc_by_university(doc: Document):
@@ -93,15 +100,15 @@ def split_doc_by_university(doc: Document):
93
 
94
  indices.sort(key=lambda x: x[0])
95
 
96
- uni_blocks = {}
97
  for idx, (start, uni_name) in enumerate(indices):
98
- end = indices[idx+1][0] if idx+1 < len(indices) else len(paragraphs)
99
  uni_blocks[uni_name] = paragraphs[start:end]
100
  return uni_blocks
101
 
102
 
103
- def parse_overview_block(block: List[str]):
104
- data = {}
105
  for line in block:
106
  if line.startswith("Founded:"):
107
  data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
@@ -120,8 +127,9 @@ def parse_overview_block(block: List[str]):
120
  return data
121
 
122
 
123
- def extract_between(block, start, stops):
124
- out, started = [], False
 
125
  for line in block:
126
  if not started and start in line:
127
  started = True
@@ -134,52 +142,86 @@ def extract_between(block, start, stops):
134
  return out
135
 
136
 
137
- def parse_benefits_block(block):
138
  lines = extract_between(
139
  block,
140
  "Benefits for ISP students at this school",
141
- ["To qualify for The International Scholars Program"]
142
  )
143
  return {"benefits": [normalize_text(l) for l in lines]}
144
 
145
 
146
- def parse_programs_block(block):
147
  lines = extract_between(
148
  block,
149
  "To qualify for The International Scholars Program",
150
- list(UNIVERSITY_ID_MAP.keys())
151
  )
152
- headers = {"Program", "Designation", "Entrance Exam Required",
153
- "Examples of Career Pathways", "Funding Category"}
 
 
 
 
 
154
 
155
  cleaned = [l for l in lines if l not in headers]
156
 
157
- programs, i = [], 0
 
158
  while i < len(cleaned):
159
  if len(cleaned) - i < 4:
160
  break
 
161
  name = cleaned[i]
162
- designation = cleaned[i+1]
163
- exam = cleaned[i+2]
164
  careers = []
165
- j = i+3
166
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
167
  careers.append(cleaned[j])
168
  j += 1
169
  tier = cleaned[j] if j < len(cleaned) else ""
170
 
171
- programs.append({
172
- "program_name": name,
173
- "designation": designation,
174
- "entrance_exam": exam,
175
- "career_pathways": careers,
176
- "funding_category": tier
177
- })
 
 
178
  i = j + 1
179
 
180
  return {"programs": programs}
181
 
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  # -----------------------------
184
  # MAIN SYNC LOGIC
185
  # -----------------------------
@@ -188,12 +230,14 @@ def run_full_sync(docx_file):
188
  return "No handbook file uploaded."
189
 
190
  try:
 
191
  doc = Document(docx_file.name)
192
  except Exception as e:
193
  return f"Error reading DOCX: {e}"
194
 
195
  uni_blocks = split_doc_by_university(doc)
196
- logs, updated = [], 0
 
197
 
198
  for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
199
  block = uni_blocks.get(uni_name)
@@ -231,8 +275,9 @@ def run_full_sync(docx_file):
231
  # -----------------------------
232
  # ISP BRANDING - BASE64 LOGO (ALWAYS VISIBLE)
233
  # -----------------------------
234
- # PLACEHOLDER I WILL REPLACE THIS WITH YOUR REAL LOGO BASE64 AFTER YOU UPLOAD THE SVG
235
- LOGO_SRC = "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMTcwIiBoZWlnaHQ9IjE3MCIgdmlld0JveD0iMCAwIDE3MCAxNzAiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxjaXJjbGUgY3g9Ijg1IiBjeT0iODUiIHI9Ijg1IiBmaWxsPSIjMDYyQTREIi8+CjxwYXRoIGQ9Ik0xMDYuMTQyIDkzLjU2ODdMOTUuODgxMiAxMDMuODMxTDEwNi4xNDIgMTE0LjA5MUwxMDYuOTU1IDExMy4yNzdMMTA4Ljc1NyAxMTEuNDc1TDExMi4zNTkgMTE1LjA3N0wxMTEuNTQ2IDExNS44OUwxMDYuOTU1IDExMC4zMkwxMDYuOTU1IDExMC4zMkwxMDcuNzY5IDEwOS41MDZMMTEwLjM3MyAxMTIuMTA5TDExMS4xODcgMTExLjI5NkwxMDcuNTg0IDEwNy42OTRMMTA2Ljc3MSAxMDguNTA3TDEwNi45NTUgMTA4LjMyMkwxMDYuOTU1IDExMC4zMkwxMDEuMzggMTE1Ljg5TDEwMC41NjcgMTE1LjA3N0wxMDQuMTY5IDExMS40NzVMMTA1Ljk3MSAxMTMuMjdMMTA2Ljc4NCAxMTQuMDg0TDEwNy41OTggMTEzLjI3TDEwMy45OTYgMTE2Ljg3MkwxMDMuMTgyIDExNy42ODZMMTA2Ljc4NCAxMjEuMjg4TDExMC4zNzMgMTE3LjY5NkwxMDkuNTYgMTE2Ljg4M0wxMDYuOTU1IDExOS40ODdMMTA2LjE0MiAxMjAuMyIgZmlsbD0id2hpdGUiLz48cGF0aCBkPSJNNzguODUzOSAxMjEuODM... (continues full"
 
236
 
237
  ISP_PRIMARY = "#062A4D"
238
  ISP_GOLD = "#D6A229"
@@ -273,22 +318,25 @@ button {{
273
  # GRADIO UI
274
  # -----------------------------
275
  with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
276
-
277
  gr.HTML(CUSTOM_CSS)
278
 
279
  # Header
280
- gr.HTML(f"""
 
281
  <div id='isp-header'>
282
  <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
283
  <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
284
  </div>
285
- """)
 
286
 
287
- gr.Markdown("""
 
288
  Upload the official ISP Handbook (.docx).
289
  This tool will compare, detect differences, and update changed sections.
290
  ---
291
- """)
 
292
 
293
  file_input = gr.File(label="Upload Handbook DOCX", file_types=[".docx"])
294
  log_output = gr.Textbox(label="Sync Log", lines=30)
 
43
  conn = get_db_connection()
44
  try:
45
  cursor = conn.cursor()
46
+ cursor.execute(
47
+ """
48
  SELECT section_json
49
  FROM university_handbook_sections
50
  WHERE university_id=%s AND section_key=%s
51
  LIMIT 1
52
+ """,
53
+ (university_id, section_key),
54
+ )
55
  row = cursor.fetchone()
56
  if not row or not row[0]:
57
  return None
 
69
  try:
70
  cursor = conn.cursor()
71
  new_json = json.dumps(new_data, ensure_ascii=False)
72
+ cursor.execute(
73
+ """
74
  UPDATE university_handbook_sections
75
  SET section_json=%s
76
  WHERE university_id=%s AND section_key=%s
77
+ """,
78
+ (new_json, university_id, section_key),
79
+ )
80
  conn.commit()
81
  finally:
82
  cursor.close()
 
86
  # -----------------------------
87
  # DOCX PARSING HELPERS
88
  # -----------------------------
89
+ def normalize_text(t):
90
+ return " ".join(t.split()).strip()
91
 
92
 
93
  def split_doc_by_university(doc: Document):
 
100
 
101
  indices.sort(key=lambda x: x[0])
102
 
103
+ uni_blocks: Dict[str, List[str]] = {}
104
  for idx, (start, uni_name) in enumerate(indices):
105
+ end = indices[idx + 1][0] if idx + 1 < len(indices) else len(paragraphs)
106
  uni_blocks[uni_name] = paragraphs[start:end]
107
  return uni_blocks
108
 
109
 
110
+ def parse_overview_block(block: List[str]) -> Dict[str, Any]:
111
+ data: Dict[str, Any] = {}
112
  for line in block:
113
  if line.startswith("Founded:"):
114
  data["founded"] = int(re.sub(r"[^\d]", "", line.split(":", 1)[1]))
 
127
  return data
128
 
129
 
130
+ def extract_between(block: List[str], start: str, stops: List[str]) -> List[str]:
131
+ out: List[str] = []
132
+ started = False
133
  for line in block:
134
  if not started and start in line:
135
  started = True
 
142
  return out
143
 
144
 
145
+ def parse_benefits_block(block: List[str]) -> Dict[str, Any]:
146
  lines = extract_between(
147
  block,
148
  "Benefits for ISP students at this school",
149
+ ["To qualify for The International Scholars Program"],
150
  )
151
  return {"benefits": [normalize_text(l) for l in lines]}
152
 
153
 
154
+ def parse_programs_block(block: List[str]) -> Dict[str, Any]:
155
  lines = extract_between(
156
  block,
157
  "To qualify for The International Scholars Program",
158
+ list(UNIVERSITY_ID_MAP.keys()),
159
  )
160
+ headers = {
161
+ "Program",
162
+ "Designation",
163
+ "Entrance Exam Required",
164
+ "Examples of Career Pathways",
165
+ "Funding Category",
166
+ }
167
 
168
  cleaned = [l for l in lines if l not in headers]
169
 
170
+ programs = []
171
+ i = 0
172
  while i < len(cleaned):
173
  if len(cleaned) - i < 4:
174
  break
175
+
176
  name = cleaned[i]
177
+ designation = cleaned[i + 1]
178
+ exam = cleaned[i + 2]
179
  careers = []
180
+ j = i + 3
181
  while j < len(cleaned) and not cleaned[j].startswith("TIER"):
182
  careers.append(cleaned[j])
183
  j += 1
184
  tier = cleaned[j] if j < len(cleaned) else ""
185
 
186
+ programs.append(
187
+ {
188
+ "program_name": name,
189
+ "designation": designation,
190
+ "entrance_exam": exam,
191
+ "career_pathways": careers,
192
+ "funding_category": tier,
193
+ }
194
+ )
195
  i = j + 1
196
 
197
  return {"programs": programs}
198
 
199
 
200
+ # -----------------------------
201
+ # HIGH-LEVEL UNIVERSITY PARSER
202
+ # -----------------------------
203
+ def parse_university_block(uni_name: str, block: List[str]) -> Dict[str, Any]:
204
+ """
205
+ Given all lines for a single university block, return a dict with:
206
+ - overview
207
+ - benefits
208
+ - programs
209
+ """
210
+ overview = parse_overview_block(block)
211
+ benefits = parse_benefits_block(block)
212
+ programs = parse_programs_block(block)
213
+
214
+ # If everything failed, return empty dict so caller can handle
215
+ result: Dict[str, Any] = {}
216
+ if overview:
217
+ result["overview"] = overview
218
+ if benefits:
219
+ result["benefits"] = benefits
220
+ if programs:
221
+ result["programs"] = programs
222
+ return result
223
+
224
+
225
  # -----------------------------
226
  # MAIN SYNC LOGIC
227
  # -----------------------------
 
230
  return "No handbook file uploaded."
231
 
232
  try:
233
+ # Gradio File object usually has a .name (temp file path)
234
  doc = Document(docx_file.name)
235
  except Exception as e:
236
  return f"Error reading DOCX: {e}"
237
 
238
  uni_blocks = split_doc_by_university(doc)
239
+ logs: List[str] = []
240
+ updated = 0
241
 
242
  for uni_name, uni_id in UNIVERSITY_ID_MAP.items():
243
  block = uni_blocks.get(uni_name)
 
275
  # -----------------------------
276
  # ISP BRANDING - BASE64 LOGO (ALWAYS VISIBLE)
277
  # -----------------------------
278
+ # TODO: Replace this with your real SVG base64
279
+ LOGO_SRC = "data:image/svg+xml;base64,..."
280
+
281
 
282
  ISP_PRIMARY = "#062A4D"
283
  ISP_GOLD = "#D6A229"
 
318
  # GRADIO UI
319
  # -----------------------------
320
  with gr.Blocks(title="Automated Handbook Sync Data Pipeline") as demo:
 
321
  gr.HTML(CUSTOM_CSS)
322
 
323
  # Header
324
+ gr.HTML(
325
+ f"""
326
  <div id='isp-header'>
327
  <img id='isp-logo' src='{LOGO_SRC}' alt='ISP Logo'/>
328
  <h1>ISP Handbook → Data Pipeline Sync (Full Auto)</h1>
329
  </div>
330
+ """
331
+ )
332
 
333
+ gr.Markdown(
334
+ """
335
  Upload the official ISP Handbook (.docx).
336
  This tool will compare, detect differences, and update changed sections.
337
  ---
338
+ """
339
+ )
340
 
341
  file_input = gr.File(label="Upload Handbook DOCX", file_types=[".docx"])
342
  log_output = gr.Textbox(label="Sync Log", lines=30)