TiH0 commited on
Commit
ad9b466
Β·
verified Β·
1 Parent(s): a935d6c

Create stat.py

Browse files
Files changed (1) hide show
  1. stat.py +601 -0
stat.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from docx import Document
3
+ from docx.shared import Pt, RGBColor
4
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
5
+ from docx.oxml.shared import OxmlElement, qn
6
+ from docx.enum.section import WD_SECTION
7
+
8
+ # Theme color configuration - change these to customize the document colors
9
+ THEME_COLOR_HEX = "5FFFDF" # Hex version for XML elements
10
+ THEME_COLOR = RGBColor.from_string(THEME_COLOR_HEX) # RGBColor version for direct use
11
+
12
+
13
+ def set_zero_spacing(paragraph):
14
+ """Force paragraph spacing to 0 before and after."""
15
+ paragraph.paragraph_format.space_before = Pt(0)
16
+ paragraph.paragraph_format.space_after = Pt(0)
17
+
18
+
19
+ def is_valid_cours_number(cours_value):
20
+ """Check if cours value is valid (numeric and not 'S2')"""
21
+ if pd.isna(cours_value):
22
+ return False
23
+
24
+ cours_str = str(cours_value).strip().upper()
25
+
26
+ # Skip S2 courses and other specific invalid values
27
+ if cours_str in ['S2', 'NAN', '']:
28
+ return False
29
+
30
+ # Try to convert to numeric - if it works and is positive, it's valid
31
+ try:
32
+ numeric_value = float(cours_str)
33
+ # Check if it's a positive number (courses should be positive integers)
34
+ return numeric_value > 0 and numeric_value == int(numeric_value)
35
+ except (ValueError, TypeError, OverflowError):
36
+ return False
37
+
38
+
39
+ def check_if_course_has_e_choices(course_questions):
40
+ """Check if any question in the course has an E choice"""
41
+ for q_data in course_questions:
42
+ for choice in q_data['choices']:
43
+ if choice['letter'].upper() == 'E':
44
+ return True
45
+ return False
46
+
47
+
48
+ def read_course_titles_from_module_sheet(excel_file_path, module_name):
49
+ """Read course titles from a module-specific sheet (case-insensitive)
50
+ Returns both titles dict and ordered list of course numbers"""
51
+ cours_titles = {}
52
+ cours_order = [] # NEW: Keep track of order courses appear in sheet
53
+
54
+ print(f" DEBUG: Looking for sheet matching module '{module_name}'")
55
+
56
+ # Get all sheet names from the Excel file
57
+ xls = pd.ExcelFile(excel_file_path)
58
+ sheet_names = xls.sheet_names
59
+
60
+ # Find matching sheet (case-insensitive)
61
+ target_sheet = None
62
+ module_name_lower = str(module_name).strip().lower()
63
+
64
+ print(f" DEBUG: Module name (lowercase): '{module_name_lower}'")
65
+ print(f" DEBUG: Available sheets: {sheet_names}")
66
+
67
+ for sheet in sheet_names:
68
+ sheet_lower = sheet.strip().lower()
69
+ print(f" DEBUG: Comparing '{module_name_lower}' with '{sheet_lower}'")
70
+ if sheet_lower == module_name_lower:
71
+ target_sheet = sheet
72
+ print(f" DEBUG: MATCH FOUND! Using sheet '{target_sheet}'")
73
+ break
74
+
75
+ if target_sheet is None:
76
+ print(f" DEBUG: No sheet found matching module '{module_name}'")
77
+ return cours_titles, cours_order
78
+
79
+ # Read the matching sheet
80
+ cours_df = pd.read_excel(excel_file_path, sheet_name=target_sheet)
81
+ print(f" DEBUG: Sheet '{target_sheet}' has {len(cours_df)} rows")
82
+ print(f" DEBUG: Sheet columns: {list(cours_df.columns)}")
83
+
84
+ if not cours_df.empty and 'cours' in cours_df.columns and 'titre' in cours_df.columns:
85
+ for idx, row in cours_df.iterrows():
86
+ print(f" DEBUG: Row {idx}: cours={row['cours']}, titre={row.get('titre', 'N/A')}")
87
+ if pd.notna(row['cours']) and pd.notna(row['titre']):
88
+ # Only store valid numeric courses
89
+ if is_valid_cours_number(row['cours']):
90
+ cours_num = int(float(str(row['cours']).strip()))
91
+ cours_titles[cours_num] = row['titre']
92
+ cours_order.append(cours_num) # NEW: Preserve order
93
+ print(f" DEBUG: Added cours {cours_num}: {row['titre']}")
94
+ else:
95
+ print(f" DEBUG: Skipped invalid cours: {row['cours']}")
96
+ print(f" DEBUG: Final count: {len(cours_titles)} course titles from sheet '{target_sheet}'")
97
+ print(f" DEBUG: Course order: {cours_order}")
98
+ else:
99
+ print(f" DEBUG: Sheet '{target_sheet}' doesn't have expected structure")
100
+ print(f" DEBUG: Has 'cours' column: {'cours' in cours_df.columns}")
101
+ print(f" DEBUG: Has 'titre' column: {'titre' in cours_df.columns}")
102
+
103
+ return cours_titles, cours_order
104
+
105
+
106
+ def process_excel_to_word(excel_file_path, output_word_path, theme_hex=None):
107
+ """Main function to process Excel and create Word document with improved column balancing and answer tables"""
108
+
109
+ # Set default theme colors if not provided
110
+ if theme_hex is None:
111
+ theme_hex = THEME_COLOR_HEX
112
+ theme_color = RGBColor.from_string(theme_hex)
113
+
114
+ # Read the Excel file
115
+ xls = pd.ExcelFile(excel_file_path)
116
+ first_sheet_name = xls.sheet_names[0] # Get the first sheet name
117
+ questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)
118
+
119
+ # Debug: Print the data structure
120
+ print("DEBUG: Excel file loaded successfully")
121
+
122
+ # Get unique modules from Questions sheet (case-insensitive)
123
+ module_col = None
124
+ for col in questions_df.columns:
125
+ if col.lower().strip() == 'module':
126
+ module_col = col
127
+ break
128
+
129
+ if module_col:
130
+ # Get all sheet names from Excel (in order)
131
+ xls_temp = pd.ExcelFile(excel_file_path)
132
+ all_sheets = xls_temp.sheet_names
133
+
134
+ print(f"DEBUG: All sheets in Excel (in order): {all_sheets}")
135
+
136
+ # Skip the first sheet (Questions sheet) and use remaining sheets as module order
137
+ module_sheets = all_sheets[1:] # Exclude Questions sheet
138
+
139
+ print(f"DEBUG: Module sheets (in order): {module_sheets}")
140
+
141
+ # Create lowercase mapping for comparison
142
+ sheet_lower_map = {sheet.strip().lower(): sheet for sheet in module_sheets}
143
+
144
+ # Get unique modules from Questions column
145
+ modules_in_questions = questions_df[module_col].dropna().unique()
146
+ print(f"DEBUG: Unique modules from Questions sheet: {list(modules_in_questions)}")
147
+
148
+ # Map each module in Questions to its corresponding sheet name
149
+ module_to_sheet = {}
150
+ for module in modules_in_questions:
151
+ module_lower = str(module).strip().lower()
152
+ if module_lower in sheet_lower_map:
153
+ module_to_sheet[module] = sheet_lower_map[module_lower]
154
+ print(f"DEBUG: Mapped '{module}' -> '{sheet_lower_map[module_lower]}'")
155
+
156
+ print(f"DEBUG: Module to sheet mapping: {module_to_sheet}")
157
+
158
+ # Normalize all module names in the dataframe to use sheet names
159
+ questions_df[module_col] = questions_df[module_col].apply(
160
+ lambda x: module_to_sheet.get(x, x) if pd.notna(x) else x
161
+ )
162
+
163
+ # Now create ordered list of modules based on sheet order
164
+ modules = []
165
+ for sheet in module_sheets:
166
+ if sheet in module_to_sheet.values():
167
+ modules.append(sheet)
168
+
169
+ print(f"DEBUG: Final modules list in sheet order: {modules}")
170
+ else:
171
+ print("DEBUG: No 'module' column found in Questions sheet!")
172
+ print(f"DEBUG: Available columns: {list(questions_df.columns)}")
173
+ modules = []
174
+
175
+ # Read course titles from module-specific sheets and organize by module
176
+ modules_data = {} # {module_name: {cours_num: cours_title}}
177
+ modules_course_order = {} # NEW: {module_name: [ordered list of course numbers]}
178
+ xls = pd.ExcelFile(excel_file_path)
179
+ print(f"DEBUG: Available sheets in Excel file: {xls.sheet_names}")
180
+
181
+ for module in modules:
182
+ print(f"\nDEBUG: Processing module '{module}'...")
183
+ try:
184
+ cours_titles_for_module, cours_order = read_course_titles_from_module_sheet(excel_file_path, module)
185
+ print(f"DEBUG: Got {len(cours_titles_for_module)} course titles from module '{module}'")
186
+ print(f"DEBUG: Course titles: {cours_titles_for_module}")
187
+ print(f"DEBUG: Course order: {cours_order}")
188
+ modules_data[module] = cours_titles_for_module
189
+ modules_course_order[module] = cours_order # NEW: Store order
190
+ except Exception as e:
191
+ print(f"DEBUG: Error reading module '{module}': {e}")
192
+ import traceback
193
+ traceback.print_exc()
194
+
195
+ print(f"\nDEBUG: Modules data: {modules_data}")
196
+ print(f"DEBUG: Modules course order: {modules_course_order}")
197
+
198
+ # Debug: Print the data structure
199
+ print("DEBUG: Excel file loaded successfully")
200
+ print(f"DEBUG: Total rows in Questions sheet: {len(questions_df)}")
201
+ print("DEBUG: Column names:", list(questions_df.columns))
202
+
203
+ # Clean column names (remove any extra spaces)
204
+ questions_df.columns = questions_df.columns.str.strip()
205
+
206
+ # Create Word document
207
+ doc = Document()
208
+
209
+ # --- Statistics collectors (questions per course and repeats) ---
210
+ stats_course_counts = {} # { course_title: count }
211
+ stats_question_repeats = {} # { question_text: count }
212
+
213
+ # Process questions with their following choice rows, grouped by course
214
+ processed_questions = []
215
+ current_question = None
216
+ current_choices = []
217
+ skipped_s2_questions = 0
218
+
219
+ print("DEBUG: Processing rows sequentially to group choices...")
220
+
221
+ for idx, row in questions_df.iterrows():
222
+ numero = row['Numero']
223
+
224
+ # If this row has a question number, it's a new question
225
+ if pd.notna(numero):
226
+ # If we were processing a previous question, save it (only if valid cours)
227
+ if current_question is not None and current_choices and is_valid_cours_number(current_cours):
228
+ processed_questions.append({
229
+ 'numero': current_question,
230
+ 'question_text': current_question_text,
231
+ 'source': current_source,
232
+ 'comment': current_comment,
233
+ 'cours': int(float(str(current_cours).strip())), # Convert to int
234
+ 'module': current_module,
235
+ 'choices': current_choices.copy()
236
+ })
237
+ print(f"DEBUG: Saved question {current_question} with {len(current_choices)} choices")
238
+ elif current_question is not None and not is_valid_cours_number(current_cours):
239
+ skipped_s2_questions += 1
240
+ print(f"DEBUG: Skipped question {current_question} from cours '{current_cours}' (invalid/S2)")
241
+
242
+ # Start new question
243
+ current_question = numero
244
+ current_question_text = str(row['Question']).strip()
245
+ current_source = str(row['Source']).strip() if pd.notna(row['Source']) else ""
246
+ current_comment = str(row['Comment']).strip() if pd.notna(row['Comment']) and str(
247
+ row['Comment']).lower() != 'nan' else None
248
+ current_cours = row['Cours'] if pd.notna(row['Cours']) else 1 # Default to course 1
249
+ current_module = row[module_col] if module_col and pd.notna(row[module_col]) else None
250
+ current_choices = []
251
+
252
+ print(f"\nDEBUG: Starting new question {numero}, Course: {current_cours}")
253
+
254
+ # Only add choices if the current cours is valid
255
+ if is_valid_cours_number(current_cours):
256
+ # Add this row as a choice (whether it's the question row or a choice row)
257
+ choice_letter = str(row['Order']).strip().upper()
258
+ choice_text = str(row['ChoiceText']).strip()
259
+ ct_value = str(row['CT']).strip().upper() if pd.notna(row['CT']) else ""
260
+ is_correct = ct_value == 'X'
261
+
262
+ if choice_text and choice_text.lower() != 'nan' and choice_text != '':
263
+ current_choices.append({
264
+ 'letter': choice_letter,
265
+ 'text': choice_text,
266
+ 'is_correct': is_correct
267
+ })
268
+
269
+ # Don't forget the last question (only if valid cours)
270
+ if current_question is not None and current_choices and is_valid_cours_number(current_cours):
271
+ processed_questions.append({
272
+ 'numero': current_question,
273
+ 'question_text': current_question_text,
274
+ 'source': current_source,
275
+ 'comment': current_comment,
276
+ 'cours': int(float(str(current_cours).strip())), # Convert to int
277
+ 'module': current_module,
278
+ 'choices': current_choices.copy()
279
+ })
280
+ elif current_question is not None and not is_valid_cours_number(current_cours):
281
+ skipped_s2_questions += 1
282
+ print(f"DEBUG: Skipped final question {current_question} from cours '{current_cours}' (invalid/S2)")
283
+
284
+ print(f"\nDEBUG: Total processed questions: {len(processed_questions)}")
285
+ print(f"DEBUG: Total skipped S2/invalid questions: {skipped_s2_questions}")
286
+
287
+ # Group questions by module and course, preserving module order
288
+ # Use a regular dict (Python 3.7+ preserves insertion order)
289
+ questions_by_module = {}
290
+
291
+ # Initialize with ordered modules to preserve sheet order
292
+ for module in modules:
293
+ questions_by_module[module] = {}
294
+
295
+ # Fill in the questions
296
+ for q_data in processed_questions:
297
+ module_name = q_data['module']
298
+ cours_num = q_data['cours']
299
+
300
+ # Only add if module is in our ordered list
301
+ if module_name in questions_by_module:
302
+ if cours_num not in questions_by_module[module_name]:
303
+ questions_by_module[module_name][cours_num] = []
304
+
305
+ questions_by_module[module_name][cours_num].append(q_data)
306
+ else:
307
+ # Handle modules not in sheet list (shouldn't happen but just in case)
308
+ if module_name not in questions_by_module:
309
+ questions_by_module[module_name] = {}
310
+ if cours_num not in questions_by_module[module_name]:
311
+ questions_by_module[module_name][cours_num] = []
312
+ questions_by_module[module_name][cours_num].append(q_data)
313
+
314
+ # NEW: Reorder courses within each module based on sheet order
315
+ for module_name in list(questions_by_module.keys()):
316
+ if module_name in modules_course_order:
317
+ course_order = modules_course_order[module_name]
318
+ # Create new ordered dict with courses in sheet order
319
+ ordered_courses = {}
320
+ for cours_num in course_order:
321
+ if cours_num in questions_by_module[module_name]:
322
+ ordered_courses[cours_num] = questions_by_module[module_name][cours_num]
323
+
324
+ # Add any courses that weren't in the sheet (shouldn't happen, but just in case)
325
+ for cours_num in questions_by_module[module_name]:
326
+ if cours_num not in ordered_courses:
327
+ ordered_courses[cours_num] = questions_by_module[module_name][cours_num]
328
+
329
+ questions_by_module[module_name] = ordered_courses
330
+ print(f"DEBUG: Reordered courses for module '{module_name}': {list(ordered_courses.keys())}")
331
+
332
+ print(f"DEBUG: Questions grouped by modules (sheet order preserved): {list(questions_by_module.keys())}")
333
+
334
+ # Check for E choices across all modules - use TOC order
335
+ total_e_choices = 0
336
+ for module_name in modules: # Sheet order
337
+ if module_name not in questions_by_module:
338
+ continue
339
+
340
+ course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
341
+
342
+ for cours_num in course_order: # Sheet order within module
343
+ if cours_num not in questions_by_module[module_name]:
344
+ continue
345
+
346
+ course_questions = questions_by_module[module_name][cours_num]
347
+ course_e_count = sum(1 for q_data in course_questions
348
+ for choice in q_data['choices']
349
+ if choice['letter'].upper() == 'E')
350
+ if course_e_count > 0:
351
+ print(f"DEBUG: Module '{module_name}' Course {cours_num} has {course_e_count} E choices")
352
+ total_e_choices += course_e_count
353
+
354
+ print(f"DEBUG: Total E choices found across all modules: {total_e_choices}")
355
+
356
+ # Collect statistics from processed questions
357
+ # Use TOC order (modules in sheet order, courses in sheet order within module)
358
+ for module_name in modules: # Already in sheet order
359
+ if module_name not in questions_by_module:
360
+ continue
361
+
362
+ # Get course order for this module
363
+ course_order = modules_course_order.get(module_name, [])
364
+
365
+ # Iterate courses in sheet order
366
+ for cours_num in course_order:
367
+ if cours_num not in questions_by_module[module_name]:
368
+ continue
369
+
370
+ course_questions = questions_by_module[module_name][cours_num]
371
+
372
+ # Get course title
373
+ cours_titles = modules_data.get(module_name, {})
374
+ course_title = cours_titles.get(cours_num, f"Course {cours_num}")
375
+
376
+ # Count questions per course
377
+ stats_course_counts[course_title] = stats_course_counts.get(course_title, 0) + len(course_questions)
378
+
379
+ # Count repeated questions
380
+ for q_data in course_questions:
381
+ q_text = str(q_data['question_text']).strip()
382
+ stats_question_repeats[q_text] = stats_question_repeats.get(q_text, 0) + 1
383
+
384
+ print(f"\nDEBUG: Statistics collected:")
385
+ print(f" - Courses tracked: {len(stats_course_counts)}")
386
+ print(f" - Unique questions: {len(stats_question_repeats)}")
387
+ print(f" - Repeated questions: {sum(1 for count in stats_question_repeats.values() if count > 1)}")
388
+
389
+ # --- Insert Statistics section (two-column layout) before TOC ---
390
+ # Add a new section (but keep 2-column layout)
391
+ stats_section = doc.add_section(WD_SECTION.CONTINUOUS)
392
+
393
+ # Ensure this new section keeps the same column layout (2 columns)
394
+ sectPr = stats_section._sectPr
395
+ cols = sectPr.xpath('./w:cols')[0]
396
+ cols.set(qn('w:num'), '2')
397
+
398
+ # --- Add STATISTICS title and bookmark so it appears in TOC ---
399
+ stats_para = doc.add_paragraph()
400
+ stats_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
401
+ stats_run = stats_para.add_run("STATISTICS")
402
+ stats_run.font.name = 'Montserrat'
403
+ stats_run.font.size = Pt(14)
404
+ stats_run.font.bold = True
405
+ stats_run.font.color.rgb = theme_color
406
+
407
+ # --- Questions per Course ---
408
+ p = doc.add_paragraph()
409
+ run = p.add_run("Questions per Course:")
410
+ run.font.name = 'Montserrat'
411
+ run.font.size = Pt(11)
412
+ run.font.bold = True
413
+ run.font.color.rgb = theme_color
414
+
415
+ table = doc.add_table(rows=1, cols=2)
416
+ table.style = 'Table Grid'
417
+ hdr = table.rows[0].cells
418
+ hdr[0].text = "Course"
419
+ hdr[1].text = "Number of Questions"
420
+
421
+ # Apply keep together to header cells
422
+ for cell in hdr:
423
+ for paragraph in cell.paragraphs:
424
+ paragraph.paragraph_format.keep_together = True
425
+
426
+ # Display courses in TOC order (module order, then course order within module)
427
+ for module_name in modules:
428
+ if module_name not in questions_by_module:
429
+ continue
430
+
431
+ course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
432
+ cours_titles = modules_data.get(module_name, {})
433
+
434
+ for cours_num in course_order:
435
+ if cours_num not in questions_by_module[module_name]:
436
+ continue
437
+
438
+ course_title = cours_titles.get(cours_num, f"Course {cours_num}")
439
+ count = stats_course_counts.get(course_title, 0)
440
+
441
+ row = table.add_row().cells
442
+ row[0].text = str(course_title)
443
+ row[1].text = str(count)
444
+
445
+ # Apply keep together to each cell
446
+ for cell in row:
447
+ for paragraph in cell.paragraphs:
448
+ paragraph.paragraph_format.keep_together = True
449
+
450
+ # Apply keep together to entire table rows
451
+ for row in table.rows:
452
+ tr = row._tr
453
+ trPr = tr.get_or_add_trPr()
454
+ cantSplit = OxmlElement('w:cantSplit')
455
+ trPr.append(cantSplit)
456
+
457
+ # --- Repeated Questions ---
458
+ doc.add_paragraph()
459
+ p2 = doc.add_paragraph()
460
+ run2 = p2.add_run("Repeated Questions:")
461
+ run2.font.name = 'Montserrat'
462
+ run2.font.size = Pt(11)
463
+ run2.font.bold = True
464
+ run2.font.color.rgb = theme_color
465
+
466
+ repeated = {q: c for q, c in stats_question_repeats.items() if c > 1}
467
+ if repeated:
468
+ rep_table = doc.add_table(rows=1, cols=2)
469
+ rep_table.style = 'Table Grid'
470
+ hdr2 = rep_table.rows[0].cells
471
+ hdr2[0].text = "Question"
472
+ hdr2[1].text = "Times Repeated"
473
+
474
+ for q, c in sorted(repeated.items(), key=lambda x: x[1], reverse=True):
475
+ row = rep_table.add_row().cells
476
+ row[0].text = q
477
+ row[1].text = str(c)
478
+
479
+ # After creating and filling rep_table
480
+ for row in rep_table.rows:
481
+ tr = row._tr
482
+ trPr = tr.get_or_add_trPr()
483
+ cant_split = OxmlElement('w:cantSplit')
484
+ trPr.append(cant_split)
485
+
486
+ else:
487
+ doc.add_paragraph("No repeated questions found.")
488
+
489
+ # Save document
490
+ doc.save(output_word_path)
491
+ print(f"\nπŸŽ‰ SUCCESS: Document saved as: {output_word_path}")
492
+ print(f"πŸ“š Total modules processed: {len(questions_by_module)}")
493
+ print(f"🚫 Total S2/invalid questions skipped: {skipped_s2_questions}")
494
+ print(f"πŸ“„ Questions sorted by module sheet order and course number")
495
+ if total_e_choices > 0:
496
+ print(f"✨ Dynamic E columns added for courses with 5-choice questions")
497
+
498
+
499
+ def debug_excel_structure(excel_file_path):
500
+ """Debug function to analyze Excel structure"""
501
+ print("=== DEBUGGING EXCEL STRUCTURE ===")
502
+
503
+ # Read the Excel file
504
+ xls = pd.ExcelFile(excel_file_path)
505
+ first_sheet_name = xls.sheet_names[0] # Get the first sheet name
506
+ questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)
507
+
508
+ print(f"Total rows: {len(questions_df)}")
509
+ print(f"Columns: {list(questions_df.columns)}")
510
+
511
+ # Check unique values in key columns
512
+ if 'Numero' in questions_df.columns:
513
+ try:
514
+ print(f"Unique Numero values: {sorted(questions_df['Numero'].dropna().unique())}")
515
+ except Exception as e:
516
+ print(f"Unique Numero values: {list(questions_df['Numero'].dropna().unique())} (couldn't sort: {e})")
517
+
518
+ if 'Order' in questions_df.columns:
519
+ try:
520
+ unique_orders = sorted(questions_df['Order'].dropna().unique())
521
+ print(f"Unique Order values: {unique_orders}")
522
+ # Check specifically for E choices
523
+ e_count = sum(1 for order in questions_df['Order'].dropna() if str(order).strip().upper() == 'E')
524
+ print(f"Total E choices found: {e_count}")
525
+ except Exception as e:
526
+ print(f"Unique Order values: {list(questions_df['Order'].dropna().unique())} (couldn't sort: {e})")
527
+
528
+ if 'Cours' in questions_df.columns:
529
+ unique_cours = questions_df['Cours'].dropna().unique()
530
+
531
+ # Convert all to strings first for display, then separate by validity
532
+ unique_cours_str = [str(c) for c in unique_cours]
533
+ print(f"Unique Cours values: {unique_cours_str}")
534
+
535
+ # Check which cours values are valid vs invalid
536
+ valid_cours = []
537
+ invalid_cours = []
538
+
539
+ for c in unique_cours:
540
+ if is_valid_cours_number(c):
541
+ valid_cours.append(c)
542
+ else:
543
+ invalid_cours.append(str(c))
544
+
545
+ # Sort valid ones (numeric) and invalid ones (as strings) separately
546
+ try:
547
+ valid_cours_sorted = sorted([float(c) for c in valid_cours])
548
+ print(f"Valid cours values: {valid_cours_sorted}")
549
+ except Exception:
550
+ print(f"Valid cours values: {valid_cours}")
551
+
552
+ try:
553
+ invalid_cours_sorted = sorted(invalid_cours)
554
+ print(f"Invalid/S2 cours values: {invalid_cours_sorted}")
555
+ except Exception:
556
+ print(f"Invalid/S2 cours values: {invalid_cours}")
557
+
558
+ # Check module column and corresponding sheets
559
+ if 'module' in questions_df.columns:
560
+ unique_modules = questions_df['module'].dropna().unique()
561
+ print(f"\nUnique Module values: {list(unique_modules)}")
562
+
563
+ # Check if sheets exist for each module
564
+ xls = pd.ExcelFile(excel_file_path)
565
+ sheet_names = xls.sheet_names
566
+ sheet_names_lower = [s.lower() for s in sheet_names]
567
+
568
+ print("\nModule sheet availability:")
569
+ for module in unique_modules:
570
+ module_lower = str(module).strip().lower()
571
+ if module_lower in sheet_names_lower:
572
+ actual_sheet = sheet_names[sheet_names_lower.index(module_lower)]
573
+ print(f" βœ“ Module '{module}' -> Sheet '{actual_sheet}' found")
574
+
575
+ # Try to read and show course info from this sheet
576
+ try:
577
+ module_df = pd.read_excel(excel_file_path, sheet_name=actual_sheet)
578
+ if 'cours' in module_df.columns and 'titre' in module_df.columns:
579
+ print(f" Courses in this module:")
580
+ for _, row in module_df.iterrows():
581
+ if pd.notna(row['cours']):
582
+ print(f" - {row['cours']}: {row.get('titre', 'N/A')}")
583
+ except Exception as e:
584
+ print(f" Error reading sheet: {e}")
585
+ else:
586
+ print(f" βœ— Module '{module}' -> No matching sheet found")
587
+
588
+ # Check Cours sheet
589
+ try:
590
+ cours_df = pd.read_excel(excel_file_path, sheet_name='Cours')
591
+ print(f"\nCours sheet - Total rows: {len(cours_df)}")
592
+ print(f"Cours sheet columns: {list(cours_df.columns)}")
593
+ if not cours_df.empty:
594
+ print("Course titles:")
595
+ for _, row in cours_df.iterrows():
596
+ cours_val = row.get('cours', 'N/A')
597
+ is_valid = is_valid_cours_number(cours_val)
598
+ status = "βœ“" if is_valid else "βœ— (SKIPPED)"
599
+ print(f" Course {cours_val}: {row.get('titre', 'N/A')} {status}")
600
+ except Exception as e:
601
+ print(f"Error reading Cours sheet: {e}")