File size: 25,974 Bytes
ad9b466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
import pandas as pd
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.shared import OxmlElement, qn
from docx.enum.section import WD_SECTION

# Theme color configuration - change these to customize the document colors
THEME_COLOR_HEX = "5FFFDF"  # Hex version for XML elements
THEME_COLOR = RGBColor.from_string(THEME_COLOR_HEX)  # RGBColor version for direct use


def set_zero_spacing(paragraph):
    """Force paragraph spacing to 0 before and after."""
    paragraph.paragraph_format.space_before = Pt(0)
    paragraph.paragraph_format.space_after = Pt(0)


def is_valid_cours_number(cours_value):
    """Check if cours value is valid (numeric and not 'S2')"""
    if pd.isna(cours_value):
        return False

    cours_str = str(cours_value).strip().upper()

    # Skip S2 courses and other specific invalid values
    if cours_str in ['S2', 'NAN', '']:
        return False

    # Try to convert to numeric - if it works and is positive, it's valid
    try:
        numeric_value = float(cours_str)
        # Check if it's a positive number (courses should be positive integers)
        return numeric_value > 0 and numeric_value == int(numeric_value)
    except (ValueError, TypeError, OverflowError):
        return False


def check_if_course_has_e_choices(course_questions):
    """Check if any question in the course has an E choice"""
    for q_data in course_questions:
        for choice in q_data['choices']:
            if choice['letter'].upper() == 'E':
                return True
    return False


def read_course_titles_from_module_sheet(excel_file_path, module_name):
    """Read course titles from a module-specific sheet (case-insensitive)
    Returns both titles dict and ordered list of course numbers"""
    cours_titles = {}
    cours_order = []  # NEW: Keep track of order courses appear in sheet

    print(f"  DEBUG: Looking for sheet matching module '{module_name}'")

    # Get all sheet names from the Excel file
    xls = pd.ExcelFile(excel_file_path)
    sheet_names = xls.sheet_names

    # Find matching sheet (case-insensitive)
    target_sheet = None
    module_name_lower = str(module_name).strip().lower()

    print(f"  DEBUG: Module name (lowercase): '{module_name_lower}'")
    print(f"  DEBUG: Available sheets: {sheet_names}")

    for sheet in sheet_names:
        sheet_lower = sheet.strip().lower()
        print(f"  DEBUG: Comparing '{module_name_lower}' with '{sheet_lower}'")
        if sheet_lower == module_name_lower:
            target_sheet = sheet
            print(f"  DEBUG: MATCH FOUND! Using sheet '{target_sheet}'")
            break

    if target_sheet is None:
        print(f"  DEBUG: No sheet found matching module '{module_name}'")
        return cours_titles, cours_order

    # Read the matching sheet
    cours_df = pd.read_excel(excel_file_path, sheet_name=target_sheet)
    print(f"  DEBUG: Sheet '{target_sheet}' has {len(cours_df)} rows")
    print(f"  DEBUG: Sheet columns: {list(cours_df.columns)}")

    if not cours_df.empty and 'cours' in cours_df.columns and 'titre' in cours_df.columns:
        for idx, row in cours_df.iterrows():
            print(f"  DEBUG: Row {idx}: cours={row['cours']}, titre={row.get('titre', 'N/A')}")
            if pd.notna(row['cours']) and pd.notna(row['titre']):
                # Only store valid numeric courses
                if is_valid_cours_number(row['cours']):
                    cours_num = int(float(str(row['cours']).strip()))
                    cours_titles[cours_num] = row['titre']
                    cours_order.append(cours_num)  # NEW: Preserve order
                    print(f"  DEBUG: Added cours {cours_num}: {row['titre']}")
                else:
                    print(f"  DEBUG: Skipped invalid cours: {row['cours']}")
        print(f"  DEBUG: Final count: {len(cours_titles)} course titles from sheet '{target_sheet}'")
        print(f"  DEBUG: Course order: {cours_order}")
    else:
        print(f"  DEBUG: Sheet '{target_sheet}' doesn't have expected structure")
        print(f"  DEBUG: Has 'cours' column: {'cours' in cours_df.columns}")
        print(f"  DEBUG: Has 'titre' column: {'titre' in cours_df.columns}")

    return cours_titles, cours_order


def process_excel_to_word(excel_file_path, output_word_path, theme_hex=None):
    """Main function to process Excel and create Word document with improved column balancing and answer tables"""

    # Set default theme colors if not provided
    if theme_hex is None:
        theme_hex = THEME_COLOR_HEX
    theme_color = RGBColor.from_string(theme_hex)

    # Read the Excel file
    xls = pd.ExcelFile(excel_file_path)
    first_sheet_name = xls.sheet_names[0]  # Get the first sheet name
    questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)

    # Debug: Print the data structure
    print("DEBUG: Excel file loaded successfully")

    # Get unique modules from Questions sheet (case-insensitive)
    module_col = None
    for col in questions_df.columns:
        if col.lower().strip() == 'module':
            module_col = col
            break

    if module_col:
        # Get all sheet names from Excel (in order)
        xls_temp = pd.ExcelFile(excel_file_path)
        all_sheets = xls_temp.sheet_names

        print(f"DEBUG: All sheets in Excel (in order): {all_sheets}")

        # Skip the first sheet (Questions sheet) and use remaining sheets as module order
        module_sheets = all_sheets[1:]  # Exclude Questions sheet

        print(f"DEBUG: Module sheets (in order): {module_sheets}")

        # Create lowercase mapping for comparison
        sheet_lower_map = {sheet.strip().lower(): sheet for sheet in module_sheets}

        # Get unique modules from Questions column
        modules_in_questions = questions_df[module_col].dropna().unique()
        print(f"DEBUG: Unique modules from Questions sheet: {list(modules_in_questions)}")

        # Map each module in Questions to its corresponding sheet name
        module_to_sheet = {}
        for module in modules_in_questions:
            module_lower = str(module).strip().lower()
            if module_lower in sheet_lower_map:
                module_to_sheet[module] = sheet_lower_map[module_lower]
                print(f"DEBUG: Mapped '{module}' -> '{sheet_lower_map[module_lower]}'")

        print(f"DEBUG: Module to sheet mapping: {module_to_sheet}")

        # Normalize all module names in the dataframe to use sheet names
        questions_df[module_col] = questions_df[module_col].apply(
            lambda x: module_to_sheet.get(x, x) if pd.notna(x) else x
        )

        # Now create ordered list of modules based on sheet order
        modules = []
        for sheet in module_sheets:
            if sheet in module_to_sheet.values():
                modules.append(sheet)

        print(f"DEBUG: Final modules list in sheet order: {modules}")
    else:
        print("DEBUG: No 'module' column found in Questions sheet!")
        print(f"DEBUG: Available columns: {list(questions_df.columns)}")
        modules = []

    # Read course titles from module-specific sheets and organize by module
    modules_data = {}  # {module_name: {cours_num: cours_title}}
    modules_course_order = {}  # NEW: {module_name: [ordered list of course numbers]}
    xls = pd.ExcelFile(excel_file_path)
    print(f"DEBUG: Available sheets in Excel file: {xls.sheet_names}")

    for module in modules:
        print(f"\nDEBUG: Processing module '{module}'...")
        try:
            cours_titles_for_module, cours_order = read_course_titles_from_module_sheet(excel_file_path, module)
            print(f"DEBUG: Got {len(cours_titles_for_module)} course titles from module '{module}'")
            print(f"DEBUG: Course titles: {cours_titles_for_module}")
            print(f"DEBUG: Course order: {cours_order}")
            modules_data[module] = cours_titles_for_module
            modules_course_order[module] = cours_order  # NEW: Store order
        except Exception as e:
            print(f"DEBUG: Error reading module '{module}': {e}")
            import traceback
            traceback.print_exc()

    print(f"\nDEBUG: Modules data: {modules_data}")
    print(f"DEBUG: Modules course order: {modules_course_order}")

    # Debug: Print the data structure
    print("DEBUG: Excel file loaded successfully")
    print(f"DEBUG: Total rows in Questions sheet: {len(questions_df)}")
    print("DEBUG: Column names:", list(questions_df.columns))

    # Clean column names (remove any extra spaces)
    questions_df.columns = questions_df.columns.str.strip()

    # Create Word document
    doc = Document()

    # --- Statistics collectors (questions per course and repeats) ---
    stats_course_counts = {}  # { course_title: count }
    stats_question_repeats = {}  # { question_text: count }

    # Process questions with their following choice rows, grouped by course
    processed_questions = []
    current_question = None
    current_choices = []
    skipped_s2_questions = 0

    print("DEBUG: Processing rows sequentially to group choices...")

    for idx, row in questions_df.iterrows():
        numero = row['Numero']

        # If this row has a question number, it's a new question
        if pd.notna(numero):
            # If we were processing a previous question, save it (only if valid cours)
            if current_question is not None and current_choices and is_valid_cours_number(current_cours):
                processed_questions.append({
                    'numero': current_question,
                    'question_text': current_question_text,
                    'source': current_source,
                    'comment': current_comment,
                    'cours': int(float(str(current_cours).strip())),  # Convert to int
                    'module': current_module,
                    'choices': current_choices.copy()
                })
                print(f"DEBUG: Saved question {current_question} with {len(current_choices)} choices")
            elif current_question is not None and not is_valid_cours_number(current_cours):
                skipped_s2_questions += 1
                print(f"DEBUG: Skipped question {current_question} from cours '{current_cours}' (invalid/S2)")

            # Start new question
            current_question = numero
            current_question_text = str(row['Question']).strip()
            current_source = str(row['Source']).strip() if pd.notna(row['Source']) else ""
            current_comment = str(row['Comment']).strip() if pd.notna(row['Comment']) and str(
                row['Comment']).lower() != 'nan' else None
            current_cours = row['Cours'] if pd.notna(row['Cours']) else 1  # Default to course 1
            current_module = row[module_col] if module_col and pd.notna(row[module_col]) else None
            current_choices = []

            print(f"\nDEBUG: Starting new question {numero}, Course: {current_cours}")

        # Only add choices if the current cours is valid
        if is_valid_cours_number(current_cours):
            # Add this row as a choice (whether it's the question row or a choice row)
            choice_letter = str(row['Order']).strip().upper()
            choice_text = str(row['ChoiceText']).strip()
            ct_value = str(row['CT']).strip().upper() if pd.notna(row['CT']) else ""
            is_correct = ct_value == 'X'

            if choice_text and choice_text.lower() != 'nan' and choice_text != '':
                current_choices.append({
                    'letter': choice_letter,
                    'text': choice_text,
                    'is_correct': is_correct
                })

    # Don't forget the last question (only if valid cours)
    if current_question is not None and current_choices and is_valid_cours_number(current_cours):
        processed_questions.append({
            'numero': current_question,
            'question_text': current_question_text,
            'source': current_source,
            'comment': current_comment,
            'cours': int(float(str(current_cours).strip())),  # Convert to int
            'module': current_module,
            'choices': current_choices.copy()
        })
    elif current_question is not None and not is_valid_cours_number(current_cours):
        skipped_s2_questions += 1
        print(f"DEBUG: Skipped final question {current_question} from cours '{current_cours}' (invalid/S2)")

    print(f"\nDEBUG: Total processed questions: {len(processed_questions)}")
    print(f"DEBUG: Total skipped S2/invalid questions: {skipped_s2_questions}")

    # Group questions by module and course, preserving module order
    # Use a regular dict (Python 3.7+ preserves insertion order)
    questions_by_module = {}

    # Initialize with ordered modules to preserve sheet order
    for module in modules:
        questions_by_module[module] = {}

    # Fill in the questions
    for q_data in processed_questions:
        module_name = q_data['module']
        cours_num = q_data['cours']

        # Only add if module is in our ordered list
        if module_name in questions_by_module:
            if cours_num not in questions_by_module[module_name]:
                questions_by_module[module_name][cours_num] = []

            questions_by_module[module_name][cours_num].append(q_data)
        else:
            # Handle modules not in sheet list (shouldn't happen but just in case)
            if module_name not in questions_by_module:
                questions_by_module[module_name] = {}
            if cours_num not in questions_by_module[module_name]:
                questions_by_module[module_name][cours_num] = []
            questions_by_module[module_name][cours_num].append(q_data)

    # NEW: Reorder courses within each module based on sheet order
    for module_name in list(questions_by_module.keys()):
        if module_name in modules_course_order:
            course_order = modules_course_order[module_name]
            # Create new ordered dict with courses in sheet order
            ordered_courses = {}
            for cours_num in course_order:
                if cours_num in questions_by_module[module_name]:
                    ordered_courses[cours_num] = questions_by_module[module_name][cours_num]

            # Add any courses that weren't in the sheet (shouldn't happen, but just in case)
            for cours_num in questions_by_module[module_name]:
                if cours_num not in ordered_courses:
                    ordered_courses[cours_num] = questions_by_module[module_name][cours_num]

            questions_by_module[module_name] = ordered_courses
            print(f"DEBUG: Reordered courses for module '{module_name}': {list(ordered_courses.keys())}")

    print(f"DEBUG: Questions grouped by modules (sheet order preserved): {list(questions_by_module.keys())}")

    # Check for E choices across all modules - use TOC order
    total_e_choices = 0
    for module_name in modules:  # Sheet order
        if module_name not in questions_by_module:
            continue

        course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))

        for cours_num in course_order:  # Sheet order within module
            if cours_num not in questions_by_module[module_name]:
                continue

            course_questions = questions_by_module[module_name][cours_num]
            course_e_count = sum(1 for q_data in course_questions
                                 for choice in q_data['choices']
                                 if choice['letter'].upper() == 'E')
            if course_e_count > 0:
                print(f"DEBUG: Module '{module_name}' Course {cours_num} has {course_e_count} E choices")
                total_e_choices += course_e_count

    print(f"DEBUG: Total E choices found across all modules: {total_e_choices}")

    # Collect statistics from processed questions
    # Use TOC order (modules in sheet order, courses in sheet order within module)
    for module_name in modules:  # Already in sheet order
        if module_name not in questions_by_module:
            continue

        # Get course order for this module
        course_order = modules_course_order.get(module_name, [])

        # Iterate courses in sheet order
        for cours_num in course_order:
            if cours_num not in questions_by_module[module_name]:
                continue

            course_questions = questions_by_module[module_name][cours_num]

            # Get course title
            cours_titles = modules_data.get(module_name, {})
            course_title = cours_titles.get(cours_num, f"Course {cours_num}")

            # Count questions per course
            stats_course_counts[course_title] = stats_course_counts.get(course_title, 0) + len(course_questions)

            # Count repeated questions
            for q_data in course_questions:
                q_text = str(q_data['question_text']).strip()
                stats_question_repeats[q_text] = stats_question_repeats.get(q_text, 0) + 1

    print(f"\nDEBUG: Statistics collected:")
    print(f"  - Courses tracked: {len(stats_course_counts)}")
    print(f"  - Unique questions: {len(stats_question_repeats)}")
    print(f"  - Repeated questions: {sum(1 for count in stats_question_repeats.values() if count > 1)}")

    # --- Insert Statistics section (two-column layout) before TOC ---
    # Add a new section (but keep 2-column layout)
    stats_section = doc.add_section(WD_SECTION.CONTINUOUS)

    # Ensure this new section keeps the same column layout (2 columns)
    sectPr = stats_section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'), '2')

    # --- Add STATISTICS title and bookmark so it appears in TOC ---
    stats_para = doc.add_paragraph()
    stats_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    stats_run = stats_para.add_run("STATISTICS")
    stats_run.font.name = 'Montserrat'
    stats_run.font.size = Pt(14)
    stats_run.font.bold = True
    stats_run.font.color.rgb = theme_color

    # --- Questions per Course ---
    p = doc.add_paragraph()
    run = p.add_run("Questions per Course:")
    run.font.name = 'Montserrat'
    run.font.size = Pt(11)
    run.font.bold = True
    run.font.color.rgb = theme_color

    table = doc.add_table(rows=1, cols=2)
    table.style = 'Table Grid'
    hdr = table.rows[0].cells
    hdr[0].text = "Course"
    hdr[1].text = "Number of Questions"

    # Apply keep together to header cells
    for cell in hdr:
        for paragraph in cell.paragraphs:
            paragraph.paragraph_format.keep_together = True

    # Display courses in TOC order (module order, then course order within module)
    for module_name in modules:
        if module_name not in questions_by_module:
            continue

        course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
        cours_titles = modules_data.get(module_name, {})

        for cours_num in course_order:
            if cours_num not in questions_by_module[module_name]:
                continue

            course_title = cours_titles.get(cours_num, f"Course {cours_num}")
            count = stats_course_counts.get(course_title, 0)

            row = table.add_row().cells
            row[0].text = str(course_title)
            row[1].text = str(count)

            # Apply keep together to each cell
            for cell in row:
                for paragraph in cell.paragraphs:
                    paragraph.paragraph_format.keep_together = True

    # Apply keep together to entire table rows
    for row in table.rows:
        tr = row._tr
        trPr = tr.get_or_add_trPr()
        cantSplit = OxmlElement('w:cantSplit')
        trPr.append(cantSplit)

    # --- Repeated Questions ---
    doc.add_paragraph()
    p2 = doc.add_paragraph()
    run2 = p2.add_run("Repeated Questions:")
    run2.font.name = 'Montserrat'
    run2.font.size = Pt(11)
    run2.font.bold = True
    run2.font.color.rgb = theme_color

    repeated = {q: c for q, c in stats_question_repeats.items() if c > 1}
    if repeated:
        rep_table = doc.add_table(rows=1, cols=2)
        rep_table.style = 'Table Grid'
        hdr2 = rep_table.rows[0].cells
        hdr2[0].text = "Question"
        hdr2[1].text = "Times Repeated"

        for q, c in sorted(repeated.items(), key=lambda x: x[1], reverse=True):
            row = rep_table.add_row().cells
            row[0].text = q
            row[1].text = str(c)

        # After creating and filling rep_table
        for row in rep_table.rows:
            tr = row._tr
            trPr = tr.get_or_add_trPr()
            cant_split = OxmlElement('w:cantSplit')
            trPr.append(cant_split)

    else:
        doc.add_paragraph("No repeated questions found.")

    # Save document
    doc.save(output_word_path)
    print(f"\nπŸŽ‰ SUCCESS: Document saved as: {output_word_path}")
    print(f"πŸ“š Total modules processed: {len(questions_by_module)}")
    print(f"🚫 Total S2/invalid questions skipped: {skipped_s2_questions}")
    print(f"πŸ“„ Questions sorted by module sheet order and course number")
    if total_e_choices > 0:
        print(f"✨ Dynamic E columns added for courses with 5-choice questions")


def debug_excel_structure(excel_file_path):
    """Debug function to analyze Excel structure"""
    print("=== DEBUGGING EXCEL STRUCTURE ===")

    # Read the Excel file
    xls = pd.ExcelFile(excel_file_path)
    first_sheet_name = xls.sheet_names[0]  # Get the first sheet name
    questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)

    print(f"Total rows: {len(questions_df)}")
    print(f"Columns: {list(questions_df.columns)}")

    # Check unique values in key columns
    if 'Numero' in questions_df.columns:
        try:
            print(f"Unique Numero values: {sorted(questions_df['Numero'].dropna().unique())}")
        except Exception as e:
            print(f"Unique Numero values: {list(questions_df['Numero'].dropna().unique())} (couldn't sort: {e})")

    if 'Order' in questions_df.columns:
        try:
            unique_orders = sorted(questions_df['Order'].dropna().unique())
            print(f"Unique Order values: {unique_orders}")
            # Check specifically for E choices
            e_count = sum(1 for order in questions_df['Order'].dropna() if str(order).strip().upper() == 'E')
            print(f"Total E choices found: {e_count}")
        except Exception as e:
            print(f"Unique Order values: {list(questions_df['Order'].dropna().unique())} (couldn't sort: {e})")

    if 'Cours' in questions_df.columns:
        unique_cours = questions_df['Cours'].dropna().unique()

        # Convert all to strings first for display, then separate by validity
        unique_cours_str = [str(c) for c in unique_cours]
        print(f"Unique Cours values: {unique_cours_str}")

        # Check which cours values are valid vs invalid
        valid_cours = []
        invalid_cours = []

        for c in unique_cours:
            if is_valid_cours_number(c):
                valid_cours.append(c)
            else:
                invalid_cours.append(str(c))

        # Sort valid ones (numeric) and invalid ones (as strings) separately
        try:
            valid_cours_sorted = sorted([float(c) for c in valid_cours])
            print(f"Valid cours values: {valid_cours_sorted}")
        except Exception:
            print(f"Valid cours values: {valid_cours}")

        try:
            invalid_cours_sorted = sorted(invalid_cours)
            print(f"Invalid/S2 cours values: {invalid_cours_sorted}")
        except Exception:
            print(f"Invalid/S2 cours values: {invalid_cours}")

    # Check module column and corresponding sheets
    if 'module' in questions_df.columns:
        unique_modules = questions_df['module'].dropna().unique()
        print(f"\nUnique Module values: {list(unique_modules)}")

        # Check if sheets exist for each module
        xls = pd.ExcelFile(excel_file_path)
        sheet_names = xls.sheet_names
        sheet_names_lower = [s.lower() for s in sheet_names]

        print("\nModule sheet availability:")
        for module in unique_modules:
            module_lower = str(module).strip().lower()
            if module_lower in sheet_names_lower:
                actual_sheet = sheet_names[sheet_names_lower.index(module_lower)]
                print(f"  βœ“ Module '{module}' -> Sheet '{actual_sheet}' found")

                # Try to read and show course info from this sheet
                try:
                    module_df = pd.read_excel(excel_file_path, sheet_name=actual_sheet)
                    if 'cours' in module_df.columns and 'titre' in module_df.columns:
                        print(f"    Courses in this module:")
                        for _, row in module_df.iterrows():
                            if pd.notna(row['cours']):
                                print(f"      - {row['cours']}: {row.get('titre', 'N/A')}")
                except Exception as e:
                    print(f"    Error reading sheet: {e}")
            else:
                print(f"  βœ— Module '{module}' -> No matching sheet found")

    # Check Cours sheet
    try:
        cours_df = pd.read_excel(excel_file_path, sheet_name='Cours')
        print(f"\nCours sheet - Total rows: {len(cours_df)}")
        print(f"Cours sheet columns: {list(cours_df.columns)}")
        if not cours_df.empty:
            print("Course titles:")
            for _, row in cours_df.iterrows():
                cours_val = row.get('cours', 'N/A')
                is_valid = is_valid_cours_number(cours_val)
                status = "βœ“" if is_valid else "βœ— (SKIPPED)"
                print(f"  Course {cours_val}: {row.get('titre', 'N/A')} {status}")
    except Exception as e:
        print(f"Error reading Cours sheet: {e}")