import pandas as pd from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml.shared import OxmlElement, qn from docx.enum.section import WD_SECTION # Theme color configuration - change these to customize the document colors THEME_COLOR_HEX = "5FFFDF" # Hex version for XML elements THEME_COLOR = RGBColor.from_string(THEME_COLOR_HEX) # RGBColor version for direct use def set_zero_spacing(paragraph): """Force paragraph spacing to 0 before and after.""" paragraph.paragraph_format.space_before = Pt(0) paragraph.paragraph_format.space_after = Pt(0) def is_valid_cours_number(cours_value): """Check if cours value is valid (numeric and not 'S2')""" if pd.isna(cours_value): return False cours_str = str(cours_value).strip().upper() # Skip S2 courses and other specific invalid values if cours_str in ['S2', 'NAN', '']: return False # Try to convert to numeric - if it works and is positive, it's valid try: numeric_value = float(cours_str) # Check if it's a positive number (courses should be positive integers) return numeric_value > 0 and numeric_value == int(numeric_value) except (ValueError, TypeError, OverflowError): return False def check_if_course_has_e_choices(course_questions): """Check if any question in the course has an E choice""" for q_data in course_questions: for choice in q_data['choices']: if choice['letter'].upper() == 'E': return True return False def read_course_titles_from_module_sheet(excel_file_path, module_name): """Read course titles from a module-specific sheet (case-insensitive) Returns both titles dict and ordered list of course numbers""" cours_titles = {} cours_order = [] # NEW: Keep track of order courses appear in sheet print(f" DEBUG: Looking for sheet matching module '{module_name}'") # Get all sheet names from the Excel file xls = pd.ExcelFile(excel_file_path) sheet_names = xls.sheet_names # Find matching sheet (case-insensitive) target_sheet = None module_name_lower = str(module_name).strip().lower() print(f" DEBUG: Module name (lowercase): '{module_name_lower}'") print(f" DEBUG: Available sheets: {sheet_names}") for sheet in sheet_names: sheet_lower = sheet.strip().lower() print(f" DEBUG: Comparing '{module_name_lower}' with '{sheet_lower}'") if sheet_lower == module_name_lower: target_sheet = sheet print(f" DEBUG: MATCH FOUND! Using sheet '{target_sheet}'") break if target_sheet is None: print(f" DEBUG: No sheet found matching module '{module_name}'") return cours_titles, cours_order # Read the matching sheet cours_df = pd.read_excel(excel_file_path, sheet_name=target_sheet) print(f" DEBUG: Sheet '{target_sheet}' has {len(cours_df)} rows") print(f" DEBUG: Sheet columns: {list(cours_df.columns)}") if not cours_df.empty and 'cours' in cours_df.columns and 'titre' in cours_df.columns: for idx, row in cours_df.iterrows(): print(f" DEBUG: Row {idx}: cours={row['cours']}, titre={row.get('titre', 'N/A')}") if pd.notna(row['cours']) and pd.notna(row['titre']): # Only store valid numeric courses if is_valid_cours_number(row['cours']): cours_num = int(float(str(row['cours']).strip())) cours_titles[cours_num] = row['titre'] cours_order.append(cours_num) # NEW: Preserve order print(f" DEBUG: Added cours {cours_num}: {row['titre']}") else: print(f" DEBUG: Skipped invalid cours: {row['cours']}") print(f" DEBUG: Final count: {len(cours_titles)} course titles from sheet '{target_sheet}'") print(f" DEBUG: Course order: {cours_order}") else: print(f" DEBUG: Sheet '{target_sheet}' doesn't have expected structure") print(f" DEBUG: Has 'cours' column: {'cours' in cours_df.columns}") print(f" DEBUG: Has 'titre' column: {'titre' in cours_df.columns}") return cours_titles, cours_order def process_excel_to_word(excel_file_path, output_word_path, theme_hex=None): """Main function to process Excel and create Word document with improved column balancing and answer tables""" # Set default theme colors if not provided if theme_hex is None: theme_hex = THEME_COLOR_HEX theme_color = RGBColor.from_string(theme_hex) # Read the Excel file xls = pd.ExcelFile(excel_file_path) first_sheet_name = xls.sheet_names[0] # Get the first sheet name questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name) # Debug: Print the data structure print("DEBUG: Excel file loaded successfully") # Get unique modules from Questions sheet (case-insensitive) module_col = None for col in questions_df.columns: if col.lower().strip() == 'module': module_col = col break if module_col: # Get all sheet names from Excel (in order) xls_temp = pd.ExcelFile(excel_file_path) all_sheets = xls_temp.sheet_names print(f"DEBUG: All sheets in Excel (in order): {all_sheets}") # Skip the first sheet (Questions sheet) and use remaining sheets as module order module_sheets = all_sheets[1:] # Exclude Questions sheet print(f"DEBUG: Module sheets (in order): {module_sheets}") # Create lowercase mapping for comparison sheet_lower_map = {sheet.strip().lower(): sheet for sheet in module_sheets} # Get unique modules from Questions column modules_in_questions = questions_df[module_col].dropna().unique() print(f"DEBUG: Unique modules from Questions sheet: {list(modules_in_questions)}") # Map each module in Questions to its corresponding sheet name module_to_sheet = {} for module in modules_in_questions: module_lower = str(module).strip().lower() if module_lower in sheet_lower_map: module_to_sheet[module] = sheet_lower_map[module_lower] print(f"DEBUG: Mapped '{module}' -> '{sheet_lower_map[module_lower]}'") print(f"DEBUG: Module to sheet mapping: {module_to_sheet}") # Normalize all module names in the dataframe to use sheet names questions_df[module_col] = questions_df[module_col].apply( lambda x: module_to_sheet.get(x, x) if pd.notna(x) else x ) # Now create ordered list of modules based on sheet order modules = [] for sheet in module_sheets: if sheet in module_to_sheet.values(): modules.append(sheet) print(f"DEBUG: Final modules list in sheet order: {modules}") else: print("DEBUG: No 'module' column found in Questions sheet!") print(f"DEBUG: Available columns: {list(questions_df.columns)}") modules = [] # Read course titles from module-specific sheets and organize by module modules_data = {} # {module_name: {cours_num: cours_title}} modules_course_order = {} # NEW: {module_name: [ordered list of course numbers]} xls = pd.ExcelFile(excel_file_path) print(f"DEBUG: Available sheets in Excel file: {xls.sheet_names}") for module in modules: print(f"\nDEBUG: Processing module '{module}'...") try: cours_titles_for_module, cours_order = read_course_titles_from_module_sheet(excel_file_path, module) print(f"DEBUG: Got {len(cours_titles_for_module)} course titles from module '{module}'") print(f"DEBUG: Course titles: {cours_titles_for_module}") print(f"DEBUG: Course order: {cours_order}") modules_data[module] = cours_titles_for_module modules_course_order[module] = cours_order # NEW: Store order except Exception as e: print(f"DEBUG: Error reading module '{module}': {e}") import traceback traceback.print_exc() print(f"\nDEBUG: Modules data: {modules_data}") print(f"DEBUG: Modules course order: {modules_course_order}") # Debug: Print the data structure print("DEBUG: Excel file loaded successfully") print(f"DEBUG: Total rows in Questions sheet: {len(questions_df)}") print("DEBUG: Column names:", list(questions_df.columns)) # Clean column names (remove any extra spaces) questions_df.columns = questions_df.columns.str.strip() # Create Word document doc = Document() # --- Statistics collectors (questions per course and repeats) --- stats_course_counts = {} # { course_title: count } stats_question_repeats = {} # { question_text: count } # Process questions with their following choice rows, grouped by course processed_questions = [] current_question = None current_choices = [] skipped_s2_questions = 0 print("DEBUG: Processing rows sequentially to group choices...") for idx, row in questions_df.iterrows(): numero = row['Numero'] # If this row has a question number, it's a new question if pd.notna(numero): # If we were processing a previous question, save it (only if valid cours) if current_question is not None and current_choices and is_valid_cours_number(current_cours): processed_questions.append({ 'numero': current_question, 'question_text': current_question_text, 'source': current_source, 'comment': current_comment, 'cours': int(float(str(current_cours).strip())), # Convert to int 'module': current_module, 'choices': current_choices.copy() }) print(f"DEBUG: Saved question {current_question} with {len(current_choices)} choices") elif current_question is not None and not is_valid_cours_number(current_cours): skipped_s2_questions += 1 print(f"DEBUG: Skipped question {current_question} from cours '{current_cours}' (invalid/S2)") # Start new question current_question = numero current_question_text = str(row['Question']).strip() current_source = str(row['Source']).strip() if pd.notna(row['Source']) else "" current_comment = str(row['Comment']).strip() if pd.notna(row['Comment']) and str( row['Comment']).lower() != 'nan' else None current_cours = row['Cours'] if pd.notna(row['Cours']) else 1 # Default to course 1 current_module = row[module_col] if module_col and pd.notna(row[module_col]) else None current_choices = [] print(f"\nDEBUG: Starting new question {numero}, Course: {current_cours}") # Only add choices if the current cours is valid if is_valid_cours_number(current_cours): # Add this row as a choice (whether it's the question row or a choice row) choice_letter = str(row['Order']).strip().upper() choice_text = str(row['ChoiceText']).strip() ct_value = str(row['CT']).strip().upper() if pd.notna(row['CT']) else "" is_correct = ct_value == 'X' if choice_text and choice_text.lower() != 'nan' and choice_text != '': current_choices.append({ 'letter': choice_letter, 'text': choice_text, 'is_correct': is_correct }) # Don't forget the last question (only if valid cours) if current_question is not None and current_choices and is_valid_cours_number(current_cours): processed_questions.append({ 'numero': current_question, 'question_text': current_question_text, 'source': current_source, 'comment': current_comment, 'cours': int(float(str(current_cours).strip())), # Convert to int 'module': current_module, 'choices': current_choices.copy() }) elif current_question is not None and not is_valid_cours_number(current_cours): skipped_s2_questions += 1 print(f"DEBUG: Skipped final question {current_question} from cours '{current_cours}' (invalid/S2)") print(f"\nDEBUG: Total processed questions: {len(processed_questions)}") print(f"DEBUG: Total skipped S2/invalid questions: {skipped_s2_questions}") # Group questions by module and course, preserving module order # Use a regular dict (Python 3.7+ preserves insertion order) questions_by_module = {} # Initialize with ordered modules to preserve sheet order for module in modules: questions_by_module[module] = {} # Fill in the questions for q_data in processed_questions: module_name = q_data['module'] cours_num = q_data['cours'] # Only add if module is in our ordered list if module_name in questions_by_module: if cours_num not in questions_by_module[module_name]: questions_by_module[module_name][cours_num] = [] questions_by_module[module_name][cours_num].append(q_data) else: # Handle modules not in sheet list (shouldn't happen but just in case) if module_name not in questions_by_module: questions_by_module[module_name] = {} if cours_num not in questions_by_module[module_name]: questions_by_module[module_name][cours_num] = [] questions_by_module[module_name][cours_num].append(q_data) # NEW: Reorder courses within each module based on sheet order for module_name in list(questions_by_module.keys()): if module_name in modules_course_order: course_order = modules_course_order[module_name] # Create new ordered dict with courses in sheet order ordered_courses = {} for cours_num in course_order: if cours_num in questions_by_module[module_name]: ordered_courses[cours_num] = questions_by_module[module_name][cours_num] # Add any courses that weren't in the sheet (shouldn't happen, but just in case) for cours_num in questions_by_module[module_name]: if cours_num not in ordered_courses: ordered_courses[cours_num] = questions_by_module[module_name][cours_num] questions_by_module[module_name] = ordered_courses print(f"DEBUG: Reordered courses for module '{module_name}': {list(ordered_courses.keys())}") print(f"DEBUG: Questions grouped by modules (sheet order preserved): {list(questions_by_module.keys())}") # Check for E choices across all modules - use TOC order total_e_choices = 0 for module_name in modules: # Sheet order if module_name not in questions_by_module: continue course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys())) for cours_num in course_order: # Sheet order within module if cours_num not in questions_by_module[module_name]: continue course_questions = questions_by_module[module_name][cours_num] course_e_count = sum(1 for q_data in course_questions for choice in q_data['choices'] if choice['letter'].upper() == 'E') if course_e_count > 0: print(f"DEBUG: Module '{module_name}' Course {cours_num} has {course_e_count} E choices") total_e_choices += course_e_count print(f"DEBUG: Total E choices found across all modules: {total_e_choices}") # Collect statistics from processed questions # Use TOC order (modules in sheet order, courses in sheet order within module) for module_name in modules: # Already in sheet order if module_name not in questions_by_module: continue # Get course order for this module course_order = modules_course_order.get(module_name, []) # Iterate courses in sheet order for cours_num in course_order: if cours_num not in questions_by_module[module_name]: continue course_questions = questions_by_module[module_name][cours_num] # Get course title cours_titles = modules_data.get(module_name, {}) course_title = cours_titles.get(cours_num, f"Course {cours_num}") # Count questions per course stats_course_counts[course_title] = stats_course_counts.get(course_title, 0) + len(course_questions) # Count repeated questions for q_data in course_questions: q_text = str(q_data['question_text']).strip() stats_question_repeats[q_text] = stats_question_repeats.get(q_text, 0) + 1 print(f"\nDEBUG: Statistics collected:") print(f" - Courses tracked: {len(stats_course_counts)}") print(f" - Unique questions: {len(stats_question_repeats)}") print(f" - Repeated questions: {sum(1 for count in stats_question_repeats.values() if count > 1)}") # --- Insert Statistics section (two-column layout) before TOC --- # Add a new section (but keep 2-column layout) stats_section = doc.add_section(WD_SECTION.CONTINUOUS) # Ensure this new section keeps the same column layout (2 columns) sectPr = stats_section._sectPr cols = sectPr.xpath('./w:cols')[0] cols.set(qn('w:num'), '2') # --- Add STATISTICS title and bookmark so it appears in TOC --- stats_para = doc.add_paragraph() stats_para.alignment = WD_ALIGN_PARAGRAPH.CENTER stats_run = stats_para.add_run("STATISTICS") stats_run.font.name = 'Montserrat' stats_run.font.size = Pt(14) stats_run.font.bold = True stats_run.font.color.rgb = theme_color # --- Questions per Course --- p = doc.add_paragraph() run = p.add_run("Questions per Course:") run.font.name = 'Montserrat' run.font.size = Pt(11) run.font.bold = True run.font.color.rgb = theme_color table = doc.add_table(rows=1, cols=2) table.style = 'Table Grid' hdr = table.rows[0].cells hdr[0].text = "Course" hdr[1].text = "Number of Questions" # Apply keep together to header cells for cell in hdr: for paragraph in cell.paragraphs: paragraph.paragraph_format.keep_together = True # Display courses in TOC order (module order, then course order within module) for module_name in modules: if module_name not in questions_by_module: continue course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys())) cours_titles = modules_data.get(module_name, {}) for cours_num in course_order: if cours_num not in questions_by_module[module_name]: continue course_title = cours_titles.get(cours_num, f"Course {cours_num}") count = stats_course_counts.get(course_title, 0) row = table.add_row().cells row[0].text = str(course_title) row[1].text = str(count) # Apply keep together to each cell for cell in row: for paragraph in cell.paragraphs: paragraph.paragraph_format.keep_together = True # Apply keep together to entire table rows for row in table.rows: tr = row._tr trPr = tr.get_or_add_trPr() cantSplit = OxmlElement('w:cantSplit') trPr.append(cantSplit) # --- Repeated Questions --- doc.add_paragraph() p2 = doc.add_paragraph() run2 = p2.add_run("Repeated Questions:") run2.font.name = 'Montserrat' run2.font.size = Pt(11) run2.font.bold = True run2.font.color.rgb = theme_color repeated = {q: c for q, c in stats_question_repeats.items() if c > 1} if repeated: rep_table = doc.add_table(rows=1, cols=2) rep_table.style = 'Table Grid' hdr2 = rep_table.rows[0].cells hdr2[0].text = "Question" hdr2[1].text = "Times Repeated" for q, c in sorted(repeated.items(), key=lambda x: x[1], reverse=True): row = rep_table.add_row().cells row[0].text = q row[1].text = str(c) # After creating and filling rep_table for row in rep_table.rows: tr = row._tr trPr = tr.get_or_add_trPr() cant_split = OxmlElement('w:cantSplit') trPr.append(cant_split) else: doc.add_paragraph("No repeated questions found.") # Save document doc.save(output_word_path) print(f"\nšŸŽ‰ SUCCESS: Document saved as: {output_word_path}") print(f"šŸ“š Total modules processed: {len(questions_by_module)}") print(f"🚫 Total S2/invalid questions skipped: {skipped_s2_questions}") print(f"šŸ“„ Questions sorted by module sheet order and course number") if total_e_choices > 0: print(f"✨ Dynamic E columns added for courses with 5-choice questions") def debug_excel_structure(excel_file_path): """Debug function to analyze Excel structure""" print("=== DEBUGGING EXCEL STRUCTURE ===") # Read the Excel file xls = pd.ExcelFile(excel_file_path) first_sheet_name = xls.sheet_names[0] # Get the first sheet name questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name) print(f"Total rows: {len(questions_df)}") print(f"Columns: {list(questions_df.columns)}") # Check unique values in key columns if 'Numero' in questions_df.columns: try: print(f"Unique Numero values: {sorted(questions_df['Numero'].dropna().unique())}") except Exception as e: print(f"Unique Numero values: {list(questions_df['Numero'].dropna().unique())} (couldn't sort: {e})") if 'Order' in questions_df.columns: try: unique_orders = sorted(questions_df['Order'].dropna().unique()) print(f"Unique Order values: {unique_orders}") # Check specifically for E choices e_count = sum(1 for order in questions_df['Order'].dropna() if str(order).strip().upper() == 'E') print(f"Total E choices found: {e_count}") except Exception as e: print(f"Unique Order values: {list(questions_df['Order'].dropna().unique())} (couldn't sort: {e})") if 'Cours' in questions_df.columns: unique_cours = questions_df['Cours'].dropna().unique() # Convert all to strings first for display, then separate by validity unique_cours_str = [str(c) for c in unique_cours] print(f"Unique Cours values: {unique_cours_str}") # Check which cours values are valid vs invalid valid_cours = [] invalid_cours = [] for c in unique_cours: if is_valid_cours_number(c): valid_cours.append(c) else: invalid_cours.append(str(c)) # Sort valid ones (numeric) and invalid ones (as strings) separately try: valid_cours_sorted = sorted([float(c) for c in valid_cours]) print(f"Valid cours values: {valid_cours_sorted}") except Exception: print(f"Valid cours values: {valid_cours}") try: invalid_cours_sorted = sorted(invalid_cours) print(f"Invalid/S2 cours values: {invalid_cours_sorted}") except Exception: print(f"Invalid/S2 cours values: {invalid_cours}") # Check module column and corresponding sheets if 'module' in questions_df.columns: unique_modules = questions_df['module'].dropna().unique() print(f"\nUnique Module values: {list(unique_modules)}") # Check if sheets exist for each module xls = pd.ExcelFile(excel_file_path) sheet_names = xls.sheet_names sheet_names_lower = [s.lower() for s in sheet_names] print("\nModule sheet availability:") for module in unique_modules: module_lower = str(module).strip().lower() if module_lower in sheet_names_lower: actual_sheet = sheet_names[sheet_names_lower.index(module_lower)] print(f" āœ“ Module '{module}' -> Sheet '{actual_sheet}' found") # Try to read and show course info from this sheet try: module_df = pd.read_excel(excel_file_path, sheet_name=actual_sheet) if 'cours' in module_df.columns and 'titre' in module_df.columns: print(f" Courses in this module:") for _, row in module_df.iterrows(): if pd.notna(row['cours']): print(f" - {row['cours']}: {row.get('titre', 'N/A')}") except Exception as e: print(f" Error reading sheet: {e}") else: print(f" āœ— Module '{module}' -> No matching sheet found") # Check Cours sheet try: cours_df = pd.read_excel(excel_file_path, sheet_name='Cours') print(f"\nCours sheet - Total rows: {len(cours_df)}") print(f"Cours sheet columns: {list(cours_df.columns)}") if not cours_df.empty: print("Course titles:") for _, row in cours_df.iterrows(): cours_val = row.get('cours', 'N/A') is_valid = is_valid_cours_number(cours_val) status = "āœ“" if is_valid else "āœ— (SKIPPED)" print(f" Course {cours_val}: {row.get('titre', 'N/A')} {status}") except Exception as e: print(f"Error reading Cours sheet: {e}")