Spaces:

TiH0
/

Manhattan-Statistics

Sleeping

File size: 25,974 Bytes

ad9b466

import pandas as pd
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.shared import OxmlElement, qn
from docx.enum.section import WD_SECTION

# Theme color configuration - change these to customize the document colors
THEME_COLOR_HEX = "5FFFDF"  # Hex version for XML elements
THEME_COLOR = RGBColor.from_string(THEME_COLOR_HEX)  # RGBColor version for direct use


def set_zero_spacing(paragraph):
    """Force paragraph spacing to 0 before and after."""
    paragraph.paragraph_format.space_before = Pt(0)
    paragraph.paragraph_format.space_after = Pt(0)


def is_valid_cours_number(cours_value):
    """Check if cours value is valid (numeric and not 'S2')"""
    if pd.isna(cours_value):
        return False

    cours_str = str(cours_value).strip().upper()

    # Skip S2 courses and other specific invalid values
    if cours_str in ['S2', 'NAN', '']:
        return False

    # Try to convert to numeric - if it works and is positive, it's valid
    try:
        numeric_value = float(cours_str)
        # Check if it's a positive number (courses should be positive integers)
        return numeric_value > 0 and numeric_value == int(numeric_value)
    except (ValueError, TypeError, OverflowError):
        return False


def check_if_course_has_e_choices(course_questions):
    """Check if any question in the course has an E choice"""
    for q_data in course_questions:
        for choice in q_data['choices']:
            if choice['letter'].upper() == 'E':
                return True
    return False


def read_course_titles_from_module_sheet(excel_file_path, module_name):
    """Read course titles from a module-specific sheet (case-insensitive)
    Returns both titles dict and ordered list of course numbers"""
    cours_titles = {}
    cours_order = []  # NEW: Keep track of order courses appear in sheet

    print(f"  DEBUG: Looking for sheet matching module '{module_name}'")

    # Get all sheet names from the Excel file
    xls = pd.ExcelFile(excel_file_path)
    sheet_names = xls.sheet_names

    # Find matching sheet (case-insensitive)
    target_sheet = None
    module_name_lower = str(module_name).strip().lower()

    print(f"  DEBUG: Module name (lowercase): '{module_name_lower}'")
    print(f"  DEBUG: Available sheets: {sheet_names}")

    for sheet in sheet_names:
        sheet_lower = sheet.strip().lower()
        print(f"  DEBUG: Comparing '{module_name_lower}' with '{sheet_lower}'")
        if sheet_lower == module_name_lower:
            target_sheet = sheet
            print(f"  DEBUG: MATCH FOUND! Using sheet '{target_sheet}'")
            break

    if target_sheet is None:
        print(f"  DEBUG: No sheet found matching module '{module_name}'")
        return cours_titles, cours_order

    # Read the matching sheet
    cours_df = pd.read_excel(excel_file_path, sheet_name=target_sheet)
    print(f"  DEBUG: Sheet '{target_sheet}' has {len(cours_df)} rows")
    print(f"  DEBUG: Sheet columns: {list(cours_df.columns)}")

    if not cours_df.empty and 'cours' in cours_df.columns and 'titre' in cours_df.columns:
        for idx, row in cours_df.iterrows():
            print(f"  DEBUG: Row {idx}: cours={row['cours']}, titre={row.get('titre', 'N/A')}")
            if pd.notna(row['cours']) and pd.notna(row['titre']):
                # Only store valid numeric courses
                if is_valid_cours_number(row['cours']):
                    cours_num = int(float(str(row['cours']).strip()))
                    cours_titles[cours_num] = row['titre']
                    cours_order.append(cours_num)  # NEW: Preserve order
                    print(f"  DEBUG: Added cours {cours_num}: {row['titre']}")
                else:
                    print(f"  DEBUG: Skipped invalid cours: {row['cours']}")
        print(f"  DEBUG: Final count: {len(cours_titles)} course titles from sheet '{target_sheet}'")
        print(f"  DEBUG: Course order: {cours_order}")
    else:
        print(f"  DEBUG: Sheet '{target_sheet}' doesn't have expected structure")
        print(f"  DEBUG: Has 'cours' column: {'cours' in cours_df.columns}")
        print(f"  DEBUG: Has 'titre' column: {'titre' in cours_df.columns}")

    return cours_titles, cours_order


def process_excel_to_word(excel_file_path, output_word_path, theme_hex=None):
    """Main function to process Excel and create Word document with improved column balancing and answer tables"""

    # Set default theme colors if not provided
    if theme_hex is None:
        theme_hex = THEME_COLOR_HEX
    theme_color = RGBColor.from_string(theme_hex)

    # Read the Excel file
    xls = pd.ExcelFile(excel_file_path)
    first_sheet_name = xls.sheet_names[0]  # Get the first sheet name
    questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)

    # Debug: Print the data structure
    print("DEBUG: Excel file loaded successfully")

    # Get unique modules from Questions sheet (case-insensitive)
    module_col = None
    for col in questions_df.columns:
        if col.lower().strip() == 'module':
            module_col = col
            break

    if module_col:
        # Get all sheet names from Excel (in order)
        xls_temp = pd.ExcelFile(excel_file_path)
        all_sheets = xls_temp.sheet_names

        print(f"DEBUG: All sheets in Excel (in order): {all_sheets}")

        # Skip the first sheet (Questions sheet) and use remaining sheets as module order
        module_sheets = all_sheets[1:]  # Exclude Questions sheet

        print(f"DEBUG: Module sheets (in order): {module_sheets}")

        # Create lowercase mapping for comparison
        sheet_lower_map = {sheet.strip().lower(): sheet for sheet in module_sheets}

        # Get unique modules from Questions column
        modules_in_questions = questions_df[module_col].dropna().unique()
        print(f"DEBUG: Unique modules from Questions sheet: {list(modules_in_questions)}")

        # Map each module in Questions to its corresponding sheet name
        module_to_sheet = {}
        for module in modules_in_questions:
            module_lower = str(module).strip().lower()
            if module_lower in sheet_lower_map:
                module_to_sheet[module] = sheet_lower_map[module_lower]
                print(f"DEBUG: Mapped '{module}' -> '{sheet_lower_map[module_lower]}'")

        print(f"DEBUG: Module to sheet mapping: {module_to_sheet}")

        # Normalize all module names in the dataframe to use sheet names
        questions_df[module_col] = questions_df[module_col].apply(
            lambda x: module_to_sheet.get(x, x) if pd.notna(x) else x
        )

        # Now create ordered list of modules based on sheet order
        modules = []
        for sheet in module_sheets:
            if sheet in module_to_sheet.values():
                modules.append(sheet)

        print(f"DEBUG: Final modules list in sheet order: {modules}")
    else:
        print("DEBUG: No 'module' column found in Questions sheet!")
        print(f"DEBUG: Available columns: {list(questions_df.columns)}")
        modules = []

    # Read course titles from module-specific sheets and organize by module
    modules_data = {}  # {module_name: {cours_num: cours_title}}
    modules_course_order = {}  # NEW: {module_name: [ordered list of course numbers]}
    xls = pd.ExcelFile(excel_file_path)
    print(f"DEBUG: Available sheets in Excel file: {xls.sheet_names}")

    for module in modules:
        print(f"\nDEBUG: Processing module '{module}'...")
        try:
            cours_titles_for_module, cours_order = read_course_titles_from_module_sheet(excel_file_path, module)
            print(f"DEBUG: Got {len(cours_titles_for_module)} course titles from module '{module}'")
            print(f"DEBUG: Course titles: {cours_titles_for_module}")
            print(f"DEBUG: Course order: {cours_order}")
            modules_data[module] = cours_titles_for_module
            modules_course_order[module] = cours_order  # NEW: Store order
        except Exception as e:
            print(f"DEBUG: Error reading module '{module}': {e}")
            import traceback
            traceback.print_exc()

    print(f"\nDEBUG: Modules data: {modules_data}")
    print(f"DEBUG: Modules course order: {modules_course_order}")

    # Debug: Print the data structure
    print("DEBUG: Excel file loaded successfully")
    print(f"DEBUG: Total rows in Questions sheet: {len(questions_df)}")
    print("DEBUG: Column names:", list(questions_df.columns))

    # Clean column names (remove any extra spaces)
    questions_df.columns = questions_df.columns.str.strip()

    # Create Word document
    doc = Document()

    # --- Statistics collectors (questions per course and repeats) ---
    stats_course_counts = {}  # { course_title: count }
    stats_question_repeats = {}  # { question_text: count }

    # Process questions with their following choice rows, grouped by course
    processed_questions = []
    current_question = None
    current_choices = []
    skipped_s2_questions = 0

    print("DEBUG: Processing rows sequentially to group choices...")

    for idx, row in questions_df.iterrows():
        numero = row['Numero']

        # If this row has a question number, it's a new question
        if pd.notna(numero):
            # If we were processing a previous question, save it (only if valid cours)
            if current_question is not None and current_choices and is_valid_cours_number(current_cours):
                processed_questions.append({
                    'numero': current_question,
                    'question_text': current_question_text,
                    'source': current_source,
                    'comment': current_comment,
                    'cours': int(float(str(current_cours).strip())),  # Convert to int
                    'module': current_module,
                    'choices': current_choices.copy()
                })
                print(f"DEBUG: Saved question {current_question} with {len(current_choices)} choices")
            elif current_question is not None and not is_valid_cours_number(current_cours):
                skipped_s2_questions += 1
                print(f"DEBUG: Skipped question {current_question} from cours '{current_cours}' (invalid/S2)")

            # Start new question
            current_question = numero
            current_question_text = str(row['Question']).strip()
            current_source = str(row['Source']).strip() if pd.notna(row['Source']) else ""
            current_comment = str(row['Comment']).strip() if pd.notna(row['Comment']) and str(
                row['Comment']).lower() != 'nan' else None
            current_cours = row['Cours'] if pd.notna(row['Cours']) else 1  # Default to course 1
            current_module = row[module_col] if module_col and pd.notna(row[module_col]) else None
            current_choices = []

            print(f"\nDEBUG: Starting new question {numero}, Course: {current_cours}")

        # Only add choices if the current cours is valid
        if is_valid_cours_number(current_cours):
            # Add this row as a choice (whether it's the question row or a choice row)
            choice_letter = str(row['Order']).strip().upper()
            choice_text = str(row['ChoiceText']).strip()
            ct_value = str(row['CT']).strip().upper() if pd.notna(row['CT']) else ""
            is_correct = ct_value == 'X'

            if choice_text and choice_text.lower() != 'nan' and choice_text != '':
                current_choices.append({
                    'letter': choice_letter,
                    'text': choice_text,
                    'is_correct': is_correct
                })

    # Don't forget the last question (only if valid cours)
    if current_question is not None and current_choices and is_valid_cours_number(current_cours):
        processed_questions.append({
            'numero': current_question,
            'question_text': current_question_text,
            'source': current_source,
            'comment': current_comment,
            'cours': int(float(str(current_cours).strip())),  # Convert to int
            'module': current_module,
            'choices': current_choices.copy()
        })
    elif current_question is not None and not is_valid_cours_number(current_cours):
        skipped_s2_questions += 1
        print(f"DEBUG: Skipped final question {current_question} from cours '{current_cours}' (invalid/S2)")

    print(f"\nDEBUG: Total processed questions: {len(processed_questions)}")
    print(f"DEBUG: Total skipped S2/invalid questions: {skipped_s2_questions}")

    # Group questions by module and course, preserving module order
    # Use a regular dict (Python 3.7+ preserves insertion order)
    questions_by_module = {}

    # Initialize with ordered modules to preserve sheet order
    for module in modules:
        questions_by_module[module] = {}

    # Fill in the questions
    for q_data in processed_questions:
        module_name = q_data['module']
        cours_num = q_data['cours']

        # Only add if module is in our ordered list
        if module_name in questions_by_module:
            if cours_num not in questions_by_module[module_name]:
                questions_by_module[module_name][cours_num] = []

            questions_by_module[module_name][cours_num].append(q_data)
        else:
            # Handle modules not in sheet list (shouldn't happen but just in case)
            if module_name not in questions_by_module:
                questions_by_module[module_name] = {}
            if cours_num not in questions_by_module[module_name]:
                questions_by_module[module_name][cours_num] = []
            questions_by_module[module_name][cours_num].append(q_data)

    # NEW: Reorder courses within each module based on sheet order
    for module_name in list(questions_by_module.keys()):
        if module_name in modules_course_order:
            course_order = modules_course_order[module_name]
            # Create new ordered dict with courses in sheet order
            ordered_courses = {}
            for cours_num in course_order:
                if cours_num in questions_by_module[module_name]:
                    ordered_courses[cours_num] = questions_by_module[module_name][cours_num]

            # Add any courses that weren't in the sheet (shouldn't happen, but just in case)
            for cours_num in questions_by_module[module_name]:
                if cours_num not in ordered_courses:
                    ordered_courses[cours_num] = questions_by_module[module_name][cours_num]

            questions_by_module[module_name] = ordered_courses
            print(f"DEBUG: Reordered courses for module '{module_name}': {list(ordered_courses.keys())}")

    print(f"DEBUG: Questions grouped by modules (sheet order preserved): {list(questions_by_module.keys())}")

    # Check for E choices across all modules - use TOC order
    total_e_choices = 0
    for module_name in modules:  # Sheet order
        if module_name not in questions_by_module:
            continue

        course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))

        for cours_num in course_order:  # Sheet order within module
            if cours_num not in questions_by_module[module_name]:
                continue

            course_questions = questions_by_module[module_name][cours_num]
            course_e_count = sum(1 for q_data in course_questions
                                 for choice in q_data['choices']
                                 if choice['letter'].upper() == 'E')
            if course_e_count > 0:
                print(f"DEBUG: Module '{module_name}' Course {cours_num} has {course_e_count} E choices")
                total_e_choices += course_e_count

    print(f"DEBUG: Total E choices found across all modules: {total_e_choices}")

    # Collect statistics from processed questions
    # Use TOC order (modules in sheet order, courses in sheet order within module)
    for module_name in modules:  # Already in sheet order
        if module_name not in questions_by_module:
            continue

        # Get course order for this module
        course_order = modules_course_order.get(module_name, [])

        # Iterate courses in sheet order
        for cours_num in course_order:
            if cours_num not in questions_by_module[module_name]:
                continue

            course_questions = questions_by_module[module_name][cours_num]

            # Get course title
            cours_titles = modules_data.get(module_name, {})
            course_title = cours_titles.get(cours_num, f"Course {cours_num}")

            # Count questions per course
            stats_course_counts[course_title] = stats_course_counts.get(course_title, 0) + len(course_questions)

            # Count repeated questions
            for q_data in course_questions:
                q_text = str(q_data['question_text']).strip()
                stats_question_repeats[q_text] = stats_question_repeats.get(q_text, 0) + 1

    print(f"\nDEBUG: Statistics collected:")
    print(f"  - Courses tracked: {len(stats_course_counts)}")
    print(f"  - Unique questions: {len(stats_question_repeats)}")
    print(f"  - Repeated questions: {sum(1 for count in stats_question_repeats.values() if count > 1)}")

    # --- Insert Statistics section (two-column layout) before TOC ---
    # Add a new section (but keep 2-column layout)
    stats_section = doc.add_section(WD_SECTION.CONTINUOUS)

    # Ensure this new section keeps the same column layout (2 columns)
    sectPr = stats_section._sectPr
    cols = sectPr.xpath('./w:cols')[0]
    cols.set(qn('w:num'), '2')

    # --- Add STATISTICS title and bookmark so it appears in TOC ---
    stats_para = doc.add_paragraph()
    stats_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    stats_run = stats_para.add_run("STATISTICS")
    stats_run.font.name = 'Montserrat'
    stats_run.font.size = Pt(14)
    stats_run.font.bold = True
    stats_run.font.color.rgb = theme_color

    # --- Questions per Course ---
    p = doc.add_paragraph()
    run = p.add_run("Questions per Course:")
    run.font.name = 'Montserrat'
    run.font.size = Pt(11)
    run.font.bold = True
    run.font.color.rgb = theme_color

    table = doc.add_table(rows=1, cols=2)
    table.style = 'Table Grid'
    hdr = table.rows[0].cells
    hdr[0].text = "Course"
    hdr[1].text = "Number of Questions"

    # Apply keep together to header cells
    for cell in hdr:
        for paragraph in cell.paragraphs:
            paragraph.paragraph_format.keep_together = True

    # Display courses in TOC order (module order, then course order within module)
    for module_name in modules:
        if module_name not in questions_by_module:
            continue

        course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
        cours_titles = modules_data.get(module_name, {})

        for cours_num in course_order:
            if cours_num not in questions_by_module[module_name]:
                continue

            course_title = cours_titles.get(cours_num, f"Course {cours_num}")
            count = stats_course_counts.get(course_title, 0)

            row = table.add_row().cells
            row[0].text = str(course_title)
            row[1].text = str(count)

            # Apply keep together to each cell
            for cell in row:
                for paragraph in cell.paragraphs:
                    paragraph.paragraph_format.keep_together = True

    # Apply keep together to entire table rows
    for row in table.rows:
        tr = row._tr
        trPr = tr.get_or_add_trPr()
        cantSplit = OxmlElement('w:cantSplit')
        trPr.append(cantSplit)

    # --- Repeated Questions ---
    doc.add_paragraph()
    p2 = doc.add_paragraph()
    run2 = p2.add_run("Repeated Questions:")
    run2.font.name = 'Montserrat'
    run2.font.size = Pt(11)
    run2.font.bold = True
    run2.font.color.rgb = theme_color

    repeated = {q: c for q, c in stats_question_repeats.items() if c > 1}
    if repeated:
        rep_table = doc.add_table(rows=1, cols=2)
        rep_table.style = 'Table Grid'
        hdr2 = rep_table.rows[0].cells
        hdr2[0].text = "Question"
        hdr2[1].text = "Times Repeated"

        for q, c in sorted(repeated.items(), key=lambda x: x[1], reverse=True):
            row = rep_table.add_row().cells
            row[0].text = q
            row[1].text = str(c)

        # After creating and filling rep_table
        for row in rep_table.rows:
            tr = row._tr
            trPr = tr.get_or_add_trPr()
            cant_split = OxmlElement('w:cantSplit')
            trPr.append(cant_split)

    else:
        doc.add_paragraph("No repeated questions found.")

    # Save document
    doc.save(output_word_path)
    print(f"\n🎉 SUCCESS: Document saved as: {output_word_path}")
    print(f"📚 Total modules processed: {len(questions_by_module)}")
    print(f"🚫 Total S2/invalid questions skipped: {skipped_s2_questions}")
    print(f"📄 Questions sorted by module sheet order and course number")
    if total_e_choices > 0:
        print(f"✨ Dynamic E columns added for courses with 5-choice questions")


def debug_excel_structure(excel_file_path):
    """Debug function to analyze Excel structure"""
    print("=== DEBUGGING EXCEL STRUCTURE ===")

    # Read the Excel file
    xls = pd.ExcelFile(excel_file_path)
    first_sheet_name = xls.sheet_names[0]  # Get the first sheet name
    questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)

    print(f"Total rows: {len(questions_df)}")
    print(f"Columns: {list(questions_df.columns)}")

    # Check unique values in key columns
    if 'Numero' in questions_df.columns:
        try:
            print(f"Unique Numero values: {sorted(questions_df['Numero'].dropna().unique())}")
        except Exception as e:
            print(f"Unique Numero values: {list(questions_df['Numero'].dropna().unique())} (couldn't sort: {e})")

    if 'Order' in questions_df.columns:
        try:
            unique_orders = sorted(questions_df['Order'].dropna().unique())
            print(f"Unique Order values: {unique_orders}")
            # Check specifically for E choices
            e_count = sum(1 for order in questions_df['Order'].dropna() if str(order).strip().upper() == 'E')
            print(f"Total E choices found: {e_count}")
        except Exception as e:
            print(f"Unique Order values: {list(questions_df['Order'].dropna().unique())} (couldn't sort: {e})")

    if 'Cours' in questions_df.columns:
        unique_cours = questions_df['Cours'].dropna().unique()

        # Convert all to strings first for display, then separate by validity
        unique_cours_str = [str(c) for c in unique_cours]
        print(f"Unique Cours values: {unique_cours_str}")

        # Check which cours values are valid vs invalid
        valid_cours = []
        invalid_cours = []

        for c in unique_cours:
            if is_valid_cours_number(c):
                valid_cours.append(c)
            else:
                invalid_cours.append(str(c))

        # Sort valid ones (numeric) and invalid ones (as strings) separately
        try:
            valid_cours_sorted = sorted([float(c) for c in valid_cours])
            print(f"Valid cours values: {valid_cours_sorted}")
        except Exception:
            print(f"Valid cours values: {valid_cours}")

        try:
            invalid_cours_sorted = sorted(invalid_cours)
            print(f"Invalid/S2 cours values: {invalid_cours_sorted}")
        except Exception:
            print(f"Invalid/S2 cours values: {invalid_cours}")

    # Check module column and corresponding sheets
    if 'module' in questions_df.columns:
        unique_modules = questions_df['module'].dropna().unique()
        print(f"\nUnique Module values: {list(unique_modules)}")

        # Check if sheets exist for each module
        xls = pd.ExcelFile(excel_file_path)
        sheet_names = xls.sheet_names
        sheet_names_lower = [s.lower() for s in sheet_names]

        print("\nModule sheet availability:")
        for module in unique_modules:
            module_lower = str(module).strip().lower()
            if module_lower in sheet_names_lower:
                actual_sheet = sheet_names[sheet_names_lower.index(module_lower)]
                print(f"  ✓ Module '{module}' -> Sheet '{actual_sheet}' found")

                # Try to read and show course info from this sheet
                try:
                    module_df = pd.read_excel(excel_file_path, sheet_name=actual_sheet)
                    if 'cours' in module_df.columns and 'titre' in module_df.columns:
                        print(f"    Courses in this module:")
                        for _, row in module_df.iterrows():
                            if pd.notna(row['cours']):
                                print(f"      - {row['cours']}: {row.get('titre', 'N/A')}")
                except Exception as e:
                    print(f"    Error reading sheet: {e}")
            else:
                print(f"  ✗ Module '{module}' -> No matching sheet found")

    # Check Cours sheet
    try:
        cours_df = pd.read_excel(excel_file_path, sheet_name='Cours')
        print(f"\nCours sheet - Total rows: {len(cours_df)}")
        print(f"Cours sheet columns: {list(cours_df.columns)}")
        if not cours_df.empty:
            print("Course titles:")
            for _, row in cours_df.iterrows():
                cours_val = row.get('cours', 'N/A')
                is_valid = is_valid_cours_number(cours_val)
                status = "✓" if is_valid else "✗ (SKIPPED)"
                print(f"  Course {cours_val}: {row.get('titre', 'N/A')} {status}")
    except Exception as e:
        print(f"Error reading Cours sheet: {e}")