Spaces:

TiH0
/

Manhattan-Statistics

Sleeping

App Files Files Community

TiH0 commited on Oct 31, 2025

Commit

ad9b466

verified ·

1 Parent(s): a935d6c

Create stat.py

Browse files

Files changed (1) hide show

stat.py +601 -0

stat.py ADDED Viewed

	@@ -0,0 +1,601 @@

+import pandas as pd
+from docx import Document
+from docx.shared import Pt, RGBColor
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.oxml.shared import OxmlElement, qn
+from docx.enum.section import WD_SECTION
+# Theme color configuration - change these to customize the document colors
+THEME_COLOR_HEX = "5FFFDF"  # Hex version for XML elements
+THEME_COLOR = RGBColor.from_string(THEME_COLOR_HEX)  # RGBColor version for direct use
+def set_zero_spacing(paragraph):
+    """Force paragraph spacing to 0 before and after."""
+    paragraph.paragraph_format.space_before = Pt(0)
+    paragraph.paragraph_format.space_after = Pt(0)
+def is_valid_cours_number(cours_value):
+    """Check if cours value is valid (numeric and not 'S2')"""
+    if pd.isna(cours_value):
+        return False
+    cours_str = str(cours_value).strip().upper()
+    # Skip S2 courses and other specific invalid values
+    if cours_str in ['S2', 'NAN', '']:
+        return False
+    # Try to convert to numeric - if it works and is positive, it's valid
+    try:
+        numeric_value = float(cours_str)
+        # Check if it's a positive number (courses should be positive integers)
+        return numeric_value > 0 and numeric_value == int(numeric_value)
+    except (ValueError, TypeError, OverflowError):
+        return False
+def check_if_course_has_e_choices(course_questions):
+    """Check if any question in the course has an E choice"""
+    for q_data in course_questions:
+        for choice in q_data['choices']:
+            if choice['letter'].upper() == 'E':
+                return True
+    return False
+def read_course_titles_from_module_sheet(excel_file_path, module_name):
+    """Read course titles from a module-specific sheet (case-insensitive)
+    Returns both titles dict and ordered list of course numbers"""
+    cours_titles = {}
+    cours_order = []  # NEW: Keep track of order courses appear in sheet
+    print(f"  DEBUG: Looking for sheet matching module '{module_name}'")
+    # Get all sheet names from the Excel file
+    xls = pd.ExcelFile(excel_file_path)
+    sheet_names = xls.sheet_names
+    # Find matching sheet (case-insensitive)
+    target_sheet = None
+    module_name_lower = str(module_name).strip().lower()
+    print(f"  DEBUG: Module name (lowercase): '{module_name_lower}'")
+    print(f"  DEBUG: Available sheets: {sheet_names}")
+    for sheet in sheet_names:
+        sheet_lower = sheet.strip().lower()
+        print(f"  DEBUG: Comparing '{module_name_lower}' with '{sheet_lower}'")
+        if sheet_lower == module_name_lower:
+            target_sheet = sheet
+            print(f"  DEBUG: MATCH FOUND! Using sheet '{target_sheet}'")
+            break
+    if target_sheet is None:
+        print(f"  DEBUG: No sheet found matching module '{module_name}'")
+        return cours_titles, cours_order
+    # Read the matching sheet
+    cours_df = pd.read_excel(excel_file_path, sheet_name=target_sheet)
+    print(f"  DEBUG: Sheet '{target_sheet}' has {len(cours_df)} rows")
+    print(f"  DEBUG: Sheet columns: {list(cours_df.columns)}")
+    if not cours_df.empty and 'cours' in cours_df.columns and 'titre' in cours_df.columns:
+        for idx, row in cours_df.iterrows():
+            print(f"  DEBUG: Row {idx}: cours={row['cours']}, titre={row.get('titre', 'N/A')}")
+            if pd.notna(row['cours']) and pd.notna(row['titre']):
+                # Only store valid numeric courses
+                if is_valid_cours_number(row['cours']):
+                    cours_num = int(float(str(row['cours']).strip()))
+                    cours_titles[cours_num] = row['titre']
+                    cours_order.append(cours_num)  # NEW: Preserve order
+                    print(f"  DEBUG: Added cours {cours_num}: {row['titre']}")
+                else:
+                    print(f"  DEBUG: Skipped invalid cours: {row['cours']}")
+        print(f"  DEBUG: Final count: {len(cours_titles)} course titles from sheet '{target_sheet}'")
+        print(f"  DEBUG: Course order: {cours_order}")
+    else:
+        print(f"  DEBUG: Sheet '{target_sheet}' doesn't have expected structure")
+        print(f"  DEBUG: Has 'cours' column: {'cours' in cours_df.columns}")
+        print(f"  DEBUG: Has 'titre' column: {'titre' in cours_df.columns}")
+    return cours_titles, cours_order
+def process_excel_to_word(excel_file_path, output_word_path, theme_hex=None):
+    """Main function to process Excel and create Word document with improved column balancing and answer tables"""
+    # Set default theme colors if not provided
+    if theme_hex is None:
+        theme_hex = THEME_COLOR_HEX
+    theme_color = RGBColor.from_string(theme_hex)
+    # Read the Excel file
+    xls = pd.ExcelFile(excel_file_path)
+    first_sheet_name = xls.sheet_names[0]  # Get the first sheet name
+    questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)
+    # Debug: Print the data structure
+    print("DEBUG: Excel file loaded successfully")
+    # Get unique modules from Questions sheet (case-insensitive)
+    module_col = None
+    for col in questions_df.columns:
+        if col.lower().strip() == 'module':
+            module_col = col
+            break
+    if module_col:
+        # Get all sheet names from Excel (in order)
+        xls_temp = pd.ExcelFile(excel_file_path)
+        all_sheets = xls_temp.sheet_names
+        print(f"DEBUG: All sheets in Excel (in order): {all_sheets}")
+        # Skip the first sheet (Questions sheet) and use remaining sheets as module order
+        module_sheets = all_sheets[1:]  # Exclude Questions sheet
+        print(f"DEBUG: Module sheets (in order): {module_sheets}")
+        # Create lowercase mapping for comparison
+        sheet_lower_map = {sheet.strip().lower(): sheet for sheet in module_sheets}
+        # Get unique modules from Questions column
+        modules_in_questions = questions_df[module_col].dropna().unique()
+        print(f"DEBUG: Unique modules from Questions sheet: {list(modules_in_questions)}")
+        # Map each module in Questions to its corresponding sheet name
+        module_to_sheet = {}
+        for module in modules_in_questions:
+            module_lower = str(module).strip().lower()
+            if module_lower in sheet_lower_map:
+                module_to_sheet[module] = sheet_lower_map[module_lower]
+                print(f"DEBUG: Mapped '{module}' -> '{sheet_lower_map[module_lower]}'")
+        print(f"DEBUG: Module to sheet mapping: {module_to_sheet}")
+        # Normalize all module names in the dataframe to use sheet names
+        questions_df[module_col] = questions_df[module_col].apply(
+            lambda x: module_to_sheet.get(x, x) if pd.notna(x) else x
+        )
+        # Now create ordered list of modules based on sheet order
+        modules = []
+        for sheet in module_sheets:
+            if sheet in module_to_sheet.values():
+                modules.append(sheet)
+        print(f"DEBUG: Final modules list in sheet order: {modules}")
+    else:
+        print("DEBUG: No 'module' column found in Questions sheet!")
+        print(f"DEBUG: Available columns: {list(questions_df.columns)}")
+        modules = []
+    # Read course titles from module-specific sheets and organize by module
+    modules_data = {}  # {module_name: {cours_num: cours_title}}
+    modules_course_order = {}  # NEW: {module_name: [ordered list of course numbers]}
+    xls = pd.ExcelFile(excel_file_path)
+    print(f"DEBUG: Available sheets in Excel file: {xls.sheet_names}")
+    for module in modules:
+        print(f"\nDEBUG: Processing module '{module}'...")
+        try:
+            cours_titles_for_module, cours_order = read_course_titles_from_module_sheet(excel_file_path, module)
+            print(f"DEBUG: Got {len(cours_titles_for_module)} course titles from module '{module}'")
+            print(f"DEBUG: Course titles: {cours_titles_for_module}")
+            print(f"DEBUG: Course order: {cours_order}")
+            modules_data[module] = cours_titles_for_module
+            modules_course_order[module] = cours_order  # NEW: Store order
+        except Exception as e:
+            print(f"DEBUG: Error reading module '{module}': {e}")
+            import traceback
+            traceback.print_exc()
+    print(f"\nDEBUG: Modules data: {modules_data}")
+    print(f"DEBUG: Modules course order: {modules_course_order}")
+    # Debug: Print the data structure
+    print("DEBUG: Excel file loaded successfully")
+    print(f"DEBUG: Total rows in Questions sheet: {len(questions_df)}")
+    print("DEBUG: Column names:", list(questions_df.columns))
+    # Clean column names (remove any extra spaces)
+    questions_df.columns = questions_df.columns.str.strip()
+    # Create Word document
+    doc = Document()
+    # --- Statistics collectors (questions per course and repeats) ---
+    stats_course_counts = {}  # { course_title: count }
+    stats_question_repeats = {}  # { question_text: count }
+    # Process questions with their following choice rows, grouped by course
+    processed_questions = []
+    current_question = None
+    current_choices = []
+    skipped_s2_questions = 0
+    print("DEBUG: Processing rows sequentially to group choices...")
+    for idx, row in questions_df.iterrows():
+        numero = row['Numero']
+        # If this row has a question number, it's a new question
+        if pd.notna(numero):
+            # If we were processing a previous question, save it (only if valid cours)
+            if current_question is not None and current_choices and is_valid_cours_number(current_cours):
+                processed_questions.append({
+                    'numero': current_question,
+                    'question_text': current_question_text,
+                    'source': current_source,
+                    'comment': current_comment,
+                    'cours': int(float(str(current_cours).strip())),  # Convert to int
+                    'module': current_module,
+                    'choices': current_choices.copy()
+                })
+                print(f"DEBUG: Saved question {current_question} with {len(current_choices)} choices")
+            elif current_question is not None and not is_valid_cours_number(current_cours):
+                skipped_s2_questions += 1
+                print(f"DEBUG: Skipped question {current_question} from cours '{current_cours}' (invalid/S2)")
+            # Start new question
+            current_question = numero
+            current_question_text = str(row['Question']).strip()
+            current_source = str(row['Source']).strip() if pd.notna(row['Source']) else ""
+            current_comment = str(row['Comment']).strip() if pd.notna(row['Comment']) and str(
+                row['Comment']).lower() != 'nan' else None
+            current_cours = row['Cours'] if pd.notna(row['Cours']) else 1  # Default to course 1
+            current_module = row[module_col] if module_col and pd.notna(row[module_col]) else None
+            current_choices = []
+            print(f"\nDEBUG: Starting new question {numero}, Course: {current_cours}")
+        # Only add choices if the current cours is valid
+        if is_valid_cours_number(current_cours):
+            # Add this row as a choice (whether it's the question row or a choice row)
+            choice_letter = str(row['Order']).strip().upper()
+            choice_text = str(row['ChoiceText']).strip()
+            ct_value = str(row['CT']).strip().upper() if pd.notna(row['CT']) else ""
+            is_correct = ct_value == 'X'
+            if choice_text and choice_text.lower() != 'nan' and choice_text != '':
+                current_choices.append({
+                    'letter': choice_letter,
+                    'text': choice_text,
+                    'is_correct': is_correct
+                })
+    # Don't forget the last question (only if valid cours)
+    if current_question is not None and current_choices and is_valid_cours_number(current_cours):
+        processed_questions.append({
+            'numero': current_question,
+            'question_text': current_question_text,
+            'source': current_source,
+            'comment': current_comment,
+            'cours': int(float(str(current_cours).strip())),  # Convert to int
+            'module': current_module,
+            'choices': current_choices.copy()
+        })
+    elif current_question is not None and not is_valid_cours_number(current_cours):
+        skipped_s2_questions += 1
+        print(f"DEBUG: Skipped final question {current_question} from cours '{current_cours}' (invalid/S2)")
+    print(f"\nDEBUG: Total processed questions: {len(processed_questions)}")
+    print(f"DEBUG: Total skipped S2/invalid questions: {skipped_s2_questions}")
+    # Group questions by module and course, preserving module order
+    # Use a regular dict (Python 3.7+ preserves insertion order)
+    questions_by_module = {}
+    # Initialize with ordered modules to preserve sheet order
+    for module in modules:
+        questions_by_module[module] = {}
+    # Fill in the questions
+    for q_data in processed_questions:
+        module_name = q_data['module']
+        cours_num = q_data['cours']
+        # Only add if module is in our ordered list
+        if module_name in questions_by_module:
+            if cours_num not in questions_by_module[module_name]:
+                questions_by_module[module_name][cours_num] = []
+            questions_by_module[module_name][cours_num].append(q_data)
+        else:
+            # Handle modules not in sheet list (shouldn't happen but just in case)
+            if module_name not in questions_by_module:
+                questions_by_module[module_name] = {}
+            if cours_num not in questions_by_module[module_name]:
+                questions_by_module[module_name][cours_num] = []
+            questions_by_module[module_name][cours_num].append(q_data)
+    # NEW: Reorder courses within each module based on sheet order
+    for module_name in list(questions_by_module.keys()):
+        if module_name in modules_course_order:
+            course_order = modules_course_order[module_name]
+            # Create new ordered dict with courses in sheet order
+            ordered_courses = {}
+            for cours_num in course_order:
+                if cours_num in questions_by_module[module_name]:
+                    ordered_courses[cours_num] = questions_by_module[module_name][cours_num]
+            # Add any courses that weren't in the sheet (shouldn't happen, but just in case)
+            for cours_num in questions_by_module[module_name]:
+                if cours_num not in ordered_courses:
+                    ordered_courses[cours_num] = questions_by_module[module_name][cours_num]
+            questions_by_module[module_name] = ordered_courses
+            print(f"DEBUG: Reordered courses for module '{module_name}': {list(ordered_courses.keys())}")
+    print(f"DEBUG: Questions grouped by modules (sheet order preserved): {list(questions_by_module.keys())}")
+    # Check for E choices across all modules - use TOC order
+    total_e_choices = 0
+    for module_name in modules:  # Sheet order
+        if module_name not in questions_by_module:
+            continue
+        course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
+        for cours_num in course_order:  # Sheet order within module
+            if cours_num not in questions_by_module[module_name]:
+                continue
+            course_questions = questions_by_module[module_name][cours_num]
+            course_e_count = sum(1 for q_data in course_questions
+                                 for choice in q_data['choices']
+                                 if choice['letter'].upper() == 'E')
+            if course_e_count > 0:
+                print(f"DEBUG: Module '{module_name}' Course {cours_num} has {course_e_count} E choices")
+                total_e_choices += course_e_count
+    print(f"DEBUG: Total E choices found across all modules: {total_e_choices}")
+    # Collect statistics from processed questions
+    # Use TOC order (modules in sheet order, courses in sheet order within module)
+    for module_name in modules:  # Already in sheet order
+        if module_name not in questions_by_module:
+            continue
+        # Get course order for this module
+        course_order = modules_course_order.get(module_name, [])
+        # Iterate courses in sheet order
+        for cours_num in course_order:
+            if cours_num not in questions_by_module[module_name]:
+                continue
+            course_questions = questions_by_module[module_name][cours_num]
+            # Get course title
+            cours_titles = modules_data.get(module_name, {})
+            course_title = cours_titles.get(cours_num, f"Course {cours_num}")
+            # Count questions per course
+            stats_course_counts[course_title] = stats_course_counts.get(course_title, 0) + len(course_questions)
+            # Count repeated questions
+            for q_data in course_questions:
+                q_text = str(q_data['question_text']).strip()
+                stats_question_repeats[q_text] = stats_question_repeats.get(q_text, 0) + 1
+    print(f"\nDEBUG: Statistics collected:")
+    print(f"  - Courses tracked: {len(stats_course_counts)}")
+    print(f"  - Unique questions: {len(stats_question_repeats)}")
+    print(f"  - Repeated questions: {sum(1 for count in stats_question_repeats.values() if count > 1)}")
+    # --- Insert Statistics section (two-column layout) before TOC ---
+    # Add a new section (but keep 2-column layout)
+    stats_section = doc.add_section(WD_SECTION.CONTINUOUS)
+    # Ensure this new section keeps the same column layout (2 columns)
+    sectPr = stats_section._sectPr
+    cols = sectPr.xpath('./w:cols')[0]
+    cols.set(qn('w:num'), '2')
+    # --- Add STATISTICS title and bookmark so it appears in TOC ---
+    stats_para = doc.add_paragraph()
+    stats_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    stats_run = stats_para.add_run("STATISTICS")
+    stats_run.font.name = 'Montserrat'
+    stats_run.font.size = Pt(14)
+    stats_run.font.bold = True
+    stats_run.font.color.rgb = theme_color
+    # --- Questions per Course ---
+    p = doc.add_paragraph()
+    run = p.add_run("Questions per Course:")
+    run.font.name = 'Montserrat'
+    run.font.size = Pt(11)
+    run.font.bold = True
+    run.font.color.rgb = theme_color
+    table = doc.add_table(rows=1, cols=2)
+    table.style = 'Table Grid'
+    hdr = table.rows[0].cells
+    hdr[0].text = "Course"
+    hdr[1].text = "Number of Questions"
+    # Apply keep together to header cells
+    for cell in hdr:
+        for paragraph in cell.paragraphs:
+            paragraph.paragraph_format.keep_together = True
+    # Display courses in TOC order (module order, then course order within module)
+    for module_name in modules:
+        if module_name not in questions_by_module:
+            continue
+        course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
+        cours_titles = modules_data.get(module_name, {})
+        for cours_num in course_order:
+            if cours_num not in questions_by_module[module_name]:
+                continue
+            course_title = cours_titles.get(cours_num, f"Course {cours_num}")
+            count = stats_course_counts.get(course_title, 0)
+            row = table.add_row().cells
+            row[0].text = str(course_title)
+            row[1].text = str(count)
+            # Apply keep together to each cell
+            for cell in row:
+                for paragraph in cell.paragraphs:
+                    paragraph.paragraph_format.keep_together = True
+    # Apply keep together to entire table rows
+    for row in table.rows:
+        tr = row._tr
+        trPr = tr.get_or_add_trPr()
+        cantSplit = OxmlElement('w:cantSplit')
+        trPr.append(cantSplit)
+    # --- Repeated Questions ---
+    doc.add_paragraph()
+    p2 = doc.add_paragraph()
+    run2 = p2.add_run("Repeated Questions:")
+    run2.font.name = 'Montserrat'
+    run2.font.size = Pt(11)
+    run2.font.bold = True
+    run2.font.color.rgb = theme_color
+    repeated = {q: c for q, c in stats_question_repeats.items() if c > 1}
+    if repeated:
+        rep_table = doc.add_table(rows=1, cols=2)
+        rep_table.style = 'Table Grid'
+        hdr2 = rep_table.rows[0].cells
+        hdr2[0].text = "Question"
+        hdr2[1].text = "Times Repeated"
+        for q, c in sorted(repeated.items(), key=lambda x: x[1], reverse=True):
+            row = rep_table.add_row().cells
+            row[0].text = q
+            row[1].text = str(c)
+        # After creating and filling rep_table
+        for row in rep_table.rows:
+            tr = row._tr
+            trPr = tr.get_or_add_trPr()
+            cant_split = OxmlElement('w:cantSplit')
+            trPr.append(cant_split)
+    else:
+        doc.add_paragraph("No repeated questions found.")
+    # Save document
+    doc.save(output_word_path)
+    print(f"\n🎉 SUCCESS: Document saved as: {output_word_path}")
+    print(f"📚 Total modules processed: {len(questions_by_module)}")
+    print(f"🚫 Total S2/invalid questions skipped: {skipped_s2_questions}")
+    print(f"📄 Questions sorted by module sheet order and course number")
+    if total_e_choices > 0:
+        print(f"✨ Dynamic E columns added for courses with 5-choice questions")
+def debug_excel_structure(excel_file_path):
+    """Debug function to analyze Excel structure"""
+    print("=== DEBUGGING EXCEL STRUCTURE ===")
+    # Read the Excel file
+    xls = pd.ExcelFile(excel_file_path)
+    first_sheet_name = xls.sheet_names[0]  # Get the first sheet name
+    questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)
+    print(f"Total rows: {len(questions_df)}")
+    print(f"Columns: {list(questions_df.columns)}")
+    # Check unique values in key columns
+    if 'Numero' in questions_df.columns:
+        try:
+            print(f"Unique Numero values: {sorted(questions_df['Numero'].dropna().unique())}")
+        except Exception as e:
+            print(f"Unique Numero values: {list(questions_df['Numero'].dropna().unique())} (couldn't sort: {e})")
+    if 'Order' in questions_df.columns:
+        try:
+            unique_orders = sorted(questions_df['Order'].dropna().unique())
+            print(f"Unique Order values: {unique_orders}")
+            # Check specifically for E choices
+            e_count = sum(1 for order in questions_df['Order'].dropna() if str(order).strip().upper() == 'E')
+            print(f"Total E choices found: {e_count}")
+        except Exception as e:
+            print(f"Unique Order values: {list(questions_df['Order'].dropna().unique())} (couldn't sort: {e})")
+    if 'Cours' in questions_df.columns:
+        unique_cours = questions_df['Cours'].dropna().unique()
+        # Convert all to strings first for display, then separate by validity
+        unique_cours_str = [str(c) for c in unique_cours]
+        print(f"Unique Cours values: {unique_cours_str}")
+        # Check which cours values are valid vs invalid
+        valid_cours = []
+        invalid_cours = []
+        for c in unique_cours:
+            if is_valid_cours_number(c):
+                valid_cours.append(c)
+            else:
+                invalid_cours.append(str(c))
+        # Sort valid ones (numeric) and invalid ones (as strings) separately
+        try:
+            valid_cours_sorted = sorted([float(c) for c in valid_cours])
+            print(f"Valid cours values: {valid_cours_sorted}")
+        except Exception:
+            print(f"Valid cours values: {valid_cours}")
+        try:
+            invalid_cours_sorted = sorted(invalid_cours)
+            print(f"Invalid/S2 cours values: {invalid_cours_sorted}")
+        except Exception:
+            print(f"Invalid/S2 cours values: {invalid_cours}")
+    # Check module column and corresponding sheets
+    if 'module' in questions_df.columns:
+        unique_modules = questions_df['module'].dropna().unique()
+        print(f"\nUnique Module values: {list(unique_modules)}")
+        # Check if sheets exist for each module
+        xls = pd.ExcelFile(excel_file_path)
+        sheet_names = xls.sheet_names
+        sheet_names_lower = [s.lower() for s in sheet_names]
+        print("\nModule sheet availability:")
+        for module in unique_modules:
+            module_lower = str(module).strip().lower()
+            if module_lower in sheet_names_lower:
+                actual_sheet = sheet_names[sheet_names_lower.index(module_lower)]
+                print(f"  ✓ Module '{module}' -> Sheet '{actual_sheet}' found")
+                # Try to read and show course info from this sheet
+                try:
+                    module_df = pd.read_excel(excel_file_path, sheet_name=actual_sheet)
+                    if 'cours' in module_df.columns and 'titre' in module_df.columns:
+                        print(f"    Courses in this module:")
+                        for _, row in module_df.iterrows():
+                            if pd.notna(row['cours']):
+                                print(f"      - {row['cours']}: {row.get('titre', 'N/A')}")
+                except Exception as e:
+                    print(f"    Error reading sheet: {e}")
+            else:
+                print(f"  ✗ Module '{module}' -> No matching sheet found")
+    # Check Cours sheet
+    try:
+        cours_df = pd.read_excel(excel_file_path, sheet_name='Cours')
+        print(f"\nCours sheet - Total rows: {len(cours_df)}")
+        print(f"Cours sheet columns: {list(cours_df.columns)}")
+        if not cours_df.empty:
+            print("Course titles:")
+            for _, row in cours_df.iterrows():
+                cours_val = row.get('cours', 'N/A')
+                is_valid = is_valid_cours_number(cours_val)
+                status = "✓" if is_valid else "✗ (SKIPPED)"
+                print(f"  Course {cours_val}: {row.get('titre', 'N/A')} {status}")
+    except Exception as e:
+        print(f"Error reading Cours sheet: {e}")