TiH0's picture
Rename stat.py to statis.py
76b4094 verified
import pandas as pd
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.shared import OxmlElement, qn
from docx.enum.section import WD_SECTION
# Theme color configuration - change these to customize the document colors
THEME_COLOR_HEX = "5FFFDF" # Hex version for XML elements
THEME_COLOR = RGBColor.from_string(THEME_COLOR_HEX) # RGBColor version for direct use
def set_zero_spacing(paragraph):
"""Force paragraph spacing to 0 before and after."""
paragraph.paragraph_format.space_before = Pt(0)
paragraph.paragraph_format.space_after = Pt(0)
def is_valid_cours_number(cours_value):
"""Check if cours value is valid (numeric and not 'S2')"""
if pd.isna(cours_value):
return False
cours_str = str(cours_value).strip().upper()
# Skip S2 courses and other specific invalid values
if cours_str in ['S2', 'NAN', '']:
return False
# Try to convert to numeric - if it works and is positive, it's valid
try:
numeric_value = float(cours_str)
# Check if it's a positive number (courses should be positive integers)
return numeric_value > 0 and numeric_value == int(numeric_value)
except (ValueError, TypeError, OverflowError):
return False
def check_if_course_has_e_choices(course_questions):
"""Check if any question in the course has an E choice"""
for q_data in course_questions:
for choice in q_data['choices']:
if choice['letter'].upper() == 'E':
return True
return False
def read_course_titles_from_module_sheet(excel_file_path, module_name):
"""Read course titles from a module-specific sheet (case-insensitive)
Returns both titles dict and ordered list of course numbers"""
cours_titles = {}
cours_order = [] # NEW: Keep track of order courses appear in sheet
print(f" DEBUG: Looking for sheet matching module '{module_name}'")
# Get all sheet names from the Excel file
xls = pd.ExcelFile(excel_file_path)
sheet_names = xls.sheet_names
# Find matching sheet (case-insensitive)
target_sheet = None
module_name_lower = str(module_name).strip().lower()
print(f" DEBUG: Module name (lowercase): '{module_name_lower}'")
print(f" DEBUG: Available sheets: {sheet_names}")
for sheet in sheet_names:
sheet_lower = sheet.strip().lower()
print(f" DEBUG: Comparing '{module_name_lower}' with '{sheet_lower}'")
if sheet_lower == module_name_lower:
target_sheet = sheet
print(f" DEBUG: MATCH FOUND! Using sheet '{target_sheet}'")
break
if target_sheet is None:
print(f" DEBUG: No sheet found matching module '{module_name}'")
return cours_titles, cours_order
# Read the matching sheet
cours_df = pd.read_excel(excel_file_path, sheet_name=target_sheet)
print(f" DEBUG: Sheet '{target_sheet}' has {len(cours_df)} rows")
print(f" DEBUG: Sheet columns: {list(cours_df.columns)}")
if not cours_df.empty and 'cours' in cours_df.columns and 'titre' in cours_df.columns:
for idx, row in cours_df.iterrows():
print(f" DEBUG: Row {idx}: cours={row['cours']}, titre={row.get('titre', 'N/A')}")
if pd.notna(row['cours']) and pd.notna(row['titre']):
# Only store valid numeric courses
if is_valid_cours_number(row['cours']):
cours_num = int(float(str(row['cours']).strip()))
cours_titles[cours_num] = row['titre']
cours_order.append(cours_num) # NEW: Preserve order
print(f" DEBUG: Added cours {cours_num}: {row['titre']}")
else:
print(f" DEBUG: Skipped invalid cours: {row['cours']}")
print(f" DEBUG: Final count: {len(cours_titles)} course titles from sheet '{target_sheet}'")
print(f" DEBUG: Course order: {cours_order}")
else:
print(f" DEBUG: Sheet '{target_sheet}' doesn't have expected structure")
print(f" DEBUG: Has 'cours' column: {'cours' in cours_df.columns}")
print(f" DEBUG: Has 'titre' column: {'titre' in cours_df.columns}")
return cours_titles, cours_order
def process_excel_to_word(excel_file_path, output_word_path, theme_hex=None):
"""Main function to process Excel and create Word document with improved column balancing and answer tables"""
# Set default theme colors if not provided
if theme_hex is None:
theme_hex = THEME_COLOR_HEX
theme_color = RGBColor.from_string(theme_hex)
# Read the Excel file
xls = pd.ExcelFile(excel_file_path)
first_sheet_name = xls.sheet_names[0] # Get the first sheet name
questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)
# Debug: Print the data structure
print("DEBUG: Excel file loaded successfully")
# Get unique modules from Questions sheet (case-insensitive)
module_col = None
for col in questions_df.columns:
if col.lower().strip() == 'module':
module_col = col
break
if module_col:
# Get all sheet names from Excel (in order)
xls_temp = pd.ExcelFile(excel_file_path)
all_sheets = xls_temp.sheet_names
print(f"DEBUG: All sheets in Excel (in order): {all_sheets}")
# Skip the first sheet (Questions sheet) and use remaining sheets as module order
module_sheets = all_sheets[1:] # Exclude Questions sheet
print(f"DEBUG: Module sheets (in order): {module_sheets}")
# Create lowercase mapping for comparison
sheet_lower_map = {sheet.strip().lower(): sheet for sheet in module_sheets}
# Get unique modules from Questions column
modules_in_questions = questions_df[module_col].dropna().unique()
print(f"DEBUG: Unique modules from Questions sheet: {list(modules_in_questions)}")
# Map each module in Questions to its corresponding sheet name
module_to_sheet = {}
for module in modules_in_questions:
module_lower = str(module).strip().lower()
if module_lower in sheet_lower_map:
module_to_sheet[module] = sheet_lower_map[module_lower]
print(f"DEBUG: Mapped '{module}' -> '{sheet_lower_map[module_lower]}'")
print(f"DEBUG: Module to sheet mapping: {module_to_sheet}")
# Normalize all module names in the dataframe to use sheet names
questions_df[module_col] = questions_df[module_col].apply(
lambda x: module_to_sheet.get(x, x) if pd.notna(x) else x
)
# Now create ordered list of modules based on sheet order
modules = []
for sheet in module_sheets:
if sheet in module_to_sheet.values():
modules.append(sheet)
print(f"DEBUG: Final modules list in sheet order: {modules}")
else:
print("DEBUG: No 'module' column found in Questions sheet!")
print(f"DEBUG: Available columns: {list(questions_df.columns)}")
modules = []
# Read course titles from module-specific sheets and organize by module
modules_data = {} # {module_name: {cours_num: cours_title}}
modules_course_order = {} # NEW: {module_name: [ordered list of course numbers]}
xls = pd.ExcelFile(excel_file_path)
print(f"DEBUG: Available sheets in Excel file: {xls.sheet_names}")
for module in modules:
print(f"\nDEBUG: Processing module '{module}'...")
try:
cours_titles_for_module, cours_order = read_course_titles_from_module_sheet(excel_file_path, module)
print(f"DEBUG: Got {len(cours_titles_for_module)} course titles from module '{module}'")
print(f"DEBUG: Course titles: {cours_titles_for_module}")
print(f"DEBUG: Course order: {cours_order}")
modules_data[module] = cours_titles_for_module
modules_course_order[module] = cours_order # NEW: Store order
except Exception as e:
print(f"DEBUG: Error reading module '{module}': {e}")
import traceback
traceback.print_exc()
print(f"\nDEBUG: Modules data: {modules_data}")
print(f"DEBUG: Modules course order: {modules_course_order}")
# Debug: Print the data structure
print("DEBUG: Excel file loaded successfully")
print(f"DEBUG: Total rows in Questions sheet: {len(questions_df)}")
print("DEBUG: Column names:", list(questions_df.columns))
# Clean column names (remove any extra spaces)
questions_df.columns = questions_df.columns.str.strip()
# Create Word document
doc = Document()
# --- Statistics collectors (questions per course and repeats) ---
stats_course_counts = {} # { course_title: count }
stats_question_repeats = {} # { question_text: count }
# Process questions with their following choice rows, grouped by course
processed_questions = []
current_question = None
current_choices = []
skipped_s2_questions = 0
print("DEBUG: Processing rows sequentially to group choices...")
for idx, row in questions_df.iterrows():
numero = row['Numero']
# If this row has a question number, it's a new question
if pd.notna(numero):
# If we were processing a previous question, save it (only if valid cours)
if current_question is not None and current_choices and is_valid_cours_number(current_cours):
processed_questions.append({
'numero': current_question,
'question_text': current_question_text,
'source': current_source,
'comment': current_comment,
'cours': int(float(str(current_cours).strip())), # Convert to int
'module': current_module,
'choices': current_choices.copy()
})
print(f"DEBUG: Saved question {current_question} with {len(current_choices)} choices")
elif current_question is not None and not is_valid_cours_number(current_cours):
skipped_s2_questions += 1
print(f"DEBUG: Skipped question {current_question} from cours '{current_cours}' (invalid/S2)")
# Start new question
current_question = numero
current_question_text = str(row['Question']).strip()
current_source = str(row['Source']).strip() if pd.notna(row['Source']) else ""
current_comment = str(row['Comment']).strip() if pd.notna(row['Comment']) and str(
row['Comment']).lower() != 'nan' else None
current_cours = row['Cours'] if pd.notna(row['Cours']) else 1 # Default to course 1
current_module = row[module_col] if module_col and pd.notna(row[module_col]) else None
current_choices = []
print(f"\nDEBUG: Starting new question {numero}, Course: {current_cours}")
# Only add choices if the current cours is valid
if is_valid_cours_number(current_cours):
# Add this row as a choice (whether it's the question row or a choice row)
choice_letter = str(row['Order']).strip().upper()
choice_text = str(row['ChoiceText']).strip()
ct_value = str(row['CT']).strip().upper() if pd.notna(row['CT']) else ""
is_correct = ct_value == 'X'
if choice_text and choice_text.lower() != 'nan' and choice_text != '':
current_choices.append({
'letter': choice_letter,
'text': choice_text,
'is_correct': is_correct
})
# Don't forget the last question (only if valid cours)
if current_question is not None and current_choices and is_valid_cours_number(current_cours):
processed_questions.append({
'numero': current_question,
'question_text': current_question_text,
'source': current_source,
'comment': current_comment,
'cours': int(float(str(current_cours).strip())), # Convert to int
'module': current_module,
'choices': current_choices.copy()
})
elif current_question is not None and not is_valid_cours_number(current_cours):
skipped_s2_questions += 1
print(f"DEBUG: Skipped final question {current_question} from cours '{current_cours}' (invalid/S2)")
print(f"\nDEBUG: Total processed questions: {len(processed_questions)}")
print(f"DEBUG: Total skipped S2/invalid questions: {skipped_s2_questions}")
# Group questions by module and course, preserving module order
# Use a regular dict (Python 3.7+ preserves insertion order)
questions_by_module = {}
# Initialize with ordered modules to preserve sheet order
for module in modules:
questions_by_module[module] = {}
# Fill in the questions
for q_data in processed_questions:
module_name = q_data['module']
cours_num = q_data['cours']
# Only add if module is in our ordered list
if module_name in questions_by_module:
if cours_num not in questions_by_module[module_name]:
questions_by_module[module_name][cours_num] = []
questions_by_module[module_name][cours_num].append(q_data)
else:
# Handle modules not in sheet list (shouldn't happen but just in case)
if module_name not in questions_by_module:
questions_by_module[module_name] = {}
if cours_num not in questions_by_module[module_name]:
questions_by_module[module_name][cours_num] = []
questions_by_module[module_name][cours_num].append(q_data)
# NEW: Reorder courses within each module based on sheet order
for module_name in list(questions_by_module.keys()):
if module_name in modules_course_order:
course_order = modules_course_order[module_name]
# Create new ordered dict with courses in sheet order
ordered_courses = {}
for cours_num in course_order:
if cours_num in questions_by_module[module_name]:
ordered_courses[cours_num] = questions_by_module[module_name][cours_num]
# Add any courses that weren't in the sheet (shouldn't happen, but just in case)
for cours_num in questions_by_module[module_name]:
if cours_num not in ordered_courses:
ordered_courses[cours_num] = questions_by_module[module_name][cours_num]
questions_by_module[module_name] = ordered_courses
print(f"DEBUG: Reordered courses for module '{module_name}': {list(ordered_courses.keys())}")
print(f"DEBUG: Questions grouped by modules (sheet order preserved): {list(questions_by_module.keys())}")
# Check for E choices across all modules - use TOC order
total_e_choices = 0
for module_name in modules: # Sheet order
if module_name not in questions_by_module:
continue
course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
for cours_num in course_order: # Sheet order within module
if cours_num not in questions_by_module[module_name]:
continue
course_questions = questions_by_module[module_name][cours_num]
course_e_count = sum(1 for q_data in course_questions
for choice in q_data['choices']
if choice['letter'].upper() == 'E')
if course_e_count > 0:
print(f"DEBUG: Module '{module_name}' Course {cours_num} has {course_e_count} E choices")
total_e_choices += course_e_count
print(f"DEBUG: Total E choices found across all modules: {total_e_choices}")
# Collect statistics from processed questions
# Use TOC order (modules in sheet order, courses in sheet order within module)
for module_name in modules: # Already in sheet order
if module_name not in questions_by_module:
continue
# Get course order for this module
course_order = modules_course_order.get(module_name, [])
# Iterate courses in sheet order
for cours_num in course_order:
if cours_num not in questions_by_module[module_name]:
continue
course_questions = questions_by_module[module_name][cours_num]
# Get course title
cours_titles = modules_data.get(module_name, {})
course_title = cours_titles.get(cours_num, f"Course {cours_num}")
# Count questions per course
stats_course_counts[course_title] = stats_course_counts.get(course_title, 0) + len(course_questions)
# Count repeated questions
for q_data in course_questions:
q_text = str(q_data['question_text']).strip()
stats_question_repeats[q_text] = stats_question_repeats.get(q_text, 0) + 1
print(f"\nDEBUG: Statistics collected:")
print(f" - Courses tracked: {len(stats_course_counts)}")
print(f" - Unique questions: {len(stats_question_repeats)}")
print(f" - Repeated questions: {sum(1 for count in stats_question_repeats.values() if count > 1)}")
# --- Insert Statistics section (two-column layout) before TOC ---
# Add a new section (but keep 2-column layout)
stats_section = doc.add_section(WD_SECTION.CONTINUOUS)
# Ensure this new section keeps the same column layout (2 columns)
sectPr = stats_section._sectPr
cols = sectPr.xpath('./w:cols')[0]
cols.set(qn('w:num'), '2')
# --- Add STATISTICS title and bookmark so it appears in TOC ---
stats_para = doc.add_paragraph()
stats_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
stats_run = stats_para.add_run("STATISTICS")
stats_run.font.name = 'Montserrat'
stats_run.font.size = Pt(14)
stats_run.font.bold = True
stats_run.font.color.rgb = theme_color
# --- Questions per Course ---
p = doc.add_paragraph()
run = p.add_run("Questions per Course:")
run.font.name = 'Montserrat'
run.font.size = Pt(11)
run.font.bold = True
run.font.color.rgb = theme_color
table = doc.add_table(rows=1, cols=2)
table.style = 'Table Grid'
hdr = table.rows[0].cells
hdr[0].text = "Course"
hdr[1].text = "Number of Questions"
# Apply keep together to header cells
for cell in hdr:
for paragraph in cell.paragraphs:
paragraph.paragraph_format.keep_together = True
# Display courses in TOC order (module order, then course order within module)
for module_name in modules:
if module_name not in questions_by_module:
continue
course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
cours_titles = modules_data.get(module_name, {})
for cours_num in course_order:
if cours_num not in questions_by_module[module_name]:
continue
course_title = cours_titles.get(cours_num, f"Course {cours_num}")
count = stats_course_counts.get(course_title, 0)
row = table.add_row().cells
row[0].text = str(course_title)
row[1].text = str(count)
# Apply keep together to each cell
for cell in row:
for paragraph in cell.paragraphs:
paragraph.paragraph_format.keep_together = True
# Apply keep together to entire table rows
for row in table.rows:
tr = row._tr
trPr = tr.get_or_add_trPr()
cantSplit = OxmlElement('w:cantSplit')
trPr.append(cantSplit)
# --- Repeated Questions ---
doc.add_paragraph()
p2 = doc.add_paragraph()
run2 = p2.add_run("Repeated Questions:")
run2.font.name = 'Montserrat'
run2.font.size = Pt(11)
run2.font.bold = True
run2.font.color.rgb = theme_color
repeated = {q: c for q, c in stats_question_repeats.items() if c > 1}
if repeated:
rep_table = doc.add_table(rows=1, cols=2)
rep_table.style = 'Table Grid'
hdr2 = rep_table.rows[0].cells
hdr2[0].text = "Question"
hdr2[1].text = "Times Repeated"
for q, c in sorted(repeated.items(), key=lambda x: x[1], reverse=True):
row = rep_table.add_row().cells
row[0].text = q
row[1].text = str(c)
# After creating and filling rep_table
for row in rep_table.rows:
tr = row._tr
trPr = tr.get_or_add_trPr()
cant_split = OxmlElement('w:cantSplit')
trPr.append(cant_split)
else:
doc.add_paragraph("No repeated questions found.")
# Save document
doc.save(output_word_path)
print(f"\nπŸŽ‰ SUCCESS: Document saved as: {output_word_path}")
print(f"πŸ“š Total modules processed: {len(questions_by_module)}")
print(f"🚫 Total S2/invalid questions skipped: {skipped_s2_questions}")
print(f"πŸ“„ Questions sorted by module sheet order and course number")
if total_e_choices > 0:
print(f"✨ Dynamic E columns added for courses with 5-choice questions")
def debug_excel_structure(excel_file_path):
"""Debug function to analyze Excel structure"""
print("=== DEBUGGING EXCEL STRUCTURE ===")
# Read the Excel file
xls = pd.ExcelFile(excel_file_path)
first_sheet_name = xls.sheet_names[0] # Get the first sheet name
questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)
print(f"Total rows: {len(questions_df)}")
print(f"Columns: {list(questions_df.columns)}")
# Check unique values in key columns
if 'Numero' in questions_df.columns:
try:
print(f"Unique Numero values: {sorted(questions_df['Numero'].dropna().unique())}")
except Exception as e:
print(f"Unique Numero values: {list(questions_df['Numero'].dropna().unique())} (couldn't sort: {e})")
if 'Order' in questions_df.columns:
try:
unique_orders = sorted(questions_df['Order'].dropna().unique())
print(f"Unique Order values: {unique_orders}")
# Check specifically for E choices
e_count = sum(1 for order in questions_df['Order'].dropna() if str(order).strip().upper() == 'E')
print(f"Total E choices found: {e_count}")
except Exception as e:
print(f"Unique Order values: {list(questions_df['Order'].dropna().unique())} (couldn't sort: {e})")
if 'Cours' in questions_df.columns:
unique_cours = questions_df['Cours'].dropna().unique()
# Convert all to strings first for display, then separate by validity
unique_cours_str = [str(c) for c in unique_cours]
print(f"Unique Cours values: {unique_cours_str}")
# Check which cours values are valid vs invalid
valid_cours = []
invalid_cours = []
for c in unique_cours:
if is_valid_cours_number(c):
valid_cours.append(c)
else:
invalid_cours.append(str(c))
# Sort valid ones (numeric) and invalid ones (as strings) separately
try:
valid_cours_sorted = sorted([float(c) for c in valid_cours])
print(f"Valid cours values: {valid_cours_sorted}")
except Exception:
print(f"Valid cours values: {valid_cours}")
try:
invalid_cours_sorted = sorted(invalid_cours)
print(f"Invalid/S2 cours values: {invalid_cours_sorted}")
except Exception:
print(f"Invalid/S2 cours values: {invalid_cours}")
# Check module column and corresponding sheets
if 'module' in questions_df.columns:
unique_modules = questions_df['module'].dropna().unique()
print(f"\nUnique Module values: {list(unique_modules)}")
# Check if sheets exist for each module
xls = pd.ExcelFile(excel_file_path)
sheet_names = xls.sheet_names
sheet_names_lower = [s.lower() for s in sheet_names]
print("\nModule sheet availability:")
for module in unique_modules:
module_lower = str(module).strip().lower()
if module_lower in sheet_names_lower:
actual_sheet = sheet_names[sheet_names_lower.index(module_lower)]
print(f" βœ“ Module '{module}' -> Sheet '{actual_sheet}' found")
# Try to read and show course info from this sheet
try:
module_df = pd.read_excel(excel_file_path, sheet_name=actual_sheet)
if 'cours' in module_df.columns and 'titre' in module_df.columns:
print(f" Courses in this module:")
for _, row in module_df.iterrows():
if pd.notna(row['cours']):
print(f" - {row['cours']}: {row.get('titre', 'N/A')}")
except Exception as e:
print(f" Error reading sheet: {e}")
else:
print(f" βœ— Module '{module}' -> No matching sheet found")
# Check Cours sheet
try:
cours_df = pd.read_excel(excel_file_path, sheet_name='Cours')
print(f"\nCours sheet - Total rows: {len(cours_df)}")
print(f"Cours sheet columns: {list(cours_df.columns)}")
if not cours_df.empty:
print("Course titles:")
for _, row in cours_df.iterrows():
cours_val = row.get('cours', 'N/A')
is_valid = is_valid_cours_number(cours_val)
status = "βœ“" if is_valid else "βœ— (SKIPPED)"
print(f" Course {cours_val}: {row.get('titre', 'N/A')} {status}")
except Exception as e:
print(f"Error reading Cours sheet: {e}")