Spaces:

TiH0
/

Manhattan-Statistics

Sleeping

App Files Files Community

Manhattan-Statistics / statis.py

TiH0

Rename stat.py to statis.py

76b4094 verified 5 months ago

raw

history blame contribute delete

26 kB

	import pandas as pd
	from docx import Document
	from docx.shared import Pt, RGBColor
	from docx.enum.text import WD_ALIGN_PARAGRAPH
	from docx.oxml.shared import OxmlElement, qn
	from docx.enum.section import WD_SECTION

	# Theme color configuration - change these to customize the document colors
	THEME_COLOR_HEX = "5FFFDF" # Hex version for XML elements
	THEME_COLOR = RGBColor.from_string(THEME_COLOR_HEX) # RGBColor version for direct use


	def set_zero_spacing(paragraph):
	"""Force paragraph spacing to 0 before and after."""
	paragraph.paragraph_format.space_before = Pt(0)
	paragraph.paragraph_format.space_after = Pt(0)


	def is_valid_cours_number(cours_value):
	"""Check if cours value is valid (numeric and not 'S2')"""
	if pd.isna(cours_value):
	return False

	cours_str = str(cours_value).strip().upper()

	# Skip S2 courses and other specific invalid values
	if cours_str in ['S2', 'NAN', '']:
	return False

	# Try to convert to numeric - if it works and is positive, it's valid
	try:
	numeric_value = float(cours_str)
	# Check if it's a positive number (courses should be positive integers)
	return numeric_value > 0 and numeric_value == int(numeric_value)
	except (ValueError, TypeError, OverflowError):
	return False


	def check_if_course_has_e_choices(course_questions):
	"""Check if any question in the course has an E choice"""
	for q_data in course_questions:
	for choice in q_data['choices']:
	if choice['letter'].upper() == 'E':
	return True
	return False


	def read_course_titles_from_module_sheet(excel_file_path, module_name):
	"""Read course titles from a module-specific sheet (case-insensitive)
	Returns both titles dict and ordered list of course numbers"""
	cours_titles = {}
	cours_order = [] # NEW: Keep track of order courses appear in sheet

	print(f" DEBUG: Looking for sheet matching module '{module_name}'")

	# Get all sheet names from the Excel file
	xls = pd.ExcelFile(excel_file_path)
	sheet_names = xls.sheet_names

	# Find matching sheet (case-insensitive)
	target_sheet = None
	module_name_lower = str(module_name).strip().lower()

	print(f" DEBUG: Module name (lowercase): '{module_name_lower}'")
	print(f" DEBUG: Available sheets: {sheet_names}")

	for sheet in sheet_names:
	sheet_lower = sheet.strip().lower()
	print(f" DEBUG: Comparing '{module_name_lower}' with '{sheet_lower}'")
	if sheet_lower == module_name_lower:
	target_sheet = sheet
	print(f" DEBUG: MATCH FOUND! Using sheet '{target_sheet}'")
	break

	if target_sheet is None:
	print(f" DEBUG: No sheet found matching module '{module_name}'")
	return cours_titles, cours_order

	# Read the matching sheet
	cours_df = pd.read_excel(excel_file_path, sheet_name=target_sheet)
	print(f" DEBUG: Sheet '{target_sheet}' has {len(cours_df)} rows")
	print(f" DEBUG: Sheet columns: {list(cours_df.columns)}")

	if not cours_df.empty and 'cours' in cours_df.columns and 'titre' in cours_df.columns:
	for idx, row in cours_df.iterrows():
	print(f" DEBUG: Row {idx}: cours={row['cours']}, titre={row.get('titre', 'N/A')}")
	if pd.notna(row['cours']) and pd.notna(row['titre']):
	# Only store valid numeric courses
	if is_valid_cours_number(row['cours']):
	cours_num = int(float(str(row['cours']).strip()))
	cours_titles[cours_num] = row['titre']
	cours_order.append(cours_num) # NEW: Preserve order
	print(f" DEBUG: Added cours {cours_num}: {row['titre']}")
	else:
	print(f" DEBUG: Skipped invalid cours: {row['cours']}")
	print(f" DEBUG: Final count: {len(cours_titles)} course titles from sheet '{target_sheet}'")
	print(f" DEBUG: Course order: {cours_order}")
	else:
	print(f" DEBUG: Sheet '{target_sheet}' doesn't have expected structure")
	print(f" DEBUG: Has 'cours' column: {'cours' in cours_df.columns}")
	print(f" DEBUG: Has 'titre' column: {'titre' in cours_df.columns}")

	return cours_titles, cours_order


	def process_excel_to_word(excel_file_path, output_word_path, theme_hex=None):
	"""Main function to process Excel and create Word document with improved column balancing and answer tables"""

	# Set default theme colors if not provided
	if theme_hex is None:
	theme_hex = THEME_COLOR_HEX
	theme_color = RGBColor.from_string(theme_hex)

	# Read the Excel file
	xls = pd.ExcelFile(excel_file_path)
	first_sheet_name = xls.sheet_names[0] # Get the first sheet name
	questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)

	# Debug: Print the data structure
	print("DEBUG: Excel file loaded successfully")

	# Get unique modules from Questions sheet (case-insensitive)
	module_col = None
	for col in questions_df.columns:
	if col.lower().strip() == 'module':
	module_col = col
	break

	if module_col:
	# Get all sheet names from Excel (in order)
	xls_temp = pd.ExcelFile(excel_file_path)
	all_sheets = xls_temp.sheet_names

	print(f"DEBUG: All sheets in Excel (in order): {all_sheets}")

	# Skip the first sheet (Questions sheet) and use remaining sheets as module order
	module_sheets = all_sheets[1:] # Exclude Questions sheet

	print(f"DEBUG: Module sheets (in order): {module_sheets}")

	# Create lowercase mapping for comparison
	sheet_lower_map = {sheet.strip().lower(): sheet for sheet in module_sheets}

	# Get unique modules from Questions column
	modules_in_questions = questions_df[module_col].dropna().unique()
	print(f"DEBUG: Unique modules from Questions sheet: {list(modules_in_questions)}")

	# Map each module in Questions to its corresponding sheet name
	module_to_sheet = {}
	for module in modules_in_questions:
	module_lower = str(module).strip().lower()
	if module_lower in sheet_lower_map:
	module_to_sheet[module] = sheet_lower_map[module_lower]
	print(f"DEBUG: Mapped '{module}' -> '{sheet_lower_map[module_lower]}'")

	print(f"DEBUG: Module to sheet mapping: {module_to_sheet}")

	# Normalize all module names in the dataframe to use sheet names
	questions_df[module_col] = questions_df[module_col].apply(
	lambda x: module_to_sheet.get(x, x) if pd.notna(x) else x
	)

	# Now create ordered list of modules based on sheet order
	modules = []
	for sheet in module_sheets:
	if sheet in module_to_sheet.values():
	modules.append(sheet)

	print(f"DEBUG: Final modules list in sheet order: {modules}")
	else:
	print("DEBUG: No 'module' column found in Questions sheet!")
	print(f"DEBUG: Available columns: {list(questions_df.columns)}")
	modules = []

	# Read course titles from module-specific sheets and organize by module
	modules_data = {} # {module_name: {cours_num: cours_title}}
	modules_course_order = {} # NEW: {module_name: [ordered list of course numbers]}
	xls = pd.ExcelFile(excel_file_path)
	print(f"DEBUG: Available sheets in Excel file: {xls.sheet_names}")

	for module in modules:
	print(f"\nDEBUG: Processing module '{module}'...")
	try:
	cours_titles_for_module, cours_order = read_course_titles_from_module_sheet(excel_file_path, module)
	print(f"DEBUG: Got {len(cours_titles_for_module)} course titles from module '{module}'")
	print(f"DEBUG: Course titles: {cours_titles_for_module}")
	print(f"DEBUG: Course order: {cours_order}")
	modules_data[module] = cours_titles_for_module
	modules_course_order[module] = cours_order # NEW: Store order
	except Exception as e:
	print(f"DEBUG: Error reading module '{module}': {e}")
	import traceback
	traceback.print_exc()

	print(f"\nDEBUG: Modules data: {modules_data}")
	print(f"DEBUG: Modules course order: {modules_course_order}")

	# Debug: Print the data structure
	print("DEBUG: Excel file loaded successfully")
	print(f"DEBUG: Total rows in Questions sheet: {len(questions_df)}")
	print("DEBUG: Column names:", list(questions_df.columns))

	# Clean column names (remove any extra spaces)
	questions_df.columns = questions_df.columns.str.strip()

	# Create Word document
	doc = Document()

	# --- Statistics collectors (questions per course and repeats) ---
	stats_course_counts = {} # { course_title: count }
	stats_question_repeats = {} # { question_text: count }

	# Process questions with their following choice rows, grouped by course
	processed_questions = []
	current_question = None
	current_choices = []
	skipped_s2_questions = 0

	print("DEBUG: Processing rows sequentially to group choices...")

	for idx, row in questions_df.iterrows():
	numero = row['Numero']

	# If this row has a question number, it's a new question
	if pd.notna(numero):
	# If we were processing a previous question, save it (only if valid cours)
	if current_question is not None and current_choices and is_valid_cours_number(current_cours):
	processed_questions.append({
	'numero': current_question,
	'question_text': current_question_text,
	'source': current_source,
	'comment': current_comment,
	'cours': int(float(str(current_cours).strip())), # Convert to int
	'module': current_module,
	'choices': current_choices.copy()
	})
	print(f"DEBUG: Saved question {current_question} with {len(current_choices)} choices")
	elif current_question is not None and not is_valid_cours_number(current_cours):
	skipped_s2_questions += 1
	print(f"DEBUG: Skipped question {current_question} from cours '{current_cours}' (invalid/S2)")

	# Start new question
	current_question = numero
	current_question_text = str(row['Question']).strip()
	current_source = str(row['Source']).strip() if pd.notna(row['Source']) else ""
	current_comment = str(row['Comment']).strip() if pd.notna(row['Comment']) and str(
	row['Comment']).lower() != 'nan' else None
	current_cours = row['Cours'] if pd.notna(row['Cours']) else 1 # Default to course 1
	current_module = row[module_col] if module_col and pd.notna(row[module_col]) else None
	current_choices = []

	print(f"\nDEBUG: Starting new question {numero}, Course: {current_cours}")

	# Only add choices if the current cours is valid
	if is_valid_cours_number(current_cours):
	# Add this row as a choice (whether it's the question row or a choice row)
	choice_letter = str(row['Order']).strip().upper()
	choice_text = str(row['ChoiceText']).strip()
	ct_value = str(row['CT']).strip().upper() if pd.notna(row['CT']) else ""
	is_correct = ct_value == 'X'

	if choice_text and choice_text.lower() != 'nan' and choice_text != '':
	current_choices.append({
	'letter': choice_letter,
	'text': choice_text,
	'is_correct': is_correct
	})

	# Don't forget the last question (only if valid cours)
	if current_question is not None and current_choices and is_valid_cours_number(current_cours):
	processed_questions.append({
	'numero': current_question,
	'question_text': current_question_text,
	'source': current_source,
	'comment': current_comment,
	'cours': int(float(str(current_cours).strip())), # Convert to int
	'module': current_module,
	'choices': current_choices.copy()
	})
	elif current_question is not None and not is_valid_cours_number(current_cours):
	skipped_s2_questions += 1
	print(f"DEBUG: Skipped final question {current_question} from cours '{current_cours}' (invalid/S2)")

	print(f"\nDEBUG: Total processed questions: {len(processed_questions)}")
	print(f"DEBUG: Total skipped S2/invalid questions: {skipped_s2_questions}")

	# Group questions by module and course, preserving module order
	# Use a regular dict (Python 3.7+ preserves insertion order)
	questions_by_module = {}

	# Initialize with ordered modules to preserve sheet order
	for module in modules:
	questions_by_module[module] = {}

	# Fill in the questions
	for q_data in processed_questions:
	module_name = q_data['module']
	cours_num = q_data['cours']

	# Only add if module is in our ordered list
	if module_name in questions_by_module:
	if cours_num not in questions_by_module[module_name]:
	questions_by_module[module_name][cours_num] = []

	questions_by_module[module_name][cours_num].append(q_data)
	else:
	# Handle modules not in sheet list (shouldn't happen but just in case)
	if module_name not in questions_by_module:
	questions_by_module[module_name] = {}
	if cours_num not in questions_by_module[module_name]:
	questions_by_module[module_name][cours_num] = []
	questions_by_module[module_name][cours_num].append(q_data)

	# NEW: Reorder courses within each module based on sheet order
	for module_name in list(questions_by_module.keys()):
	if module_name in modules_course_order:
	course_order = modules_course_order[module_name]
	# Create new ordered dict with courses in sheet order
	ordered_courses = {}
	for cours_num in course_order:
	if cours_num in questions_by_module[module_name]:
	ordered_courses[cours_num] = questions_by_module[module_name][cours_num]

	# Add any courses that weren't in the sheet (shouldn't happen, but just in case)
	for cours_num in questions_by_module[module_name]:
	if cours_num not in ordered_courses:
	ordered_courses[cours_num] = questions_by_module[module_name][cours_num]

	questions_by_module[module_name] = ordered_courses
	print(f"DEBUG: Reordered courses for module '{module_name}': {list(ordered_courses.keys())}")

	print(f"DEBUG: Questions grouped by modules (sheet order preserved): {list(questions_by_module.keys())}")

	# Check for E choices across all modules - use TOC order
	total_e_choices = 0
	for module_name in modules: # Sheet order
	if module_name not in questions_by_module:
	continue

	course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))

	for cours_num in course_order: # Sheet order within module
	if cours_num not in questions_by_module[module_name]:
	continue

	course_questions = questions_by_module[module_name][cours_num]
	course_e_count = sum(1 for q_data in course_questions
	for choice in q_data['choices']
	if choice['letter'].upper() == 'E')
	if course_e_count > 0:
	print(f"DEBUG: Module '{module_name}' Course {cours_num} has {course_e_count} E choices")
	total_e_choices += course_e_count

	print(f"DEBUG: Total E choices found across all modules: {total_e_choices}")

	# Collect statistics from processed questions
	# Use TOC order (modules in sheet order, courses in sheet order within module)
	for module_name in modules: # Already in sheet order
	if module_name not in questions_by_module:
	continue

	# Get course order for this module
	course_order = modules_course_order.get(module_name, [])

	# Iterate courses in sheet order
	for cours_num in course_order:
	if cours_num not in questions_by_module[module_name]:
	continue

	course_questions = questions_by_module[module_name][cours_num]

	# Get course title
	cours_titles = modules_data.get(module_name, {})
	course_title = cours_titles.get(cours_num, f"Course {cours_num}")

	# Count questions per course
	stats_course_counts[course_title] = stats_course_counts.get(course_title, 0) + len(course_questions)

	# Count repeated questions
	for q_data in course_questions:
	q_text = str(q_data['question_text']).strip()
	stats_question_repeats[q_text] = stats_question_repeats.get(q_text, 0) + 1

	print(f"\nDEBUG: Statistics collected:")
	print(f" - Courses tracked: {len(stats_course_counts)}")
	print(f" - Unique questions: {len(stats_question_repeats)}")
	print(f" - Repeated questions: {sum(1 for count in stats_question_repeats.values() if count > 1)}")

	# --- Insert Statistics section (two-column layout) before TOC ---
	# Add a new section (but keep 2-column layout)
	stats_section = doc.add_section(WD_SECTION.CONTINUOUS)

	# Ensure this new section keeps the same column layout (2 columns)
	sectPr = stats_section._sectPr
	cols = sectPr.xpath('./w:cols')[0]
	cols.set(qn('w:num'), '2')

	# --- Add STATISTICS title and bookmark so it appears in TOC ---
	stats_para = doc.add_paragraph()
	stats_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
	stats_run = stats_para.add_run("STATISTICS")
	stats_run.font.name = 'Montserrat'
	stats_run.font.size = Pt(14)
	stats_run.font.bold = True
	stats_run.font.color.rgb = theme_color

	# --- Questions per Course ---
	p = doc.add_paragraph()
	run = p.add_run("Questions per Course:")
	run.font.name = 'Montserrat'
	run.font.size = Pt(11)
	run.font.bold = True
	run.font.color.rgb = theme_color

	table = doc.add_table(rows=1, cols=2)
	table.style = 'Table Grid'
	hdr = table.rows[0].cells
	hdr[0].text = "Course"
	hdr[1].text = "Number of Questions"

	# Apply keep together to header cells
	for cell in hdr:
	for paragraph in cell.paragraphs:
	paragraph.paragraph_format.keep_together = True

	# Display courses in TOC order (module order, then course order within module)
	for module_name in modules:
	if module_name not in questions_by_module:
	continue

	course_order = modules_course_order.get(module_name, sorted(questions_by_module[module_name].keys()))
	cours_titles = modules_data.get(module_name, {})

	for cours_num in course_order:
	if cours_num not in questions_by_module[module_name]:
	continue

	course_title = cours_titles.get(cours_num, f"Course {cours_num}")
	count = stats_course_counts.get(course_title, 0)

	row = table.add_row().cells
	row[0].text = str(course_title)
	row[1].text = str(count)

	# Apply keep together to each cell
	for cell in row:
	for paragraph in cell.paragraphs:
	paragraph.paragraph_format.keep_together = True

	# Apply keep together to entire table rows
	for row in table.rows:
	tr = row._tr
	trPr = tr.get_or_add_trPr()
	cantSplit = OxmlElement('w:cantSplit')
	trPr.append(cantSplit)

	# --- Repeated Questions ---
	doc.add_paragraph()
	p2 = doc.add_paragraph()
	run2 = p2.add_run("Repeated Questions:")
	run2.font.name = 'Montserrat'
	run2.font.size = Pt(11)
	run2.font.bold = True
	run2.font.color.rgb = theme_color

	repeated = {q: c for q, c in stats_question_repeats.items() if c > 1}
	if repeated:
	rep_table = doc.add_table(rows=1, cols=2)
	rep_table.style = 'Table Grid'
	hdr2 = rep_table.rows[0].cells
	hdr2[0].text = "Question"
	hdr2[1].text = "Times Repeated"

	for q, c in sorted(repeated.items(), key=lambda x: x[1], reverse=True):
	row = rep_table.add_row().cells
	row[0].text = q
	row[1].text = str(c)

	# After creating and filling rep_table
	for row in rep_table.rows:
	tr = row._tr
	trPr = tr.get_or_add_trPr()
	cant_split = OxmlElement('w:cantSplit')
	trPr.append(cant_split)

	else:
	doc.add_paragraph("No repeated questions found.")

	# Save document
	doc.save(output_word_path)
	print(f"\n🎉 SUCCESS: Document saved as: {output_word_path}")
	print(f"📚 Total modules processed: {len(questions_by_module)}")
	print(f"🚫 Total S2/invalid questions skipped: {skipped_s2_questions}")
	print(f"📄 Questions sorted by module sheet order and course number")
	if total_e_choices > 0:
	print(f"✨ Dynamic E columns added for courses with 5-choice questions")


	def debug_excel_structure(excel_file_path):
	"""Debug function to analyze Excel structure"""
	print("=== DEBUGGING EXCEL STRUCTURE ===")

	# Read the Excel file
	xls = pd.ExcelFile(excel_file_path)
	first_sheet_name = xls.sheet_names[0] # Get the first sheet name
	questions_df = pd.read_excel(excel_file_path, sheet_name=first_sheet_name)

	print(f"Total rows: {len(questions_df)}")
	print(f"Columns: {list(questions_df.columns)}")

	# Check unique values in key columns
	if 'Numero' in questions_df.columns:
	try:
	print(f"Unique Numero values: {sorted(questions_df['Numero'].dropna().unique())}")
	except Exception as e:
	print(f"Unique Numero values: {list(questions_df['Numero'].dropna().unique())} (couldn't sort: {e})")

	if 'Order' in questions_df.columns:
	try:
	unique_orders = sorted(questions_df['Order'].dropna().unique())
	print(f"Unique Order values: {unique_orders}")
	# Check specifically for E choices
	e_count = sum(1 for order in questions_df['Order'].dropna() if str(order).strip().upper() == 'E')
	print(f"Total E choices found: {e_count}")
	except Exception as e:
	print(f"Unique Order values: {list(questions_df['Order'].dropna().unique())} (couldn't sort: {e})")

	if 'Cours' in questions_df.columns:
	unique_cours = questions_df['Cours'].dropna().unique()

	# Convert all to strings first for display, then separate by validity
	unique_cours_str = [str(c) for c in unique_cours]
	print(f"Unique Cours values: {unique_cours_str}")

	# Check which cours values are valid vs invalid
	valid_cours = []
	invalid_cours = []

	for c in unique_cours:
	if is_valid_cours_number(c):
	valid_cours.append(c)
	else:
	invalid_cours.append(str(c))

	# Sort valid ones (numeric) and invalid ones (as strings) separately
	try:
	valid_cours_sorted = sorted([float(c) for c in valid_cours])
	print(f"Valid cours values: {valid_cours_sorted}")
	except Exception:
	print(f"Valid cours values: {valid_cours}")

	try:
	invalid_cours_sorted = sorted(invalid_cours)
	print(f"Invalid/S2 cours values: {invalid_cours_sorted}")
	except Exception:
	print(f"Invalid/S2 cours values: {invalid_cours}")

	# Check module column and corresponding sheets
	if 'module' in questions_df.columns:
	unique_modules = questions_df['module'].dropna().unique()
	print(f"\nUnique Module values: {list(unique_modules)}")

	# Check if sheets exist for each module
	xls = pd.ExcelFile(excel_file_path)
	sheet_names = xls.sheet_names
	sheet_names_lower = [s.lower() for s in sheet_names]

	print("\nModule sheet availability:")
	for module in unique_modules:
	module_lower = str(module).strip().lower()
	if module_lower in sheet_names_lower:
	actual_sheet = sheet_names[sheet_names_lower.index(module_lower)]
	print(f" ✓ Module '{module}' -> Sheet '{actual_sheet}' found")

	# Try to read and show course info from this sheet
	try:
	module_df = pd.read_excel(excel_file_path, sheet_name=actual_sheet)
	if 'cours' in module_df.columns and 'titre' in module_df.columns:
	print(f" Courses in this module:")
	for _, row in module_df.iterrows():
	if pd.notna(row['cours']):
	print(f" - {row['cours']}: {row.get('titre', 'N/A')}")
	except Exception as e:
	print(f" Error reading sheet: {e}")
	else:
	print(f" ✗ Module '{module}' -> No matching sheet found")

	# Check Cours sheet
	try:
	cours_df = pd.read_excel(excel_file_path, sheet_name='Cours')
	print(f"\nCours sheet - Total rows: {len(cours_df)}")
	print(f"Cours sheet columns: {list(cours_df.columns)}")
	if not cours_df.empty:
	print("Course titles:")
	for _, row in cours_df.iterrows():
	cours_val = row.get('cours', 'N/A')
	is_valid = is_valid_cours_number(cours_val)
	status = "✓" if is_valid else "✗ (SKIPPED)"
	print(f" Course {cours_val}: {row.get('titre', 'N/A')} {status}")
	except Exception as e:
	print(f"Error reading Cours sheet: {e}")