IW2025's picture
Upload 7 files
4b63357 verified
raw
history blame
2.56 kB
"""
Utility functions for the Inclusive World Curriculum Assistant
"""
import re
from typing import List, Dict, Any
from pathlib import Path
import fitz
from config import CURRICULUM_TOPICS
def clean_text(text: str) -> str:
"""Clean and normalize text content"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters that might interfere with processing
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
return text.strip()
def extract_curriculum_topics(text: str) -> List[str]:
"""Extract relevant curriculum topics from text"""
found_topics = []
text_lower = text.lower()
for topic in CURRICULUM_TOPICS:
topic_lower = topic.lower()
if any(word in text_lower for word in topic_lower.split()):
found_topics.append(topic)
return found_topics
def create_curriculum_summary(docs: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Create a summary of processed curriculum documents"""
summary = {
"total_documents": len(docs),
"total_content_length": sum(len(doc.get('content', '')) for doc in docs),
"topics_covered": [],
"document_types": {}
}
# Analyze document types
for doc in docs:
filename = doc.get('filename', '')
if 'week' in filename.lower():
week_num = re.search(r'week\s*(\d+)', filename.lower())
if week_num:
summary["document_types"][f"Week {week_num.group(1)}"] = filename
# Extract common topics
all_content = ' '.join([doc.get('content', '') for doc in docs])
summary["topics_covered"] = extract_curriculum_topics(all_content)
return summary
def validate_pdf_file(file_path: str) -> bool:
"""Validate if a file is a readable PDF"""
try:
doc = fitz.open(file_path)
if doc.page_count > 0:
doc.close()
return True
doc.close()
return False
except Exception:
return False
def get_file_info(file_path: str) -> Dict[str, Any]:
"""Get information about a PDF file"""
try:
doc = fitz.open(file_path)
info = {
"filename": Path(file_path).name,
"page_count": doc.page_count,
"file_size": Path(file_path).stat().st_size,
"is_valid": True
}
doc.close()
return info
except Exception as e:
return {
"filename": Path(file_path).name,
"error": str(e),
"is_valid": False
}