Spaces:
Sleeping
Sleeping
| """ | |
| Utility functions for the Inclusive World Curriculum Assistant | |
| """ | |
| import re | |
| from typing import List, Dict, Any | |
| from pathlib import Path | |
| import fitz | |
| from config import CURRICULUM_TOPICS | |
| def clean_text(text: str) -> str: | |
| """Clean and normalize text content""" | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters that might interfere with processing | |
| text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text) | |
| return text.strip() | |
| def extract_curriculum_topics(text: str) -> List[str]: | |
| """Extract relevant curriculum topics from text""" | |
| found_topics = [] | |
| text_lower = text.lower() | |
| for topic in CURRICULUM_TOPICS: | |
| topic_lower = topic.lower() | |
| if any(word in text_lower for word in topic_lower.split()): | |
| found_topics.append(topic) | |
| return found_topics | |
| def create_curriculum_summary(docs: List[Dict[str, Any]]) -> Dict[str, Any]: | |
| """Create a summary of processed curriculum documents""" | |
| summary = { | |
| "total_documents": len(docs), | |
| "total_content_length": sum(len(doc.get('content', '')) for doc in docs), | |
| "topics_covered": [], | |
| "document_types": {} | |
| } | |
| # Analyze document types | |
| for doc in docs: | |
| filename = doc.get('filename', '') | |
| if 'week' in filename.lower(): | |
| week_num = re.search(r'week\s*(\d+)', filename.lower()) | |
| if week_num: | |
| summary["document_types"][f"Week {week_num.group(1)}"] = filename | |
| # Extract common topics | |
| all_content = ' '.join([doc.get('content', '') for doc in docs]) | |
| summary["topics_covered"] = extract_curriculum_topics(all_content) | |
| return summary | |
| def validate_pdf_file(file_path: str) -> bool: | |
| """Validate if a file is a readable PDF""" | |
| try: | |
| doc = fitz.open(file_path) | |
| if doc.page_count > 0: | |
| doc.close() | |
| return True | |
| doc.close() | |
| return False | |
| except Exception: | |
| return False | |
| def get_file_info(file_path: str) -> Dict[str, Any]: | |
| """Get information about a PDF file""" | |
| try: | |
| doc = fitz.open(file_path) | |
| info = { | |
| "filename": Path(file_path).name, | |
| "page_count": doc.page_count, | |
| "file_size": Path(file_path).stat().st_size, | |
| "is_valid": True | |
| } | |
| doc.close() | |
| return info | |
| except Exception as e: | |
| return { | |
| "filename": Path(file_path).name, | |
| "error": str(e), | |
| "is_valid": False | |
| } |