Spaces:
Sleeping
Sleeping
File size: 2,563 Bytes
93fe96e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
"""
Utility functions for the Inclusive World Curriculum Assistant
"""
import re
from typing import List, Dict, Any
from pathlib import Path
import fitz
from config import CURRICULUM_TOPICS
def clean_text(text: str) -> str:
"""Clean and normalize text content"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters that might interfere with processing
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
return text.strip()
def extract_curriculum_topics(text: str) -> List[str]:
"""Extract relevant curriculum topics from text"""
found_topics = []
text_lower = text.lower()
for topic in CURRICULUM_TOPICS:
topic_lower = topic.lower()
if any(word in text_lower for word in topic_lower.split()):
found_topics.append(topic)
return found_topics
def create_curriculum_summary(docs: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Create a summary of processed curriculum documents"""
summary = {
"total_documents": len(docs),
"total_content_length": sum(len(doc.get('content', '')) for doc in docs),
"topics_covered": [],
"document_types": {}
}
# Analyze document types
for doc in docs:
filename = doc.get('filename', '')
if 'week' in filename.lower():
week_num = re.search(r'week\s*(\d+)', filename.lower())
if week_num:
summary["document_types"][f"Week {week_num.group(1)}"] = filename
# Extract common topics
all_content = ' '.join([doc.get('content', '') for doc in docs])
summary["topics_covered"] = extract_curriculum_topics(all_content)
return summary
def validate_pdf_file(file_path: str) -> bool:
"""Validate if a file is a readable PDF"""
try:
doc = fitz.open(file_path)
if doc.page_count > 0:
doc.close()
return True
doc.close()
return False
except Exception:
return False
def get_file_info(file_path: str) -> Dict[str, Any]:
"""Get information about a PDF file"""
try:
doc = fitz.open(file_path)
info = {
"filename": Path(file_path).name,
"page_count": doc.page_count,
"file_size": Path(file_path).stat().st_size,
"is_valid": True
}
doc.close()
return info
except Exception as e:
return {
"filename": Path(file_path).name,
"error": str(e),
"is_valid": False
} |