Spaces:

IW2025
/

InclusiveWorldChatbotSpace

Sleeping

File size: 2,563 Bytes

93fe96e

"""
Utility functions for the Inclusive World Curriculum Assistant
"""

import re
from typing import List, Dict, Any
from pathlib import Path
import fitz
from config import CURRICULUM_TOPICS

def clean_text(text: str) -> str:
    """Clean and normalize text content"""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters that might interfere with processing
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
    return text.strip()

def extract_curriculum_topics(text: str) -> List[str]:
    """Extract relevant curriculum topics from text"""
    found_topics = []
    text_lower = text.lower()
    
    for topic in CURRICULUM_TOPICS:
        topic_lower = topic.lower()
        if any(word in text_lower for word in topic_lower.split()):
            found_topics.append(topic)
    
    return found_topics

def create_curriculum_summary(docs: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Create a summary of processed curriculum documents"""
    summary = {
        "total_documents": len(docs),
        "total_content_length": sum(len(doc.get('content', '')) for doc in docs),
        "topics_covered": [],
        "document_types": {}
    }
    
    # Analyze document types
    for doc in docs:
        filename = doc.get('filename', '')
        if 'week' in filename.lower():
            week_num = re.search(r'week\s*(\d+)', filename.lower())
            if week_num:
                summary["document_types"][f"Week {week_num.group(1)}"] = filename
    
    # Extract common topics
    all_content = ' '.join([doc.get('content', '') for doc in docs])
    summary["topics_covered"] = extract_curriculum_topics(all_content)
    
    return summary

def validate_pdf_file(file_path: str) -> bool:
    """Validate if a file is a readable PDF"""
    try:
        doc = fitz.open(file_path)
        if doc.page_count > 0:
            doc.close()
            return True
        doc.close()
        return False
    except Exception:
        return False

def get_file_info(file_path: str) -> Dict[str, Any]:
    """Get information about a PDF file"""
    try:
        doc = fitz.open(file_path)
        info = {
            "filename": Path(file_path).name,
            "page_count": doc.page_count,
            "file_size": Path(file_path).stat().st_size,
            "is_valid": True
        }
        doc.close()
        return info
    except Exception as e:
        return {
            "filename": Path(file_path).name,
            "error": str(e),
            "is_valid": False
        }