File size: 2,563 Bytes
93fe96e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
Utility functions for the Inclusive World Curriculum Assistant
"""

import re
from typing import List, Dict, Any
from pathlib import Path
import fitz
from config import CURRICULUM_TOPICS

def clean_text(text: str) -> str:
    """Clean and normalize text content"""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters that might interfere with processing
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
    return text.strip()

def extract_curriculum_topics(text: str) -> List[str]:
    """Extract relevant curriculum topics from text"""
    found_topics = []
    text_lower = text.lower()
    
    for topic in CURRICULUM_TOPICS:
        topic_lower = topic.lower()
        if any(word in text_lower for word in topic_lower.split()):
            found_topics.append(topic)
    
    return found_topics

def create_curriculum_summary(docs: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Create a summary of processed curriculum documents"""
    summary = {
        "total_documents": len(docs),
        "total_content_length": sum(len(doc.get('content', '')) for doc in docs),
        "topics_covered": [],
        "document_types": {}
    }
    
    # Analyze document types
    for doc in docs:
        filename = doc.get('filename', '')
        if 'week' in filename.lower():
            week_num = re.search(r'week\s*(\d+)', filename.lower())
            if week_num:
                summary["document_types"][f"Week {week_num.group(1)}"] = filename
    
    # Extract common topics
    all_content = ' '.join([doc.get('content', '') for doc in docs])
    summary["topics_covered"] = extract_curriculum_topics(all_content)
    
    return summary

def validate_pdf_file(file_path: str) -> bool:
    """Validate if a file is a readable PDF"""
    try:
        doc = fitz.open(file_path)
        if doc.page_count > 0:
            doc.close()
            return True
        doc.close()
        return False
    except Exception:
        return False

def get_file_info(file_path: str) -> Dict[str, Any]:
    """Get information about a PDF file"""
    try:
        doc = fitz.open(file_path)
        info = {
            "filename": Path(file_path).name,
            "page_count": doc.page_count,
            "file_size": Path(file_path).stat().st_size,
            "is_valid": True
        }
        doc.close()
        return info
    except Exception as e:
        return {
            "filename": Path(file_path).name,
            "error": str(e),
            "is_valid": False
        }