File size: 3,680 Bytes
fed9d9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import fitz                          # PyMuPDF
from langdetect import detect
from config import DOCUMENTS_DIR, CHUNK_SIZE, CHUNK_OVERLAP


# ── Detect Language ───────────────────────────────────────────────────────────
def detect_language(text: str) -> str:
    try:
        lang = detect(text[:500])
        return "ar" if lang == "ar" else "en"
    except Exception:
        return "en"


# ── Load a Single PDF ─────────────────────────────────────────────────────────
def load_pdf(pdf_path: str) -> list[dict]:
    """
    Returns a list of page dicts:
    { text, page_number, source, language }
    """
    pages = []
    doc_name = os.path.splitext(os.path.basename(pdf_path))[0]

    try:
        doc = fitz.open(pdf_path)
        for i, page in enumerate(doc):
            text = page.get_text().strip()
            if not text:          # skip empty pages
                continue
            pages.append({
                "text"       : text,
                "page_number": i + 1,
                "source"     : doc_name,
                "language"   : detect_language(text),
            })
        doc.close()
    except Exception as e:
        print(f"[ERROR] Could not load {pdf_path}: {e}")

    return pages


# ── Chunk a List of Pages ─────────────────────────────────────────────────────
def chunk_pages(pages: list[dict]) -> list[dict]:
    """
    Splits page text into overlapping chunks.
    Each chunk keeps the source metadata.
    """
    chunks = []

    for page in pages:
        text       = page["text"]
        words      = text.split()
        start      = 0

        while start < len(words):
            end        = start + CHUNK_SIZE
            chunk_text = " ".join(words[start:end])

            chunks.append({
                "text"       : chunk_text,
                "page_number": page["page_number"],
                "source"     : page["source"],
                "language"   : page["language"],
            })

            start += CHUNK_SIZE - CHUNK_OVERLAP   # overlap

    return chunks


# ── Load ALL PDFs in the documents/ folder ───────────────────────────────────
def load_all_documents() -> list[dict]:
    all_chunks = []

    if not os.path.exists(DOCUMENTS_DIR):
        os.makedirs(DOCUMENTS_DIR)
        print(f"[INFO] Created '{DOCUMENTS_DIR}' β€” add your PDFs there.")
        return all_chunks

    pdf_files = [
        f for f in os.listdir(DOCUMENTS_DIR)
        if f.lower().endswith(".pdf")
    ]

    if not pdf_files:
        print(f"[WARN] No PDFs found in '{DOCUMENTS_DIR}'.")
        return all_chunks

    for pdf_file in pdf_files:
        path   = os.path.join(DOCUMENTS_DIR, pdf_file)
        pages  = load_pdf(path)
        chunks = chunk_pages(pages)
        all_chunks.extend(chunks)
        print(f"[INFO] Loaded '{pdf_file}' β†’ {len(chunks)} chunks")

    print(f"[INFO] Total chunks: {len(all_chunks)}")
    return all_chunks


# ── Load a Single Uploaded PDF (for the Upload Tab) ──────────────────────────
def load_uploaded_pdf(pdf_path: str) -> list[dict]:
    pages  = load_pdf(pdf_path)
    chunks = chunk_pages(pages)
    print(f"[INFO] Uploaded PDF β†’ {len(chunks)} chunks")
    return chunks