import os import fitz # PyMuPDF from langdetect import detect from config import DOCUMENTS_DIR, CHUNK_SIZE, CHUNK_OVERLAP # ── Detect Language ─────────────────────────────────────────────────────────── def detect_language(text: str) -> str: try: lang = detect(text[:500]) return "ar" if lang == "ar" else "en" except Exception: return "en" # ── Load a Single PDF ───────────────────────────────────────────────────────── def load_pdf(pdf_path: str) -> list[dict]: """ Returns a list of page dicts: { text, page_number, source, language } """ pages = [] doc_name = os.path.splitext(os.path.basename(pdf_path))[0] try: doc = fitz.open(pdf_path) for i, page in enumerate(doc): text = page.get_text().strip() if not text: # skip empty pages continue pages.append({ "text" : text, "page_number": i + 1, "source" : doc_name, "language" : detect_language(text), }) doc.close() except Exception as e: print(f"[ERROR] Could not load {pdf_path}: {e}") return pages # ── Chunk a List of Pages ───────────────────────────────────────────────────── def chunk_pages(pages: list[dict]) -> list[dict]: """ Splits page text into overlapping chunks. Each chunk keeps the source metadata. """ chunks = [] for page in pages: text = page["text"] words = text.split() start = 0 while start < len(words): end = start + CHUNK_SIZE chunk_text = " ".join(words[start:end]) chunks.append({ "text" : chunk_text, "page_number": page["page_number"], "source" : page["source"], "language" : page["language"], }) start += CHUNK_SIZE - CHUNK_OVERLAP # overlap return chunks # ── Load ALL PDFs in the documents/ folder ─────────────────────────────────── def load_all_documents() -> list[dict]: all_chunks = [] if not os.path.exists(DOCUMENTS_DIR): os.makedirs(DOCUMENTS_DIR) print(f"[INFO] Created '{DOCUMENTS_DIR}' — add your PDFs there.") return all_chunks pdf_files = [ f for f in os.listdir(DOCUMENTS_DIR) if f.lower().endswith(".pdf") ] if not pdf_files: print(f"[WARN] No PDFs found in '{DOCUMENTS_DIR}'.") return all_chunks for pdf_file in pdf_files: path = os.path.join(DOCUMENTS_DIR, pdf_file) pages = load_pdf(path) chunks = chunk_pages(pages) all_chunks.extend(chunks) print(f"[INFO] Loaded '{pdf_file}' → {len(chunks)} chunks") print(f"[INFO] Total chunks: {len(all_chunks)}") return all_chunks # ── Load a Single Uploaded PDF (for the Upload Tab) ────────────────────────── def load_uploaded_pdf(pdf_path: str) -> list[dict]: pages = load_pdf(pdf_path) chunks = chunk_pages(pages) print(f"[INFO] Uploaded PDF → {len(chunks)} chunks") return chunks