Spaces:

wassim2433
/

RAG1

Running

File size: 3,680 Bytes

fed9d9d

import os
import fitz                          # PyMuPDF
from langdetect import detect
from config import DOCUMENTS_DIR, CHUNK_SIZE, CHUNK_OVERLAP


# ── Detect Language ───────────────────────────────────────────────────────────
def detect_language(text: str) -> str:
    try:
        lang = detect(text[:500])
        return "ar" if lang == "ar" else "en"
    except Exception:
        return "en"


# ── Load a Single PDF ─────────────────────────────────────────────────────────
def load_pdf(pdf_path: str) -> list[dict]:
    """
    Returns a list of page dicts:
    { text, page_number, source, language }
    """
    pages = []
    doc_name = os.path.splitext(os.path.basename(pdf_path))[0]

    try:
        doc = fitz.open(pdf_path)
        for i, page in enumerate(doc):
            text = page.get_text().strip()
            if not text:          # skip empty pages
                continue
            pages.append({
                "text"       : text,
                "page_number": i + 1,
                "source"     : doc_name,
                "language"   : detect_language(text),
            })
        doc.close()
    except Exception as e:
        print(f"[ERROR] Could not load {pdf_path}: {e}")

    return pages


# ── Chunk a List of Pages ─────────────────────────────────────────────────────
def chunk_pages(pages: list[dict]) -> list[dict]:
    """
    Splits page text into overlapping chunks.
    Each chunk keeps the source metadata.
    """
    chunks = []

    for page in pages:
        text       = page["text"]
        words      = text.split()
        start      = 0

        while start < len(words):
            end        = start + CHUNK_SIZE
            chunk_text = " ".join(words[start:end])

            chunks.append({
                "text"       : chunk_text,
                "page_number": page["page_number"],
                "source"     : page["source"],
                "language"   : page["language"],
            })

            start += CHUNK_SIZE - CHUNK_OVERLAP   # overlap

    return chunks


# ── Load ALL PDFs in the documents/ folder ───────────────────────────────────
def load_all_documents() -> list[dict]:
    all_chunks = []

    if not os.path.exists(DOCUMENTS_DIR):
        os.makedirs(DOCUMENTS_DIR)
        print(f"[INFO] Created '{DOCUMENTS_DIR}' — add your PDFs there.")
        return all_chunks

    pdf_files = [
        f for f in os.listdir(DOCUMENTS_DIR)
        if f.lower().endswith(".pdf")
    ]

    if not pdf_files:
        print(f"[WARN] No PDFs found in '{DOCUMENTS_DIR}'.")
        return all_chunks

    for pdf_file in pdf_files:
        path   = os.path.join(DOCUMENTS_DIR, pdf_file)
        pages  = load_pdf(path)
        chunks = chunk_pages(pages)
        all_chunks.extend(chunks)
        print(f"[INFO] Loaded '{pdf_file}' → {len(chunks)} chunks")

    print(f"[INFO] Total chunks: {len(all_chunks)}")
    return all_chunks


# ── Load a Single Uploaded PDF (for the Upload Tab) ──────────────────────────
def load_uploaded_pdf(pdf_path: str) -> list[dict]:
    pages  = load_pdf(pdf_path)
    chunks = chunk_pages(pages)
    print(f"[INFO] Uploaded PDF → {len(chunks)} chunks")
    return chunks