Spaces:
Running
Running
| import os | |
| import fitz # PyMuPDF | |
| from langdetect import detect | |
| from config import DOCUMENTS_DIR, CHUNK_SIZE, CHUNK_OVERLAP | |
| # ββ Detect Language βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def detect_language(text: str) -> str: | |
| try: | |
| lang = detect(text[:500]) | |
| return "ar" if lang == "ar" else "en" | |
| except Exception: | |
| return "en" | |
| # ββ Load a Single PDF βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_pdf(pdf_path: str) -> list[dict]: | |
| """ | |
| Returns a list of page dicts: | |
| { text, page_number, source, language } | |
| """ | |
| pages = [] | |
| doc_name = os.path.splitext(os.path.basename(pdf_path))[0] | |
| try: | |
| doc = fitz.open(pdf_path) | |
| for i, page in enumerate(doc): | |
| text = page.get_text().strip() | |
| if not text: # skip empty pages | |
| continue | |
| pages.append({ | |
| "text" : text, | |
| "page_number": i + 1, | |
| "source" : doc_name, | |
| "language" : detect_language(text), | |
| }) | |
| doc.close() | |
| except Exception as e: | |
| print(f"[ERROR] Could not load {pdf_path}: {e}") | |
| return pages | |
| # ββ Chunk a List of Pages βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chunk_pages(pages: list[dict]) -> list[dict]: | |
| """ | |
| Splits page text into overlapping chunks. | |
| Each chunk keeps the source metadata. | |
| """ | |
| chunks = [] | |
| for page in pages: | |
| text = page["text"] | |
| words = text.split() | |
| start = 0 | |
| while start < len(words): | |
| end = start + CHUNK_SIZE | |
| chunk_text = " ".join(words[start:end]) | |
| chunks.append({ | |
| "text" : chunk_text, | |
| "page_number": page["page_number"], | |
| "source" : page["source"], | |
| "language" : page["language"], | |
| }) | |
| start += CHUNK_SIZE - CHUNK_OVERLAP # overlap | |
| return chunks | |
| # ββ Load ALL PDFs in the documents/ folder βββββββββββββββββββββββββββββββββββ | |
| def load_all_documents() -> list[dict]: | |
| all_chunks = [] | |
| if not os.path.exists(DOCUMENTS_DIR): | |
| os.makedirs(DOCUMENTS_DIR) | |
| print(f"[INFO] Created '{DOCUMENTS_DIR}' β add your PDFs there.") | |
| return all_chunks | |
| pdf_files = [ | |
| f for f in os.listdir(DOCUMENTS_DIR) | |
| if f.lower().endswith(".pdf") | |
| ] | |
| if not pdf_files: | |
| print(f"[WARN] No PDFs found in '{DOCUMENTS_DIR}'.") | |
| return all_chunks | |
| for pdf_file in pdf_files: | |
| path = os.path.join(DOCUMENTS_DIR, pdf_file) | |
| pages = load_pdf(path) | |
| chunks = chunk_pages(pages) | |
| all_chunks.extend(chunks) | |
| print(f"[INFO] Loaded '{pdf_file}' β {len(chunks)} chunks") | |
| print(f"[INFO] Total chunks: {len(all_chunks)}") | |
| return all_chunks | |
| # ββ Load a Single Uploaded PDF (for the Upload Tab) ββββββββββββββββββββββββββ | |
| def load_uploaded_pdf(pdf_path: str) -> list[dict]: | |
| pages = load_pdf(pdf_path) | |
| chunks = chunk_pages(pages) | |
| print(f"[INFO] Uploaded PDF β {len(chunks)} chunks") | |
| return chunks | |