Spaces:
Running
Running
File size: 3,680 Bytes
fed9d9d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | import os
import fitz # PyMuPDF
from langdetect import detect
from config import DOCUMENTS_DIR, CHUNK_SIZE, CHUNK_OVERLAP
# ββ Detect Language βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def detect_language(text: str) -> str:
try:
lang = detect(text[:500])
return "ar" if lang == "ar" else "en"
except Exception:
return "en"
# ββ Load a Single PDF βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def load_pdf(pdf_path: str) -> list[dict]:
"""
Returns a list of page dicts:
{ text, page_number, source, language }
"""
pages = []
doc_name = os.path.splitext(os.path.basename(pdf_path))[0]
try:
doc = fitz.open(pdf_path)
for i, page in enumerate(doc):
text = page.get_text().strip()
if not text: # skip empty pages
continue
pages.append({
"text" : text,
"page_number": i + 1,
"source" : doc_name,
"language" : detect_language(text),
})
doc.close()
except Exception as e:
print(f"[ERROR] Could not load {pdf_path}: {e}")
return pages
# ββ Chunk a List of Pages βββββββββββββββββββββββββββββββββββββββββββββββββββββ
def chunk_pages(pages: list[dict]) -> list[dict]:
"""
Splits page text into overlapping chunks.
Each chunk keeps the source metadata.
"""
chunks = []
for page in pages:
text = page["text"]
words = text.split()
start = 0
while start < len(words):
end = start + CHUNK_SIZE
chunk_text = " ".join(words[start:end])
chunks.append({
"text" : chunk_text,
"page_number": page["page_number"],
"source" : page["source"],
"language" : page["language"],
})
start += CHUNK_SIZE - CHUNK_OVERLAP # overlap
return chunks
# ββ Load ALL PDFs in the documents/ folder βββββββββββββββββββββββββββββββββββ
def load_all_documents() -> list[dict]:
all_chunks = []
if not os.path.exists(DOCUMENTS_DIR):
os.makedirs(DOCUMENTS_DIR)
print(f"[INFO] Created '{DOCUMENTS_DIR}' β add your PDFs there.")
return all_chunks
pdf_files = [
f for f in os.listdir(DOCUMENTS_DIR)
if f.lower().endswith(".pdf")
]
if not pdf_files:
print(f"[WARN] No PDFs found in '{DOCUMENTS_DIR}'.")
return all_chunks
for pdf_file in pdf_files:
path = os.path.join(DOCUMENTS_DIR, pdf_file)
pages = load_pdf(path)
chunks = chunk_pages(pages)
all_chunks.extend(chunks)
print(f"[INFO] Loaded '{pdf_file}' β {len(chunks)} chunks")
print(f"[INFO] Total chunks: {len(all_chunks)}")
return all_chunks
# ββ Load a Single Uploaded PDF (for the Upload Tab) ββββββββββββββββββββββββββ
def load_uploaded_pdf(pdf_path: str) -> list[dict]:
pages = load_pdf(pdf_path)
chunks = chunk_pages(pages)
print(f"[INFO] Uploaded PDF β {len(chunks)} chunks")
return chunks
|