Spaces:

wassim2433
/

RAG1

Running

App Files Files Community

RAG1 / document_processor.py

wassim2433

inital

fed9d9d 23 days ago

raw

history blame contribute delete

3.68 kB

	import os
	import fitz # PyMuPDF
	from langdetect import detect
	from config import DOCUMENTS_DIR, CHUNK_SIZE, CHUNK_OVERLAP


	# ── Detect Language ───────────────────────────────────────────────────────────
	def detect_language(text: str) -> str:
	try:
	lang = detect(text[:500])
	return "ar" if lang == "ar" else "en"
	except Exception:
	return "en"


	# ── Load a Single PDF ─────────────────────────────────────────────────────────
	def load_pdf(pdf_path: str) -> list[dict]:
	"""
	Returns a list of page dicts:
	{ text, page_number, source, language }
	"""
	pages = []
	doc_name = os.path.splitext(os.path.basename(pdf_path))[0]

	try:
	doc = fitz.open(pdf_path)
	for i, page in enumerate(doc):
	text = page.get_text().strip()
	if not text: # skip empty pages
	continue
	pages.append({
	"text" : text,
	"page_number": i + 1,
	"source" : doc_name,
	"language" : detect_language(text),
	})
	doc.close()
	except Exception as e:
	print(f"[ERROR] Could not load {pdf_path}: {e}")

	return pages


	# ── Chunk a List of Pages ─────────────────────────────────────────────────────
	def chunk_pages(pages: list[dict]) -> list[dict]:
	"""
	Splits page text into overlapping chunks.
	Each chunk keeps the source metadata.
	"""
	chunks = []

	for page in pages:
	text = page["text"]
	words = text.split()
	start = 0

	while start < len(words):
	end = start + CHUNK_SIZE
	chunk_text = " ".join(words[start:end])

	chunks.append({
	"text" : chunk_text,
	"page_number": page["page_number"],
	"source" : page["source"],
	"language" : page["language"],
	})

	start += CHUNK_SIZE - CHUNK_OVERLAP # overlap

	return chunks


	# ── Load ALL PDFs in the documents/ folder ───────────────────────────────────
	def load_all_documents() -> list[dict]:
	all_chunks = []

	if not os.path.exists(DOCUMENTS_DIR):
	os.makedirs(DOCUMENTS_DIR)
	print(f"[INFO] Created '{DOCUMENTS_DIR}' — add your PDFs there.")
	return all_chunks

	pdf_files = [
	f for f in os.listdir(DOCUMENTS_DIR)
	if f.lower().endswith(".pdf")
	]

	if not pdf_files:
	print(f"[WARN] No PDFs found in '{DOCUMENTS_DIR}'.")
	return all_chunks

	for pdf_file in pdf_files:
	path = os.path.join(DOCUMENTS_DIR, pdf_file)
	pages = load_pdf(path)
	chunks = chunk_pages(pages)
	all_chunks.extend(chunks)
	print(f"[INFO] Loaded '{pdf_file}' → {len(chunks)} chunks")

	print(f"[INFO] Total chunks: {len(all_chunks)}")
	return all_chunks


	# ── Load a Single Uploaded PDF (for the Upload Tab) ──────────────────────────
	def load_uploaded_pdf(pdf_path: str) -> list[dict]:
	pages = load_pdf(pdf_path)
	chunks = chunk_pages(pages)
	print(f"[INFO] Uploaded PDF → {len(chunks)} chunks")
	return chunks