Spaces:

jessica45
/

rag

Sleeping

App Files Files Community

rag / index_docs.py

jessica45

updated rag

5f04d6e verified 4 months ago

raw

history blame contribute delete

2.91 kB

	from typing import Optional
	from docx import Document
	try:
	import fitz # PyMuPDF
	except Exception:
	# fall back to pymupdf module name if present
	import pymupdf as fitz

	def load_pdf_text(file_path: str) -> str:
	try:
	doc = fitz.open(file_path)
	text = ""
	# iterate directly over pages
	for page in doc:
	# use standard PyMuPDF API
	try:
	page_text = page.get_text()
	except Exception:
	# try alternate name for older versions
	page_text = page.getText() if hasattr(page, 'getText') else ''
	if page_text:
	text += page_text + "\n"
	try:
	doc.close()
	except Exception:
	pass
	return text.strip()
	except Exception as e:
	print(f"Error reading PDF {file_path}: {e}")
	return ""


	def load_docx_text(file_path: str) -> str:
	try:
	doc = Document(file_path)
	paragraphs = [p.text for p in doc.paragraphs if p.text]
	return "\n".join(paragraphs).strip()
	except Exception as e:
	print(f"Error reading DOCX {file_path}: {e}")
	return ""


	def load_txt_text(file_path: str) -> str:
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except Exception as e:
	print(f"Error reading TXT {file_path}: {e}")
	return ""


	def extract_text_from_path(path: str) -> Optional[str]:
	if path.lower().endswith('.pdf'):
	return load_pdf_text(path)
	if path.lower().endswith('.docx'):
	return load_docx_text(path)
	if path.lower().endswith('.txt'):
	return load_txt_text(path)
	return None


	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
	chunks = []
	start = 0
	text_length = len(text)
	while start < text_length:
	end = min(start + chunk_size, text_length)
	chunk = text[start:end]
	chunks.append(chunk)
	start += chunk_size - overlap
	return chunks


	if __name__ == '__main__':
	import sys

	def usage():
	print('Usage: python src/index_docs.py <path-to-file-or-folder> [chunk_size]')

	if len(sys.argv) < 2:
	usage()
	sys.exit(1)

	path = sys.argv[1]
	chunk_size = int(sys.argv[2]) if len(sys.argv) > 2 else 500

	print(f'Testing extraction for: {path}')
	text = extract_text_from_path(path)
	if not text:
	print('No text extracted or unsupported file type.')
	sys.exit(1)

	print('Characters extracted:', len(text))
	chunks = chunk_text(text, chunk_size=chunk_size)
	print('Chunks produced:', len(chunks))
	if chunks:
	preview = 300
	print('\n--- First chunk preview ---')
	print(chunks[0][:preview])
	print('\n--- Second chunk preview ---')
	print(chunks[1][:preview] if len(chunks) > 1 else '<none>')