Spaces:

Rivalcoder
/

rag-hackrx

Sleeping

rag-hackrx / Extraction_Models /document_extractor.py

Rivalcoder

add For Hosting

cddddfc 4 months ago

2.02 kB

	import fitz
	from concurrent.futures import ThreadPoolExecutor

	def _extract_text(page):
	text = page.get_text()
	return text.strip() if text and text.strip() else None

	def parse_pdf_from_url_multithreaded(content, max_workers=2, chunk_size=1):
	try:
	with fitz.open(stream=content, filetype="pdf") as doc:
	pages = list(doc)
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	texts = list(executor.map(_extract_text, pages))
	if chunk_size > 1:
	chunks = []
	for i in range(0, len(texts), chunk_size):
	chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
	if chunk:
	chunks.append(chunk)
	return chunks if chunks else ["No data found in this document (empty PDF)"]
	return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
	except Exception as e:
	print(f"Failed to parse as PDF: {str(e)}")
	return [f"No data found in this document (not PDF or corrupted)"]

	def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
	try:
	with fitz.open(file_path) as doc:
	pages = list(doc)
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	texts = list(executor.map(_extract_text, pages))
	if chunk_size > 1:
	chunks = []
	for i in range(0, len(texts), chunk_size):
	chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
	if chunk:
	chunks.append(chunk)
	return chunks if chunks else ["No data found in this document (local PDF empty)"]
	return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
	except Exception as e:
	print(f"Failed to open local file: {str(e)}")
	return [f"No data found in this document (local file error)"]