Spaces:

LeonceNsh
/

kora-synth

Sleeping

kora-synth / pdf_parser.py

Upload folder using huggingface_hub

26cf0a8 verified 5 months ago

1.25 kB

	import os
	import tempfile
	import nest_asyncio
	from llama_parse import LlamaParse
	from llama_index.core.node_parser import SimpleNodeParser
	from dotenv import load_dotenv

	def pdf_parser(pdf_file):
	load_dotenv()
	api_key = os.getenv("LLAMA_CLOUD_API_KEY")
	if not api_key:
	raise ValueError("LLAMA_CLOUD_API_KEY is not set in the environment variables.")

	pdf_content = []
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_file.write(pdf_file.read())
	temp_file_path = temp_file.name

	nest_asyncio.apply()

	parser = LlamaParse(api_key=api_key, result_type="text", verbose=True)
	documents = parser.load_data(temp_file_path)

	node_parser = SimpleNodeParser.from_defaults(chunk_size=500, chunk_overlap=20)
	nodes = node_parser.get_nodes_from_documents(documents)

	for node in nodes:
	pdf_content.append({
	'text': node.get_content(),
	'page': node.metadata.get('page_label', 'N/A')
	})

	os.unlink(temp_file_path)
	print(f"Parsed {len(pdf_content)} chunks from the PDF.")
	print(f"{pdf_content}")
	return pdf_content


	if __name__ == "__main__":
	pdf_parser(open("FP-Juliett-Final-Report.pdf", "rb"))