Spaces:

tensorlake
/

document-extractors

Sleeping

App Files Files Community

rishiraj commited on Jun 7, 2024

Commit

33f6a35

verified ·

1 Parent(s): 4b3bfb7

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -65

app.py CHANGED Viewed

@@ -23,49 +23,7 @@ def use_marker(pdf_filepath):
 	config = MarkdownExtractorConfig(batch_multiplier=2)
 	result = markdown_extractor.extract(content, config)
-	return str(result)
-@spaces.GPU
-def use_pdf_extractor(pdf_filepath):
-	if pdf_filepath is None:
-		raise gr.Error("Please provide some input PDF: upload an PDF file")
-	with open(pdf_filepath, "rb") as f:
-		pdf_data = f.read()
-	content = Content(content_type="application/pdf", data=pdf_data)
-	config = PDFExtractorConfig(output_types=["text", "table"])
-	result = pdf_extractor.extract(content, config)
-	return str(result)
-@spaces.GPU
-def use_gemini(pdf_filepath, key):
-	if pdf_filepath is None:
-		raise gr.Error("Please provide some input PDF: upload an PDF file")
-	with open(pdf_filepath, "rb") as f:
-		pdf_data = f.read()
-	content = Content(content_type="application/pdf", data=pdf_data)
-	config = GeminiExtractorConfig(prompt="Extract all text from the document.", model_name="gemini-1.5-flash", key=key)
-	result = gemini_extractor.extract(content, config)
-	return str(result)
-@spaces.GPU
-def use_openai(pdf_filepath, key):
-	if pdf_filepath is None:
-		raise gr.Error("Please provide some input PDF: upload an PDF file")
-	with open(pdf_filepath, "rb") as f:
-		pdf_data = f.read()
-	content = Content(content_type="application/pdf", data=pdf_data)
-	config = OAIExtractorConfig(prompt="Extract all text from the document.", model_name="gpt-4o", key=key)
-	result = oai_extractor.extract(content, config)
-	return str(result)
 with gr.Blocks(title="PDF data extraction with Marker & Indexify") as marker_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with Marker & <a href='https://getindexify.ai/'>Indexify</a></h1>")
@@ -83,7 +41,7 @@ with gr.Blocks(title="PDF data extraction with Marker & Indexify") as marker_dem
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
-			pdf_file_marker = gr.File(type="filepath")
 		with gr.Column():
 			gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
@@ -93,9 +51,9 @@ with gr.Blocks(title="PDF data extraction with Marker & Indexify") as marker_dem
 				variant="primary",
 			)
-			model_output_text_box_marker = gr.Textbox(
 				label="Extractor Output",
-				elem_id="model_output_text_box_marker",
 			)
 	with gr.Row():
@@ -109,10 +67,24 @@ with gr.Blocks(title="PDF data extraction with Marker & Indexify") as marker_dem
 	go_button.click(
 		fn=use_marker,
-		inputs = [pdf_file_marker],
-		outputs = [model_output_text_box_marker]
 	)
 with gr.Blocks(title="PDF data extraction with PDF Extractor & Indexify") as pdf_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with PDF Extractor & <a href='https://getindexify.ai/'>Indexify</a></h1>")
 	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
@@ -129,7 +101,7 @@ with gr.Blocks(title="PDF data extraction with PDF Extractor & Indexify") as pdf
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
-			pdf_file_pdf = gr.File(type="filepath")
 		with gr.Column():
 			gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
@@ -139,9 +111,9 @@ with gr.Blocks(title="PDF data extraction with PDF Extractor & Indexify") as pdf
 				variant="primary",
 			)
-			model_output_text_box_pdf = gr.Textbox(
 				label="Extractor Output",
-				elem_id="model_output_text_box_pdf",
 			)
 	with gr.Row():
@@ -155,10 +127,23 @@ with gr.Blocks(title="PDF data extraction with PDF Extractor & Indexify") as pdf
 	go_button.click(
 		fn=use_pdf_extractor,
-		inputs = [pdf_file_pdf],
-		outputs = [model_output_text_box_pdf]
 	)
 with gr.Blocks(title="PDF data extraction with Gemini & Indexify") as gemini_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with Gemini & <a href='https://getindexify.ai/'>Indexify</a></h1>")
 	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
@@ -175,11 +160,11 @@ with gr.Blocks(title="PDF data extraction with Gemini & Indexify") as gemini_dem
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
-			pdf_file_gemini = gr.File(type="filepath")
 			gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
-			key_gemini = gr.Textbox(
                 info="Please enter your GEMINI_API_KEY",
 				label="Key:"
 			)
@@ -192,9 +177,9 @@ with gr.Blocks(title="PDF data extraction with Gemini & Indexify") as gemini_dem
 				variant="primary",
 			)
-			model_output_text_box_gemini = gr.Textbox(
 				label="Extractor Output",
-				elem_id="model_output_text_box_gemini",
 			)
 	with gr.Row():
@@ -208,10 +193,23 @@ with gr.Blocks(title="PDF data extraction with Gemini & Indexify") as gemini_dem
 	go_button.click(
 		fn=use_gemini,
-		inputs = [pdf_file_gemini, key_gemini],
-		outputs = [model_output_text_box_gemini]
 	)
 with gr.Blocks(title="PDF data extraction with OpenAI & Indexify") as openai_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with OpenAI & <a href='https://getindexify.ai/'>Indexify</a></h1>")
 	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
@@ -228,11 +226,11 @@ with gr.Blocks(title="PDF data extraction with OpenAI & Indexify") as openai_dem
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
-			pdf_file_oai = gr.File(type="filepath")
 			gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
-			key_oai = gr.Textbox(
                 info="Please enter your OPENAI_API_KEY",
 				label="Key:"
 			)
@@ -245,9 +243,9 @@ with gr.Blocks(title="PDF data extraction with OpenAI & Indexify") as openai_dem
 				variant="primary",
 			)
-			model_output_text_box_oai = gr.Textbox(
 				label="Extractor Output",
-				elem_id="model_output_text_box_oai",
 			)
 	with gr.Row():
@@ -261,8 +259,8 @@ with gr.Blocks(title="PDF data extraction with OpenAI & Indexify") as openai_dem
 	go_button.click(
 		fn=use_openai,
-		inputs = [pdf_file_oai, key_oai],
-		outputs = [model_output_text_box_oai]
 	)
 demo = gr.TabbedInterface([marker_demo, pdf_demo, gemini_demo, openai_demo], ["Marker Extractor", "PDF Extractor", "Gemini Extractor", "OpenAI Extractor"], theme=gr.themes.Soft())

 	config = MarkdownExtractorConfig(batch_multiplier=2)
 	result = markdown_extractor.extract(content, config)
+	return result
 with gr.Blocks(title="PDF data extraction with Marker & Indexify") as marker_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with Marker & <a href='https://getindexify.ai/'>Indexify</a></h1>")
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
+			pdf_file = gr.File(type="filepath")
 		with gr.Column():
 			gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
 				variant="primary",
 			)
+			model_output_text_box = gr.Textbox(
 				label="Extractor Output",
+				elem_id="model_output_text_box",
 			)
 	with gr.Row():
 	go_button.click(
 		fn=use_marker,
+		inputs = [pdf_file],
+		outputs = [model_output_text_box]
 	)
+@spaces.GPU
+def use_pdf_extractor(pdf_filepath):
+	if pdf_filepath is None:
+		raise gr.Error("Please provide some input PDF: upload an PDF file")
+	with open(pdf_filepath, "rb") as f:
+		pdf_data = f.read()
+	content = Content(content_type="application/pdf", data=pdf_data)
+	config = PDFExtractorConfig(output_types=["text", "table"])
+	result = pdf_extractor.extract(content, config)
+	return result
 with gr.Blocks(title="PDF data extraction with PDF Extractor & Indexify") as pdf_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with PDF Extractor & <a href='https://getindexify.ai/'>Indexify</a></h1>")
 	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
+			pdf_file = gr.File(type="filepath")
 		with gr.Column():
 			gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
 				variant="primary",
 			)
+			model_output_text_box = gr.Textbox(
 				label="Extractor Output",
+				elem_id="model_output_text_box",
 			)
 	with gr.Row():
 	go_button.click(
 		fn=use_pdf_extractor,
+		inputs = [pdf_file],
+		outputs = [model_output_text_box]
 	)
+def use_gemini(pdf_filepath, key):
+	if pdf_filepath is None:
+		raise gr.Error("Please provide some input PDF: upload an PDF file")
+	with open(pdf_filepath, "rb") as f:
+		pdf_data = f.read()
+	content = Content(content_type="application/pdf", data=pdf_data)
+	config = GeminiExtractorConfig(prompt="Extract all text from the document.", model_name="gemini-1.5-flash", key=key)
+	result = gemini_extractor.extract(content, config)
+	return result
 with gr.Blocks(title="PDF data extraction with Gemini & Indexify") as gemini_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with Gemini & <a href='https://getindexify.ai/'>Indexify</a></h1>")
 	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
+			pdf_file = gr.File(type="filepath")
 			gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
+			key = gr.Textbox(
                 info="Please enter your GEMINI_API_KEY",
 				label="Key:"
 			)
 				variant="primary",
 			)
+			model_output_text_box = gr.Textbox(
 				label="Extractor Output",
+				elem_id="model_output_text_box",
 			)
 	with gr.Row():
 	go_button.click(
 		fn=use_gemini,
+		inputs = [pdf_file, key],
+		outputs = [model_output_text_box]
 	)
+def use_openai(pdf_filepath, key):
+	if pdf_filepath is None:
+		raise gr.Error("Please provide some input PDF: upload an PDF file")
+	with open(pdf_filepath, "rb") as f:
+		pdf_data = f.read()
+	content = Content(content_type="application/pdf", data=pdf_data)
+	config = OAIExtractorConfig(prompt="Extract all text from the document.", model_name="gpt-4o", key=key)
+	result = oai_extractor.extract(content, config)
+	return result
 with gr.Blocks(title="PDF data extraction with OpenAI & Indexify") as openai_demo:
 	gr.HTML("<h1 style='text-align: center'>PDF data extraction with OpenAI & <a href='https://getindexify.ai/'>Indexify</a></h1>")
 	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
 				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
 			)
+			pdf_file = gr.File(type="filepath")
 			gr.HTML("<p><b>Step 2:</b> Enter your API key.</p>")
+			key = gr.Textbox(
                 info="Please enter your OPENAI_API_KEY",
 				label="Key:"
 			)
 				variant="primary",
 			)
+			model_output_text_box = gr.Textbox(
 				label="Extractor Output",
+				elem_id="model_output_text_box",
 			)
 	with gr.Row():
 	go_button.click(
 		fn=use_openai,
+		inputs = [pdf_file, key],
+		outputs = [model_output_text_box]
 	)
 demo = gr.TabbedInterface([marker_demo, pdf_demo, gemini_demo, openai_demo], ["Marker Extractor", "PDF Extractor", "Gemini Extractor", "OpenAI Extractor"], theme=gr.themes.Soft())