Spaces:
Build error
Build error
Commit
·
20f0ac1
1
Parent(s):
2de9223
Redesigned interface
Browse filesRebuilt the gradio interface
app.py
CHANGED
|
@@ -214,9 +214,7 @@ def format_output(extracted_values):
|
|
| 214 |
return output
|
| 215 |
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
def pdf_ocr(file):
|
| 220 |
# Convert PDF to image
|
| 221 |
with tempfile.TemporaryDirectory() as path:
|
| 222 |
with open(file, "rb") as f:
|
|
@@ -240,28 +238,64 @@ def pdf_ocr(file):
|
|
| 240 |
# Clear the image list to free up memory
|
| 241 |
del images
|
| 242 |
|
| 243 |
-
# Call extractor_clean and format_output functions
|
| 244 |
ks = ('mq', 'metri quadri', 'm2')
|
| 245 |
-
tra = 'it5/it5-base-question-answering'
|
| 246 |
quest = "Quanti metri quadri misura la superficie?"
|
| 247 |
totalK = ['totale', 'complessivo', 'complessiva']
|
| 248 |
|
| 249 |
-
extracted_values = extractor_clean(text=text, k_words=ks, transformer=
|
| 250 |
-
values_output = extracted_values[0][0]
|
| 251 |
-
total_output = extracted_values[0][1]
|
| 252 |
text_output = extracted_values[2]
|
| 253 |
|
| 254 |
-
|
|
|
|
| 255 |
|
| 256 |
-
|
|
|
|
|
|
|
| 257 |
# Call the pdf_ocr function
|
| 258 |
-
values, total, text = pdf_ocr(pdf_file.name)
|
| 259 |
return values, total, text
|
| 260 |
|
| 261 |
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
return output
|
| 215 |
|
| 216 |
|
| 217 |
+
def pdf_ocr(file, model_t, question):
|
|
|
|
|
|
|
| 218 |
# Convert PDF to image
|
| 219 |
with tempfile.TemporaryDirectory() as path:
|
| 220 |
with open(file, "rb") as f:
|
|
|
|
| 238 |
# Clear the image list to free up memory
|
| 239 |
del images
|
| 240 |
|
|
|
|
| 241 |
ks = ('mq', 'metri quadri', 'm2')
|
|
|
|
| 242 |
quest = "Quanti metri quadri misura la superficie?"
|
| 243 |
totalK = ['totale', 'complessivo', 'complessiva']
|
| 244 |
|
| 245 |
+
extracted_values = extractor_clean(text=text, k_words=ks, transformer=model_t, question=question, total_kwords=totalK, return_text=True)
|
| 246 |
+
values_output = extracted_values[0][0]
|
| 247 |
+
total_output = f'{extracted_values[0][1]} Mq'
|
| 248 |
text_output = extracted_values[2]
|
| 249 |
|
| 250 |
+
immobile_values = [f'{i + 1}. Immobile : {value} Mq\n' for i, value in enumerate(values_output)]
|
| 251 |
+
immobile_values = '\n'.join(immobile_values)
|
| 252 |
|
| 253 |
+
return immobile_values, total_output, text_output
|
| 254 |
+
|
| 255 |
+
def ocr_interface(pdf_file, model_t, question):
|
| 256 |
# Call the pdf_ocr function
|
| 257 |
+
values, total, text = pdf_ocr(pdf_file.name, model_t, question)
|
| 258 |
return values, total, text
|
| 259 |
|
| 260 |
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 265 |
+
|
| 266 |
+
gr.Markdown(
|
| 267 |
+
'''
|
| 268 |
+
# PDF Mq Extractor
|
| 269 |
+
Set the params and switch the tabs to see the output.
|
| 270 |
+
''')
|
| 271 |
+
with gr.Tab("Extractor", scroll_to_output = True):
|
| 272 |
+
with gr.Row():
|
| 273 |
+
pdf_input = gr.inputs.File(label="PDF File")
|
| 274 |
+
|
| 275 |
+
with gr.Row():
|
| 276 |
+
model_input = gr.inputs.Dropdown(['it5/it5-base-question-answering', 'it5/it5-small-question-answering'], label = 'Select model')
|
| 277 |
+
question_input = gr.inputs.Dropdown(["Quanti metri quadri misura l'immobile?"], label = 'Question')
|
| 278 |
+
|
| 279 |
+
with gr.Column():
|
| 280 |
+
gr.Markdown(
|
| 281 |
+
'''
|
| 282 |
+
# Output values
|
| 283 |
+
Values extracted from the pdf document
|
| 284 |
+
''')
|
| 285 |
+
|
| 286 |
+
with gr.Row():
|
| 287 |
+
|
| 288 |
+
values_output = gr.outputs.Textbox(label="Area Values")
|
| 289 |
+
total_output = gr.outputs.Textbox(label="Total")
|
| 290 |
+
with gr.Row():
|
| 291 |
+
extract_button = gr.Button("Extract")
|
| 292 |
+
|
| 293 |
+
with gr.Tab("Ref. Text"):
|
| 294 |
+
text_output = gr.outputs.Textbox(label="Ref. Text")
|
| 295 |
+
|
| 296 |
+
extract_button.click(fn = ocr_interface, inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
demo.launch()
|
| 300 |
+
|
| 301 |
+
|